tlb.c source code [Linux/arch/x86/mm/tlb.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	#include <linux/init.h>
3
4	#include <linux/mm.h>
5	#include <linux/spinlock.h>
6	#include <linux/smp.h>
7	#include <linux/interrupt.h>
8	#include <linux/export.h>
9	#include <linux/cpu.h>
10	#include <linux/debugfs.h>
11	#include <linux/sched/smt.h>
12	#include <linux/task_work.h>
13	#include <linux/mmu_notifier.h>
14	#include <linux/mmu_context.h>
15
16	#include <asm/tlbflush.h>
17	#include <asm/mmu_context.h>
18	#include <asm/nospec-branch.h>
19	#include <asm/cache.h>
20	#include <asm/cacheflush.h>
21	#include <asm/apic.h>
22	#include <asm/msr.h>
23	#include <asm/perf_event.h>
24	#include <asm/tlb.h>
25
26	#include "mm_internal.h"
27
28	#ifdef CONFIG_PARAVIRT
29	# define STATIC_NOPV
30	#else
31	# define STATIC_NOPV static
32	# define __flush_tlb_local native_flush_tlb_local
33	# define __flush_tlb_global native_flush_tlb_global
34	# define __flush_tlb_one_user(addr) native_flush_tlb_one_user(addr)
35	# define __flush_tlb_multi(msk, info) native_flush_tlb_multi(msk, info)
36	#endif
37
38	/*
39	* TLB flushing, formerly SMP-only
40	* c/o Linus Torvalds.
41	*
42	* These mean you can really definitely utterly forget about
43	* writing to user space from interrupts. (Its not allowed anyway).
44	*
45	* Optimizations Manfred Spraul <manfred@colorfullife.com>
46	*
47	* More scalable flush, from Andi Kleen
48	*
49	* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
50	*/
51
52	/*
53	* Bits to mangle the TIF_SPEC_* state into the mm pointer which is
54	* stored in cpu_tlb_state.last_user_mm_spec.
55	*/
56	#define LAST_USER_MM_IBPB 0x1UL
57	#define LAST_USER_MM_L1D_FLUSH 0x2UL
58	#define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB \| LAST_USER_MM_L1D_FLUSH)
59
60	/ Bits to set when tlbstate and flush is (re)initialized /
61	#define LAST_USER_MM_INIT LAST_USER_MM_IBPB
62
63	/*
64	* The x86 feature is called PCID (Process Context IDentifier). It is similar
65	* to what is traditionally called ASID on the RISC processors.
66	*
67	* We don't use the traditional ASID implementation, where each process/mm gets
68	* its own ASID and flush/restart when we run out of ASID space.
69	*
70	* Instead we have a small per-cpu array of ASIDs and cache the last few mm's
71	* that came by on this CPU, allowing cheaper switch_mm between processes on
72	* this CPU.
73	*
74	* We end up with different spaces for different things. To avoid confusion we
75	* use different names for each of them:
76	*
77	* ASID - [0, TLB_NR_DYN_ASIDS-1]
78	* the canonical identifier for an mm, dynamically allocated on each CPU
79	* [TLB_NR_DYN_ASIDS, MAX_ASID_AVAILABLE-1]
80	* the canonical, global identifier for an mm, identical across all CPUs
81	*
82	* kPCID - [1, MAX_ASID_AVAILABLE]
83	* the value we write into the PCID part of CR3; corresponds to the
84	* ASID+1, because PCID 0 is special.
85	*
86	* uPCID - [2048 + 1, 2048 + MAX_ASID_AVAILABLE]
87	* for KPTI each mm has two address spaces and thus needs two
88	* PCID values, but we can still do with a single ASID denomination
89	* for each mm. Corresponds to kPCID + 2048.
90	*
91	*/
92
93	/*
94	* When enabled, MITIGATION_PAGE_TABLE_ISOLATION consumes a single bit for
95	* user/kernel switches
96	*/
97	#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
98	# define PTI_CONSUMED_PCID_BITS 1
99	#else
100	# define PTI_CONSUMED_PCID_BITS 0
101	#endif
102
103	#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
104
105	/*
106	* ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
107	* for them being zero-based. Another -1 is because PCID 0 is reserved for
108	* use by non-PCID-aware users.
109	*/
110	#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
111
112	/*
113	* Given @asid, compute kPCID
114	*/
115	static inline u16 kern_pcid(u16 asid)
116	{
117	VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
118
119	#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
120	/*
121	* Make sure that the dynamic ASID space does not conflict with the
122	* bit we are using to switch between user and kernel ASIDs.
123	*/
124	BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (`1` << X86_CR3_PTI_PCID_USER_BIT));
125
126	/*
127	* The ASID being passed in here should have respected the
128	* MAX_ASID_AVAILABLE and thus never have the switch bit set.
129	*/
130	VM_WARN_ON_ONCE(asid & (`1` << X86_CR3_PTI_PCID_USER_BIT));
131	#endif
132	/*
133	* The dynamically-assigned ASIDs that get passed in are small
134	* (<TLB_NR_DYN_ASIDS). They never have the high switch bit set,
135	* so do not bother to clear it.
136	*
137	* If PCID is on, ASID-aware code paths put the ASID+1 into the
138	* PCID bits. This serves two purposes. It prevents a nasty
139	* situation in which PCID-unaware code saves CR3, loads some other
140	* value (with PCID == 0), and then restores CR3, thus corrupting
141	* the TLB for ASID 0 if the saved ASID was nonzero. It also means
142	* that any bugs involving loading a PCID-enabled CR3 with
143	* CR4.PCIDE off will trigger deterministically.
144	*/
145	return asid + `1`;
146	}
147
148	/*
149	* Given @asid, compute uPCID
150	*/
151	static inline u16 user_pcid(u16 asid)
152	{
153	u16 ret = kern_pcid(asid);
154	#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
155	ret \|= `1` << X86_CR3_PTI_PCID_USER_BIT;
156	#endif
157	return ret;
158	}
159
160	static inline unsigned long build_cr3(pgd_t pgd, u16 asid, unsigned* long lam)
161	{
162	unsigned long cr3 = __sme_pa(pgd) \| lam;
163
164	if (static_cpu_has(X86_FEATURE_PCID)) {
165	cr3 \|= kern_pcid(asid);
166	} else {
167	VM_WARN_ON_ONCE(asid != `0`);
168	}
169
170	return cr3;
171	}
172
173	static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid,
174	unsigned long lam)
175	{
176	/*
177	* Use boot_cpu_has() instead of this_cpu_has() as this function
178	* might be called during early boot. This should work even after
179	* boot because all CPU's the have same capabilities:
180	*/
181	VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID));
182	return build_cr3(pgd, asid, lam) \| CR3_NOFLUSH;
183	}
184
185	/*
186	* We get here when we do something requiring a TLB invalidation
187	* but could not go invalidate all of the contexts. We do the
188	* necessary invalidation by clearing out the 'ctx_id' which
189	* forces a TLB flush when the context is loaded.
190	*/
191	static void clear_asid_other(void)
192	{
193	u16 asid;
194
195	/*
196	* This is only expected to be set if we have disabled
197	* kernel _PAGE_GLOBAL pages.
198	*/
199	if (!static_cpu_has(X86_FEATURE_PTI)) {
200	WARN_ON_ONCE(`1`);
201	return;
202	}
203
204	for (asid = `0`; asid < TLB_NR_DYN_ASIDS; asid++) {
205	/ Do not need to flush the current asid /
206	if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
207	continue;
208	/*
209	* Make sure the next time we go to switch to
210	* this asid, we do a flush:
211	*/
212	this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, `0`);
213	}
214	this_cpu_write(cpu_tlbstate.invalidate_other, false);
215	}
216
217	atomic64_t last_mm_ctx_id = ATOMIC64_INIT(`1`);
218
219	struct new_asid {
220	unsigned int asid : `16`;
221	unsigned int need_flush : `1`;
222	};
223
224	static struct new_asid choose_new_asid(struct mm_struct *next, u64 next_tlb_gen)
225	{
226	struct new_asid ns;
227	u16 asid;
228
229	if (!static_cpu_has(X86_FEATURE_PCID)) {
230	ns.asid = `0`;
231	ns.need_flush = `1`;
232	return ns;
233	}
234
235	/*
236	* TLB consistency for global ASIDs is maintained with hardware assisted
237	* remote TLB flushing. Global ASIDs are always up to date.
238	*/
239	if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
240	u16 global_asid = mm_global_asid(mm: next);
241
242	if (global_asid) {
243	ns.asid = global_asid;
244	ns.need_flush = `0`;
245	return ns;
246	}
247	}
248
249	if (this_cpu_read(cpu_tlbstate.invalidate_other))
250	clear_asid_other();
251
252	for (asid = `0`; asid < TLB_NR_DYN_ASIDS; asid++) {
253	if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
254	next->context.ctx_id)
255	continue;
256
257	ns.asid = asid;
258	ns.need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < next_tlb_gen);
259	return ns;
260	}
261
262	/*
263	* We don't currently own an ASID slot on this CPU.
264	* Allocate a slot.
265	*/
266	ns.asid = this_cpu_add_return(cpu_tlbstate.next_asid, `1`) - `1`;
267	if (ns.asid >= TLB_NR_DYN_ASIDS) {
268	ns.asid = `0`;
269	this_cpu_write(cpu_tlbstate.next_asid, `1`);
270	}
271	ns.need_flush = true;
272
273	return ns;
274	}
275
276	/*
277	* Global ASIDs are allocated for multi-threaded processes that are
278	* active on multiple CPUs simultaneously, giving each of those
279	* processes the same PCID on every CPU, for use with hardware-assisted
280	* TLB shootdown on remote CPUs, like AMD INVLPGB or Intel RAR.
281	*
282	* These global ASIDs are held for the lifetime of the process.
283	*/
284	static DEFINE_RAW_SPINLOCK(global_asid_lock);
285	static u16 last_global_asid = MAX_ASID_AVAILABLE;
286	static DECLARE_BITMAP(global_asid_used, MAX_ASID_AVAILABLE);
287	static DECLARE_BITMAP(global_asid_freed, MAX_ASID_AVAILABLE);
288	static int global_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - `1`;
289
290	/*
291	* When the search for a free ASID in the global ASID space reaches
292	* MAX_ASID_AVAILABLE, a global TLB flush guarantees that previously
293	* freed global ASIDs are safe to re-use.
294	*
295	* This way the global flush only needs to happen at ASID rollover
296	* time, and not at ASID allocation time.
297	*/
298	static void reset_global_asid_space(void)
299	{
300	lockdep_assert_held(&global_asid_lock);
301
302	invlpgb_flush_all_nonglobals();
303
304	/*
305	* The TLB flush above makes it safe to re-use the previously
306	* freed global ASIDs.
307	*/
308	bitmap_andnot(dst: global_asid_used, src1: global_asid_used,
309	src2: global_asid_freed, MAX_ASID_AVAILABLE);
310	bitmap_clear(map: global_asid_freed, start: `0`, MAX_ASID_AVAILABLE);
311
312	/ Restart the search from the start of global ASID space. /
313	last_global_asid = TLB_NR_DYN_ASIDS;
314	}
315
316	static u16 allocate_global_asid(void)
317	{
318	u16 asid;
319
320	lockdep_assert_held(&global_asid_lock);
321
322	/ The previous allocation hit the edge of available address space /
323	if (last_global_asid >= MAX_ASID_AVAILABLE - `1`)
324	reset_global_asid_space();
325
326	asid = find_next_zero_bit(addr: global_asid_used, MAX_ASID_AVAILABLE, offset: last_global_asid);
327
328	if (asid >= MAX_ASID_AVAILABLE && !global_asid_available) {
329	/ This should never happen. /
330	VM_WARN_ONCE(`1`, "Unable to allocate global ASID despite %d available\n",
331	global_asid_available);
332	return `0`;
333	}
334
335	/ Claim this global ASID. /
336	__set_bit(asid, global_asid_used);
337	last_global_asid = asid;
338	global_asid_available--;
339	return asid;
340	}
341
342	/*
343	* Check whether a process is currently active on more than @threshold CPUs.
344	* This is a cheap estimation on whether or not it may make sense to assign
345	* a global ASID to this process, and use broadcast TLB invalidation.
346	*/
347	static bool mm_active_cpus_exceeds(struct mm_struct mm, int* threshold)
348	{
349	int count = `0`;
350	int cpu;
351
352	/ This quick check should eliminate most single threaded programs. /
353	if (cpumask_weight(srcp: mm_cpumask(mm)) <= threshold)
354	return false;
355
356	/ Slower check to make sure. /
357	for_each_cpu(cpu, mm_cpumask(mm)) {
358	/ Skip the CPUs that aren't really running this process. /
359	if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm)
360	continue;
361
362	if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
363	continue;
364
365	if (++count > threshold)
366	return true;
367	}
368	return false;
369	}
370
371	/*
372	* Assign a global ASID to the current process, protecting against
373	* races between multiple threads in the process.
374	*/
375	static void use_global_asid(struct mm_struct *mm)
376	{
377	u16 asid;
378
379	guard(raw_spinlock_irqsave)(l: &global_asid_lock);
380
381	/ This process is already using broadcast TLB invalidation. /
382	if (mm_global_asid(mm))
383	return;
384
385	/*
386	* The last global ASID was consumed while waiting for the lock.
387	*
388	* If this fires, a more aggressive ASID reuse scheme might be
389	* needed.
390	*/
391	if (!global_asid_available) {
392	VM_WARN_ONCE(`1`, "Ran out of global ASIDs\n");
393	return;
394	}
395
396	asid = allocate_global_asid();
397	if (!asid)
398	return;
399
400	mm_assign_global_asid(mm, asid);
401	}
402
403	void mm_free_global_asid(struct mm_struct *mm)
404	{
405	if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
406	return;
407
408	if (!mm_global_asid(mm))
409	return;
410
411	guard(raw_spinlock_irqsave)(l: &global_asid_lock);
412
413	/ The global ASID can be re-used only after flush at wrap-around. /
414	#ifdef CONFIG_BROADCAST_TLB_FLUSH
415	__set_bit(mm->context.global_asid, global_asid_freed);
416
417	mm->context.global_asid = `0`;
418	global_asid_available++;
419	#endif
420	}
421
422	/*
423	* Is the mm transitioning from a CPU-local ASID to a global ASID?
424	*/
425	static bool mm_needs_global_asid(struct mm_struct *mm, u16 asid)
426	{
427	u16 global_asid = mm_global_asid(mm);
428
429	if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
430	return false;
431
432	/ Process is transitioning to a global ASID /
433	if (global_asid && asid != global_asid)
434	return true;
435
436	return false;
437	}
438
439	/*
440	* x86 has 4k ASIDs (2k when compiled with KPTI), but the largest x86
441	* systems have over 8k CPUs. Because of this potential ASID shortage,
442	* global ASIDs are handed out to processes that have frequent TLB
443	* flushes and are active on 4 or more CPUs simultaneously.
444	*/
445	static void consider_global_asid(struct mm_struct *mm)
446	{
447	if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
448	return;
449
450	/ Check every once in a while. /
451	if ((current->pid & `0x1f`) != (jiffies & `0x1f`))
452	return;
453
454	/*
455	* Assign a global ASID if the process is active on
456	* 4 or more CPUs simultaneously.
457	*/
458	if (mm_active_cpus_exceeds(mm, threshold: `3`))
459	use_global_asid(mm);
460	}
461
462	static void finish_asid_transition(struct flush_tlb_info *info)
463	{
464	struct mm_struct *mm = info->mm;
465	int bc_asid = mm_global_asid(mm);
466	int cpu;
467
468	if (!mm_in_asid_transition(mm))
469	return;
470
471	for_each_cpu(cpu, mm_cpumask(mm)) {
472	/*
473	* The remote CPU is context switching. Wait for that to
474	* finish, to catch the unlikely case of it switching to
475	* the target mm with an out of date ASID.
476	*/
477	while (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) == LOADED_MM_SWITCHING)
478	cpu_relax();
479
480	if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm)
481	continue;
482
483	/*
484	* If at least one CPU is not using the global ASID yet,
485	* send a TLB flush IPI. The IPI should cause stragglers
486	* to transition soon.
487	*
488	* This can race with the CPU switching to another task;
489	* that results in a (harmless) extra IPI.
490	*/
491	if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm_asid, cpu)) != bc_asid) {
492	flush_tlb_multi(cpumask: mm_cpumask(mm: info->mm), info);
493	return;
494	}
495	}
496
497	/ All the CPUs running this process are using the global ASID. /
498	mm_clear_asid_transition(mm);
499	}
500
501	static void broadcast_tlb_flush(struct flush_tlb_info *info)
502	{
503	bool pmd = info->stride_shift == PMD_SHIFT;
504	unsigned long asid = mm_global_asid(mm: info->mm);
505	unsigned long addr = info->start;
506
507	/*
508	* TLB flushes with INVLPGB are kicked off asynchronously.
509	* The inc_mm_tlb_gen() guarantees page table updates are done
510	* before these TLB flushes happen.
511	*/
512	if (info->end == TLB_FLUSH_ALL) {
513	invlpgb_flush_single_pcid_nosync(pcid: kern_pcid(asid));
514	/ Do any CPUs supporting INVLPGB need PTI? /
515	if (cpu_feature_enabled(X86_FEATURE_PTI))
516	invlpgb_flush_single_pcid_nosync(pcid: user_pcid(asid));
517	} else do {
518	unsigned long nr = `1`;
519
520	if (info->stride_shift <= PMD_SHIFT) {
521	nr = (info->end - addr) >> info->stride_shift;
522	nr = clamp_val(nr, `1`, invlpgb_count_max);
523	}
524
525	invlpgb_flush_user_nr_nosync(pcid: kern_pcid(asid), addr, nr, stride: pmd);
526	if (cpu_feature_enabled(X86_FEATURE_PTI))
527	invlpgb_flush_user_nr_nosync(pcid: user_pcid(asid), addr, nr, stride: pmd);
528
529	addr += nr << info->stride_shift;
530	} while (addr < info->end);
531
532	finish_asid_transition(info);
533
534	/ Wait for the INVLPGBs kicked off above to finish. /
535	__tlbsync();
536	}
537
538	/*
539	* Given an ASID, flush the corresponding user ASID. We can delay this
540	* until the next time we switch to it.
541	*
542	* See SWITCH_TO_USER_CR3.
543	*/
544	static inline void invalidate_user_asid(u16 asid)
545	{
546	/ There is no user ASID if address space separation is off /
547	if (!IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION))
548	return;
549
550	/*
551	* We only have a single ASID if PCID is off and the CR3
552	* write will have flushed it.
553	*/
554	if (!cpu_feature_enabled(X86_FEATURE_PCID))
555	return;
556
557	if (!static_cpu_has(X86_FEATURE_PTI))
558	return;
559
560	__set_bit(kern_pcid(asid),
561	(unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
562	}
563
564	static void load_new_mm_cr3(pgd_t pgdir, u16 new_asid, unsigned* long lam,
565	bool need_flush)
566	{
567	unsigned long new_mm_cr3;
568
569	if (need_flush) {
570	invalidate_user_asid(asid: new_asid);
571	new_mm_cr3 = build_cr3(pgd: pgdir, asid: new_asid, lam);
572	} else {
573	new_mm_cr3 = build_cr3_noflush(pgd: pgdir, asid: new_asid, lam);
574	}
575
576	/*
577	* Caution: many callers of this function expect
578	* that load_cr3() is serializing and orders TLB
579	* fills with respect to the mm_cpumask writes.
580	*/
581	write_cr3(x: new_mm_cr3);
582	}
583
584	void leave_mm(void)
585	{
586	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
587
588	/*
589	* It's plausible that we're in lazy TLB mode while our mm is init_mm.
590	* If so, our callers still expect us to flush the TLB, but there
591	* aren't any user TLB entries in init_mm to worry about.
592	*
593	* This needs to happen before any other sanity checks due to
594	* intel_idle's shenanigans.
595	*/
596	if (loaded_mm == &init_mm)
597	return;
598
599	/ Warn if we're not lazy. /
600	WARN_ON(!this_cpu_read(cpu_tlbstate_shared.is_lazy));
601
602	switch_mm(NULL, next: &init_mm, NULL);
603	}
604	EXPORT_SYMBOL_GPL(leave_mm);
605
606	void switch_mm(struct mm_struct prev, struct* mm_struct *next,
607	struct task_struct *tsk)
608	{
609	unsigned long flags;
610
611	local_irq_save(flags);
612	switch_mm_irqs_off(NULL, next, tsk);
613	local_irq_restore(flags);
614	}
615
616	/*
617	* Invoked from return to user/guest by a task that opted-in to L1D
618	* flushing but ended up running on an SMT enabled core due to wrong
619	* affinity settings or CPU hotplug. This is part of the paranoid L1D flush
620	* contract which this task requested.
621	*/
622	static void l1d_flush_force_sigbus(struct callback_head *ch)
623	{
624	force_sig(SIGBUS);
625	}
626
627	static void l1d_flush_evaluate(unsigned long prev_mm, unsigned long next_mm,
628	struct task_struct *next)
629	{
630	/ Flush L1D if the outgoing task requests it /
631	if (prev_mm & LAST_USER_MM_L1D_FLUSH)
632	wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
633
634	/ Check whether the incoming task opted in for L1D flush /
635	if (likely(!(next_mm & LAST_USER_MM_L1D_FLUSH)))
636	return;
637
638	/*
639	* Validate that it is not running on an SMT sibling as this would
640	* make the exercise pointless because the siblings share L1D. If
641	* it runs on a SMT sibling, notify it with SIGBUS on return to
642	* user/guest
643	*/
644	if (this_cpu_read(cpu_info.smt_active)) {
645	clear_ti_thread_flag(ti: &next->thread_info, TIF_SPEC_L1D_FLUSH);
646	next->l1d_flush_kill.func = l1d_flush_force_sigbus;
647	task_work_add(task: next, twork: &next->l1d_flush_kill, mode: TWA_RESUME);
648	}
649	}
650
651	static unsigned long mm_mangle_tif_spec_bits(struct task_struct *next)
652	{
653	unsigned long next_tif = read_task_thread_flags(next);
654	unsigned long spec_bits = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_SPEC_MASK;
655
656	/*
657	* Ensure that the bit shift above works as expected and the two flags
658	* end up in bit 0 and 1.
659	*/
660	BUILD_BUG_ON(TIF_SPEC_L1D_FLUSH != TIF_SPEC_IB + `1`);
661
662	return (unsigned long)next->mm \| spec_bits;
663	}
664
665	static void cond_mitigation(struct task_struct *next)
666	{
667	unsigned long prev_mm, next_mm;
668
669	if (!next \|\| !next->mm)
670	return;
671
672	next_mm = mm_mangle_tif_spec_bits(next);
673	prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec);
674
675	/*
676	* Avoid user->user BTB/RSB poisoning by flushing them when switching
677	* between processes. This stops one process from doing Spectre-v2
678	* attacks on another.
679	*
680	* Both, the conditional and the always IBPB mode use the mm
681	* pointer to avoid the IBPB when switching between tasks of the
682	* same process. Using the mm pointer instead of mm->context.ctx_id
683	* opens a hypothetical hole vs. mm_struct reuse, which is more or
684	* less impossible to control by an attacker. Aside of that it
685	* would only affect the first schedule so the theoretically
686	* exposed data is not really interesting.
687	*/
688	if (static_branch_likely(&switch_mm_cond_ibpb)) {
689	/*
690	* This is a bit more complex than the always mode because
691	* it has to handle two cases:
692	*
693	* 1) Switch from a user space task (potential attacker)
694	* which has TIF_SPEC_IB set to a user space task
695	* (potential victim) which has TIF_SPEC_IB not set.
696	*
697	* 2) Switch from a user space task (potential attacker)
698	* which has TIF_SPEC_IB not set to a user space task
699	* (potential victim) which has TIF_SPEC_IB set.
700	*
701	* This could be done by unconditionally issuing IBPB when
702	* a task which has TIF_SPEC_IB set is either scheduled in
703	* or out. Though that results in two flushes when:
704	*
705	* - the same user space task is scheduled out and later
706	* scheduled in again and only a kernel thread ran in
707	* between.
708	*
709	* - a user space task belonging to the same process is
710	* scheduled in after a kernel thread ran in between
711	*
712	* - a user space task belonging to the same process is
713	* scheduled in immediately.
714	*
715	* Optimize this with reasonably small overhead for the
716	* above cases. Mangle the TIF_SPEC_IB bit into the mm
717	* pointer of the incoming task which is stored in
718	* cpu_tlbstate.last_user_mm_spec for comparison.
719	*
720	* Issue IBPB only if the mm's are different and one or
721	* both have the IBPB bit set.
722	*/
723	if (next_mm != prev_mm &&
724	(next_mm \| prev_mm) & LAST_USER_MM_IBPB)
725	indirect_branch_prediction_barrier();
726	}
727
728	if (static_branch_unlikely(&switch_mm_always_ibpb)) {
729	/*
730	* Only flush when switching to a user space task with a
731	* different context than the user space task which ran
732	* last on this CPU.
733	*/
734	if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) != (unsigned long)next->mm)
735	indirect_branch_prediction_barrier();
736	}
737
738	if (static_branch_unlikely(&switch_mm_cond_l1d_flush)) {
739	/*
740	* Flush L1D when the outgoing task requested it and/or
741	* check whether the incoming task requested L1D flushing
742	* and ended up on an SMT sibling.
743	*/
744	if (unlikely((prev_mm \| next_mm) & LAST_USER_MM_L1D_FLUSH))
745	l1d_flush_evaluate(prev_mm, next_mm, next);
746	}
747
748	this_cpu_write(cpu_tlbstate.last_user_mm_spec, next_mm);
749	}
750
751	#ifdef CONFIG_PERF_EVENTS
752	static inline void cr4_update_pce_mm(struct mm_struct *mm)
753	{
754	if (static_branch_unlikely(&rdpmc_always_available_key) \|\|
755	(!static_branch_unlikely(&rdpmc_never_available_key) &&
756	atomic_read(v: &mm->context.perf_rdpmc_allowed))) {
757	/*
758	* Clear the existing dirty counters to
759	* prevent the leak for an RDPMC task.
760	*/
761	perf_clear_dirty_counters();
762	cr4_set_bits_irqsoff(X86_CR4_PCE);
763	} else
764	cr4_clear_bits_irqsoff(X86_CR4_PCE);
765	}
766
767	void cr4_update_pce(void *ignored)
768	{
769	cr4_update_pce_mm(this_cpu_read(cpu_tlbstate.loaded_mm));
770	}
771
772	#else
773	static inline void cr4_update_pce_mm(struct mm_struct *mm) { }
774	#endif
775
776	/*
777	* This optimizes when not actually switching mm's. Some architectures use the
778	* 'unused' argument for this optimization, but x86 must use
779	* 'cpu_tlbstate.loaded_mm' instead because it does not always keep
780	* 'current->active_mm' up to date.
781	*/
782	void switch_mm_irqs_off(struct mm_struct unused, struct* mm_struct *next,
783	struct task_struct *tsk)
784	{
785	struct mm_struct *prev = this_cpu_read(cpu_tlbstate.loaded_mm);
786	u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
787	bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
788	unsigned cpu = smp_processor_id();
789	unsigned long new_lam;
790	struct new_asid ns;
791	u64 next_tlb_gen;
792
793
794	/ We don't want flush_tlb_func() to run concurrently with us. /
795	if (IS_ENABLED(CONFIG_PROVE_LOCKING))
796	WARN_ON_ONCE(!irqs_disabled());
797
798	/*
799	* Verify that CR3 is what we think it is. This will catch
800	* hypothetical buggy code that directly switches to swapper_pg_dir
801	* without going through leave_mm() / switch_mm_irqs_off() or that
802	* does something like write_cr3(read_cr3_pa()).
803	*
804	* Only do this check if CONFIG_DEBUG_VM=y because __read_cr3()
805	* isn't free.
806	*/
807	#ifdef CONFIG_DEBUG_VM
808	if (WARN_ON_ONCE(__read_cr3() != build_cr3(prev->pgd, prev_asid,
809	tlbstate_lam_cr3_mask()))) {
810	/*
811	* If we were to BUG here, we'd be very likely to kill
812	* the system so hard that we don't see the call trace.
813	* Try to recover instead by ignoring the error and doing
814	* a global flush to minimize the chance of corruption.
815	*
816	* (This is far from being a fully correct recovery.
817	* Architecturally, the CPU could prefetch something
818	* back into an incorrect ASID slot and leave it there
819	* to cause trouble down the road. It's better than
820	* nothing, though.)
821	*/
822	__flush_tlb_all();
823	}
824	#endif
825	if (was_lazy)
826	this_cpu_write(cpu_tlbstate_shared.is_lazy, false);
827
828	/*
829	* The membarrier system call requires a full memory barrier and
830	* core serialization before returning to user-space, after
831	* storing to rq->curr, when changing mm. This is because
832	* membarrier() sends IPIs to all CPUs that are in the target mm
833	* to make them issue memory barriers. However, if another CPU
834	* switches to/from the target mm concurrently with
835	* membarrier(), it can cause that CPU not to receive an IPI
836	* when it really should issue a memory barrier. Writing to CR3
837	* provides that full memory barrier and core serializing
838	* instruction.
839	*/
840	if (prev == next) {
841	/ Not actually switching mm's /
842	VM_WARN_ON(is_dyn_asid(prev_asid) &&
843	this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
844	next->context.ctx_id);
845
846	/*
847	* If this races with another thread that enables lam, 'new_lam'
848	* might not match tlbstate_lam_cr3_mask().
849	*/
850
851	/*
852	* Even in lazy TLB mode, the CPU should stay set in the
853	* mm_cpumask. The TLB shootdown code can figure out from
854	* cpu_tlbstate_shared.is_lazy whether or not to send an IPI.
855	*/
856	if (IS_ENABLED(CONFIG_DEBUG_VM) &&
857	WARN_ON_ONCE(prev != &init_mm && !is_notrack_mm(prev) &&
858	!cpumask_test_cpu(cpu, mm_cpumask(next))))
859	cpumask_set_cpu(cpu, dstp: mm_cpumask(mm: next));
860
861	/ Check if the current mm is transitioning to a global ASID /
862	if (mm_needs_global_asid(mm: next, asid: prev_asid)) {
863	next_tlb_gen = atomic64_read(v: &next->context.tlb_gen);
864	ns = choose_new_asid(next, next_tlb_gen);
865	goto reload_tlb;
866	}
867
868	/*
869	* Broadcast TLB invalidation keeps this ASID up to date
870	* all the time.
871	*/
872	if (is_global_asid(asid: prev_asid))
873	return;
874
875	/*
876	* If the CPU is not in lazy TLB mode, we are just switching
877	* from one thread in a process to another thread in the same
878	* process. No TLB flush required.
879	*/
880	if (!was_lazy)
881	return;
882
883	/*
884	* Read the tlb_gen to check whether a flush is needed.
885	* If the TLB is up to date, just use it.
886	* The barrier synchronizes with the tlb_gen increment in
887	* the TLB shootdown code.
888	*/
889	smp_mb();
890	next_tlb_gen = atomic64_read(v: &next->context.tlb_gen);
891	if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
892	next_tlb_gen)
893	return;
894
895	/*
896	* TLB contents went out of date while we were in lazy
897	* mode. Fall through to the TLB switching code below.
898	*/
899	ns.asid = prev_asid;
900	ns.need_flush = true;
901	} else {
902	/*
903	* Apply process to process speculation vulnerability
904	* mitigations if applicable.
905	*/
906	cond_mitigation(next: tsk);
907
908	/*
909	* Indicate that CR3 is about to change. nmi_uaccess_okay()
910	* and others are sensitive to the window where mm_cpumask(),
911	* CR3 and cpu_tlbstate.loaded_mm are not all in sync.
912	*/
913	this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
914	barrier();
915
916	/ Start receiving IPIs and then read tlb_gen (and LAM below) /
917	if (next != &init_mm && !cpumask_test_cpu(cpu, cpumask: mm_cpumask(mm: next)))
918	cpumask_set_cpu(cpu, dstp: mm_cpumask(mm: next));
919	next_tlb_gen = atomic64_read(v: &next->context.tlb_gen);
920
921	ns = choose_new_asid(next, next_tlb_gen);
922	}
923
924	reload_tlb:
925	new_lam = mm_lam_cr3_mask(mm: next);
926	if (ns.need_flush) {
927	VM_WARN_ON_ONCE(is_global_asid(ns.asid));
928	this_cpu_write(cpu_tlbstate.ctxs[ns.asid].ctx_id, next->context.ctx_id);
929	this_cpu_write(cpu_tlbstate.ctxs[ns.asid].tlb_gen, next_tlb_gen);
930	load_new_mm_cr3(pgdir: next->pgd, new_asid: ns.asid, lam: new_lam, need_flush: true);
931
932	trace_tlb_flush(reason: TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
933	} else {
934	/ The new ASID is already up to date. /
935	load_new_mm_cr3(pgdir: next->pgd, new_asid: ns.asid, lam: new_lam, need_flush: false);
936
937	trace_tlb_flush(reason: TLB_FLUSH_ON_TASK_SWITCH, pages: `0`);
938	}
939
940	/ Make sure we write CR3 before loaded_mm. /
941	barrier();
942
943	this_cpu_write(cpu_tlbstate.loaded_mm, next);
944	this_cpu_write(cpu_tlbstate.loaded_mm_asid, ns.asid);
945	cpu_tlbstate_update_lam(lam: new_lam, untag_mask: mm_untag_mask(mm: next));
946
947	if (next != prev) {
948	cr4_update_pce_mm(mm: next);
949	switch_ldt(prev, next);
950	}
951	}
952
953	/*
954	* Please ignore the name of this function. It should be called
955	* switch_to_kernel_thread().
956	*
957	* enter_lazy_tlb() is a hint from the scheduler that we are entering a
958	* kernel thread or other context without an mm. Acceptable implementations
959	* include doing nothing whatsoever, switching to init_mm, or various clever
960	* lazy tricks to try to minimize TLB flushes.
961	*
962	* The scheduler reserves the right to call enter_lazy_tlb() several times
963	* in a row. It will notify us that we're going back to a real mm by
964	* calling switch_mm_irqs_off().
965	*/
966	void enter_lazy_tlb(struct mm_struct mm, struct* task_struct *tsk)
967	{
968	if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
969	return;
970
971	this_cpu_write(cpu_tlbstate_shared.is_lazy, true);
972	}
973
974	/*
975	* Using a temporary mm allows to set temporary mappings that are not accessible
976	* by other CPUs. Such mappings are needed to perform sensitive memory writes
977	* that override the kernel memory protections (e.g., W^X), without exposing the
978	* temporary page-table mappings that are required for these write operations to
979	* other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
980	* mapping is torn down. Temporary mms can also be used for EFI runtime service
981	* calls or similar functionality.
982	*
983	* It is illegal to schedule while using a temporary mm -- the context switch
984	* code is unaware of the temporary mm and does not know how to context switch.
985	* Use a real (non-temporary) mm in a kernel thread if you need to sleep.
986	*
987	* Note: For sensitive memory writes, the temporary mm needs to be used
988	* exclusively by a single core, and IRQs should be disabled while the
989	* temporary mm is loaded, thereby preventing interrupt handler bugs from
990	* overriding the kernel memory protection.
991	*/
992	struct mm_struct use_temporary_mm(struct* mm_struct *temp_mm)
993	{
994	struct mm_struct *prev_mm;
995
996	lockdep_assert_preemption_disabled();
997	guard(irqsave)();
998
999	/*
1000	* Make sure not to be in TLB lazy mode, as otherwise we'll end up
1001	* with a stale address space WITHOUT being in lazy mode after
1002	* restoring the previous mm.
1003	*/
1004	if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
1005	leave_mm();
1006
1007	prev_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
1008	switch_mm_irqs_off(NULL, next: temp_mm, current);
1009
1010	/*
1011	* If breakpoints are enabled, disable them while the temporary mm is
1012	* used. Userspace might set up watchpoints on addresses that are used
1013	* in the temporary mm, which would lead to wrong signals being sent or
1014	* crashes.
1015	*
1016	* Note that breakpoints are not disabled selectively, which also causes
1017	* kernel breakpoints (e.g., perf's) to be disabled. This might be
1018	* undesirable, but still seems reasonable as the code that runs in the
1019	* temporary mm should be short.
1020	*/
1021	if (hw_breakpoint_active())
1022	hw_breakpoint_disable();
1023
1024	return prev_mm;
1025	}
1026
1027	void unuse_temporary_mm(struct mm_struct *prev_mm)
1028	{
1029	lockdep_assert_preemption_disabled();
1030	guard(irqsave)();
1031
1032	/ Clear the cpumask, to indicate no TLB flushing is needed anywhere /
1033	cpumask_clear_cpu(smp_processor_id(), dstp: mm_cpumask(this_cpu_read(cpu_tlbstate.loaded_mm)));
1034
1035	switch_mm_irqs_off(NULL, next: prev_mm, current);
1036
1037	/*
1038	* Restore the breakpoints if they were disabled before the temporary mm
1039	* was loaded.
1040	*/
1041	if (hw_breakpoint_active())
1042	hw_breakpoint_restore();
1043	}
1044
1045	/*
1046	* Call this when reinitializing a CPU. It fixes the following potential
1047	* problems:
1048	*
1049	* - The ASID changed from what cpu_tlbstate thinks it is (most likely
1050	* because the CPU was taken down and came back up with CR3's PCID
1051	* bits clear. CPU hotplug can do this.
1052	*
1053	* - The TLB contains junk in slots corresponding to inactive ASIDs.
1054	*
1055	* - The CPU went so far out to lunch that it may have missed a TLB
1056	* flush.
1057	*/
1058	void initialize_tlbstate_and_flush(void)
1059	{
1060	int i;
1061	struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm);
1062	u64 tlb_gen = atomic64_read(v: &init_mm.context.tlb_gen);
1063	unsigned long lam = mm_lam_cr3_mask(mm);
1064	unsigned long cr3 = __read_cr3();
1065
1066	/ Assert that CR3 already references the right mm. /
1067	WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));
1068
1069	/ LAM expected to be disabled /
1070	WARN_ON(cr3 & (X86_CR3_LAM_U48 \| X86_CR3_LAM_U57));
1071	WARN_ON(lam);
1072
1073	/*
1074	* Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization
1075	* doesn't work like other CR4 bits because it can only be set from
1076	* long mode.)
1077	*/
1078	WARN_ON(boot_cpu_has(X86_FEATURE_PCID) &&
1079	!(cr4_read_shadow() & X86_CR4_PCIDE));
1080
1081	/ Disable LAM, force ASID 0 and force a TLB flush. /
1082	write_cr3(x: build_cr3(pgd: mm->pgd, asid: `0`, lam: `0`));
1083
1084	/ Reinitialize tlbstate. /
1085	this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT);
1086	this_cpu_write(cpu_tlbstate.loaded_mm_asid, `0`);
1087	this_cpu_write(cpu_tlbstate.next_asid, `1`);
1088	this_cpu_write(cpu_tlbstate.ctxs[`0`].ctx_id, mm->context.ctx_id);
1089	this_cpu_write(cpu_tlbstate.ctxs[`0`].tlb_gen, tlb_gen);
1090	cpu_tlbstate_update_lam(lam, untag_mask: mm_untag_mask(mm));
1091
1092	for (i = `1`; i < TLB_NR_DYN_ASIDS; i++)
1093	this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, `0`);
1094	}
1095
1096	/*
1097	* flush_tlb_func()'s memory ordering requirement is that any
1098	* TLB fills that happen after we flush the TLB are ordered after we
1099	* read active_mm's tlb_gen. We don't need any explicit barriers
1100	* because all x86 flush operations are serializing and the
1101	* atomic64_read operation won't be reordered by the compiler.
1102	*/
1103	static void flush_tlb_func(void *info)
1104	{
1105	/*
1106	* We have three different tlb_gen values in here. They are:
1107	*
1108	* - mm_tlb_gen: the latest generation.
1109	* - local_tlb_gen: the generation that this CPU has already caught
1110	* up to.
1111	* - f->new_tlb_gen: the generation that the requester of the flush
1112	* wants us to catch up to.
1113	*/
1114	const struct flush_tlb_info *f = info;
1115	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
1116	u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
1117	u64 local_tlb_gen;
1118	bool local = smp_processor_id() == f->initiating_cpu;
1119	unsigned long nr_invalidate = `0`;
1120	u64 mm_tlb_gen;
1121
1122	/ This code cannot presently handle being reentered. /
1123	VM_WARN_ON(!irqs_disabled());
1124
1125	if (!local) {
1126	inc_irq_stat(irq_tlb_count);
1127	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
1128	}
1129
1130	/ The CPU was left in the mm_cpumask of the target mm. Clear it. /
1131	if (f->mm && f->mm != loaded_mm) {
1132	cpumask_clear_cpu(raw_smp_processor_id(), dstp: mm_cpumask(mm: f->mm));
1133	trace_tlb_flush(reason: TLB_REMOTE_WRONG_CPU, pages: `0`);
1134	return;
1135	}
1136
1137	if (unlikely(loaded_mm == &init_mm))
1138	return;
1139
1140	/ Reload the ASID if transitioning into or out of a global ASID /
1141	if (mm_needs_global_asid(mm: loaded_mm, asid: loaded_mm_asid)) {
1142	switch_mm_irqs_off(NULL, next: loaded_mm, NULL);
1143	loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
1144	}
1145
1146	/ Broadcast ASIDs are always kept up to date with INVLPGB. /
1147	if (is_global_asid(asid: loaded_mm_asid))
1148	return;
1149
1150	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
1151	loaded_mm->context.ctx_id);
1152
1153	if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) {
1154	/*
1155	* We're in lazy mode. We need to at least flush our
1156	* paging-structure cache to avoid speculatively reading
1157	* garbage into our TLB. Since switching to init_mm is barely
1158	* slower than a minimal flush, just switch to init_mm.
1159	*
1160	* This should be rare, with native_flush_tlb_multi() skipping
1161	* IPIs to lazy TLB mode CPUs.
1162	*/
1163	switch_mm_irqs_off(NULL, next: &init_mm, NULL);
1164	return;
1165	}
1166
1167	local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
1168
1169	if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID &&
1170	f->new_tlb_gen <= local_tlb_gen)) {
1171	/*
1172	* The TLB is already up to date in respect to f->new_tlb_gen.
1173	* While the core might be still behind mm_tlb_gen, checking
1174	* mm_tlb_gen unnecessarily would have negative caching effects
1175	* so avoid it.
1176	*/
1177	return;
1178	}
1179
1180	/*
1181	* Defer mm_tlb_gen reading as long as possible to avoid cache
1182	* contention.
1183	*/
1184	mm_tlb_gen = atomic64_read(v: &loaded_mm->context.tlb_gen);
1185
1186	if (unlikely(local_tlb_gen == mm_tlb_gen)) {
1187	/*
1188	* There's nothing to do: we're already up to date. This can
1189	* happen if two concurrent flushes happen -- the first flush to
1190	* be handled can catch us all the way up, leaving no work for
1191	* the second flush.
1192	*/
1193	goto done;
1194	}
1195
1196	WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
1197	WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);
1198
1199	/*
1200	* If we get to this point, we know that our TLB is out of date.
1201	* This does not strictly imply that we need to flush (it's
1202	* possible that f->new_tlb_gen <= local_tlb_gen), but we're
1203	* going to need to flush in the very near future, so we might
1204	* as well get it over with.
1205	*
1206	* The only question is whether to do a full or partial flush.
1207	*
1208	* We do a partial flush if requested and two extra conditions
1209	* are met:
1210	*
1211	* 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that
1212	* we've always done all needed flushes to catch up to
1213	* local_tlb_gen. If, for example, local_tlb_gen == 2 and
1214	* f->new_tlb_gen == 3, then we know that the flush needed to bring
1215	* us up to date for tlb_gen 3 is the partial flush we're
1216	* processing.
1217	*
1218	* As an example of why this check is needed, suppose that there
1219	* are two concurrent flushes. The first is a full flush that
1220	* changes context.tlb_gen from 1 to 2. The second is a partial
1221	* flush that changes context.tlb_gen from 2 to 3. If they get
1222	* processed on this CPU in reverse order, we'll see
1223	* local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
1224	* If we were to use __flush_tlb_one_user() and set local_tlb_gen to
1225	* 3, we'd be break the invariant: we'd update local_tlb_gen above
1226	* 1 without the full flush that's needed for tlb_gen 2.
1227	*
1228	* 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimization.
1229	* Partial TLB flushes are not all that much cheaper than full TLB
1230	* flushes, so it seems unlikely that it would be a performance win
1231	* to do a partial flush if that won't bring our TLB fully up to
1232	* date. By doing a full flush instead, we can increase
1233	* local_tlb_gen all the way to mm_tlb_gen and we can probably
1234	* avoid another flush in the very near future.
1235	*/
1236	if (f->end != TLB_FLUSH_ALL &&
1237	f->new_tlb_gen == local_tlb_gen + `1` &&
1238	f->new_tlb_gen == mm_tlb_gen) {
1239	/ Partial flush /
1240	unsigned long addr = f->start;
1241
1242	/ Partial flush cannot have invalid generations /
1243	VM_WARN_ON(f->new_tlb_gen == TLB_GENERATION_INVALID);
1244
1245	/ Partial flush must have valid mm /
1246	VM_WARN_ON(f->mm == NULL);
1247
1248	nr_invalidate = (f->end - f->start) >> f->stride_shift;
1249
1250	while (addr < f->end) {
1251	flush_tlb_one_user(addr);
1252	addr += `1UL` << f->stride_shift;
1253	}
1254	if (local)
1255	count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
1256	} else {
1257	/ Full flush. /
1258	nr_invalidate = TLB_FLUSH_ALL;
1259
1260	flush_tlb_local();
1261	if (local)
1262	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
1263	}
1264
1265	/ Both paths above update our state to mm_tlb_gen. /
1266	this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
1267
1268	/ Tracing is done in a unified manner to reduce the code size /
1269	done:
1270	trace_tlb_flush(reason: !local ? TLB_REMOTE_SHOOTDOWN :
1271	(f->mm == NULL) ? TLB_LOCAL_SHOOTDOWN :
1272	TLB_LOCAL_MM_SHOOTDOWN,
1273	pages: nr_invalidate);
1274	}
1275
1276	static bool should_flush_tlb(int cpu, void *data)
1277	{
1278	struct mm_struct *loaded_mm = per_cpu(cpu_tlbstate.loaded_mm, cpu);
1279	struct flush_tlb_info *info = data;
1280
1281	/*
1282	* Order the 'loaded_mm' and 'is_lazy' against their
1283	* write ordering in switch_mm_irqs_off(). Ensure
1284	* 'is_lazy' is at least as new as 'loaded_mm'.
1285	*/
1286	smp_rmb();
1287
1288	/ Lazy TLB will get flushed at the next context switch. /
1289	if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
1290	return false;
1291
1292	/ No mm means kernel memory flush. /
1293	if (!info->mm)
1294	return true;
1295
1296	/*
1297	* While switching, the remote CPU could have state from
1298	* either the prev or next mm. Assume the worst and flush.
1299	*/
1300	if (loaded_mm == LOADED_MM_SWITCHING)
1301	return true;
1302
1303	/ The target mm is loaded, and the CPU is not lazy. /
1304	if (loaded_mm == info->mm)
1305	return true;
1306
1307	/ In cpumask, but not the loaded mm? Periodically remove by flushing. /
1308	if (info->trim_cpumask)
1309	return true;
1310
1311	return false;
1312	}
1313
1314	static bool should_trim_cpumask(struct mm_struct *mm)
1315	{
1316	if (time_after(jiffies, READ_ONCE(mm->context.next_trim_cpumask))) {
1317	WRITE_ONCE(mm->context.next_trim_cpumask, jiffies + HZ);
1318	return true;
1319	}
1320	return false;
1321	}
1322
1323	DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared);
1324	EXPORT_PER_CPU_SYMBOL(cpu_tlbstate_shared);
1325
1326	STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask,
1327	const struct flush_tlb_info *info)
1328	{
1329	/*
1330	* Do accounting and tracing. Note that there are (and have always been)
1331	* cases in which a remote TLB flush will be traced, but eventually
1332	* would not happen.
1333	*/
1334	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
1335	if (info->end == TLB_FLUSH_ALL)
1336	trace_tlb_flush(reason: TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
1337	else
1338	trace_tlb_flush(reason: TLB_REMOTE_SEND_IPI,
1339	pages: (info->end - info->start) >> PAGE_SHIFT);
1340
1341	/*
1342	* If no page tables were freed, we can skip sending IPIs to
1343	* CPUs in lazy TLB mode. They will flush the CPU themselves
1344	* at the next context switch.
1345	*
1346	* However, if page tables are getting freed, we need to send the
1347	* IPI everywhere, to prevent CPUs in lazy TLB mode from tripping
1348	* up on the new contents of what used to be page tables, while
1349	* doing a speculative memory access.
1350	*/
1351	if (info->freed_tables \|\| mm_in_asid_transition(mm: info->mm))
1352	on_each_cpu_mask(mask: cpumask, func: flush_tlb_func, info: (void *)info, wait: true);
1353	else
1354	on_each_cpu_cond_mask(cond_func: should_flush_tlb, func: flush_tlb_func,
1355	info: (void *)info, wait: `1`, mask: cpumask);
1356	}
1357
1358	void flush_tlb_multi(const struct cpumask *cpumask,
1359	const struct flush_tlb_info *info)
1360	{
1361	__flush_tlb_multi(cpumask, info);
1362	}
1363
1364	/*
1365	* See Documentation/arch/x86/tlb.rst for details. We choose 33
1366	* because it is large enough to cover the vast majority (at
1367	* least 95%) of allocations, and is small enough that we are
1368	* confident it will not cause too much overhead. Each single
1369	* flush is about 100 ns, so this caps the maximum overhead at
1370	* _about_ 3,000 ns.
1371	*
1372	* This is in units of pages.
1373	*/
1374	unsigned long tlb_single_page_flush_ceiling __read_mostly = `33`;
1375
1376	static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
1377
1378	#ifdef CONFIG_DEBUG_VM
1379	static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
1380	#endif
1381
1382	static struct flush_tlb_info get_flush_tlb_info(struct* mm_struct *mm,
1383	unsigned long start, unsigned long end,
1384	unsigned int stride_shift, bool freed_tables,
1385	u64 new_tlb_gen)
1386	{
1387	struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info);
1388
1389	#ifdef CONFIG_DEBUG_VM
1390	/*
1391	* Ensure that the following code is non-reentrant and flush_tlb_info
1392	* is not overwritten. This means no TLB flushing is initiated by
1393	* interrupt handlers and machine-check exception handlers.
1394	*/
1395	BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != `1`);
1396	#endif
1397
1398	/*
1399	* If the number of flushes is so large that a full flush
1400	* would be faster, do a full flush.
1401	*/
1402	if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) {
1403	start = `0`;
1404	end = TLB_FLUSH_ALL;
1405	}
1406
1407	info->start = start;
1408	info->end = end;
1409	info->mm = mm;
1410	info->stride_shift = stride_shift;
1411	info->freed_tables = freed_tables;
1412	info->new_tlb_gen = new_tlb_gen;
1413	info->initiating_cpu = smp_processor_id();
1414	info->trim_cpumask = `0`;
1415
1416	return info;
1417	}
1418
1419	static void put_flush_tlb_info(void)
1420	{
1421	#ifdef CONFIG_DEBUG_VM
1422	/ Complete reentrancy prevention checks /
1423	barrier();
1424	this_cpu_dec(flush_tlb_info_idx);
1425	#endif
1426	}
1427
1428	void flush_tlb_mm_range(struct mm_struct mm, unsigned* long start,
1429	unsigned long end, unsigned int stride_shift,
1430	bool freed_tables)
1431	{
1432	struct flush_tlb_info *info;
1433	int cpu = get_cpu();
1434	u64 new_tlb_gen;
1435
1436	/ This is also a barrier that synchronizes with switch_mm(). /
1437	new_tlb_gen = inc_mm_tlb_gen(mm);
1438
1439	info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
1440	new_tlb_gen);
1441
1442	/*
1443	* flush_tlb_multi() is not optimized for the common case in which only
1444	* a local TLB flush is needed. Optimize this use-case by calling
1445	* flush_tlb_func_local() directly in this case.
1446	*/
1447	if (mm_global_asid(mm)) {
1448	broadcast_tlb_flush(info);
1449	} else if (cpumask_any_but(mask: mm_cpumask(mm), cpu) < nr_cpu_ids) {
1450	info->trim_cpumask = should_trim_cpumask(mm);
1451	flush_tlb_multi(cpumask: mm_cpumask(mm), info);
1452	consider_global_asid(mm);
1453	} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
1454	lockdep_assert_irqs_enabled();
1455	local_irq_disable();
1456	flush_tlb_func(info);
1457	local_irq_enable();
1458	}
1459
1460	put_flush_tlb_info();
1461	put_cpu();
1462	mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
1463	}
1464
1465	static void do_flush_tlb_all(void *info)
1466	{
1467	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
1468	__flush_tlb_all();
1469	}
1470
1471	void flush_tlb_all(void)
1472	{
1473	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
1474
1475	/ First try (faster) hardware-assisted TLB invalidation. /
1476	if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
1477	invlpgb_flush_all();
1478	else
1479	/ Fall back to the IPI-based invalidation. /
1480	on_each_cpu(func: do_flush_tlb_all, NULL, wait: `1`);
1481	}
1482
1483	/ Flush an arbitrarily large range of memory with INVLPGB. /
1484	static void invlpgb_kernel_range_flush(struct flush_tlb_info *info)
1485	{
1486	unsigned long addr, nr;
1487
1488	for (addr = info->start; addr < info->end; addr += nr << PAGE_SHIFT) {
1489	nr = (info->end - addr) >> PAGE_SHIFT;
1490
1491	/*
1492	* INVLPGB has a limit on the size of ranges it can
1493	* flush. Break up large flushes.
1494	*/
1495	nr = clamp_val(nr, `1`, invlpgb_count_max);
1496
1497	invlpgb_flush_addr_nosync(addr, nr);
1498	}
1499	__tlbsync();
1500	}
1501
1502	static void do_kernel_range_flush(void *info)
1503	{
1504	struct flush_tlb_info *f = info;
1505	unsigned long addr;
1506
1507	/ flush range by one by one 'invlpg' /
1508	for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
1509	flush_tlb_one_kernel(addr);
1510	}
1511
1512	static void kernel_tlb_flush_all(struct flush_tlb_info *info)
1513	{
1514	if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
1515	invlpgb_flush_all();
1516	else
1517	on_each_cpu(func: do_flush_tlb_all, NULL, wait: `1`);
1518	}
1519
1520	static void kernel_tlb_flush_range(struct flush_tlb_info *info)
1521	{
1522	if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
1523	invlpgb_kernel_range_flush(info);
1524	else
1525	on_each_cpu(func: do_kernel_range_flush, info, wait: `1`);
1526	}
1527
1528	void flush_tlb_kernel_range(unsigned long start, unsigned long end)
1529	{
1530	struct flush_tlb_info *info;
1531
1532	guard(preempt)();
1533
1534	info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, freed_tables: false,
1535	TLB_GENERATION_INVALID);
1536
1537	if (info->end == TLB_FLUSH_ALL)
1538	kernel_tlb_flush_all(info);
1539	else
1540	kernel_tlb_flush_range(info);
1541
1542	put_flush_tlb_info();
1543	}
1544
1545	/*
1546	* This can be used from process context to figure out what the value of
1547	* CR3 is without needing to do a (slow) __read_cr3().
1548	*
1549	* It's intended to be used for code like KVM that sneakily changes CR3
1550	* and needs to restore it. It needs to be used very carefully.
1551	*/
1552	unsigned long __get_current_cr3_fast(void)
1553	{
1554	unsigned long cr3 =
1555	build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
1556	this_cpu_read(cpu_tlbstate.loaded_mm_asid),
1557	lam: tlbstate_lam_cr3_mask());
1558
1559	/ For now, be very restrictive about when this can be called. /
1560	VM_WARN_ON(in_nmi() \|\| preemptible());
1561
1562	VM_BUG_ON(cr3 != __read_cr3());
1563	return cr3;
1564	}
1565	EXPORT_SYMBOL_GPL(__get_current_cr3_fast);
1566
1567	/*
1568	* Flush one page in the kernel mapping
1569	*/
1570	void flush_tlb_one_kernel(unsigned long addr)
1571	{
1572	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
1573
1574	/*
1575	* If PTI is off, then __flush_tlb_one_user() is just INVLPG or its
1576	* paravirt equivalent. Even with PCID, this is sufficient: we only
1577	* use PCID if we also use global PTEs for the kernel mapping, and
1578	* INVLPG flushes global translations across all address spaces.
1579	*
1580	* If PTI is on, then the kernel is mapped with non-global PTEs, and
1581	* __flush_tlb_one_user() will flush the given address for the current
1582	* kernel address space and for its usermode counterpart, but it does
1583	* not flush it for other address spaces.
1584	*/
1585	flush_tlb_one_user(addr);
1586
1587	if (!static_cpu_has(X86_FEATURE_PTI))
1588	return;
1589
1590	/*
1591	* See above. We need to propagate the flush to all other address
1592	* spaces. In principle, we only need to propagate it to kernelmode
1593	* address spaces, but the extra bookkeeping we would need is not
1594	* worth it.
1595	*/
1596	this_cpu_write(cpu_tlbstate.invalidate_other, true);
1597	}
1598
1599	/*
1600	* Flush one page in the user mapping
1601	*/
1602	STATIC_NOPV void native_flush_tlb_one_user(unsigned long addr)
1603	{
1604	u32 loaded_mm_asid;
1605	bool cpu_pcide;
1606
1607	/ Flush 'addr' from the kernel PCID: /
1608	invlpg(addr);
1609
1610	/ If PTI is off there is no user PCID and nothing to flush. /
1611	if (!static_cpu_has(X86_FEATURE_PTI))
1612	return;
1613
1614	loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
1615	cpu_pcide = this_cpu_read(cpu_tlbstate.cr4) & X86_CR4_PCIDE;
1616
1617	/*
1618	* invpcid_flush_one(pcid>0) will #GP if CR4.PCIDE==0. Check
1619	* 'cpu_pcide' to ensure that this CPU will not trigger those
1620	* #GP's even if called before CR4.PCIDE has been initialized.
1621	*/
1622	if (boot_cpu_has(X86_FEATURE_INVPCID) && cpu_pcide)
1623	invpcid_flush_one(pcid: user_pcid(asid: loaded_mm_asid), addr);
1624	else
1625	invalidate_user_asid(asid: loaded_mm_asid);
1626	}
1627
1628	void flush_tlb_one_user(unsigned long addr)
1629	{
1630	__flush_tlb_one_user(addr);
1631	}
1632
1633	/*
1634	* Flush everything
1635	*/
1636	STATIC_NOPV void native_flush_tlb_global(void)
1637	{
1638	unsigned long flags;
1639
1640	if (static_cpu_has(X86_FEATURE_INVPCID)) {
1641	/*
1642	* Using INVPCID is considerably faster than a pair of writes
1643	* to CR4 sandwiched inside an IRQ flag save/restore.
1644	*
1645	* Note, this works with CR4.PCIDE=0 or 1.
1646	*/
1647	invpcid_flush_all();
1648	return;
1649	}
1650
1651	/*
1652	* Read-modify-write to CR4 - protect it from preemption and
1653	* from interrupts. (Use the raw variant because this code can
1654	* be called from deep inside debugging code.)
1655	*/
1656	raw_local_irq_save(flags);
1657
1658	__native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4));
1659
1660	raw_local_irq_restore(flags);
1661	}
1662
1663	/*
1664	* Flush the entire current user mapping
1665	*/
1666	STATIC_NOPV void native_flush_tlb_local(void)
1667	{
1668	/*
1669	* Preemption or interrupts must be disabled to protect the access
1670	* to the per CPU variable and to prevent being preempted between
1671	* read_cr3() and write_cr3().
1672	*/
1673	WARN_ON_ONCE(preemptible());
1674
1675	invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
1676
1677	/ If current->mm == NULL then the read_cr3() "borrows" an mm /
1678	native_write_cr3(val: __native_read_cr3());
1679	}
1680
1681	void flush_tlb_local(void)
1682	{
1683	__flush_tlb_local();
1684	}
1685
1686	/*
1687	* Flush everything
1688	*/
1689	void __flush_tlb_all(void)
1690	{
1691	/*
1692	* This is to catch users with enabled preemption and the PGE feature
1693	* and don't trigger the warning in __native_flush_tlb().
1694	*/
1695	VM_WARN_ON_ONCE(preemptible());
1696
1697	if (cpu_feature_enabled(X86_FEATURE_PGE)) {
1698	__flush_tlb_global();
1699	} else {
1700	/*
1701	* !PGE -> !PCID (setup_pcid()), thus every flush is total.
1702	*/
1703	flush_tlb_local();
1704	}
1705	}
1706	EXPORT_SYMBOL_GPL(__flush_tlb_all);
1707
1708	void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
1709	{
1710	struct flush_tlb_info *info;
1711
1712	int cpu = get_cpu();
1713
1714	info = get_flush_tlb_info(NULL, start: `0`, TLB_FLUSH_ALL, stride_shift: `0`, freed_tables: false,
1715	TLB_GENERATION_INVALID);
1716	/*
1717	* flush_tlb_multi() is not optimized for the common case in which only
1718	* a local TLB flush is needed. Optimize this use-case by calling
1719	* flush_tlb_func_local() directly in this case.
1720	*/
1721	if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->unmapped_pages) {
1722	invlpgb_flush_all_nonglobals();
1723	batch->unmapped_pages = false;
1724	} else if (cpumask_any_but(mask: &batch->cpumask, cpu) < nr_cpu_ids) {
1725	flush_tlb_multi(cpumask: &batch->cpumask, info);
1726	} else if (cpumask_test_cpu(cpu, cpumask: &batch->cpumask)) {
1727	lockdep_assert_irqs_enabled();
1728	local_irq_disable();
1729	flush_tlb_func(info);
1730	local_irq_enable();
1731	}
1732
1733	cpumask_clear(dstp: &batch->cpumask);
1734
1735	put_flush_tlb_info();
1736	put_cpu();
1737	}
1738
1739	/*
1740	* Blindly accessing user memory from NMI context can be dangerous
1741	* if we're in the middle of switching the current user task or
1742	* switching the loaded mm. It can also be dangerous if we
1743	* interrupted some kernel code that was temporarily using a
1744	* different mm.
1745	*/
1746	bool nmi_uaccess_okay(void)
1747	{
1748	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
1749	struct mm_struct *current_mm = current->mm;
1750
1751	VM_WARN_ON_ONCE(!loaded_mm);
1752
1753	/*
1754	* The condition we want to check is
1755	* current_mm->pgd == __va(read_cr3_pa()). This may be slow, though,
1756	* if we're running in a VM with shadow paging, and nmi_uaccess_okay()
1757	* is supposed to be reasonably fast.
1758	*
1759	* Instead, we check the almost equivalent but somewhat conservative
1760	* condition below, and we rely on the fact that switch_mm_irqs_off()
1761	* sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3.
1762	*/
1763	if (loaded_mm != current_mm)
1764	return false;
1765
1766	VM_WARN_ON_ONCE(__pa(current_mm->pgd) != read_cr3_pa());
1767
1768	return true;
1769	}
1770
1771	static ssize_t tlbflush_read_file(struct file file, char* __user *user_buf,
1772	size_t count, loff_t *ppos)
1773	{
1774	char buf[`32`];
1775	unsigned int len;
1776
1777	len = sprintf(buf, fmt: "%ld\n", tlb_single_page_flush_ceiling);
1778	return simple_read_from_buffer(to: user_buf, count, ppos, from: buf, available: len);
1779	}
1780
1781	static ssize_t tlbflush_write_file(struct file *file,
1782	const char __user user_buf, size_t count, loff_t ppos)
1783	{
1784	char buf[`32`];
1785	ssize_t len;
1786	int ceiling;
1787
1788	len = min(count, sizeof(buf) - `1`);
1789	if (copy_from_user(to: buf, from: user_buf, n: len))
1790	return -EFAULT;
1791
1792	buf[len] = `'\0'`;
1793	if (kstrtoint(s: buf, base: `0`, res: &ceiling))
1794	return -EINVAL;
1795
1796	if (ceiling < `0`)
1797	return -EINVAL;
1798
1799	tlb_single_page_flush_ceiling = ceiling;
1800	return count;
1801	}
1802
1803	static const struct file_operations fops_tlbflush = {
1804	.read = tlbflush_read_file,
1805	.write = tlbflush_write_file,
1806	.llseek = default_llseek,
1807	};
1808
1809	static int __init create_tlb_single_page_flush_ceiling(void)
1810	{
1811	debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR \| S_IWUSR,
1812	arch_debugfs_dir, NULL, &fops_tlbflush);
1813	return `0`;
1814	}
1815	late_initcall(create_tlb_single_page_flush_ceiling);
1816

Browse the source code of Linux/arch/x86/mm/tlb.c