kvm.c source code [Linux/arch/x86/kernel/kvm.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* KVM paravirt_ops implementation
4	*
5	* Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6	* Copyright IBM Corporation, 2007
7	* Authors: Anthony Liguori <aliguori@us.ibm.com>
8	*/
9
10	#define pr_fmt(fmt) "kvm-guest: " fmt
11
12	#include <linux/context_tracking.h>
13	#include <linux/init.h>
14	#include <linux/irq.h>
15	#include <linux/kernel.h>
16	#include <linux/kvm_para.h>
17	#include <linux/cpu.h>
18	#include <linux/mm.h>
19	#include <linux/highmem.h>
20	#include <linux/hardirq.h>
21	#include <linux/notifier.h>
22	#include <linux/reboot.h>
23	#include <linux/hash.h>
24	#include <linux/sched.h>
25	#include <linux/slab.h>
26	#include <linux/kprobes.h>
27	#include <linux/nmi.h>
28	#include <linux/swait.h>
29	#include <linux/syscore_ops.h>
30	#include <linux/cc_platform.h>
31	#include <linux/efi.h>
32	#include <asm/timer.h>
33	#include <asm/cpu.h>
34	#include <asm/traps.h>
35	#include <asm/desc.h>
36	#include <asm/tlbflush.h>
37	#include <asm/apic.h>
38	#include <asm/apicdef.h>
39	#include <asm/hypervisor.h>
40	#include <asm/mtrr.h>
41	#include <asm/tlb.h>
42	#include <asm/cpuidle_haltpoll.h>
43	#include <asm/msr.h>
44	#include <asm/ptrace.h>
45	#include <asm/reboot.h>
46	#include <asm/svm.h>
47	#include <asm/e820/api.h>
48
49	DEFINE_STATIC_KEY_FALSE_RO(kvm_async_pf_enabled);
50
51	static int kvmapf = `1`;
52
53	static int __init parse_no_kvmapf(char *arg)
54	{
55	kvmapf = `0`;
56	return `0`;
57	}
58
59	early_param("no-kvmapf", parse_no_kvmapf);
60
61	static int steal_acc = `1`;
62	static int __init parse_no_stealacc(char *arg)
63	{
64	steal_acc = `0`;
65	return `0`;
66	}
67
68	early_param("no-steal-acc", parse_no_stealacc);
69
70	static DEFINE_PER_CPU_READ_MOSTLY(bool, async_pf_enabled);
71	static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(`64`);
72	DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(`64`) __visible;
73	static int has_steal_clock = `0`;
74
75	static int has_guest_poll = `0`;
76	/*
77	* No need for any "IO delay" on KVM
78	*/
79	static void kvm_io_delay(void)
80	{
81	}
82
83	#define KVM_TASK_SLEEP_HASHBITS 8
84	#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
85
86	struct kvm_task_sleep_node {
87	struct hlist_node link;
88	struct swait_queue_head wq;
89	u32 token;
90	int cpu;
91	};
92
93	static struct kvm_task_sleep_head {
94	raw_spinlock_t lock;
95	struct hlist_head list;
96	} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
97
98	static struct kvm_task_sleep_node _find_apf_task(struct* kvm_task_sleep_head *b,
99	u32 token)
100	{
101	struct hlist_node *p;
102
103	hlist_for_each(p, &b->list) {
104	struct kvm_task_sleep_node *n =
105	hlist_entry(p, typeof(*n), link);
106	if (n->token == token)
107	return n;
108	}
109
110	return NULL;
111	}
112
113	static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n)
114	{
115	u32 key = hash_32(val: token, KVM_TASK_SLEEP_HASHBITS);
116	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
117	struct kvm_task_sleep_node *e;
118
119	raw_spin_lock(&b->lock);
120	e = _find_apf_task(b, token);
121	if (e) {
122	/ dummy entry exist -> wake up was delivered ahead of PF /
123	hlist_del(n: &e->link);
124	raw_spin_unlock(&b->lock);
125	kfree(objp: e);
126	return false;
127	}
128
129	n->token = token;
130	n->cpu = smp_processor_id();
131	init_swait_queue_head(&n->wq);
132	hlist_add_head(n: &n->link, h: &b->list);
133	raw_spin_unlock(&b->lock);
134	return true;
135	}
136
137	/*
138	* kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled
139	* @token: Token to identify the sleep node entry
140	*
141	* Invoked from the async pagefault handling code or from the VM exit page
142	* fault handler. In both cases RCU is watching.
143	*/
144	void kvm_async_pf_task_wait_schedule(u32 token)
145	{
146	struct kvm_task_sleep_node n;
147	DECLARE_SWAITQUEUE(wait);
148
149	lockdep_assert_irqs_disabled();
150
151	if (!kvm_async_pf_queue_task(token, n: &n))
152	return;
153
154	for (;;) {
155	prepare_to_swait_exclusive(q: &n.wq, wait: &wait, TASK_UNINTERRUPTIBLE);
156	if (hlist_unhashed(h: &n.link))
157	break;
158
159	local_irq_enable();
160	schedule();
161	local_irq_disable();
162	}
163	finish_swait(q: &n.wq, wait: &wait);
164	}
165	EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule);
166
167	static void apf_task_wake_one(struct kvm_task_sleep_node *n)
168	{
169	hlist_del_init(n: &n->link);
170	if (swq_has_sleeper(wq: &n->wq))
171	swake_up_one(q: &n->wq);
172	}
173
174	static void apf_task_wake_all(void)
175	{
176	int i;
177
178	for (i = `0`; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
179	struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
180	struct kvm_task_sleep_node *n;
181	struct hlist_node p, next;
182
183	raw_spin_lock(&b->lock);
184	hlist_for_each_safe(p, next, &b->list) {
185	n = hlist_entry(p, typeof(*n), link);
186	if (n->cpu == smp_processor_id())
187	apf_task_wake_one(n);
188	}
189	raw_spin_unlock(&b->lock);
190	}
191	}
192
193	static void kvm_async_pf_task_wake(u32 token)
194	{
195	u32 key = hash_32(val: token, KVM_TASK_SLEEP_HASHBITS);
196	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
197	struct kvm_task_sleep_node n, dummy = NULL;
198
199	if (token == ~`0`) {
200	apf_task_wake_all();
201	return;
202	}
203
204	again:
205	raw_spin_lock(&b->lock);
206	n = _find_apf_task(b, token);
207	if (!n) {
208	/*
209	* Async #PF not yet handled, add a dummy entry for the token.
210	* Allocating the token must be down outside of the raw lock
211	* as the allocator is preemptible on PREEMPT_RT kernels.
212	*/
213	if (!dummy) {
214	raw_spin_unlock(&b->lock);
215	dummy = kzalloc(sizeof(*dummy), GFP_ATOMIC);
216
217	/*
218	* Continue looping on allocation failure, eventually
219	* the async #PF will be handled and allocating a new
220	* node will be unnecessary.
221	*/
222	if (!dummy)
223	cpu_relax();
224
225	/*
226	* Recheck for async #PF completion before enqueueing
227	* the dummy token to avoid duplicate list entries.
228	*/
229	goto again;
230	}
231	dummy->token = token;
232	dummy->cpu = smp_processor_id();
233	init_swait_queue_head(&dummy->wq);
234	hlist_add_head(n: &dummy->link, h: &b->list);
235	dummy = NULL;
236	} else {
237	apf_task_wake_one(n);
238	}
239	raw_spin_unlock(&b->lock);
240
241	/ A dummy token might be allocated and ultimately not used. /
242	kfree(objp: dummy);
243	}
244
245	noinstr u32 kvm_read_and_reset_apf_flags(void)
246	{
247	u32 flags = `0`;
248
249	if (__this_cpu_read(async_pf_enabled)) {
250	flags = __this_cpu_read(apf_reason.flags);
251	__this_cpu_write(apf_reason.flags, `0`);
252	}
253
254	return flags;
255	}
256	EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags);
257
258	noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
259	{
260	u32 flags = kvm_read_and_reset_apf_flags();
261	irqentry_state_t state;
262
263	if (!flags)
264	return false;
265
266	state = irqentry_enter(regs);
267	instrumentation_begin();
268
269	/*
270	* If the host managed to inject an async #PF into an interrupt
271	* disabled region, then die hard as this is not going to end well
272	* and the host side is seriously broken.
273	*/
274	if (unlikely(!(regs->flags & X86_EFLAGS_IF)))
275	panic(fmt: "Host injected async #PF in interrupt disabled region\n");
276
277	if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
278	if (unlikely(!(user_mode(regs))))
279	panic(fmt: "Host injected async #PF in kernel mode\n");
280	/ Page is swapped out by the host. /
281	kvm_async_pf_task_wait_schedule(token);
282	} else {
283	WARN_ONCE(`1`, "Unexpected async PF flags: %x\n", flags);
284	}
285
286	instrumentation_end();
287	irqentry_exit(regs, state);
288	return true;
289	}
290
291	DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
292	{
293	struct pt_regs *old_regs = set_irq_regs(regs);
294	u32 token;
295
296	apic_eoi();
297
298	inc_irq_stat(irq_hv_callback_count);
299
300	if (__this_cpu_read(async_pf_enabled)) {
301	token = __this_cpu_read(apf_reason.token);
302	kvm_async_pf_task_wake(token);
303	__this_cpu_write(apf_reason.token, `0`);
304	wrmsrq(MSR_KVM_ASYNC_PF_ACK, val: `1`);
305	}
306
307	set_irq_regs(old_regs);
308	}
309
310	static void __init paravirt_ops_setup(void)
311	{
312	pv_info.name = "KVM";
313
314	if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
315	pv_ops.cpu.io_delay = kvm_io_delay;
316
317	#ifdef CONFIG_X86_IO_APIC
318	no_timer_check = `1`;
319	#endif
320	}
321
322	static void kvm_register_steal_time(void)
323	{
324	int cpu = smp_processor_id();
325	struct kvm_steal_time *st = &per_cpu(steal_time, cpu);
326
327	if (!has_steal_clock)
328	return;
329
330	wrmsrq(MSR_KVM_STEAL_TIME, val: (slow_virt_to_phys(address: st) \| KVM_MSR_ENABLED));
331	pr_debug("stealtime: cpu %d, msr %llx\n", cpu,
332	(unsigned long long) slow_virt_to_phys(st));
333	}
334
335	static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
336
337	static notrace __maybe_unused void kvm_guest_apic_eoi_write(void)
338	{
339	/**
340	* This relies on __test_and_clear_bit to modify the memory
341	* in a way that is atomic with respect to the local CPU.
342	* The hypervisor only accesses this memory from the local CPU so
343	* there's no need for lock or memory barriers.
344	* An optimization barrier is implied in apic write.
345	*/
346	if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi)))
347	return;
348	apic_native_eoi();
349	}
350
351	static void kvm_guest_cpu_init(void)
352	{
353	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
354	u64 pa;
355
356	WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));
357
358	pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
359	pa \|= KVM_ASYNC_PF_ENABLED \| KVM_ASYNC_PF_DELIVERY_AS_INT;
360
361	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))
362	pa \|= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
363
364	wrmsrq(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR);
365
366	wrmsrq(MSR_KVM_ASYNC_PF_EN, val: pa);
367	__this_cpu_write(async_pf_enabled, true);
368	pr_debug("setup async PF for cpu %d\n", smp_processor_id());
369	}
370
371	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
372	unsigned long pa;
373
374	/ Size alignment is implied but just to make it explicit. /
375	BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < `4`);
376	__this_cpu_write(kvm_apic_eoi, `0`);
377	pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi))
378	\| KVM_MSR_ENABLED;
379	wrmsrq(MSR_KVM_PV_EOI_EN, val: pa);
380	}
381
382	if (has_steal_clock)
383	kvm_register_steal_time();
384	}
385
386	static void kvm_pv_disable_apf(void)
387	{
388	if (!__this_cpu_read(async_pf_enabled))
389	return;
390
391	wrmsrq(MSR_KVM_ASYNC_PF_EN, val: `0`);
392	__this_cpu_write(async_pf_enabled, false);
393
394	pr_debug("disable async PF for cpu %d\n", smp_processor_id());
395	}
396
397	static void kvm_disable_steal_time(void)
398	{
399	if (!has_steal_clock)
400	return;
401
402	wrmsrq(MSR_KVM_STEAL_TIME, val: `0`);
403	}
404
405	static u64 kvm_steal_clock(int cpu)
406	{
407	u64 steal;
408	struct kvm_steal_time *src;
409	int version;
410
411	src = &per_cpu(steal_time, cpu);
412	do {
413	version = src->version;
414	virt_rmb();
415	steal = src->steal;
416	virt_rmb();
417	} while ((version & `1`) \|\| (version != src->version));
418
419	return steal;
420	}
421
422	static inline __init void __set_percpu_decrypted(void ptr, unsigned* long size)
423	{
424	early_set_memory_decrypted(vaddr: (unsigned long) ptr, size);
425	}
426
427	/*
428	* Iterate through all possible CPUs and map the memory region pointed
429	* by apf_reason, steal_time and kvm_apic_eoi as decrypted at once.
430	*
431	* Note: we iterate through all possible CPUs to ensure that CPUs
432	* hotplugged will have their per-cpu variable already mapped as
433	* decrypted.
434	*/
435	static void __init sev_map_percpu_data(void)
436	{
437	int cpu;
438
439	if (cc_vendor != CC_VENDOR_AMD \|\|
440	!cc_platform_has(attr: CC_ATTR_GUEST_MEM_ENCRYPT))
441	return;
442
443	for_each_possible_cpu(cpu) {
444	__set_percpu_decrypted(ptr: &per_cpu(apf_reason, cpu), size: sizeof(apf_reason));
445	__set_percpu_decrypted(ptr: &per_cpu(steal_time, cpu), size: sizeof(steal_time));
446	__set_percpu_decrypted(ptr: &per_cpu(kvm_apic_eoi, cpu), size: sizeof(kvm_apic_eoi));
447	}
448	}
449
450	static void kvm_guest_cpu_offline(bool shutdown)
451	{
452	kvm_disable_steal_time();
453	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
454	wrmsrq(MSR_KVM_PV_EOI_EN, val: `0`);
455	if (kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
456	wrmsrq(MSR_KVM_MIGRATION_CONTROL, val: `0`);
457	kvm_pv_disable_apf();
458	if (!shutdown)
459	apf_task_wake_all();
460	kvmclock_disable();
461	}
462
463	static int kvm_cpu_online(unsigned int cpu)
464	{
465	unsigned long flags;
466
467	local_irq_save(flags);
468	kvm_guest_cpu_init();
469	local_irq_restore(flags);
470	return `0`;
471	}
472
473	#ifdef CONFIG_SMP
474
475	static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
476
477	static bool pv_tlb_flush_supported(void)
478	{
479	return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
480	!kvm_para_has_hint(KVM_HINTS_REALTIME) &&
481	kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) &&
482	!boot_cpu_has(X86_FEATURE_MWAIT) &&
483	(num_possible_cpus() != `1`));
484	}
485
486	static bool pv_ipi_supported(void)
487	{
488	return (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI) &&
489	(num_possible_cpus() != `1`));
490	}
491
492	static bool pv_sched_yield_supported(void)
493	{
494	return (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) &&
495	!kvm_para_has_hint(KVM_HINTS_REALTIME) &&
496	kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) &&
497	!boot_cpu_has(X86_FEATURE_MWAIT) &&
498	(num_possible_cpus() != `1`));
499	}
500
501	#define KVM_IPI_CLUSTER_SIZE (2 * BITS_PER_LONG)
502
503	static void __send_ipi_mask(const struct cpumask mask, int* vector)
504	{
505	unsigned long flags;
506	int cpu, min = `0`, max = `0`;
507	#ifdef CONFIG_X86_64
508	__uint128_t ipi_bitmap = `0`;
509	#else
510	u64 ipi_bitmap = `0`;
511	#endif
512	u32 apic_id, icr;
513	long ret;
514
515	if (cpumask_empty(srcp: mask))
516	return;
517
518	local_irq_save(flags);
519
520	switch (vector) {
521	default:
522	icr = APIC_DM_FIXED \| vector;
523	break;
524	case NMI_VECTOR:
525	icr = APIC_DM_NMI;
526	break;
527	}
528
529	for_each_cpu(cpu, mask) {
530	apic_id = per_cpu(x86_cpu_to_apicid, cpu);
531	if (!ipi_bitmap) {
532	min = max = apic_id;
533	} else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) {
534	ipi_bitmap <<= min - apic_id;
535	min = apic_id;
536	} else if (apic_id > min && apic_id < min + KVM_IPI_CLUSTER_SIZE) {
537	max = apic_id < max ? max : apic_id;
538	} else {
539	ret = kvm_hypercall4(KVM_HC_SEND_IPI, p1: (unsigned long)ipi_bitmap,
540	p2: (unsigned long)(ipi_bitmap >> BITS_PER_LONG), p3: min, p4: icr);
541	WARN_ONCE(ret < `0`, "kvm-guest: failed to send PV IPI: %ld",
542	ret);
543	min = max = apic_id;
544	ipi_bitmap = `0`;
545	}
546	__set_bit(apic_id - min, (unsigned long *)&ipi_bitmap);
547	}
548
549	if (ipi_bitmap) {
550	ret = kvm_hypercall4(KVM_HC_SEND_IPI, p1: (unsigned long)ipi_bitmap,
551	p2: (unsigned long)(ipi_bitmap >> BITS_PER_LONG), p3: min, p4: icr);
552	WARN_ONCE(ret < `0`, "kvm-guest: failed to send PV IPI: %ld",
553	ret);
554	}
555
556	local_irq_restore(flags);
557	}
558
559	static void kvm_send_ipi_mask(const struct cpumask mask, int* vector)
560	{
561	__send_ipi_mask(mask, vector);
562	}
563
564	static void kvm_send_ipi_mask_allbutself(const struct cpumask mask, int* vector)
565	{
566	unsigned int this_cpu = smp_processor_id();
567	struct cpumask *new_mask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
568	const struct cpumask *local_mask;
569
570	cpumask_copy(dstp: new_mask, srcp: mask);
571	cpumask_clear_cpu(cpu: this_cpu, dstp: new_mask);
572	local_mask = new_mask;
573	__send_ipi_mask(mask: local_mask, vector);
574	}
575
576	static int __init setup_efi_kvm_sev_migration(void)
577	{
578	efi_char16_t efi_sev_live_migration_enabled[] = L"SevLiveMigrationEnabled";
579	efi_guid_t efi_variable_guid = AMD_SEV_MEM_ENCRYPT_GUID;
580	efi_status_t status;
581	unsigned long size;
582	bool enabled;
583
584	if (!cc_platform_has(attr: CC_ATTR_GUEST_MEM_ENCRYPT) \|\|
585	!kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
586	return `0`;
587
588	if (!efi_enabled(EFI_BOOT))
589	return `0`;
590
591	if (!efi_enabled(EFI_RUNTIME_SERVICES)) {
592	pr_info("%s : EFI runtime services are not enabled\n", __func__);
593	return `0`;
594	}
595
596	size = sizeof(enabled);
597
598	/ Get variable contents into buffer /
599	status = efi.get_variable(efi_sev_live_migration_enabled,
600	&efi_variable_guid, NULL, &size, &enabled);
601
602	if (status == EFI_NOT_FOUND) {
603	pr_info("%s : EFI live migration variable not found\n", __func__);
604	return `0`;
605	}
606
607	if (status != EFI_SUCCESS) {
608	pr_info("%s : EFI variable retrieval failed\n", __func__);
609	return `0`;
610	}
611
612	if (enabled == `0`) {
613	pr_info("%s: live migration disabled in EFI\n", __func__);
614	return `0`;
615	}
616
617	pr_info("%s : live migration enabled in EFI\n", __func__);
618	wrmsrq(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY);
619
620	return `1`;
621	}
622
623	late_initcall(setup_efi_kvm_sev_migration);
624
625	/*
626	* Set the IPI entry points
627	*/
628	static __init void kvm_setup_pv_ipi(void)
629	{
630	apic_update_callback(send_IPI_mask, kvm_send_ipi_mask);
631	apic_update_callback(send_IPI_mask_allbutself, kvm_send_ipi_mask_allbutself);
632	pr_info("setup PV IPIs\n");
633	}
634
635	static void kvm_smp_send_call_func_ipi(const struct cpumask *mask)
636	{
637	int cpu;
638
639	native_send_call_func_ipi(mask);
640
641	/ Make sure other vCPUs get a chance to run if they need to. /
642	for_each_cpu(cpu, mask) {
643	if (!idle_cpu(cpu) && vcpu_is_preempted(cpu)) {
644	kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu));
645	break;
646	}
647	}
648	}
649
650	static void kvm_flush_tlb_multi(const struct cpumask *cpumask,
651	const struct flush_tlb_info *info)
652	{
653	u8 state;
654	int cpu;
655	struct kvm_steal_time *src;
656	struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
657
658	cpumask_copy(dstp: flushmask, srcp: cpumask);
659	/*
660	* We have to call flush only on online vCPUs. And
661	* queue flush_on_enter for pre-empted vCPUs
662	*/
663	for_each_cpu(cpu, flushmask) {
664	/*
665	* The local vCPU is never preempted, so we do not explicitly
666	* skip check for local vCPU - it will never be cleared from
667	* flushmask.
668	*/
669	src = &per_cpu(steal_time, cpu);
670	state = READ_ONCE(src->preempted);
671	if ((state & KVM_VCPU_PREEMPTED)) {
672	if (try_cmpxchg(&src->preempted, &state,
673	state \| KVM_VCPU_FLUSH_TLB))
674	__cpumask_clear_cpu(cpu, dstp: flushmask);
675	}
676	}
677
678	native_flush_tlb_multi(cpumask: flushmask, info);
679	}
680
681	static __init int kvm_alloc_cpumask(void)
682	{
683	int cpu;
684
685	if (!kvm_para_available() \|\| nopv)
686	return `0`;
687
688	if (pv_tlb_flush_supported() \|\| pv_ipi_supported())
689	for_each_possible_cpu(cpu) {
690	zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu),
691	GFP_KERNEL, node: cpu_to_node(cpu));
692	}
693
694	return `0`;
695	}
696	arch_initcall(kvm_alloc_cpumask);
697
698	static void __init kvm_smp_prepare_boot_cpu(void)
699	{
700	/*
701	* Map the per-cpu variables as decrypted before kvm_guest_cpu_init()
702	* shares the guest physical address with the hypervisor.
703	*/
704	sev_map_percpu_data();
705
706	kvm_guest_cpu_init();
707	native_smp_prepare_boot_cpu();
708	kvm_spinlock_init();
709	}
710
711	static int kvm_cpu_down_prepare(unsigned int cpu)
712	{
713	unsigned long flags;
714
715	local_irq_save(flags);
716	kvm_guest_cpu_offline(shutdown: false);
717	local_irq_restore(flags);
718	return `0`;
719	}
720
721	#endif
722
723	static int kvm_suspend(void)
724	{
725	u64 val = `0`;
726
727	kvm_guest_cpu_offline(shutdown: false);
728
729	#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
730	if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
731	rdmsrq(MSR_KVM_POLL_CONTROL, val);
732	has_guest_poll = !(val & `1`);
733	#endif
734	return `0`;
735	}
736
737	static void kvm_resume(void)
738	{
739	kvm_cpu_online(raw_smp_processor_id());
740
741	#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
742	if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll)
743	wrmsrq(MSR_KVM_POLL_CONTROL, val: `0`);
744	#endif
745	}
746
747	static struct syscore_ops kvm_syscore_ops = {
748	.suspend = kvm_suspend,
749	.resume = kvm_resume,
750	};
751
752	static void kvm_pv_guest_cpu_reboot(void *unused)
753	{
754	kvm_guest_cpu_offline(shutdown: true);
755	}
756
757	static int kvm_pv_reboot_notify(struct notifier_block *nb,
758	unsigned long code, void *unused)
759	{
760	if (code == SYS_RESTART)
761	on_each_cpu(func: kvm_pv_guest_cpu_reboot, NULL, wait: `1`);
762	return NOTIFY_DONE;
763	}
764
765	static struct notifier_block kvm_pv_reboot_nb = {
766	.notifier_call = kvm_pv_reboot_notify,
767	};
768
769	/*
770	* After a PV feature is registered, the host will keep writing to the
771	* registered memory location. If the guest happens to shutdown, this memory
772	* won't be valid. In cases like kexec, in which you install a new kernel, this
773	* means a random memory location will be kept being written.
774	*/
775	#ifdef CONFIG_CRASH_DUMP
776	static void kvm_crash_shutdown(struct pt_regs *regs)
777	{
778	kvm_guest_cpu_offline(shutdown: true);
779	native_machine_crash_shutdown(regs);
780	}
781	#endif
782
783	#if defined(CONFIG_X86_32) \|\| !defined(CONFIG_SMP)
784	bool __kvm_vcpu_is_preempted(long cpu);
785
786	__visible bool __kvm_vcpu_is_preempted(long cpu)
787	{
788	struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
789
790	return !!(src->preempted & KVM_VCPU_PREEMPTED);
791	}
792	PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
793
794	#else
795
796	#include <asm/asm-offsets.h>
797
798	extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
799
800	/*
801	* Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
802	* restoring to/from the stack.
803	*/
804	#define PV_VCPU_PREEMPTED_ASM \
805	"movq __per_cpu_offset(,%rdi,8), %rax\n\t" \
806	"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax)\n\t" \
807	"setne %al\n\t"
808
809	DEFINE_ASM_FUNC(__raw_callee_save___kvm_vcpu_is_preempted,
810	PV_VCPU_PREEMPTED_ASM, .text);
811	#endif
812
813	static void __init kvm_guest_init(void)
814	{
815	int i;
816
817	paravirt_ops_setup();
818	register_reboot_notifier(&kvm_pv_reboot_nb);
819	for (i = `0`; i < KVM_TASK_SLEEP_HASHSIZE; i++)
820	raw_spin_lock_init(&async_pf_sleepers[i].lock);
821
822	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
823	has_steal_clock = `1`;
824	static_call_update(pv_steal_clock, kvm_steal_clock);
825
826	pv_ops.lock.vcpu_is_preempted =
827	PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
828	}
829
830	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
831	apic_update_callback(eoi, kvm_guest_apic_eoi_write);
832
833	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
834	static_branch_enable(&kvm_async_pf_enabled);
835	sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_kvm_asyncpf_interrupt);
836	}
837
838	#ifdef CONFIG_SMP
839	if (pv_tlb_flush_supported()) {
840	pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
841	pr_info("KVM setup pv remote TLB flush\n");
842	}
843
844	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
845	if (pv_sched_yield_supported()) {
846	smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
847	pr_info("setup PV sched yield\n");
848	}
849	if (cpuhp_setup_state_nocalls(state: CPUHP_AP_ONLINE_DYN, name: "x86/kvm:online",
850	startup: kvm_cpu_online, teardown: kvm_cpu_down_prepare) < `0`)
851	pr_err("failed to install cpu hotplug callbacks\n");
852	#else
853	sev_map_percpu_data();
854	kvm_guest_cpu_init();
855	#endif
856
857	#ifdef CONFIG_CRASH_DUMP
858	machine_ops.crash_shutdown = kvm_crash_shutdown;
859	#endif
860
861	register_syscore_ops(ops: &kvm_syscore_ops);
862
863	/*
864	* Hard lockup detection is enabled by default. Disable it, as guests
865	* can get false positives too easily, for example if the host is
866	* overcommitted.
867	*/
868	hardlockup_detector_disable();
869	}
870
871	static noinline uint32_t __kvm_cpuid_base(void)
872	{
873	if (boot_cpu_data.cpuid_level < `0`)
874	return `0`; / So we don't blow up on old processors /
875
876	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
877	return cpuid_base_hypervisor(KVM_SIGNATURE, leaves: `0`);
878
879	return `0`;
880	}
881
882	static inline uint32_t kvm_cpuid_base(void)
883	{
884	static int kvm_cpuid_base = -`1`;
885
886	if (kvm_cpuid_base == -`1`)
887	kvm_cpuid_base = __kvm_cpuid_base();
888
889	return kvm_cpuid_base;
890	}
891
892	bool kvm_para_available(void)
893	{
894	return kvm_cpuid_base() != `0`;
895	}
896	EXPORT_SYMBOL_GPL(kvm_para_available);
897
898	unsigned int kvm_arch_para_features(void)
899	{
900	return cpuid_eax(op: kvm_cpuid_base() \| KVM_CPUID_FEATURES);
901	}
902
903	unsigned int kvm_arch_para_hints(void)
904	{
905	return cpuid_edx(op: kvm_cpuid_base() \| KVM_CPUID_FEATURES);
906	}
907	EXPORT_SYMBOL_GPL(kvm_arch_para_hints);
908
909	static uint32_t __init kvm_detect(void)
910	{
911	return kvm_cpuid_base();
912	}
913
914	static void __init kvm_apic_init(void)
915	{
916	#ifdef CONFIG_SMP
917	if (pv_ipi_supported())
918	kvm_setup_pv_ipi();
919	#endif
920	}
921
922	static bool __init kvm_msi_ext_dest_id(void)
923	{
924	return kvm_para_has_feature(KVM_FEATURE_MSI_EXT_DEST_ID);
925	}
926
927	static void kvm_sev_hc_page_enc_status(unsigned long pfn, int npages, bool enc)
928	{
929	kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, p1: pfn << PAGE_SHIFT, p2: npages,
930	KVM_MAP_GPA_RANGE_ENC_STAT(enc) \| KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
931	}
932
933	static void __init kvm_init_platform(void)
934	{
935	u64 tolud = PFN_PHYS(e820__end_of_low_ram_pfn());
936	/*
937	* Note, hardware requires variable MTRR ranges to be power-of-2 sized
938	* and naturally aligned. But when forcing guest MTRR state, Linux
939	* doesn't program the forced ranges into hardware. Don't bother doing
940	* the math to generate a technically-legal range.
941	*/
942	struct mtrr_var_range pci_hole = {
943	.base_lo = tolud \| X86_MEMTYPE_UC,
944	.mask_lo = (u32)(~(SZ_4G - tolud - `1`)) \| MTRR_PHYSMASK_V,
945	.mask_hi = (BIT_ULL(boot_cpu_data.x86_phys_bits) - `1`) >> `32`,
946	};
947
948	if (cc_platform_has(attr: CC_ATTR_GUEST_MEM_ENCRYPT) &&
949	kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) {
950	unsigned long nr_pages;
951	int i;
952
953	pv_ops.mmu.notify_page_enc_status_changed =
954	kvm_sev_hc_page_enc_status;
955
956	/*
957	* Reset the host's shared pages list related to kernel
958	* specific page encryption status settings before we load a
959	* new kernel by kexec. Reset the page encryption status
960	* during early boot instead of just before kexec to avoid SMP
961	* races during kvm_pv_guest_cpu_reboot().
962	* NOTE: We cannot reset the complete shared pages list
963	* here as we need to retain the UEFI/OVMF firmware
964	* specific settings.
965	*/
966
967	for (i = `0`; i < e820_table->nr_entries; i++) {
968	struct e820_entry *entry = &e820_table->entries[i];
969
970	if (entry->type != E820_TYPE_RAM)
971	continue;
972
973	nr_pages = DIV_ROUND_UP(entry->size, PAGE_SIZE);
974
975	kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, p1: entry->addr,
976	p2: nr_pages,
977	KVM_MAP_GPA_RANGE_ENCRYPTED \| KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
978	}
979
980	/*
981	* Ensure that _bss_decrypted section is marked as decrypted in the
982	* shared pages list.
983	*/
984	early_set_mem_enc_dec_hypercall(vaddr: (unsigned long)__start_bss_decrypted,
985	size: __end_bss_decrypted - __start_bss_decrypted, enc: `0`);
986
987	/*
988	* If not booted using EFI, enable Live migration support.
989	*/
990	if (!efi_enabled(EFI_BOOT))
991	wrmsrq(MSR_KVM_MIGRATION_CONTROL,
992	KVM_MIGRATION_READY);
993	}
994	kvmclock_init();
995	x86_platform.apic_post_init = kvm_apic_init;
996
997	/*
998	* Set WB as the default cache mode for SEV-SNP and TDX, with a single
999	* UC range for the legacy PCI hole, e.g. so that devices that expect
1000	* to get UC/WC mappings don't get surprised with WB.
1001	*/
1002	guest_force_mtrr_state(var: &pci_hole, num_var: `1`, MTRR_TYPE_WRBACK);
1003	}
1004
1005	#if defined(CONFIG_AMD_MEM_ENCRYPT)
1006	static void kvm_sev_es_hcall_prepare(struct ghcb ghcb, struct* pt_regs *regs)
1007	{
1008	/ RAX and CPL are already in the GHCB /
1009	ghcb_set_rbx(ghcb, regs->bx);
1010	ghcb_set_rcx(ghcb, regs->cx);
1011	ghcb_set_rdx(ghcb, regs->dx);
1012	ghcb_set_rsi(ghcb, regs->si);
1013	}
1014
1015	static bool kvm_sev_es_hcall_finish(struct ghcb ghcb, struct* pt_regs *regs)
1016	{
1017	/ No checking of the return state needed /
1018	return true;
1019	}
1020	#endif
1021
1022	const __initconst struct hypervisor_x86 x86_hyper_kvm = {
1023	.name = "KVM",
1024	.detect = kvm_detect,
1025	.type = X86_HYPER_KVM,
1026	.init.guest_late_init = kvm_guest_init,
1027	.init.x2apic_available = kvm_para_available,
1028	.init.msi_ext_dest_id = kvm_msi_ext_dest_id,
1029	.init.init_platform = kvm_init_platform,
1030	#if defined(CONFIG_AMD_MEM_ENCRYPT)
1031	.runtime.sev_es_hcall_prepare = kvm_sev_es_hcall_prepare,
1032	.runtime.sev_es_hcall_finish = kvm_sev_es_hcall_finish,
1033	#endif
1034	};
1035
1036	static __init int activate_jump_labels(void)
1037	{
1038	if (has_steal_clock) {
1039	static_key_slow_inc(key: &paravirt_steal_enabled);
1040	if (steal_acc)
1041	static_key_slow_inc(key: &paravirt_steal_rq_enabled);
1042	}
1043
1044	return `0`;
1045	}
1046	arch_initcall(activate_jump_labels);
1047
1048	#ifdef CONFIG_PARAVIRT_SPINLOCKS
1049
1050	/ Kick a cpu by its apicid. Used to wake up a halted vcpu /
1051	static void kvm_kick_cpu(int cpu)
1052	{
1053	unsigned long flags = `0`;
1054	u32 apicid;
1055
1056	apicid = per_cpu(x86_cpu_to_apicid, cpu);
1057	kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
1058	}
1059
1060	#include <asm/qspinlock.h>
1061
1062	static void kvm_wait(u8 *ptr, u8 val)
1063	{
1064	if (in_nmi())
1065	return;
1066
1067	/*
1068	* halt until it's our turn and kicked. Note that we do safe halt
1069	* for irq enabled case to avoid hang when lock info is overwritten
1070	* in irq spinlock slowpath and no spurious interrupt occur to save us.
1071	*/
1072	if (irqs_disabled()) {
1073	if (READ_ONCE(*ptr) == val)
1074	halt();
1075	} else {
1076	local_irq_disable();
1077
1078	/ safe_halt() will enable IRQ /
1079	if (READ_ONCE(*ptr) == val)
1080	safe_halt();
1081	else
1082	local_irq_enable();
1083	}
1084	}
1085
1086	/*
1087	* Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
1088	*/
1089	void __init kvm_spinlock_init(void)
1090	{
1091	/*
1092	* Disable PV spinlocks and use native qspinlock when dedicated pCPUs
1093	* are available.
1094	*/
1095	if (kvm_para_has_hint(KVM_HINTS_REALTIME)) {
1096	pr_info("PV spinlocks disabled with KVM_HINTS_REALTIME hints\n");
1097	goto out;
1098	}
1099
1100	if (num_possible_cpus() == `1`) {
1101	pr_info("PV spinlocks disabled, single CPU\n");
1102	goto out;
1103	}
1104
1105	if (nopvspin) {
1106	pr_info("PV spinlocks disabled, forced by \"nopvspin\" parameter\n");
1107	goto out;
1108	}
1109
1110	/*
1111	* In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an
1112	* advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is
1113	* preferred over native qspinlock when vCPU is preempted.
1114	*/
1115	if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) {
1116	pr_info("PV spinlocks disabled, no host support\n");
1117	return;
1118	}
1119
1120	pr_info("PV spinlocks enabled\n");
1121
1122	__pv_init_lock_hash();
1123	pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
1124	pv_ops.lock.queued_spin_unlock =
1125	PV_CALLEE_SAVE(__pv_queued_spin_unlock);
1126	pv_ops.lock.wait = kvm_wait;
1127	pv_ops.lock.kick = kvm_kick_cpu;
1128
1129	/*
1130	* When PV spinlock is enabled which is preferred over
1131	* virt_spin_lock(), virt_spin_lock_key's value is meaningless.
1132	* Just disable it anyway.
1133	*/
1134	out:
1135	static_branch_disable(&virt_spin_lock_key);
1136	}
1137
1138	#endif /* CONFIG_PARAVIRT_SPINLOCKS */
1139
1140	#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
1141
1142	static void kvm_disable_host_haltpoll(void *i)
1143	{
1144	wrmsrq(MSR_KVM_POLL_CONTROL, val: `0`);
1145	}
1146
1147	static void kvm_enable_host_haltpoll(void *i)
1148	{
1149	wrmsrq(MSR_KVM_POLL_CONTROL, val: `1`);
1150	}
1151
1152	void arch_haltpoll_enable(unsigned int cpu)
1153	{
1154	if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) {
1155	pr_err_once("host does not support poll control\n");
1156	pr_err_once("host upgrade recommended\n");
1157	return;
1158	}
1159
1160	/ Enable guest halt poll disables host halt poll /
1161	smp_call_function_single(cpuid: cpu, func: kvm_disable_host_haltpoll, NULL, wait: `1`);
1162	}
1163	EXPORT_SYMBOL_GPL(arch_haltpoll_enable);
1164
1165	void arch_haltpoll_disable(unsigned int cpu)
1166	{
1167	if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
1168	return;
1169
1170	/ Disable guest halt poll enables host halt poll /
1171	smp_call_function_single(cpuid: cpu, func: kvm_enable_host_haltpoll, NULL, wait: `1`);
1172	}
1173	EXPORT_SYMBOL_GPL(arch_haltpoll_disable);
1174	#endif
1175

Browse the source code of Linux/arch/x86/kernel/kvm.c