core.c source code [Linux/arch/x86/kernel/cpu/mce/core.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Machine check handler.
4	*
5	* K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
6	* Rest from unknown author(s).
7	* 2004 Andi Kleen. Rewrote most of it.
8	* Copyright 2008 Intel Corporation
9	* Author: Andi Kleen
10	*/
11
12	#include <linux/thread_info.h>
13	#include <linux/capability.h>
14	#include <linux/miscdevice.h>
15	#include <linux/ratelimit.h>
16	#include <linux/rcupdate.h>
17	#include <linux/kobject.h>
18	#include <linux/uaccess.h>
19	#include <linux/kdebug.h>
20	#include <linux/kernel.h>
21	#include <linux/percpu.h>
22	#include <linux/string.h>
23	#include <linux/device.h>
24	#include <linux/syscore_ops.h>
25	#include <linux/delay.h>
26	#include <linux/ctype.h>
27	#include <linux/sched.h>
28	#include <linux/sysfs.h>
29	#include <linux/types.h>
30	#include <linux/slab.h>
31	#include <linux/init.h>
32	#include <linux/kmod.h>
33	#include <linux/poll.h>
34	#include <linux/nmi.h>
35	#include <linux/cpu.h>
36	#include <linux/ras.h>
37	#include <linux/smp.h>
38	#include <linux/fs.h>
39	#include <linux/mm.h>
40	#include <linux/debugfs.h>
41	#include <linux/irq_work.h>
42	#include <linux/export.h>
43	#include <linux/set_memory.h>
44	#include <linux/sync_core.h>
45	#include <linux/task_work.h>
46	#include <linux/hardirq.h>
47	#include <linux/kexec.h>
48
49	#include <asm/fred.h>
50	#include <asm/cpu_device_id.h>
51	#include <asm/processor.h>
52	#include <asm/traps.h>
53	#include <asm/tlbflush.h>
54	#include <asm/mce.h>
55	#include <asm/msr.h>
56	#include <asm/reboot.h>
57	#include <asm/tdx.h>
58
59	#include "internal.h"
60
61	/ sysfs synchronization /
62	static DEFINE_MUTEX(mce_sysfs_mutex);
63
64	#define CREATE_TRACE_POINTS
65	#include <trace/events/mce.h>
66
67	#define SPINUNIT 100 /* 100ns */
68
69	DEFINE_PER_CPU(unsigned, mce_exception_count);
70
71	DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
72
73	DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
74
75	#define ATTR_LEN 16
76	/ One object for each MCE bank, shared by all CPUs /
77	struct mce_bank_dev {
78	struct device_attribute attr; / device attribute /
79	char attrname[ATTR_LEN]; / attribute name /
80	u8 bank; / bank number /
81	};
82	static struct mce_bank_dev mce_bank_devs[MAX_NR_BANKS];
83
84	struct mce_vendor_flags mce_flags __read_mostly;
85
86	struct mca_config mca_cfg __read_mostly = {
87	.bootlog = -`1`,
88	.monarch_timeout = -`1`
89	};
90
91	static DEFINE_PER_CPU(struct mce_hw_err, hw_errs_seen);
92	static unsigned long mce_need_notify;
93
94	/*
95	* MCA banks polled by the period polling timer for corrected events.
96	* With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
97	*/
98	DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
99	[`0` ... BITS_TO_LONGS(MAX_NR_BANKS)-`1`] = ~`0UL`
100	};
101
102	/*
103	* MCA banks controlled through firmware first for corrected errors.
104	* This is a global list of banks for which we won't enable CMCI and we
105	* won't poll. Firmware controls these banks and is responsible for
106	* reporting corrected errors through GHES. Uncorrected/recoverable
107	* errors are still notified through a machine check.
108	*/
109	mce_banks_t mce_banks_ce_disabled;
110
111	static struct work_struct mce_work;
112	static struct irq_work mce_irq_work;
113
114	/*
115	* CPU/chipset specific EDAC code can register a notifier call here to print
116	* MCE errors in a human-readable form.
117	*/
118	BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
119
120	void mce_prep_record_common(struct mce *m)
121	{
122	m->cpuid = cpuid_eax(op: `1`);
123	m->cpuvendor = boot_cpu_data.x86_vendor;
124	m->mcgcap = native_rdmsrq(MSR_IA32_MCG_CAP);
125	/ need the internal __ version to avoid deadlocks /
126	m->time = __ktime_get_real_seconds();
127	}
128
129	void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m)
130	{
131	m->cpu = cpu;
132	m->extcpu = cpu;
133	m->apicid = cpu_data(cpu).topo.initial_apicid;
134	m->microcode = cpu_data(cpu).microcode;
135	m->ppin = topology_ppin(cpu);
136	m->socketid = topology_physical_package_id(cpu);
137	}
138
139	/ Do initial initialization of struct mce_hw_err /
140	void mce_prep_record(struct mce_hw_err *err)
141	{
142	struct mce *m = &err->m;
143
144	memset(s: err, c: `0`, n: sizeof(struct mce_hw_err));
145	mce_prep_record_common(m);
146	mce_prep_record_per_cpu(smp_processor_id(), m);
147	}
148
149	DEFINE_PER_CPU(struct mce, injectm);
150	EXPORT_PER_CPU_SYMBOL_GPL(injectm);
151
152	void mce_log(struct mce_hw_err *err)
153	{
154	if (mce_gen_pool_add(err))
155	irq_work_queue(work: &mce_irq_work);
156	}
157	EXPORT_SYMBOL_GPL(mce_log);
158
159	void mce_register_decode_chain(struct notifier_block *nb)
160	{
161	if (WARN_ON(nb->priority < MCE_PRIO_LOWEST \|\|
162	nb->priority > MCE_PRIO_HIGHEST))
163	return;
164
165	blocking_notifier_chain_register(nh: &x86_mce_decoder_chain, nb);
166	}
167	EXPORT_SYMBOL_GPL(mce_register_decode_chain);
168
169	void mce_unregister_decode_chain(struct notifier_block *nb)
170	{
171	blocking_notifier_chain_unregister(nh: &x86_mce_decoder_chain, nb);
172	}
173	EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
174
175	static void __print_mce(struct mce_hw_err *err)
176	{
177	struct mce *m = &err->m;
178
179	pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
180	m->extcpu,
181	(m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
182	m->mcgstatus, m->bank, m->status);
183
184	if (m->ip) {
185	pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
186	!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
187	m->cs, m->ip);
188
189	if (m->cs == __KERNEL_CS)
190	pr_cont("{%pS}", (void )(unsigned* long)m->ip);
191	pr_cont("\n");
192	}
193
194	pr_emerg(HW_ERR "TSC %llx ", m->tsc);
195	if (m->addr)
196	pr_cont("ADDR %llx ", m->addr);
197	if (m->misc)
198	pr_cont("MISC %llx ", m->misc);
199	if (m->ppin)
200	pr_cont("PPIN %llx ", m->ppin);
201
202	if (mce_flags.smca) {
203	if (m->synd)
204	pr_cont("SYND %llx ", m->synd);
205	if (err->vendor.amd.synd1)
206	pr_cont("SYND1 %llx ", err->vendor.amd.synd1);
207	if (err->vendor.amd.synd2)
208	pr_cont("SYND2 %llx ", err->vendor.amd.synd2);
209	if (m->ipid)
210	pr_cont("IPID %llx ", m->ipid);
211	}
212
213	pr_cont("\n");
214
215	/*
216	* Note this output is parsed by external tools and old fields
217	* should not be changed.
218	*/
219	pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
220	m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
221	m->microcode);
222	}
223
224	static void print_mce(struct mce_hw_err *err)
225	{
226	struct mce *m = &err->m;
227
228	__print_mce(err);
229
230	if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON)
231	pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
232	}
233
234	#define PANIC_TIMEOUT 5 /* 5 seconds */
235
236	static atomic_t mce_panicked;
237
238	static int fake_panic;
239	static atomic_t mce_fake_panicked;
240
241	/ Panic in progress. Enable interrupts and wait for final IPI /
242	static void wait_for_panic(void)
243	{
244	long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
245
246	preempt_disable();
247	local_irq_enable();
248	while (timeout-- > `0`)
249	udelay(usec: `1`);
250	if (panic_timeout == `0`)
251	panic_timeout = mca_cfg.panic_timeout;
252	panic(fmt: "Panicing machine check CPU died");
253	}
254
255	static const char mce_dump_aux_info(struct* mce *m)
256	{
257	if (boot_cpu_has_bug(X86_BUG_TDX_PW_MCE))
258	return tdx_dump_mce_info(m);
259
260	return NULL;
261	}
262
263	static noinstr void mce_panic(const char msg, struct* mce_hw_err final, char* *exp)
264	{
265	struct llist_node *pending;
266	struct mce_evt_llist *l;
267	int apei_err = `0`;
268	const char *memmsg;
269
270	/*
271	* Allow instrumentation around external facilities usage. Not that it
272	* matters a whole lot since the machine is going to panic anyway.
273	*/
274	instrumentation_begin();
275
276	if (!fake_panic) {
277	/*
278	* Make sure only one CPU runs in machine check panic
279	*/
280	if (atomic_inc_return(v: &mce_panicked) > `1`)
281	wait_for_panic();
282	barrier();
283
284	bust_spinlocks(yes: `1`);
285	console_verbose();
286	} else {
287	/ Don't log too much for fake panic /
288	if (atomic_inc_return(v: &mce_fake_panicked) > `1`)
289	goto out;
290	}
291	pending = mce_gen_pool_prepare_records();
292	/ First print corrected ones that are still unlogged /
293	llist_for_each_entry(l, pending, llnode) {
294	struct mce_hw_err *err = &l->err;
295	struct mce *m = &err->m;
296	if (!(m->status & MCI_STATUS_UC)) {
297	print_mce(err);
298	if (!apei_err)
299	apei_err = apei_write_mce(m);
300	}
301	}
302	/ Now print uncorrected but with the final one last /
303	llist_for_each_entry(l, pending, llnode) {
304	struct mce_hw_err *err = &l->err;
305	struct mce *m = &err->m;
306	if (!(m->status & MCI_STATUS_UC))
307	continue;
308	if (!final \|\| mce_cmp(m1: m, m2: &final->m)) {
309	print_mce(err);
310	if (!apei_err)
311	apei_err = apei_write_mce(m);
312	}
313	}
314	if (final) {
315	print_mce(err: final);
316	if (!apei_err)
317	apei_err = apei_write_mce(m: &final->m);
318	}
319	if (exp)
320	pr_emerg(HW_ERR "Machine check: %s\n", exp);
321
322	memmsg = mce_dump_aux_info(m: &final->m);
323	if (memmsg)
324	pr_emerg(HW_ERR "Machine check: %s\n", memmsg);
325
326	if (!fake_panic) {
327	if (panic_timeout == `0`)
328	panic_timeout = mca_cfg.panic_timeout;
329
330	/*
331	* Kdump skips the poisoned page in order to avoid
332	* touching the error bits again. Poison the page even
333	* if the error is fatal and the machine is about to
334	* panic.
335	*/
336	if (kexec_crash_loaded()) {
337	if (final && (final->m.status & MCI_STATUS_ADDRV)) {
338	struct page *p;
339	p = pfn_to_online_page(final->m.addr >> PAGE_SHIFT);
340	if (p)
341	SetPageHWPoison(p);
342	}
343	}
344	panic(fmt: msg);
345	} else
346	pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
347
348	out:
349	instrumentation_end();
350	}
351
352	/ Support code for software error injection /
353
354	static int msr_to_offset(u32 msr)
355	{
356	unsigned bank = __this_cpu_read(injectm.bank);
357
358	if (msr == mca_cfg.rip_msr)
359	return offsetof(struct mce, ip);
360	if (msr == mca_msr_reg(bank, reg: MCA_STATUS))
361	return offsetof(struct mce, status);
362	if (msr == mca_msr_reg(bank, reg: MCA_ADDR))
363	return offsetof(struct mce, addr);
364	if (msr == mca_msr_reg(bank, reg: MCA_MISC))
365	return offsetof(struct mce, misc);
366	if (msr == MSR_IA32_MCG_STATUS)
367	return offsetof(struct mce, mcgstatus);
368	return -`1`;
369	}
370
371	void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr)
372	{
373	if (wrmsr) {
374	pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
375	(unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax,
376	regs->ip, (void *)regs->ip);
377	} else {
378	pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
379	(unsigned int)regs->cx, regs->ip, (void *)regs->ip);
380	}
381
382	show_stack_regs(regs);
383
384	panic(fmt: "MCA architectural violation!\n");
385
386	while (true)
387	cpu_relax();
388	}
389
390	/ MSR access wrappers used for error injection /
391	noinstr u64 mce_rdmsrq(u32 msr)
392	{
393	EAX_EDX_DECLARE_ARGS(val, low, high);
394
395	if (__this_cpu_read(injectm.finished)) {
396	int offset;
397	u64 ret;
398
399	instrumentation_begin();
400
401	offset = msr_to_offset(msr);
402	if (offset < `0`)
403	ret = `0`;
404	else
405	ret = (u64 )((char *)this_cpu_ptr(&injectm) + offset);
406
407	instrumentation_end();
408
409	return ret;
410	}
411
412	/*
413	* RDMSR on MCA MSRs should not fault. If they do, this is very much an
414	* architectural violation and needs to be reported to hw vendor. Panic
415	* the box to not allow any further progress.
416	*/
417	asm volatile("1: rdmsr\n"
418	"2:\n"
419	_ASM_EXTABLE_TYPE(`1b`, `2b`, EX_TYPE_RDMSR_IN_MCE)
420	: EAX_EDX_RET(val, low, high) : "c" (msr));
421
422
423	return EAX_EDX_VAL(val, low, high);
424	}
425
426	noinstr void mce_wrmsrq(u32 msr, u64 v)
427	{
428	u32 low, high;
429
430	if (__this_cpu_read(injectm.finished)) {
431	int offset;
432
433	instrumentation_begin();
434
435	offset = msr_to_offset(msr);
436	if (offset >= `0`)
437	(u64 )((char *)this_cpu_ptr(&injectm) + offset) = v;
438
439	instrumentation_end();
440
441	return;
442	}
443
444	low = (u32)v;
445	high = (u32)(v >> `32`);
446
447	/ See comment in mce_rdmsrq() /
448	asm volatile("1: wrmsr\n"
449	"2:\n"
450	_ASM_EXTABLE_TYPE(`1b`, `2b`, EX_TYPE_WRMSR_IN_MCE)
451	: : "c" (msr), "a"(low), "d" (high) : "memory");
452	}
453
454	/*
455	* Collect all global (w.r.t. this processor) status about this machine
456	* check into our "mce" struct so that we can use it later to assess
457	* the severity of the problem as we read per-bank specific details.
458	*/
459	static noinstr void mce_gather_info(struct mce_hw_err err, struct* pt_regs *regs)
460	{
461	struct mce *m;
462	/*
463	* Enable instrumentation around mce_prep_record() which calls external
464	* facilities.
465	*/
466	instrumentation_begin();
467	mce_prep_record(err);
468	instrumentation_end();
469
470	m = &err->m;
471	m->mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS);
472	if (regs) {
473	/*
474	* Get the address of the instruction at the time of
475	* the machine check error.
476	*/
477	if (m->mcgstatus & (MCG_STATUS_RIPV\|MCG_STATUS_EIPV)) {
478	m->ip = regs->ip;
479	m->cs = regs->cs;
480
481	/*
482	* When in VM86 mode make the cs look like ring 3
483	* always. This is a lie, but it's better than passing
484	* the additional vm86 bit around everywhere.
485	*/
486	if (v8086_mode(regs))
487	m->cs \|= `3`;
488	}
489	/ Use accurate RIP reporting if available. /
490	if (mca_cfg.rip_msr)
491	m->ip = mce_rdmsrq(msr: mca_cfg.rip_msr);
492	}
493	}
494
495	bool mce_available(struct cpuinfo_x86 *c)
496	{
497	if (mca_cfg.disabled)
498	return false;
499	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
500	}
501
502	static void mce_schedule_work(void)
503	{
504	if (!mce_gen_pool_empty())
505	schedule_work(work: &mce_work);
506	}
507
508	static void mce_irq_work_cb(struct irq_work *entry)
509	{
510	mce_schedule_work();
511	}
512
513	bool mce_usable_address(struct mce *m)
514	{
515	if (!(m->status & MCI_STATUS_ADDRV))
516	return false;
517
518	switch (m->cpuvendor) {
519	case X86_VENDOR_AMD:
520	return amd_mce_usable_address(m);
521
522	case X86_VENDOR_INTEL:
523	case X86_VENDOR_ZHAOXIN:
524	return intel_mce_usable_address(m);
525
526	default:
527	return true;
528	}
529	}
530	EXPORT_SYMBOL_GPL(mce_usable_address);
531
532	bool mce_is_memory_error(struct mce *m)
533	{
534	switch (m->cpuvendor) {
535	case X86_VENDOR_AMD:
536	case X86_VENDOR_HYGON:
537	return amd_mce_is_memory_error(m);
538
539	case X86_VENDOR_INTEL:
540	case X86_VENDOR_ZHAOXIN:
541	/*
542	* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
543	*
544	* Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
545	* indicating a memory error. Bit 8 is used for indicating a
546	* cache hierarchy error. The combination of bit 2 and bit 3
547	* is used for indicating a `generic' cache hierarchy error
548	* But we can't just blindly check the above bits, because if
549	* bit 11 is set, then it is a bus/interconnect error - and
550	* either way the above bits just gives more detail on what
551	* bus/interconnect error happened. Note that bit 12 can be
552	* ignored, as it's the "filter" bit.
553	*/
554	return (m->status & `0xef80`) == BIT(`7`) \|\|
555	(m->status & `0xef00`) == BIT(`8`) \|\|
556	(m->status & `0xeffc`) == `0xc`;
557
558	default:
559	return false;
560	}
561	}
562	EXPORT_SYMBOL_GPL(mce_is_memory_error);
563
564	static bool whole_page(struct mce *m)
565	{
566	if (!mca_cfg.ser \|\| !(m->status & MCI_STATUS_MISCV))
567	return true;
568
569	return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT;
570	}
571
572	bool mce_is_correctable(struct mce *m)
573	{
574	if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
575	return false;
576
577	if (m->cpuvendor == X86_VENDOR_HYGON && m->status & MCI_STATUS_DEFERRED)
578	return false;
579
580	if (m->status & MCI_STATUS_UC)
581	return false;
582
583	return true;
584	}
585	EXPORT_SYMBOL_GPL(mce_is_correctable);
586
587	/*
588	* Notify the user(s) about new machine check events.
589	* Can be called from interrupt context, but not from machine check/NMI
590	* context.
591	*/
592	static bool mce_notify_irq(void)
593	{
594	/ Not more than two messages every minute /
595	static DEFINE_RATELIMIT_STATE(ratelimit, `60`*HZ, `2`);
596
597	if (test_and_clear_bit(nr: `0`, addr: &mce_need_notify)) {
598	mce_work_trigger();
599
600	if (__ratelimit(&ratelimit))
601	pr_info(HW_ERR "Machine check events logged\n");
602
603	return true;
604	}
605
606	return false;
607	}
608
609	static int mce_early_notifier(struct notifier_block nb, unsigned* long val,
610	void *data)
611	{
612	struct mce_hw_err *err = to_mce_hw_err(data);
613
614	if (!err)
615	return NOTIFY_DONE;
616
617	/ Emit the trace record: /
618	trace_mce_record(err);
619
620	set_bit(nr: `0`, addr: &mce_need_notify);
621
622	mce_notify_irq();
623
624	return NOTIFY_DONE;
625	}
626
627	static struct notifier_block early_nb = {
628	.notifier_call = mce_early_notifier,
629	.priority = MCE_PRIO_EARLY,
630	};
631
632	static int uc_decode_notifier(struct notifier_block nb, unsigned* long val,
633	void *data)
634	{
635	struct mce mce = (struct* mce *)data;
636	unsigned long pfn;
637
638	if (!mce \|\| !mce_usable_address(mce))
639	return NOTIFY_DONE;
640
641	if (mce->severity != MCE_AO_SEVERITY &&
642	mce->severity != MCE_DEFERRED_SEVERITY)
643	return NOTIFY_DONE;
644
645	pfn = (mce->addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
646	if (!memory_failure(pfn, flags: `0`)) {
647	set_mce_nospec(pfn);
648	mce->kflags \|= MCE_HANDLED_UC;
649	}
650
651	return NOTIFY_OK;
652	}
653
654	static struct notifier_block mce_uc_nb = {
655	.notifier_call = uc_decode_notifier,
656	.priority = MCE_PRIO_UC,
657	};
658
659	static int mce_default_notifier(struct notifier_block nb, unsigned* long val,
660	void *data)
661	{
662	struct mce_hw_err *err = to_mce_hw_err(data);
663
664	if (!err)
665	return NOTIFY_DONE;
666
667	if (mca_cfg.print_all \|\| !(err->m.kflags))
668	__print_mce(err);
669
670	return NOTIFY_DONE;
671	}
672
673	static struct notifier_block mce_default_nb = {
674	.notifier_call = mce_default_notifier,
675	/ lowest prio, we want it to run last. /
676	.priority = MCE_PRIO_LOWEST,
677	};
678
679	/*
680	* Read ADDR and MISC registers.
681	*/
682	static noinstr void mce_read_aux(struct mce_hw_err err, int* i)
683	{
684	struct mce *m = &err->m;
685
686	if (m->status & MCI_STATUS_MISCV)
687	m->misc = mce_rdmsrq(msr: mca_msr_reg(bank: i, reg: MCA_MISC));
688
689	if (m->status & MCI_STATUS_ADDRV) {
690	m->addr = mce_rdmsrq(msr: mca_msr_reg(bank: i, reg: MCA_ADDR));
691
692	/*
693	* Mask the reported address by the reported granularity.
694	*/
695	if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
696	u8 shift = MCI_MISC_ADDR_LSB(m->misc);
697	m->addr >>= shift;
698	m->addr <<= shift;
699	}
700
701	smca_extract_err_addr(m);
702	}
703
704	if (mce_flags.smca) {
705	m->ipid = mce_rdmsrq(MSR_AMD64_SMCA_MCx_IPID(i));
706
707	if (m->status & MCI_STATUS_SYNDV) {
708	m->synd = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND(i));
709	err->vendor.amd.synd1 = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND1(i));
710	err->vendor.amd.synd2 = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND2(i));
711	}
712	}
713	}
714
715	DEFINE_PER_CPU(unsigned, mce_poll_count);
716
717	/*
718	* Newer Intel systems that support software error
719	* recovery need to make additional checks. Other
720	* CPUs should skip over uncorrected errors, but log
721	* everything else.
722	*/
723	static bool ser_should_log_poll_error(struct mce *m)
724	{
725	/ Log "not enabled" (speculative) errors /
726	if (!(m->status & MCI_STATUS_EN))
727	return true;
728
729	/*
730	* Log UCNA (SDM: 15.6.3 "UCR Error Classification")
731	* UC == 1 && PCC == 0 && S == 0
732	*/
733	if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S))
734	return true;
735
736	return false;
737	}
738
739	static bool should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *err)
740	{
741	struct mce *m = &err->m;
742
743	/ If this entry is not valid, ignore it. /
744	if (!(m->status & MCI_STATUS_VAL))
745	return false;
746
747	/*
748	* If we are logging everything (at CPU online) or this
749	* is a corrected error, then we must log it.
750	*/
751	if ((flags & MCP_UC) \|\| !(m->status & MCI_STATUS_UC))
752	return true;
753
754	if (mca_cfg.ser)
755	return ser_should_log_poll_error(m);
756
757	if (m->status & MCI_STATUS_UC)
758	return false;
759
760	return true;
761	}
762
763	static void clear_bank(struct mce *m)
764	{
765	if (m->cpuvendor == X86_VENDOR_AMD)
766	return amd_clear_bank(m);
767
768	mce_wrmsrq(msr: mca_msr_reg(bank: m->bank, reg: MCA_STATUS), v: `0`);
769	}
770
771	/*
772	* Poll for corrected events or events that happened before reset.
773	* Those are just logged through /dev/mcelog.
774	*
775	* This is executed in standard interrupt context.
776	*
777	* Note: spec recommends to panic for fatal unsignalled
778	* errors here. However this would be quite problematic --
779	* we would need to reimplement the Monarch handling and
780	* it would mess up the exclusion between exception handler
781	* and poll handler -- * so we skip this for now.
782	* These cases should not happen anyways, or only when the CPU
783	* is already totally * confused. In this case it's likely it will
784	* not fully execute the machine check handler either.
785	*/
786	void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
787	{
788	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
789	struct mce_hw_err err;
790	struct mce *m;
791	int i;
792
793	this_cpu_inc(mce_poll_count);
794
795	mce_gather_info(err: &err, NULL);
796	m = &err.m;
797
798	if (flags & MCP_TIMESTAMP)
799	m->tsc = rdtsc();
800
801	for (i = `0`; i < this_cpu_read(mce_num_banks); i++) {
802	if (!mce_banks[i].ctl \|\| !test_bit(i, *b))
803	continue;
804
805	m->misc = `0`;
806	m->addr = `0`;
807	m->bank = i;
808
809	barrier();
810	m->status = mce_rdmsrq(msr: mca_msr_reg(bank: i, reg: MCA_STATUS));
811
812	/*
813	* Update storm tracking here, before checking for the
814	* MCI_STATUS_VAL bit. Valid corrected errors count
815	* towards declaring, or maintaining, storm status. No
816	* error in a bank counts towards avoiding, or ending,
817	* storm status.
818	*/
819	if (!mca_cfg.cmci_disabled)
820	mce_track_storm(mce: m);
821
822	/ Verify that the error should be logged based on hardware conditions. /
823	if (!should_log_poll_error(flags, err: &err))
824	continue;
825
826	mce_read_aux(err: &err, i);
827	m->severity = mce_severity(a: m, NULL, NULL, is_excp: false);
828	/*
829	* Don't get the IP here because it's unlikely to
830	* have anything to do with the actual error location.
831	*/
832
833	if (mca_cfg.dont_log_ce && !mce_usable_address(m))
834	goto clear_it;
835
836	if (flags & MCP_QUEUE_LOG)
837	mce_gen_pool_add(err: &err);
838	else
839	mce_log(&err);
840
841	clear_it:
842	clear_bank(m);
843	}
844
845	/*
846	* Don't clear MCG_STATUS here because it's only defined for
847	* exceptions.
848	*/
849
850	sync_core();
851	}
852	EXPORT_SYMBOL_GPL(machine_check_poll);
853
854	/*
855	* During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
856	* EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
857	* Vol 3B Table 15-20). But this confuses both the code that determines
858	* whether the machine check occurred in kernel or user mode, and also
859	* the severity assessment code. Pretend that EIPV was set, and take the
860	* ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
861	*/
862	static __always_inline void
863	quirk_sandybridge_ifu(int bank, struct mce m, struct* pt_regs *regs)
864	{
865	if (bank != `0`)
866	return;
867	if ((m->mcgstatus & (MCG_STATUS_EIPV\|MCG_STATUS_RIPV)) != `0`)
868	return;
869	if ((m->status & (MCI_STATUS_OVER\|MCI_STATUS_UC\|
870	MCI_STATUS_EN\|MCI_STATUS_MISCV\|MCI_STATUS_ADDRV\|
871	MCI_STATUS_PCC\|MCI_STATUS_S\|MCI_STATUS_AR\|
872	MCACOD)) !=
873	(MCI_STATUS_UC\|MCI_STATUS_EN\|
874	MCI_STATUS_MISCV\|MCI_STATUS_ADDRV\|MCI_STATUS_S\|
875	MCI_STATUS_AR\|MCACOD_INSTR))
876	return;
877
878	m->mcgstatus \|= MCG_STATUS_EIPV;
879	m->ip = regs->ip;
880	m->cs = regs->cs;
881	}
882
883	/*
884	* Disable fast string copy and return from the MCE handler upon the first SRAR
885	* MCE on bank 1 due to a CPU erratum on Intel Skylake/Cascade Lake/Cooper Lake
886	* CPUs.
887	* The fast string copy instructions ("REP; MOVS*") could consume an
888	* uncorrectable memory error in the cache line _right after_ the desired region
889	* to copy and raise an MCE with RIP pointing to the instruction _after_ the
890	* "REP; MOVS*".
891	* This mitigation addresses the issue completely with the caveat of performance
892	* degradation on the CPU affected. This is still better than the OS crashing on
893	* MCEs raised on an irrelevant process due to "REP; MOVS*" accesses from a
894	* kernel context (e.g., copy_page).
895	*
896	* Returns true when fast string copy on CPU has been disabled.
897	*/
898	static noinstr bool quirk_skylake_repmov(void)
899	{
900	u64 mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS);
901	u64 misc_enable = mce_rdmsrq(MSR_IA32_MISC_ENABLE);
902	u64 mc1_status;
903
904	/*
905	* Apply the quirk only to local machine checks, i.e., no broadcast
906	* sync is needed.
907	*/
908	if (!(mcgstatus & MCG_STATUS_LMCES) \|\|
909	!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING))
910	return false;
911
912	mc1_status = mce_rdmsrq(MSR_IA32_MCx_STATUS(`1`));
913
914	/ Check for a software-recoverable data fetch error. /
915	if ((mc1_status &
916	(MCI_STATUS_VAL \| MCI_STATUS_OVER \| MCI_STATUS_UC \| MCI_STATUS_EN \|
917	MCI_STATUS_ADDRV \| MCI_STATUS_MISCV \| MCI_STATUS_PCC \|
918	MCI_STATUS_AR \| MCI_STATUS_S)) ==
919	(MCI_STATUS_VAL \| MCI_STATUS_UC \| MCI_STATUS_EN \|
920	MCI_STATUS_ADDRV \| MCI_STATUS_MISCV \|
921	MCI_STATUS_AR \| MCI_STATUS_S)) {
922	misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
923	mce_wrmsrq(MSR_IA32_MISC_ENABLE, v: misc_enable);
924	mce_wrmsrq(MSR_IA32_MCx_STATUS(`1`), v: `0`);
925
926	instrumentation_begin();
927	pr_err_once("Erratum detected, disable fast string copy instructions.\n");
928	instrumentation_end();
929
930	return true;
931	}
932
933	return false;
934	}
935
936	/*
937	* Some Zen-based Instruction Fetch Units set EIPV=RIPV=0 on poison consumption
938	* errors. This means mce_gather_info() will not save the "ip" and "cs" registers.
939	*
940	* However, the context is still valid, so save the "cs" register for later use.
941	*
942	* The "ip" register is truly unknown, so don't save it or fixup EIPV/RIPV.
943	*
944	* The Instruction Fetch Unit is at MCA bank 1 for all affected systems.
945	*/
946	static __always_inline void quirk_zen_ifu(int bank, struct mce m, struct* pt_regs *regs)
947	{
948	if (bank != `1`)
949	return;
950	if (!(m->status & MCI_STATUS_POISON))
951	return;
952
953	m->cs = regs->cs;
954	}
955
956	/*
957	* Do a quick check if any of the events requires a panic.
958	* This decides if we keep the events around or clear them.
959	*/
960	static __always_inline int mce_no_way_out(struct mce_hw_err err, char* *msg, unsigned* long *validp,
961	struct pt_regs *regs)
962	{
963	struct mce *m = &err->m;
964	char tmp = msg;
965	int i;
966
967	for (i = `0`; i < this_cpu_read(mce_num_banks); i++) {
968	m->status = mce_rdmsrq(msr: mca_msr_reg(bank: i, reg: MCA_STATUS));
969	if (!(m->status & MCI_STATUS_VAL))
970	continue;
971
972	arch___set_bit(nr: i, addr: validp);
973	if (mce_flags.snb_ifu_quirk)
974	quirk_sandybridge_ifu(bank: i, m, regs);
975
976	if (mce_flags.zen_ifu_quirk)
977	quirk_zen_ifu(bank: i, m, regs);
978
979	m->bank = i;
980	if (mce_severity(a: m, regs, msg: &tmp, is_excp: true) >= MCE_PANIC_SEVERITY) {
981	mce_read_aux(err, i);
982	*msg = tmp;
983	return `1`;
984	}
985	}
986	return `0`;
987	}
988
989	/*
990	* Variable to establish order between CPUs while scanning.
991	* Each CPU spins initially until executing is equal its number.
992	*/
993	static atomic_t mce_executing;
994
995	/*
996	* Defines order of CPUs on entry. First CPU becomes Monarch.
997	*/
998	static atomic_t mce_callin;
999
1000	/*
1001	* Track which CPUs entered the MCA broadcast synchronization and which not in
1002	* order to print holdouts.
1003	*/
1004	static cpumask_t mce_missing_cpus = CPU_MASK_ALL;
1005
1006	/*
1007	* Check if a timeout waiting for other CPUs happened.
1008	*/
1009	static noinstr int mce_timed_out(u64 t, const* char *msg)
1010	{
1011	int ret = `0`;
1012
1013	/ Enable instrumentation around calls to external facilities /
1014	instrumentation_begin();
1015
1016	/*
1017	* The others already did panic for some reason.
1018	* Bail out like in a timeout.
1019	* rmb() to tell the compiler that system_state
1020	* might have been modified by someone else.
1021	*/
1022	rmb();
1023	if (atomic_read(v: &mce_panicked))
1024	wait_for_panic();
1025	if (!mca_cfg.monarch_timeout)
1026	goto out;
1027	if ((s64)*t < SPINUNIT) {
1028	if (cpumask_and(dstp: &mce_missing_cpus, cpu_online_mask, src2p: &mce_missing_cpus))
1029	pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n",
1030	cpumask_pr_args(&mce_missing_cpus));
1031	mce_panic(msg, NULL, NULL);
1032
1033	ret = `1`;
1034	goto out;
1035	}
1036	*t -= SPINUNIT;
1037
1038	out:
1039	touch_nmi_watchdog();
1040
1041	instrumentation_end();
1042
1043	return ret;
1044	}
1045
1046	/*
1047	* The Monarch's reign. The Monarch is the CPU who entered
1048	* the machine check handler first. It waits for the others to
1049	* raise the exception too and then grades them. When any
1050	* error is fatal panic. Only then let the others continue.
1051	*
1052	* The other CPUs entering the MCE handler will be controlled by the
1053	* Monarch. They are called Subjects.
1054	*
1055	* This way we prevent any potential data corruption in a unrecoverable case
1056	* and also makes sure always all CPU's errors are examined.
1057	*
1058	* Also this detects the case of a machine check event coming from outer
1059	* space (not detected by any CPUs) In this case some external agent wants
1060	* us to shut down, so panic too.
1061	*
1062	* The other CPUs might still decide to panic if the handler happens
1063	* in a unrecoverable place, but in this case the system is in a semi-stable
1064	* state and won't corrupt anything by itself. It's ok to let the others
1065	* continue for a bit first.
1066	*
1067	* All the spin loops have timeouts; when a timeout happens a CPU
1068	* typically elects itself to be Monarch.
1069	*/
1070	static void mce_reign(void)
1071	{
1072	struct mce_hw_err *err = NULL;
1073	struct mce *m = NULL;
1074	int global_worst = `0`;
1075	char *msg = NULL;
1076	int cpu;
1077
1078	/*
1079	* This CPU is the Monarch and the other CPUs have run
1080	* through their handlers.
1081	* Grade the severity of the errors of all the CPUs.
1082	*/
1083	for_each_possible_cpu(cpu) {
1084	struct mce_hw_err *etmp = &per_cpu(hw_errs_seen, cpu);
1085	struct mce *mtmp = &etmp->m;
1086
1087	if (mtmp->severity > global_worst) {
1088	global_worst = mtmp->severity;
1089	err = &per_cpu(hw_errs_seen, cpu);
1090	m = &err->m;
1091	}
1092	}
1093
1094	/*
1095	* Cannot recover? Panic here then.
1096	* This dumps all the mces in the log buffer and stops the
1097	* other CPUs.
1098	*/
1099	if (m && global_worst >= MCE_PANIC_SEVERITY) {
1100	/ call mce_severity() to get "msg" for panic /
1101	mce_severity(a: m, NULL, msg: &msg, is_excp: true);
1102	mce_panic(msg: "Fatal machine check", final: err, exp: msg);
1103	}
1104
1105	/*
1106	* For UC somewhere we let the CPU who detects it handle it.
1107	* Also must let continue the others, otherwise the handling
1108	* CPU could deadlock on a lock.
1109	*/
1110
1111	/*
1112	* No machine check event found. Must be some external
1113	* source or one CPU is hung. Panic.
1114	*/
1115	if (global_worst <= MCE_KEEP_SEVERITY)
1116	mce_panic(msg: "Fatal machine check from unknown source", NULL, NULL);
1117
1118	/*
1119	* Now clear all the hw_errs_seen so that they don't reappear on
1120	* the next mce.
1121	*/
1122	for_each_possible_cpu(cpu)
1123	memset(s: &per_cpu(hw_errs_seen, cpu), c: `0`, n: sizeof(struct mce_hw_err));
1124	}
1125
1126	static atomic_t global_nwo;
1127
1128	/*
1129	* Start of Monarch synchronization. This waits until all CPUs have
1130	* entered the exception handler and then determines if any of them
1131	* saw a fatal event that requires panic. Then it executes them
1132	* in the entry order.
1133	* TBD double check parallel CPU hotunplug
1134	*/
1135	static noinstr int mce_start(int *no_way_out)
1136	{
1137	u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
1138	int order, ret = -`1`;
1139
1140	if (!timeout)
1141	return ret;
1142
1143	raw_atomic_add(i: *no_way_out, v: &global_nwo);
1144	/*
1145	* Rely on the implied barrier below, such that global_nwo
1146	* is updated before mce_callin.
1147	*/
1148	order = raw_atomic_inc_return(v: &mce_callin);
1149	arch_cpumask_clear_cpu(smp_processor_id(), dstp: &mce_missing_cpus);
1150
1151	/ Enable instrumentation around calls to external facilities /
1152	instrumentation_begin();
1153
1154	/*
1155	* Wait for everyone.
1156	*/
1157	while (raw_atomic_read(v: &mce_callin) != num_online_cpus()) {
1158	if (mce_timed_out(t: &timeout,
1159	msg: "Timeout: Not all CPUs entered broadcast exception handler")) {
1160	raw_atomic_set(v: &global_nwo, i: `0`);
1161	goto out;
1162	}
1163	ndelay(SPINUNIT);
1164	}
1165
1166	/*
1167	* mce_callin should be read before global_nwo
1168	*/
1169	smp_rmb();
1170
1171	if (order == `1`) {
1172	/*
1173	* Monarch: Starts executing now, the others wait.
1174	*/
1175	raw_atomic_set(v: &mce_executing, i: `1`);
1176	} else {
1177	/*
1178	* Subject: Now start the scanning loop one by one in
1179	* the original callin order.
1180	* This way when there are any shared banks it will be
1181	* only seen by one CPU before cleared, avoiding duplicates.
1182	*/
1183	while (raw_atomic_read(v: &mce_executing) < order) {
1184	if (mce_timed_out(t: &timeout,
1185	msg: "Timeout: Subject CPUs unable to finish machine check processing")) {
1186	raw_atomic_set(v: &global_nwo, i: `0`);
1187	goto out;
1188	}
1189	ndelay(SPINUNIT);
1190	}
1191	}
1192
1193	/*
1194	* Cache the global no_way_out state.
1195	*/
1196	*no_way_out = raw_atomic_read(v: &global_nwo);
1197
1198	ret = order;
1199
1200	out:
1201	instrumentation_end();
1202
1203	return ret;
1204	}
1205
1206	/*
1207	* Synchronize between CPUs after main scanning loop.
1208	* This invokes the bulk of the Monarch processing.
1209	*/
1210	static noinstr int mce_end(int order)
1211	{
1212	u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
1213	int ret = -`1`;
1214
1215	/ Allow instrumentation around external facilities. /
1216	instrumentation_begin();
1217
1218	if (!timeout)
1219	goto reset;
1220	if (order < `0`)
1221	goto reset;
1222
1223	/*
1224	* Allow others to run.
1225	*/
1226	atomic_inc(v: &mce_executing);
1227
1228	if (order == `1`) {
1229	/*
1230	* Monarch: Wait for everyone to go through their scanning
1231	* loops.
1232	*/
1233	while (atomic_read(v: &mce_executing) <= num_online_cpus()) {
1234	if (mce_timed_out(t: &timeout,
1235	msg: "Timeout: Monarch CPU unable to finish machine check processing"))
1236	goto reset;
1237	ndelay(SPINUNIT);
1238	}
1239
1240	mce_reign();
1241	barrier();
1242	ret = `0`;
1243	} else {
1244	/*
1245	* Subject: Wait for Monarch to finish.
1246	*/
1247	while (atomic_read(v: &mce_executing) != `0`) {
1248	if (mce_timed_out(t: &timeout,
1249	msg: "Timeout: Monarch CPU did not finish machine check processing"))
1250	goto reset;
1251	ndelay(SPINUNIT);
1252	}
1253
1254	/*
1255	* Don't reset anything. That's done by the Monarch.
1256	*/
1257	ret = `0`;
1258	goto out;
1259	}
1260
1261	/*
1262	* Reset all global state.
1263	*/
1264	reset:
1265	atomic_set(v: &global_nwo, i: `0`);
1266	atomic_set(v: &mce_callin, i: `0`);
1267	cpumask_setall(dstp: &mce_missing_cpus);
1268	barrier();
1269
1270	/*
1271	* Let others run again.
1272	*/
1273	atomic_set(v: &mce_executing, i: `0`);
1274
1275	out:
1276	instrumentation_end();
1277
1278	return ret;
1279	}
1280
1281	static __always_inline void mce_clear_state(unsigned long *toclear)
1282	{
1283	int i;
1284
1285	for (i = `0`; i < this_cpu_read(mce_num_banks); i++) {
1286	if (arch_test_bit(nr: i, addr: toclear))
1287	mce_wrmsrq(msr: mca_msr_reg(bank: i, reg: MCA_STATUS), v: `0`);
1288	}
1289	}
1290
1291	/*
1292	* Cases where we avoid rendezvous handler timeout:
1293	* 1) If this CPU is offline.
1294	*
1295	* 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
1296	* skip those CPUs which remain looping in the 1st kernel - see
1297	* crash_nmi_callback().
1298	*
1299	* Note: there still is a small window between kexec-ing and the new,
1300	* kdump kernel establishing a new #MC handler where a broadcasted MCE
1301	* might not get handled properly.
1302	*/
1303	static noinstr bool mce_check_crashing_cpu(void)
1304	{
1305	unsigned int cpu = smp_processor_id();
1306
1307	if (arch_cpu_is_offline(cpu) \|\|
1308	(crashing_cpu != -`1` && crashing_cpu != cpu)) {
1309	u64 mcgstatus;
1310
1311	mcgstatus = native_rdmsrq(MSR_IA32_MCG_STATUS);
1312
1313	if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
1314	if (mcgstatus & MCG_STATUS_LMCES)
1315	return false;
1316	}
1317
1318	if (mcgstatus & MCG_STATUS_RIPV) {
1319	native_wrmsrq(MSR_IA32_MCG_STATUS, `0`);
1320	return true;
1321	}
1322	}
1323	return false;
1324	}
1325
1326	static __always_inline int
1327	__mc_scan_banks(struct mce_hw_err err, struct* pt_regs *regs,
1328	struct mce_hw_err final, unsigned* long *toclear,
1329	unsigned long valid_banks, int* no_way_out, int *worst)
1330	{
1331	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1332	struct mca_config *cfg = &mca_cfg;
1333	int severity, i, taint = `0`;
1334	struct mce *m = &err->m;
1335
1336	for (i = `0`; i < this_cpu_read(mce_num_banks); i++) {
1337	arch___clear_bit(nr: i, addr: toclear);
1338	if (!arch_test_bit(nr: i, addr: valid_banks))
1339	continue;
1340
1341	if (!mce_banks[i].ctl)
1342	continue;
1343
1344	m->misc = `0`;
1345	m->addr = `0`;
1346	m->bank = i;
1347
1348	m->status = mce_rdmsrq(msr: mca_msr_reg(bank: i, reg: MCA_STATUS));
1349	if (!(m->status & MCI_STATUS_VAL))
1350	continue;
1351
1352	/*
1353	* Corrected or non-signaled errors are handled by
1354	* machine_check_poll(). Leave them alone, unless this panics.
1355	*/
1356	if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1357	!no_way_out)
1358	continue;
1359
1360	/ Set taint even when machine check was not enabled. /
1361	taint++;
1362
1363	severity = mce_severity(a: m, regs, NULL, is_excp: true);
1364
1365	/*
1366	* When machine check was for corrected/deferred handler don't
1367	* touch, unless we're panicking.
1368	*/
1369	if ((severity == MCE_KEEP_SEVERITY \|\|
1370	severity == MCE_UCNA_SEVERITY) && !no_way_out)
1371	continue;
1372
1373	arch___set_bit(nr: i, addr: toclear);
1374
1375	/ Machine check event was not enabled. Clear, but ignore. /
1376	if (severity == MCE_NO_SEVERITY)
1377	continue;
1378
1379	mce_read_aux(err, i);
1380
1381	/ assuming valid severity level != 0 /
1382	m->severity = severity;
1383
1384	/*
1385	* Enable instrumentation around the mce_log() call which is
1386	* done in #MC context, where instrumentation is disabled.
1387	*/
1388	instrumentation_begin();
1389	mce_log(err);
1390	instrumentation_end();
1391
1392	if (severity > *worst) {
1393	final = err;
1394	*worst = severity;
1395	}
1396	}
1397
1398	/ mce_clear_state will clear final, save locally for use later /*
1399	err = final;
1400
1401	return taint;
1402	}
1403
1404	static void kill_me_now(struct callback_head *ch)
1405	{
1406	struct task_struct p = container_of(ch, struct* task_struct, mce_kill_me);
1407
1408	p->mce_count = `0`;
1409	force_sig(SIGBUS);
1410	}
1411
1412	static void kill_me_maybe(struct callback_head *cb)
1413	{
1414	struct task_struct p = container_of(cb, struct* task_struct, mce_kill_me);
1415	int flags = MF_ACTION_REQUIRED;
1416	unsigned long pfn;
1417	int ret;
1418
1419	p->mce_count = `0`;
1420	pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
1421
1422	if (!p->mce_ripv)
1423	flags \|= MF_MUST_KILL;
1424
1425	pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
1426	ret = memory_failure(pfn, flags);
1427	if (!ret) {
1428	set_mce_nospec(pfn);
1429	sync_core();
1430	return;
1431	}
1432
1433	/*
1434	* -EHWPOISON from memory_failure() means that it already sent SIGBUS
1435	* to the current process with the proper error info,
1436	* -EOPNOTSUPP means hwpoison_filter() filtered the error event,
1437	*
1438	* In both cases, no further processing is required.
1439	*/
1440	if (ret == -EHWPOISON \|\| ret == -EOPNOTSUPP)
1441	return;
1442
1443	pr_err("Memory error not recovered");
1444	kill_me_now(ch: cb);
1445	}
1446
1447	static void kill_me_never(struct callback_head *cb)
1448	{
1449	struct task_struct p = container_of(cb, struct* task_struct, mce_kill_me);
1450	unsigned long pfn;
1451
1452	p->mce_count = `0`;
1453	pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
1454	pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
1455	if (!memory_failure(pfn, flags: `0`))
1456	set_mce_nospec(pfn);
1457	}
1458
1459	static void queue_task_work(struct mce_hw_err err, char* msg, void* (func)(struct* callback_head *))
1460	{
1461	int count = ++current->mce_count;
1462	struct mce *m = &err->m;
1463
1464	/ First call, save all the details /
1465	if (count == `1`) {
1466	current->mce_addr = m->addr;
1467	current->mce_kflags = m->kflags;
1468	current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
1469	current->mce_whole_page = whole_page(m);
1470	current->mce_kill_me.func = func;
1471	}
1472
1473	/ Ten is likely overkill. Don't expect more than two faults before task_work() /
1474	if (count > `10`)
1475	mce_panic(msg: "Too many consecutive machine checks while accessing user data",
1476	final: err, exp: msg);
1477
1478	/ Second or later call, make sure page address matches the one from first call /
1479	if (count > `1` && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT))
1480	mce_panic(msg: "Consecutive machine checks to different user pages", final: err, exp: msg);
1481
1482	/ Do not call task_work_add() more than once /
1483	if (count > `1`)
1484	return;
1485
1486	task_work_add(current, twork: &current->mce_kill_me, mode: TWA_RESUME);
1487	}
1488
1489	/ Handle unconfigured int18 (should never happen) /
1490	static noinstr void unexpected_machine_check(struct pt_regs *regs)
1491	{
1492	instrumentation_begin();
1493	pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1494	smp_processor_id());
1495	instrumentation_end();
1496	}
1497
1498	/*
1499	* The actual machine check handler. This only handles real exceptions when
1500	* something got corrupted coming in through int 18.
1501	*
1502	* This is executed in #MC context not subject to normal locking rules.
1503	* This implies that most kernel services cannot be safely used. Don't even
1504	* think about putting a printk in there!
1505	*
1506	* On Intel systems this is entered on all CPUs in parallel through
1507	* MCE broadcast. However some CPUs might be broken beyond repair,
1508	* so be always careful when synchronizing with others.
1509	*
1510	* Tracing and kprobes are disabled: if we interrupted a kernel context
1511	* with IF=1, we need to minimize stack usage. There are also recursion
1512	* issues: if the machine check was due to a failure of the memory
1513	* backing the user stack, tracing that reads the user stack will cause
1514	* potentially infinite recursion.
1515	*
1516	* Currently, the #MC handler calls out to a number of external facilities
1517	* and, therefore, allows instrumentation around them. The optimal thing to
1518	* have would be to do the absolutely minimal work required in #MC context
1519	* and have instrumentation disabled only around that. Further processing can
1520	* then happen in process context where instrumentation is allowed. Achieving
1521	* that requires careful auditing and modifications. Until then, the code
1522	* allows instrumentation temporarily, where required. *
1523	*/
1524	noinstr void do_machine_check(struct pt_regs *regs)
1525	{
1526	int worst = `0`, order, no_way_out, kill_current_task, lmce, taint = `0`;
1527	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { `0` };
1528	DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { `0` };
1529	struct mce_hw_err *final;
1530	struct mce_hw_err err;
1531	char *msg = NULL;
1532	struct mce *m;
1533
1534	if (unlikely(mce_flags.p5))
1535	return pentium_machine_check(regs);
1536	else if (unlikely(mce_flags.winchip))
1537	return winchip_machine_check(regs);
1538	else if (unlikely(!mca_cfg.initialized))
1539	return unexpected_machine_check(regs);
1540
1541	if (mce_flags.skx_repmov_quirk && quirk_skylake_repmov())
1542	goto clear;
1543
1544	/*
1545	* Establish sequential order between the CPUs entering the machine
1546	* check handler.
1547	*/
1548	order = -`1`;
1549
1550	/*
1551	* If no_way_out gets set, there is no safe way to recover from this
1552	* MCE.
1553	*/
1554	no_way_out = `0`;
1555
1556	/*
1557	* If kill_current_task is not set, there might be a way to recover from this
1558	* error.
1559	*/
1560	kill_current_task = `0`;
1561
1562	/*
1563	* MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
1564	* on Intel.
1565	*/
1566	lmce = `1`;
1567
1568	this_cpu_inc(mce_exception_count);
1569
1570	mce_gather_info(err: &err, regs);
1571	m = &err.m;
1572	m->tsc = rdtsc();
1573
1574	final = this_cpu_ptr(&hw_errs_seen);
1575	*final = err;
1576
1577	no_way_out = mce_no_way_out(err: &err, msg: &msg, validp: valid_banks, regs);
1578
1579	barrier();
1580
1581	/*
1582	* When no restart IP might need to kill or panic.
1583	* Assume the worst for now, but if we find the
1584	* severity is MCE_AR_SEVERITY we have other options.
1585	*/
1586	if (!(m->mcgstatus & MCG_STATUS_RIPV))
1587	kill_current_task = `1`;
1588	/*
1589	* Check if this MCE is signaled to only this logical processor,
1590	* on Intel, Zhaoxin only.
1591	*/
1592	if (m->cpuvendor == X86_VENDOR_INTEL \|\|
1593	m->cpuvendor == X86_VENDOR_ZHAOXIN)
1594	lmce = m->mcgstatus & MCG_STATUS_LMCES;
1595
1596	/*
1597	* Local machine check may already know that we have to panic.
1598	* Broadcast machine check begins rendezvous in mce_start()
1599	* Go through all banks in exclusion of the other CPUs. This way we
1600	* don't report duplicated events on shared banks because the first one
1601	* to see it will clear it.
1602	*/
1603	if (lmce) {
1604	if (no_way_out)
1605	mce_panic(msg: "Fatal local machine check", final: &err, exp: msg);
1606	} else {
1607	order = mce_start(no_way_out: &no_way_out);
1608	}
1609
1610	taint = __mc_scan_banks(err: &err, regs, final, toclear, valid_banks, no_way_out, worst: &worst);
1611
1612	if (!no_way_out)
1613	mce_clear_state(toclear);
1614
1615	/*
1616	* Do most of the synchronization with other CPUs.
1617	* When there's any problem use only local no_way_out state.
1618	*/
1619	if (!lmce) {
1620	if (mce_end(order) < `0`) {
1621	if (!no_way_out)
1622	no_way_out = worst >= MCE_PANIC_SEVERITY;
1623
1624	if (no_way_out)
1625	mce_panic(msg: "Fatal machine check on current CPU", final: &err, exp: msg);
1626	}
1627	} else {
1628	/*
1629	* If there was a fatal machine check we should have
1630	* already called mce_panic earlier in this function.
1631	* Since we re-read the banks, we might have found
1632	* something new. Check again to see if we found a
1633	* fatal error. We call "mce_severity()" again to
1634	* make sure we have the right "msg".
1635	*/
1636	if (worst >= MCE_PANIC_SEVERITY) {
1637	mce_severity(a: m, regs, msg: &msg, is_excp: true);
1638	mce_panic(msg: "Local fatal machine check!", final: &err, exp: msg);
1639	}
1640	}
1641
1642	/*
1643	* Enable instrumentation around the external facilities like task_work_add()
1644	* (via queue_task_work()), fixup_exception() etc. For now, that is. Fixing this
1645	* properly would need a lot more involved reorganization.
1646	*/
1647	instrumentation_begin();
1648
1649	if (taint)
1650	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1651
1652	if (worst != MCE_AR_SEVERITY && !kill_current_task)
1653	goto out;
1654
1655	/ Fault was in user mode and we need to take some action /
1656	if ((m->cs & `3`) == `3`) {
1657	/ If this triggers there is no way to recover. Die hard. /
1658	BUG_ON(!on_thread_stack() \|\| !user_mode(regs));
1659
1660	if (!mce_usable_address(m))
1661	queue_task_work(err: &err, msg, func: kill_me_now);
1662	else
1663	queue_task_work(err: &err, msg, func: kill_me_maybe);
1664
1665	} else if (m->mcgstatus & MCG_STATUS_SEAM_NR) {
1666	/*
1667	* Saved RIP on stack makes it look like the machine check
1668	* was taken in the kernel on the instruction following
1669	* the entry to SEAM mode. But MCG_STATUS_SEAM_NR indicates
1670	* that the machine check was taken inside SEAM non-root
1671	* mode. CPU core has already marked that guest as dead.
1672	* It is OK for the kernel to resume execution at the
1673	* apparent point of the machine check as the fault did
1674	* not occur there. Mark the page as poisoned so it won't
1675	* be added to free list when the guest is terminated.
1676	*/
1677	if (mce_usable_address(m)) {
1678	struct page *p = pfn_to_online_page(m->addr >> PAGE_SHIFT);
1679
1680	if (p)
1681	SetPageHWPoison(p);
1682	}
1683	} else {
1684	/*
1685	* Handle an MCE which has happened in kernel space but from
1686	* which the kernel can recover: ex_has_fault_handler() has
1687	* already verified that the rIP at which the error happened is
1688	* a rIP from which the kernel can recover (by jumping to
1689	* recovery code specified in _ASM_EXTABLE_FAULT()) and the
1690	* corresponding exception handler which would do that is the
1691	* proper one.
1692	*/
1693	if (m->kflags & MCE_IN_KERNEL_RECOV) {
1694	if (!fixup_exception(regs, X86_TRAP_MC, error_code: `0`, fault_addr: `0`))
1695	mce_panic(msg: "Failed kernel mode recovery", final: &err, exp: msg);
1696	}
1697
1698	if (m->kflags & MCE_IN_KERNEL_COPYIN)
1699	queue_task_work(err: &err, msg, func: kill_me_never);
1700	}
1701
1702	out:
1703	instrumentation_end();
1704
1705	clear:
1706	mce_wrmsrq(MSR_IA32_MCG_STATUS, v: `0`);
1707	}
1708	EXPORT_SYMBOL_GPL(do_machine_check);
1709
1710	#ifndef CONFIG_MEMORY_FAILURE
1711	int memory_failure(unsigned long pfn, int flags)
1712	{
1713	/ mce_severity() should not hand us an ACTION_REQUIRED error /
1714	BUG_ON(flags & MF_ACTION_REQUIRED);
1715	pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1716	"Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1717	pfn);
1718
1719	return `0`;
1720	}
1721	#endif
1722
1723	/*
1724	* Periodic polling timer for "silent" machine check errors. If the
1725	* poller finds an MCE, poll 2x faster. When the poller finds no more
1726	* errors, poll 2x slower (up to check_interval seconds).
1727	*/
1728	static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
1729
1730	static DEFINE_PER_CPU(unsigned long, mce_next_interval); / in jiffies /
1731	static DEFINE_PER_CPU(struct timer_list, mce_timer);
1732
1733	static void __start_timer(struct timer_list t, unsigned* long interval)
1734	{
1735	unsigned long when = jiffies + interval;
1736	unsigned long flags;
1737
1738	local_irq_save(flags);
1739
1740	if (!timer_pending(timer: t) \|\| time_before(when, t->expires))
1741	mod_timer(timer: t, expires: round_jiffies(j: when));
1742
1743	local_irq_restore(flags);
1744	}
1745
1746	static void mc_poll_banks_default(void)
1747	{
1748	machine_check_poll(`0`, this_cpu_ptr(&mce_poll_banks));
1749	}
1750
1751	void (mc_poll_banks)(void*) = mc_poll_banks_default;
1752
1753	static bool should_enable_timer(unsigned long iv)
1754	{
1755	return !mca_cfg.ignore_ce && iv;
1756	}
1757
1758	static void mce_timer_fn(struct timer_list *t)
1759	{
1760	struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
1761	unsigned long iv;
1762
1763	WARN_ON(cpu_t != t);
1764
1765	iv = __this_cpu_read(mce_next_interval);
1766
1767	if (mce_available(this_cpu_ptr(&cpu_info)))
1768	mc_poll_banks();
1769
1770	/*
1771	* Alert userspace if needed. If we logged an MCE, reduce the polling
1772	* interval, otherwise increase the polling interval.
1773	*/
1774	if (mce_notify_irq())
1775	iv = max(iv / `2`, (unsigned long) HZ/`100`);
1776	else
1777	iv = min(iv * `2`, round_jiffies_relative(check_interval * HZ));
1778
1779	if (mce_get_storm_mode()) {
1780	__start_timer(t, HZ);
1781	} else if (should_enable_timer(iv)) {
1782	__this_cpu_write(mce_next_interval, iv);
1783	__start_timer(t, interval: iv);
1784	}
1785	}
1786
1787	/*
1788	* When a storm starts on any bank on this CPU, switch to polling
1789	* once per second. When the storm ends, revert to the default
1790	* polling interval.
1791	*/
1792	void mce_timer_kick(bool storm)
1793	{
1794	struct timer_list *t = this_cpu_ptr(&mce_timer);
1795
1796	mce_set_storm_mode(storm);
1797
1798	if (storm)
1799	__start_timer(t, HZ);
1800	else
1801	__this_cpu_write(mce_next_interval, check_interval * HZ);
1802	}
1803
1804	/ Must not be called in IRQ context where timer_delete_sync() can deadlock /
1805	static void mce_timer_delete_all(void)
1806	{
1807	int cpu;
1808
1809	for_each_online_cpu(cpu)
1810	timer_delete_sync(timer: &per_cpu(mce_timer, cpu));
1811	}
1812
1813	static void __mcheck_cpu_mce_banks_init(void)
1814	{
1815	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1816	u8 n_banks = this_cpu_read(mce_num_banks);
1817	int i;
1818
1819	for (i = `0`; i < n_banks; i++) {
1820	struct mce_bank *b = &mce_banks[i];
1821
1822	/*
1823	* Init them all by default.
1824	*
1825	* The required vendor quirks will be applied before
1826	* __mcheck_cpu_init_prepare_banks() does the final bank setup.
1827	*/
1828	b->ctl = -`1ULL`;
1829	b->init = true;
1830	}
1831	}
1832
1833	/*
1834	* Initialize Machine Checks for a CPU.
1835	*/
1836	static void __mcheck_cpu_cap_init(void)
1837	{
1838	u64 cap;
1839	u8 b;
1840
1841	rdmsrq(MSR_IA32_MCG_CAP, cap);
1842
1843	b = cap & MCG_BANKCNT_MASK;
1844
1845	if (b > MAX_NR_BANKS) {
1846	pr_warn("CPU%d: Using only %u machine check banks out of %u\n",
1847	smp_processor_id(), MAX_NR_BANKS, b);
1848	b = MAX_NR_BANKS;
1849	}
1850
1851	this_cpu_write(mce_num_banks, b);
1852
1853	__mcheck_cpu_mce_banks_init();
1854	}
1855
1856	static void __mcheck_cpu_init_generic(void)
1857	{
1858	u64 cap;
1859
1860	rdmsrq(MSR_IA32_MCG_CAP, cap);
1861	if (cap & MCG_CTL_P)
1862	wrmsr(MSR_IA32_MCG_CTL, low: `0xffffffff`, high: `0xffffffff`);
1863	}
1864
1865	static void __mcheck_cpu_init_prepare_banks(void)
1866	{
1867	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1868	u64 msrval;
1869	int i;
1870
1871	/*
1872	* Log the machine checks left over from the previous reset. Log them
1873	* only, do not start processing them. That will happen in mcheck_late_init()
1874	* when all consumers have been registered on the notifier chain.
1875	*/
1876	if (mca_cfg.bootlog) {
1877	mce_banks_t all_banks;
1878
1879	bitmap_fill(dst: all_banks, MAX_NR_BANKS);
1880	machine_check_poll(MCP_UC \| MCP_QUEUE_LOG, &all_banks);
1881	}
1882
1883	for (i = `0`; i < this_cpu_read(mce_num_banks); i++) {
1884	struct mce_bank *b = &mce_banks[i];
1885
1886	if (!b->init)
1887	continue;
1888
1889	wrmsrq(msr: mca_msr_reg(bank: i, reg: MCA_CTL), val: b->ctl);
1890	wrmsrq(msr: mca_msr_reg(bank: i, reg: MCA_STATUS), val: `0`);
1891
1892	rdmsrq(mca_msr_reg(i, MCA_CTL), msrval);
1893	b->init = !!msrval;
1894	}
1895	}
1896
1897	static void amd_apply_global_quirks(struct cpuinfo_x86 *c)
1898	{
1899	if (c->x86 < `0x11` && mca_cfg.bootlog < `0`) {
1900	/*
1901	* Lots of broken BIOS around that don't clear them
1902	* by default and leave crap in there. Don't log:
1903	*/
1904	mca_cfg.bootlog = `0`;
1905	}
1906
1907	/*
1908	* overflow_recov is supported for F15h Models 00h-0fh
1909	* even though we don't have a CPUID bit for it.
1910	*/
1911	if (c->x86 == `0x15` && c->x86_model <= `0xf`)
1912	mce_flags.overflow_recov = `1`;
1913
1914	if (c->x86 >= `0x17` && c->x86 <= `0x1A`)
1915	mce_flags.zen_ifu_quirk = `1`;
1916	}
1917
1918	static void intel_apply_global_quirks(struct cpuinfo_x86 *c)
1919	{
1920	/ Older CPUs (prior to family 6) don't need quirks. /
1921	if (c->x86_vfm < INTEL_PENTIUM_PRO)
1922	return;
1923
1924	/*
1925	* All newer Intel systems support MCE broadcasting. Enable
1926	* synchronization with a one second timeout.
1927	*/
1928	if (c->x86_vfm >= INTEL_CORE_YONAH && mca_cfg.monarch_timeout < `0`)
1929	mca_cfg.monarch_timeout = USEC_PER_SEC;
1930
1931	/*
1932	* There are also broken BIOSes on some Pentium M and
1933	* earlier systems:
1934	*/
1935	if (c->x86_vfm < INTEL_CORE_YONAH && mca_cfg.bootlog < `0`)
1936	mca_cfg.bootlog = `0`;
1937
1938	if (c->x86_vfm == INTEL_SANDYBRIDGE_X)
1939	mce_flags.snb_ifu_quirk = `1`;
1940
1941	/*
1942	* Skylake, Cascacde Lake and Cooper Lake require a quirk on
1943	* rep movs.
1944	*/
1945	if (c->x86_vfm == INTEL_SKYLAKE_X)
1946	mce_flags.skx_repmov_quirk = `1`;
1947	}
1948
1949	static void zhaoxin_apply_global_quirks(struct cpuinfo_x86 *c)
1950	{
1951	/*
1952	* All newer Zhaoxin CPUs support MCE broadcasting. Enable
1953	* synchronization with a one second timeout.
1954	*/
1955	if (c->x86 > `6` \|\| (c->x86_model == `0x19` \|\| c->x86_model == `0x1f`)) {
1956	if (mca_cfg.monarch_timeout < `0`)
1957	mca_cfg.monarch_timeout = USEC_PER_SEC;
1958	}
1959	}
1960
1961	static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1962	{
1963	if (c->x86 != `5`)
1964	return false;
1965
1966	switch (c->x86_vendor) {
1967	case X86_VENDOR_INTEL:
1968	intel_p5_mcheck_init(c);
1969	mce_flags.p5 = `1`;
1970	return true;
1971	case X86_VENDOR_CENTAUR:
1972	winchip_mcheck_init(c);
1973	mce_flags.winchip = `1`;
1974	return true;
1975	default:
1976	return false;
1977	}
1978
1979	return false;
1980	}
1981
1982	static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
1983	{
1984	struct mca_config *cfg = &mca_cfg;
1985
1986	/*
1987	* All newer Centaur CPUs support MCE broadcasting. Enable
1988	* synchronization with a one second timeout.
1989	*/
1990	if ((c->x86 == `6` && c->x86_model == `0xf` && c->x86_stepping >= `0xe`) \|\|
1991	c->x86 > `6`) {
1992	if (cfg->monarch_timeout < `0`)
1993	cfg->monarch_timeout = USEC_PER_SEC;
1994	}
1995	}
1996
1997	static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
1998	{
1999	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
2000
2001	/*
2002	* These CPUs have MCA bank 8 which reports only one error type called
2003	* SVAD (System View Address Decoder). The reporting of that error is
2004	* controlled by IA32_MC8.CTL.0.
2005	*
2006	* If enabled, prefetching on these CPUs will cause SVAD MCE when
2007	* virtual machines start and result in a system panic. Always disable
2008	* bank 8 SVAD error by default.
2009	*/
2010	if ((c->x86 == `7` && c->x86_model == `0x1b`) \|\|
2011	(c->x86_model == `0x19` \|\| c->x86_model == `0x1f`)) {
2012	if (this_cpu_read(mce_num_banks) > `8`)
2013	mce_banks[`8`].ctl = `0`;
2014	}
2015
2016	intel_init_cmci();
2017	intel_init_lmce();
2018	}
2019
2020	static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
2021	{
2022	intel_clear_lmce();
2023	}
2024
2025	static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
2026	{
2027	switch (c->x86_vendor) {
2028	case X86_VENDOR_INTEL:
2029	mce_intel_feature_init(c);
2030	break;
2031
2032	case X86_VENDOR_AMD:
2033	case X86_VENDOR_HYGON:
2034	mce_amd_feature_init(c);
2035	break;
2036
2037	case X86_VENDOR_CENTAUR:
2038	mce_centaur_feature_init(c);
2039	break;
2040
2041	case X86_VENDOR_ZHAOXIN:
2042	mce_zhaoxin_feature_init(c);
2043	break;
2044
2045	default:
2046	break;
2047	}
2048	}
2049
2050	static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
2051	{
2052	switch (c->x86_vendor) {
2053	case X86_VENDOR_INTEL:
2054	mce_intel_feature_clear(c);
2055	break;
2056
2057	case X86_VENDOR_ZHAOXIN:
2058	mce_zhaoxin_feature_clear(c);
2059	break;
2060
2061	default:
2062	break;
2063	}
2064	}
2065
2066	static void mce_start_timer(struct timer_list *t)
2067	{
2068	unsigned long iv = check_interval * HZ;
2069
2070	if (should_enable_timer(iv)) {
2071	this_cpu_write(mce_next_interval, iv);
2072	__start_timer(t, interval: iv);
2073	}
2074	}
2075
2076	static void __mcheck_cpu_setup_timer(void)
2077	{
2078	struct timer_list *t = this_cpu_ptr(&mce_timer);
2079
2080	timer_setup(t, mce_timer_fn, TIMER_PINNED);
2081	}
2082
2083	static void __mcheck_cpu_init_timer(void)
2084	{
2085	struct timer_list *t = this_cpu_ptr(&mce_timer);
2086
2087	timer_setup(t, mce_timer_fn, TIMER_PINNED);
2088	mce_start_timer(t);
2089	}
2090
2091	bool filter_mce(struct mce *m)
2092	{
2093	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
2094	return amd_filter_mce(m);
2095	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
2096	return intel_filter_mce(m);
2097
2098	return false;
2099	}
2100
2101	static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
2102	{
2103	irqentry_state_t irq_state;
2104
2105	WARN_ON_ONCE(user_mode(regs));
2106
2107	/*
2108	* Only required when from kernel mode. See
2109	* mce_check_crashing_cpu() for details.
2110	*/
2111	if (mca_cfg.initialized && mce_check_crashing_cpu())
2112	return;
2113
2114	irq_state = irqentry_nmi_enter(regs);
2115
2116	do_machine_check(regs);
2117
2118	irqentry_nmi_exit(regs, irq_state);
2119	}
2120
2121	static __always_inline void exc_machine_check_user(struct pt_regs *regs)
2122	{
2123	irqentry_enter_from_user_mode(regs);
2124
2125	do_machine_check(regs);
2126
2127	irqentry_exit_to_user_mode(regs);
2128	}
2129
2130	#ifdef CONFIG_X86_64
2131	/ MCE hit kernel mode /
2132	DEFINE_IDTENTRY_MCE(exc_machine_check)
2133	{
2134	unsigned long dr7;
2135
2136	dr7 = local_db_save();
2137	exc_machine_check_kernel(regs);
2138	local_db_restore(dr7);
2139	}
2140
2141	/ The user mode variant. /
2142	DEFINE_IDTENTRY_MCE_USER(exc_machine_check)
2143	{
2144	unsigned long dr7;
2145
2146	dr7 = local_db_save();
2147	exc_machine_check_user(regs);
2148	local_db_restore(dr7);
2149	}
2150
2151	#ifdef CONFIG_X86_FRED
2152	/*
2153	* When occurred on different ring level, i.e., from user or kernel
2154	* context, #MCE needs to be handled on different stack: User #MCE
2155	* on current task stack, while kernel #MCE on a dedicated stack.
2156	*
2157	* This is exactly how FRED event delivery invokes an exception
2158	* handler: ring 3 event on level 0 stack, i.e., current task stack;
2159	* ring 0 event on the #MCE dedicated stack specified in the
2160	* IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED machine check entry
2161	* stub doesn't do stack switch.
2162	*/
2163	DEFINE_FREDENTRY_MCE(exc_machine_check)
2164	{
2165	unsigned long dr7;
2166
2167	dr7 = local_db_save();
2168	if (user_mode(regs))
2169	exc_machine_check_user(regs);
2170	else
2171	exc_machine_check_kernel(regs);
2172	local_db_restore(dr7);
2173	}
2174	#endif
2175	#else
2176	/ 32bit unified entry point /
2177	DEFINE_IDTENTRY_RAW(exc_machine_check)
2178	{
2179	unsigned long dr7;
2180
2181	dr7 = local_db_save();
2182	if (user_mode(regs))
2183	exc_machine_check_user(regs);
2184	else
2185	exc_machine_check_kernel(regs);
2186	local_db_restore(dr7);
2187	}
2188	#endif
2189
2190	void mca_bsp_init(struct cpuinfo_x86 *c)
2191	{
2192	u64 cap;
2193
2194	if (!mce_available(c))
2195	return;
2196
2197	if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
2198	mca_cfg.disabled = `1`;
2199	pr_info("unknown CPU type - not enabling MCE support\n");
2200	return;
2201	}
2202
2203	mce_flags.overflow_recov = cpu_feature_enabled(X86_FEATURE_OVERFLOW_RECOV);
2204	mce_flags.succor = cpu_feature_enabled(X86_FEATURE_SUCCOR);
2205	mce_flags.smca = cpu_feature_enabled(X86_FEATURE_SMCA);
2206
2207	if (mce_flags.smca)
2208	smca_bsp_init();
2209
2210	rdmsrq(MSR_IA32_MCG_CAP, cap);
2211
2212	/ Use accurate RIP reporting if available. /
2213	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= `9`)
2214	mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
2215
2216	if (cap & MCG_SER_P)
2217	mca_cfg.ser = `1`;
2218
2219	switch (c->x86_vendor) {
2220	case X86_VENDOR_AMD:
2221	amd_apply_global_quirks(c);
2222	break;
2223	case X86_VENDOR_INTEL:
2224	intel_apply_global_quirks(c);
2225	break;
2226	case X86_VENDOR_ZHAOXIN:
2227	zhaoxin_apply_global_quirks(c);
2228	break;
2229	}
2230
2231	if (mca_cfg.monarch_timeout < `0`)
2232	mca_cfg.monarch_timeout = `0`;
2233	if (mca_cfg.bootlog != `0`)
2234	mca_cfg.panic_timeout = `30`;
2235	}
2236
2237	/*
2238	* Called for each booted CPU to set up machine checks.
2239	* Must be called with preempt off:
2240	*/
2241	void mcheck_cpu_init(struct cpuinfo_x86 *c)
2242	{
2243	if (mca_cfg.disabled)
2244	return;
2245
2246	if (__mcheck_cpu_ancient_init(c))
2247	return;
2248
2249	if (!mce_available(c))
2250	return;
2251
2252	__mcheck_cpu_cap_init();
2253
2254	if (!mce_gen_pool_init()) {
2255	mca_cfg.disabled = `1`;
2256	pr_emerg("Couldn't allocate MCE records pool!\n");
2257	return;
2258	}
2259
2260	mca_cfg.initialized = `1`;
2261
2262	__mcheck_cpu_init_generic();
2263	__mcheck_cpu_init_vendor(c);
2264	__mcheck_cpu_init_prepare_banks();
2265	__mcheck_cpu_setup_timer();
2266	cr4_set_bits(X86_CR4_MCE);
2267	}
2268
2269	/*
2270	* Called for each booted CPU to clear some machine checks opt-ins
2271	*/
2272	void mcheck_cpu_clear(struct cpuinfo_x86 *c)
2273	{
2274	if (mca_cfg.disabled)
2275	return;
2276
2277	if (!mce_available(c))
2278	return;
2279
2280	/*
2281	* Possibly to clear general settings generic to x86
2282	* __mcheck_cpu_clear_generic(c);
2283	*/
2284	__mcheck_cpu_clear_vendor(c);
2285
2286	}
2287
2288	static void __mce_disable_bank(void *arg)
2289	{
2290	int bank = ((int* *)arg);
2291	__clear_bit(bank, this_cpu_ptr(mce_poll_banks));
2292	cmci_disable_bank(bank);
2293	}
2294
2295	void mce_disable_bank(int bank)
2296	{
2297	if (bank >= this_cpu_read(mce_num_banks)) {
2298	pr_warn(FW_BUG
2299	"Ignoring request to disable invalid MCA bank %d.\n",
2300	bank);
2301	return;
2302	}
2303	set_bit(nr: bank, addr: mce_banks_ce_disabled);
2304	on_each_cpu(func: __mce_disable_bank, info: &bank, wait: `1`);
2305	}
2306
2307	/*
2308	* mce=off Disables machine check
2309	* mce=no_cmci Disables CMCI
2310	* mce=no_lmce Disables LMCE
2311	* mce=dont_log_ce Clears corrected events silently, no log created for CEs.
2312	* mce=print_all Print all machine check logs to console
2313	* mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
2314	* mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
2315	* monarchtimeout is how long to wait for other CPUs on machine
2316	* check, or 0 to not wait
2317	* mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h
2318	and older.
2319	* mce=nobootlog Don't log MCEs from before booting.
2320	* mce=bios_cmci_threshold Don't program the CMCI threshold
2321	* mce=recovery force enable copy_mc_fragile()
2322	*/
2323	static int __init mcheck_enable(char *str)
2324	{
2325	struct mca_config *cfg = &mca_cfg;
2326
2327	if (*str == `0`) {
2328	enable_p5_mce();
2329	return `1`;
2330	}
2331	if (*str == `'='`)
2332	str++;
2333	if (!strcmp(str, "off"))
2334	cfg->disabled = `1`;
2335	else if (!strcmp(str, "no_cmci"))
2336	cfg->cmci_disabled = true;
2337	else if (!strcmp(str, "no_lmce"))
2338	cfg->lmce_disabled = `1`;
2339	else if (!strcmp(str, "dont_log_ce"))
2340	cfg->dont_log_ce = true;
2341	else if (!strcmp(str, "print_all"))
2342	cfg->print_all = true;
2343	else if (!strcmp(str, "ignore_ce"))
2344	cfg->ignore_ce = true;
2345	else if (!strcmp(str, "bootlog") \|\| !strcmp(str, "nobootlog"))
2346	cfg->bootlog = (str[`0`] == `'b'`);
2347	else if (!strcmp(str, "bios_cmci_threshold"))
2348	cfg->bios_cmci_threshold = `1`;
2349	else if (!strcmp(str, "recovery"))
2350	cfg->recovery = `1`;
2351	else if (isdigit(c: str[`0`]))
2352	get_option(str: &str, pint: &(cfg->monarch_timeout));
2353	else {
2354	pr_info("mce argument %s ignored. Please use /sys\n", str);
2355	return `0`;
2356	}
2357	return `1`;
2358	}
2359	__setup("mce", mcheck_enable);
2360
2361	int __init mcheck_init(void)
2362	{
2363	mce_register_decode_chain(&early_nb);
2364	mce_register_decode_chain(&mce_uc_nb);
2365	mce_register_decode_chain(&mce_default_nb);
2366
2367	INIT_WORK(&mce_work, mce_gen_pool_process);
2368	init_irq_work(work: &mce_irq_work, func: mce_irq_work_cb);
2369
2370	return `0`;
2371	}
2372
2373	/*
2374	* mce_syscore: PM support
2375	*/
2376
2377	/*
2378	* Disable machine checks on suspend and shutdown. We can't really handle
2379	* them later.
2380	*/
2381	static void mce_disable_error_reporting(void)
2382	{
2383	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
2384	int i;
2385
2386	for (i = `0`; i < this_cpu_read(mce_num_banks); i++) {
2387	struct mce_bank *b = &mce_banks[i];
2388
2389	if (b->init)
2390	wrmsrq(msr: mca_msr_reg(bank: i, reg: MCA_CTL), val: `0`);
2391	}
2392	return;
2393	}
2394
2395	static void vendor_disable_error_reporting(void)
2396	{
2397	/*
2398	* Don't clear on Intel or AMD or Hygon or Zhaoxin CPUs. Some of these
2399	* MSRs are socket-wide. Disabling them for just a single offlined CPU
2400	* is bad, since it will inhibit reporting for all shared resources on
2401	* the socket like the last level cache (LLC), the integrated memory
2402	* controller (iMC), etc.
2403	*/
2404	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL \|\|
2405	boot_cpu_data.x86_vendor == X86_VENDOR_HYGON \|\|
2406	boot_cpu_data.x86_vendor == X86_VENDOR_AMD \|\|
2407	boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN)
2408	return;
2409
2410	mce_disable_error_reporting();
2411	}
2412
2413	static int mce_syscore_suspend(void)
2414	{
2415	vendor_disable_error_reporting();
2416	return `0`;
2417	}
2418
2419	static void mce_syscore_shutdown(void)
2420	{
2421	vendor_disable_error_reporting();
2422	}
2423
2424	/*
2425	* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
2426	* Only one CPU is active at this time, the others get re-added later using
2427	* CPU hotplug:
2428	*/
2429	static void mce_syscore_resume(void)
2430	{
2431	__mcheck_cpu_init_generic();
2432	__mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
2433	__mcheck_cpu_init_prepare_banks();
2434	cr4_set_bits(X86_CR4_MCE);
2435	}
2436
2437	static struct syscore_ops mce_syscore_ops = {
2438	.suspend = mce_syscore_suspend,
2439	.shutdown = mce_syscore_shutdown,
2440	.resume = mce_syscore_resume,
2441	};
2442
2443	/*
2444	* mce_device: Sysfs support
2445	*/
2446
2447	static void mce_cpu_restart(void *data)
2448	{
2449	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2450	return;
2451	__mcheck_cpu_init_generic();
2452	__mcheck_cpu_init_prepare_banks();
2453	__mcheck_cpu_init_timer();
2454	cr4_set_bits(X86_CR4_MCE);
2455	}
2456
2457	/ Reinit MCEs after user configuration changes /
2458	static void mce_restart(void)
2459	{
2460	mce_timer_delete_all();
2461	on_each_cpu(func: mce_cpu_restart, NULL, wait: `1`);
2462	mce_schedule_work();
2463	}
2464
2465	/ Toggle features for corrected errors /
2466	static void mce_disable_cmci(void *data)
2467	{
2468	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2469	return;
2470	cmci_clear();
2471	}
2472
2473	static void mce_enable_ce(void *all)
2474	{
2475	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2476	return;
2477	cmci_reenable();
2478	cmci_recheck();
2479	if (all)
2480	__mcheck_cpu_init_timer();
2481	}
2482
2483	static const struct bus_type mce_subsys = {
2484	.name = "machinecheck",
2485	.dev_name = "machinecheck",
2486	};
2487
2488	DEFINE_PER_CPU(struct device *, mce_device);
2489
2490	static inline struct mce_bank_dev attr_to_bank(struct* device_attribute *attr)
2491	{
2492	return container_of(attr, struct mce_bank_dev, attr);
2493	}
2494
2495	static ssize_t show_bank(struct device s, struct* device_attribute *attr,
2496	char *buf)
2497	{
2498	u8 bank = attr_to_bank(attr)->bank;
2499	struct mce_bank *b;
2500
2501	if (bank >= per_cpu(mce_num_banks, s->id))
2502	return -EINVAL;
2503
2504	b = &per_cpu(mce_banks_array, s->id)[bank];
2505
2506	if (!b->init)
2507	return -ENODEV;
2508
2509	return sprintf(buf, fmt: "%llx\n", b->ctl);
2510	}
2511
2512	static ssize_t set_bank(struct device s, struct* device_attribute *attr,
2513	const char *buf, size_t size)
2514	{
2515	u8 bank = attr_to_bank(attr)->bank;
2516	struct mce_bank *b;
2517	u64 new;
2518
2519	if (kstrtou64(s: buf, base: `0`, res: &new) < `0`)
2520	return -EINVAL;
2521
2522	if (bank >= per_cpu(mce_num_banks, s->id))
2523	return -EINVAL;
2524
2525	b = &per_cpu(mce_banks_array, s->id)[bank];
2526	if (!b->init)
2527	return -ENODEV;
2528
2529	b->ctl = new;
2530
2531	mutex_lock(lock: &mce_sysfs_mutex);
2532	mce_restart();
2533	mutex_unlock(lock: &mce_sysfs_mutex);
2534
2535	return size;
2536	}
2537
2538	static ssize_t set_ignore_ce(struct device *s,
2539	struct device_attribute *attr,
2540	const char *buf, size_t size)
2541	{
2542	u64 new;
2543
2544	if (kstrtou64(s: buf, base: `0`, res: &new) < `0`)
2545	return -EINVAL;
2546
2547	mutex_lock(lock: &mce_sysfs_mutex);
2548	if (mca_cfg.ignore_ce ^ !!new) {
2549	if (new) {
2550	/ disable ce features /
2551	mce_timer_delete_all();
2552	on_each_cpu(func: mce_disable_cmci, NULL, wait: `1`);
2553	mca_cfg.ignore_ce = true;
2554	} else {
2555	/ enable ce features /
2556	mca_cfg.ignore_ce = false;
2557	on_each_cpu(func: mce_enable_ce, info: (void *)`1`, wait: `1`);
2558	}
2559	}
2560	mutex_unlock(lock: &mce_sysfs_mutex);
2561
2562	return size;
2563	}
2564
2565	static ssize_t set_cmci_disabled(struct device *s,
2566	struct device_attribute *attr,
2567	const char *buf, size_t size)
2568	{
2569	u64 new;
2570
2571	if (kstrtou64(s: buf, base: `0`, res: &new) < `0`)
2572	return -EINVAL;
2573
2574	mutex_lock(lock: &mce_sysfs_mutex);
2575	if (mca_cfg.cmci_disabled ^ !!new) {
2576	if (new) {
2577	/ disable cmci /
2578	on_each_cpu(func: mce_disable_cmci, NULL, wait: `1`);
2579	mca_cfg.cmci_disabled = true;
2580	} else {
2581	/ enable cmci /
2582	mca_cfg.cmci_disabled = false;
2583	on_each_cpu(func: mce_enable_ce, NULL, wait: `1`);
2584	}
2585	}
2586	mutex_unlock(lock: &mce_sysfs_mutex);
2587
2588	return size;
2589	}
2590
2591	static ssize_t store_int_with_restart(struct device *s,
2592	struct device_attribute *attr,
2593	const char *buf, size_t size)
2594	{
2595	unsigned long old_check_interval = check_interval;
2596	ssize_t ret = device_store_ulong(dev: s, attr, buf, count: size);
2597
2598	if (check_interval == old_check_interval)
2599	return ret;
2600
2601	mutex_lock(lock: &mce_sysfs_mutex);
2602	mce_restart();
2603	mutex_unlock(lock: &mce_sysfs_mutex);
2604
2605	return ret;
2606	}
2607
2608	static DEVICE_INT_ATTR(monarch_timeout, `0644`, mca_cfg.monarch_timeout);
2609	static DEVICE_BOOL_ATTR(dont_log_ce, `0644`, mca_cfg.dont_log_ce);
2610	static DEVICE_BOOL_ATTR(print_all, `0644`, mca_cfg.print_all);
2611
2612	static struct dev_ext_attribute dev_attr_check_interval = {
2613	__ATTR(check_interval, `0644`, device_show_int, store_int_with_restart),
2614	&check_interval
2615	};
2616
2617	static struct dev_ext_attribute dev_attr_ignore_ce = {
2618	__ATTR(ignore_ce, `0644`, device_show_bool, set_ignore_ce),
2619	&mca_cfg.ignore_ce
2620	};
2621
2622	static struct dev_ext_attribute dev_attr_cmci_disabled = {
2623	__ATTR(cmci_disabled, `0644`, device_show_bool, set_cmci_disabled),
2624	&mca_cfg.cmci_disabled
2625	};
2626
2627	static struct device_attribute *mce_device_attrs[] = {
2628	&dev_attr_check_interval.attr,
2629	#ifdef CONFIG_X86_MCELOG_LEGACY
2630	&dev_attr_trigger,
2631	#endif
2632	&dev_attr_monarch_timeout.attr,
2633	&dev_attr_dont_log_ce.attr,
2634	&dev_attr_print_all.attr,
2635	&dev_attr_ignore_ce.attr,
2636	&dev_attr_cmci_disabled.attr,
2637	NULL
2638	};
2639
2640	static cpumask_var_t mce_device_initialized;
2641
2642	static void mce_device_release(struct device *dev)
2643	{
2644	kfree(objp: dev);
2645	}
2646
2647	/ Per CPU device init. All of the CPUs still share the same bank device: /
2648	static int mce_device_create(unsigned int cpu)
2649	{
2650	struct device *dev;
2651	int err;
2652	int i, j;
2653
2654	dev = per_cpu(mce_device, cpu);
2655	if (dev)
2656	return `0`;
2657
2658	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
2659	if (!dev)
2660	return -ENOMEM;
2661	dev->id = cpu;
2662	dev->bus = &mce_subsys;
2663	dev->release = &mce_device_release;
2664
2665	err = device_register(dev);
2666	if (err) {
2667	put_device(dev);
2668	return err;
2669	}
2670
2671	for (i = `0`; mce_device_attrs[i]; i++) {
2672	err = device_create_file(device: dev, entry: mce_device_attrs[i]);
2673	if (err)
2674	goto error;
2675	}
2676	for (j = `0`; j < per_cpu(mce_num_banks, cpu); j++) {
2677	err = device_create_file(device: dev, entry: &mce_bank_devs[j].attr);
2678	if (err)
2679	goto error2;
2680	}
2681	cpumask_set_cpu(cpu, dstp: mce_device_initialized);
2682	per_cpu(mce_device, cpu) = dev;
2683
2684	return `0`;
2685	error2:
2686	while (--j >= `0`)
2687	device_remove_file(dev, attr: &mce_bank_devs[j].attr);
2688	error:
2689	while (--i >= `0`)
2690	device_remove_file(dev, attr: mce_device_attrs[i]);
2691
2692	device_unregister(dev);
2693
2694	return err;
2695	}
2696
2697	static void mce_device_remove(unsigned int cpu)
2698	{
2699	struct device *dev = per_cpu(mce_device, cpu);
2700	int i;
2701
2702	if (!cpumask_test_cpu(cpu, cpumask: mce_device_initialized))
2703	return;
2704
2705	for (i = `0`; mce_device_attrs[i]; i++)
2706	device_remove_file(dev, attr: mce_device_attrs[i]);
2707
2708	for (i = `0`; i < per_cpu(mce_num_banks, cpu); i++)
2709	device_remove_file(dev, attr: &mce_bank_devs[i].attr);
2710
2711	device_unregister(dev);
2712	cpumask_clear_cpu(cpu, dstp: mce_device_initialized);
2713	per_cpu(mce_device, cpu) = NULL;
2714	}
2715
2716	/ Make sure there are no machine checks on offlined CPUs. /
2717	static void mce_disable_cpu(void)
2718	{
2719	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2720	return;
2721
2722	if (!cpuhp_tasks_frozen)
2723	cmci_clear();
2724
2725	vendor_disable_error_reporting();
2726	}
2727
2728	static void mce_reenable_cpu(void)
2729	{
2730	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
2731	int i;
2732
2733	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2734	return;
2735
2736	if (!cpuhp_tasks_frozen)
2737	cmci_reenable();
2738	for (i = `0`; i < this_cpu_read(mce_num_banks); i++) {
2739	struct mce_bank *b = &mce_banks[i];
2740
2741	if (b->init)
2742	wrmsrq(msr: mca_msr_reg(bank: i, reg: MCA_CTL), val: b->ctl);
2743	}
2744	}
2745
2746	static int mce_cpu_dead(unsigned int cpu)
2747	{
2748	/ intentionally ignoring frozen here /
2749	if (!cpuhp_tasks_frozen)
2750	cmci_rediscover();
2751	return `0`;
2752	}
2753
2754	static int mce_cpu_online(unsigned int cpu)
2755	{
2756	struct timer_list *t = this_cpu_ptr(&mce_timer);
2757
2758	mce_device_create(cpu);
2759	mce_threshold_create_device(cpu);
2760	mce_reenable_cpu();
2761	mce_start_timer(t);
2762	return `0`;
2763	}
2764
2765	static int mce_cpu_pre_down(unsigned int cpu)
2766	{
2767	struct timer_list *t = this_cpu_ptr(&mce_timer);
2768
2769	mce_disable_cpu();
2770	timer_delete_sync(timer: t);
2771	mce_threshold_remove_device(cpu);
2772	mce_device_remove(cpu);
2773	return `0`;
2774	}
2775
2776	static __init void mce_init_banks(void)
2777	{
2778	int i;
2779
2780	for (i = `0`; i < MAX_NR_BANKS; i++) {
2781	struct mce_bank_dev *b = &mce_bank_devs[i];
2782	struct device_attribute *a = &b->attr;
2783
2784	b->bank = i;
2785
2786	sysfs_attr_init(&a->attr);
2787	a->attr.name = b->attrname;
2788	snprintf(buf: b->attrname, ATTR_LEN, fmt: "bank%d", i);
2789
2790	a->attr.mode = `0644`;
2791	a->show = show_bank;
2792	a->store = set_bank;
2793	}
2794	}
2795
2796	/*
2797	* When running on XEN, this initcall is ordered against the XEN mcelog
2798	* initcall:
2799	*
2800	* device_initcall(xen_late_init_mcelog);
2801	* device_initcall_sync(mcheck_init_device);
2802	*/
2803	static __init int mcheck_init_device(void)
2804	{
2805	int err;
2806
2807	/*
2808	* Check if we have a spare virtual bit. This will only become
2809	* a problem if/when we move beyond 5-level page tables.
2810	*/
2811	MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= `63`);
2812
2813	if (!mce_available(c: &boot_cpu_data)) {
2814	err = -EIO;
2815	goto err_out;
2816	}
2817
2818	if (!zalloc_cpumask_var(mask: &mce_device_initialized, GFP_KERNEL)) {
2819	err = -ENOMEM;
2820	goto err_out;
2821	}
2822
2823	mce_init_banks();
2824
2825	err = subsys_system_register(subsys: &mce_subsys, NULL);
2826	if (err)
2827	goto err_out_mem;
2828
2829	err = cpuhp_setup_state(state: CPUHP_X86_MCE_DEAD, name: "x86/mce:dead", NULL,
2830	teardown: mce_cpu_dead);
2831	if (err)
2832	goto err_out_mem;
2833
2834	/*
2835	* Invokes mce_cpu_online() on all CPUs which are online when
2836	* the state is installed.
2837	*/
2838	err = cpuhp_setup_state(state: CPUHP_AP_ONLINE_DYN, name: "x86/mce:online",
2839	startup: mce_cpu_online, teardown: mce_cpu_pre_down);
2840	if (err < `0`)
2841	goto err_out_online;
2842
2843	register_syscore_ops(ops: &mce_syscore_ops);
2844
2845	return `0`;
2846
2847	err_out_online:
2848	cpuhp_remove_state(state: CPUHP_X86_MCE_DEAD);
2849
2850	err_out_mem:
2851	free_cpumask_var(mask: mce_device_initialized);
2852
2853	err_out:
2854	pr_err("Unable to init MCE device (rc: %d)\n", err);
2855
2856	return err;
2857	}
2858	device_initcall_sync(mcheck_init_device);
2859
2860	/*
2861	* Old style boot options parsing. Only for compatibility.
2862	*/
2863	static int __init mcheck_disable(char *str)
2864	{
2865	mca_cfg.disabled = `1`;
2866	return `1`;
2867	}
2868	__setup("nomce", mcheck_disable);
2869
2870	#ifdef CONFIG_DEBUG_FS
2871	struct dentry mce_get_debugfs_dir(void*)
2872	{
2873	static struct dentry *dmce;
2874
2875	if (!dmce)
2876	dmce = debugfs_create_dir(name: "mce", NULL);
2877
2878	return dmce;
2879	}
2880
2881	static void mce_reset(void)
2882	{
2883	atomic_set(v: &mce_fake_panicked, i: `0`);
2884	atomic_set(v: &mce_executing, i: `0`);
2885	atomic_set(v: &mce_callin, i: `0`);
2886	atomic_set(v: &global_nwo, i: `0`);
2887	cpumask_setall(dstp: &mce_missing_cpus);
2888	}
2889
2890	static int fake_panic_get(void data, u64 val)
2891	{
2892	*val = fake_panic;
2893	return `0`;
2894	}
2895
2896	static int fake_panic_set(void *data, u64 val)
2897	{
2898	mce_reset();
2899	fake_panic = val;
2900	return `0`;
2901	}
2902
2903	DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set,
2904	"%llu\n");
2905
2906	static void __init mcheck_debugfs_init(void)
2907	{
2908	struct dentry *dmce;
2909
2910	dmce = mce_get_debugfs_dir();
2911	debugfs_create_file_unsafe(name: "fake_panic", mode: `0444`, parent: dmce, NULL,
2912	fops: &fake_panic_fops);
2913	}
2914	#else
2915	static void __init mcheck_debugfs_init(void) { }
2916	#endif
2917
2918	static int __init mcheck_late_init(void)
2919	{
2920	if (mca_cfg.recovery)
2921	enable_copy_mc_fragile();
2922
2923	mcheck_debugfs_init();
2924
2925	/*
2926	* Flush out everything that has been logged during early boot, now that
2927	* everything has been initialized (workqueues, decoders, ...).
2928	*/
2929	mce_schedule_work();
2930
2931	return `0`;
2932	}
2933	late_initcall(mcheck_late_init);
2934

Browse the source code of Linux/arch/x86/kernel/cpu/mce/core.c