| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | 
|---|
| 2 | #include <linux/acpi.h> | 
|---|
| 3 | #include <linux/cpu.h> | 
|---|
| 4 | #include <linux/delay.h> | 
|---|
| 5 | #include <linux/io.h> | 
|---|
| 6 | #include <linux/kexec.h> | 
|---|
| 7 | #include <linux/memblock.h> | 
|---|
| 8 | #include <linux/pgtable.h> | 
|---|
| 9 | #include <linux/sched/hotplug.h> | 
|---|
| 10 | #include <asm/apic.h> | 
|---|
| 11 | #include <asm/barrier.h> | 
|---|
| 12 | #include <asm/init.h> | 
|---|
| 13 | #include <asm/intel_pt.h> | 
|---|
| 14 | #include <asm/nmi.h> | 
|---|
| 15 | #include <asm/processor.h> | 
|---|
| 16 | #include <asm/reboot.h> | 
|---|
| 17 |  | 
|---|
| 18 | /* Physical address of the Multiprocessor Wakeup Structure mailbox */ | 
|---|
| 19 | static u64 acpi_mp_wake_mailbox_paddr __ro_after_init; | 
|---|
| 20 |  | 
|---|
| 21 | /* Virtual address of the Multiprocessor Wakeup Structure mailbox */ | 
|---|
| 22 | static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox; | 
|---|
| 23 |  | 
|---|
| 24 | static u64 acpi_mp_pgd __ro_after_init; | 
|---|
| 25 | static u64 acpi_mp_reset_vector_paddr __ro_after_init; | 
|---|
| 26 |  | 
|---|
| 27 | static void acpi_mp_stop_this_cpu(void) | 
|---|
| 28 | { | 
|---|
| 29 | asm_acpi_mp_play_dead(reset_vector: acpi_mp_reset_vector_paddr, pgd_pa: acpi_mp_pgd); | 
|---|
| 30 | } | 
|---|
| 31 |  | 
|---|
| 32 | static void acpi_mp_play_dead(void) | 
|---|
| 33 | { | 
|---|
| 34 | play_dead_common(); | 
|---|
| 35 | asm_acpi_mp_play_dead(reset_vector: acpi_mp_reset_vector_paddr, pgd_pa: acpi_mp_pgd); | 
|---|
| 36 | } | 
|---|
| 37 |  | 
|---|
| 38 | static void acpi_mp_cpu_die(unsigned int cpu) | 
|---|
| 39 | { | 
|---|
| 40 | u32 apicid = per_cpu(x86_cpu_to_apicid, cpu); | 
|---|
| 41 | unsigned long timeout; | 
|---|
| 42 |  | 
|---|
| 43 | /* | 
|---|
| 44 | * Use TEST mailbox command to prove that BIOS got control over | 
|---|
| 45 | * the CPU before declaring it dead. | 
|---|
| 46 | * | 
|---|
| 47 | * BIOS has to clear 'command' field of the mailbox. | 
|---|
| 48 | */ | 
|---|
| 49 | acpi_mp_wake_mailbox->apic_id = apicid; | 
|---|
| 50 | smp_store_release(&acpi_mp_wake_mailbox->command, | 
|---|
| 51 | ACPI_MP_WAKE_COMMAND_TEST); | 
|---|
| 52 |  | 
|---|
| 53 | /* Don't wait longer than a second. */ | 
|---|
| 54 | timeout = USEC_PER_SEC; | 
|---|
| 55 | while (READ_ONCE(acpi_mp_wake_mailbox->command) && --timeout) | 
|---|
| 56 | udelay(usec: 1); | 
|---|
| 57 |  | 
|---|
| 58 | if (!timeout) | 
|---|
| 59 | pr_err( "Failed to hand over CPU %d to BIOS\n", cpu); | 
|---|
| 60 | } | 
|---|
| 61 |  | 
|---|
| 62 | /* The argument is required to match type of x86_mapping_info::alloc_pgt_page */ | 
|---|
| 63 | static void __init *alloc_pgt_page(void *dummy) | 
|---|
| 64 | { | 
|---|
| 65 | return memblock_alloc(PAGE_SIZE, PAGE_SIZE); | 
|---|
| 66 | } | 
|---|
| 67 |  | 
|---|
| 68 | static void __init free_pgt_page(void *pgt, void *dummy) | 
|---|
| 69 | { | 
|---|
| 70 | return memblock_free(ptr: pgt, PAGE_SIZE); | 
|---|
| 71 | } | 
|---|
| 72 |  | 
|---|
| 73 | static int __init acpi_mp_setup_reset(u64 reset_vector) | 
|---|
| 74 | { | 
|---|
| 75 | struct x86_mapping_info info = { | 
|---|
| 76 | .alloc_pgt_page = alloc_pgt_page, | 
|---|
| 77 | .free_pgt_page	= free_pgt_page, | 
|---|
| 78 | .page_flag      = __PAGE_KERNEL_LARGE_EXEC, | 
|---|
| 79 | .kernpg_flag    = _KERNPG_TABLE_NOENC, | 
|---|
| 80 | }; | 
|---|
| 81 | unsigned long mstart, mend; | 
|---|
| 82 | pgd_t *pgd; | 
|---|
| 83 |  | 
|---|
| 84 | pgd = alloc_pgt_page(NULL); | 
|---|
| 85 | if (!pgd) | 
|---|
| 86 | return -ENOMEM; | 
|---|
| 87 |  | 
|---|
| 88 | for (int i = 0; i < nr_pfn_mapped; i++) { | 
|---|
| 89 | mstart = pfn_mapped[i].start << PAGE_SHIFT; | 
|---|
| 90 | mend   = pfn_mapped[i].end << PAGE_SHIFT; | 
|---|
| 91 | if (kernel_ident_mapping_init(info: &info, pgd_page: pgd, pstart: mstart, pend: mend)) { | 
|---|
| 92 | kernel_ident_mapping_free(info: &info, pgd); | 
|---|
| 93 | return -ENOMEM; | 
|---|
| 94 | } | 
|---|
| 95 | } | 
|---|
| 96 |  | 
|---|
| 97 | mstart = PAGE_ALIGN_DOWN(reset_vector); | 
|---|
| 98 | mend = mstart + PAGE_SIZE; | 
|---|
| 99 | if (kernel_ident_mapping_init(info: &info, pgd_page: pgd, pstart: mstart, pend: mend)) { | 
|---|
| 100 | kernel_ident_mapping_free(info: &info, pgd); | 
|---|
| 101 | return -ENOMEM; | 
|---|
| 102 | } | 
|---|
| 103 |  | 
|---|
| 104 | /* | 
|---|
| 105 | * Make sure asm_acpi_mp_play_dead() is present in the identity mapping | 
|---|
| 106 | * at the same place as in the kernel page tables. | 
|---|
| 107 | * asm_acpi_mp_play_dead() switches to the identity mapping and the | 
|---|
| 108 | * function must be present at the same spot in the virtual address space | 
|---|
| 109 | * before and after switching page tables. | 
|---|
| 110 | */ | 
|---|
| 111 | info.offset = __START_KERNEL_map - phys_base; | 
|---|
| 112 | mstart = PAGE_ALIGN_DOWN(__pa(asm_acpi_mp_play_dead)); | 
|---|
| 113 | mend = mstart + PAGE_SIZE; | 
|---|
| 114 | if (kernel_ident_mapping_init(info: &info, pgd_page: pgd, pstart: mstart, pend: mend)) { | 
|---|
| 115 | kernel_ident_mapping_free(info: &info, pgd); | 
|---|
| 116 | return -ENOMEM; | 
|---|
| 117 | } | 
|---|
| 118 |  | 
|---|
| 119 | smp_ops.play_dead = acpi_mp_play_dead; | 
|---|
| 120 | smp_ops.stop_this_cpu = acpi_mp_stop_this_cpu; | 
|---|
| 121 | smp_ops.cpu_die = acpi_mp_cpu_die; | 
|---|
| 122 |  | 
|---|
| 123 | acpi_mp_reset_vector_paddr = reset_vector; | 
|---|
| 124 | acpi_mp_pgd = __pa(pgd); | 
|---|
| 125 |  | 
|---|
| 126 | return 0; | 
|---|
| 127 | } | 
|---|
| 128 |  | 
|---|
| 129 | static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip, unsigned int cpu) | 
|---|
| 130 | { | 
|---|
| 131 | if (!acpi_mp_wake_mailbox_paddr) { | 
|---|
| 132 | pr_warn_once( "No MADT mailbox: cannot bringup secondary CPUs. Booting with kexec?\n"); | 
|---|
| 133 | return -EOPNOTSUPP; | 
|---|
| 134 | } | 
|---|
| 135 |  | 
|---|
| 136 | /* | 
|---|
| 137 | * Remap mailbox memory only for the first call to acpi_wakeup_cpu(). | 
|---|
| 138 | * | 
|---|
| 139 | * Wakeup of secondary CPUs is fully serialized in the core code. | 
|---|
| 140 | * No need to protect acpi_mp_wake_mailbox from concurrent accesses. | 
|---|
| 141 | */ | 
|---|
| 142 | if (!acpi_mp_wake_mailbox) { | 
|---|
| 143 | acpi_mp_wake_mailbox = memremap(offset: acpi_mp_wake_mailbox_paddr, | 
|---|
| 144 | size: sizeof(*acpi_mp_wake_mailbox), | 
|---|
| 145 | flags: MEMREMAP_WB); | 
|---|
| 146 | } | 
|---|
| 147 |  | 
|---|
| 148 | /* | 
|---|
| 149 | * Mailbox memory is shared between the firmware and OS. Firmware will | 
|---|
| 150 | * listen on mailbox command address, and once it receives the wakeup | 
|---|
| 151 | * command, the CPU associated with the given apicid will be booted. | 
|---|
| 152 | * | 
|---|
| 153 | * The value of 'apic_id' and 'wakeup_vector' must be visible to the | 
|---|
| 154 | * firmware before the wakeup command is visible.  smp_store_release() | 
|---|
| 155 | * ensures ordering and visibility. | 
|---|
| 156 | */ | 
|---|
| 157 | acpi_mp_wake_mailbox->apic_id	    = apicid; | 
|---|
| 158 | acpi_mp_wake_mailbox->wakeup_vector = start_ip; | 
|---|
| 159 | smp_store_release(&acpi_mp_wake_mailbox->command, | 
|---|
| 160 | ACPI_MP_WAKE_COMMAND_WAKEUP); | 
|---|
| 161 |  | 
|---|
| 162 | /* | 
|---|
| 163 | * Wait for the CPU to wake up. | 
|---|
| 164 | * | 
|---|
| 165 | * The CPU being woken up is essentially in a spin loop waiting to be | 
|---|
| 166 | * woken up. It should not take long for it wake up and acknowledge by | 
|---|
| 167 | * zeroing out ->command. | 
|---|
| 168 | * | 
|---|
| 169 | * ACPI specification doesn't provide any guidance on how long kernel | 
|---|
| 170 | * has to wait for a wake up acknowledgment. It also doesn't provide | 
|---|
| 171 | * a way to cancel a wake up request if it takes too long. | 
|---|
| 172 | * | 
|---|
| 173 | * In TDX environment, the VMM has control over how long it takes to | 
|---|
| 174 | * wake up secondary. It can postpone scheduling secondary vCPU | 
|---|
| 175 | * indefinitely. Giving up on wake up request and reporting error opens | 
|---|
| 176 | * possible attack vector for VMM: it can wake up a secondary CPU when | 
|---|
| 177 | * kernel doesn't expect it. Wait until positive result of the wake up | 
|---|
| 178 | * request. | 
|---|
| 179 | */ | 
|---|
| 180 | while (READ_ONCE(acpi_mp_wake_mailbox->command)) | 
|---|
| 181 | cpu_relax(); | 
|---|
| 182 |  | 
|---|
| 183 | return 0; | 
|---|
| 184 | } | 
|---|
| 185 |  | 
|---|
| 186 | static void acpi_mp_disable_offlining(struct acpi_madt_multiproc_wakeup *mp_wake) | 
|---|
| 187 | { | 
|---|
| 188 | cpu_hotplug_disable_offlining(); | 
|---|
| 189 |  | 
|---|
| 190 | /* | 
|---|
| 191 | * ACPI MADT doesn't allow to offline a CPU after it was onlined. This | 
|---|
| 192 | * limits kexec: the second kernel won't be able to use more than one CPU. | 
|---|
| 193 | * | 
|---|
| 194 | * To prevent a kexec kernel from onlining secondary CPUs invalidate the | 
|---|
| 195 | * mailbox address in the ACPI MADT wakeup structure which prevents a | 
|---|
| 196 | * kexec kernel to use it. | 
|---|
| 197 | * | 
|---|
| 198 | * This is safe as the booting kernel has the mailbox address cached | 
|---|
| 199 | * already and acpi_wakeup_cpu() uses the cached value to bring up the | 
|---|
| 200 | * secondary CPUs. | 
|---|
| 201 | * | 
|---|
| 202 | * Note: This is a Linux specific convention and not covered by the | 
|---|
| 203 | *       ACPI specification. | 
|---|
| 204 | */ | 
|---|
| 205 | mp_wake->mailbox_address = 0; | 
|---|
| 206 | } | 
|---|
| 207 |  | 
|---|
| 208 | int __init acpi_parse_mp_wake(union acpi_subtable_headers *, | 
|---|
| 209 | const unsigned long end) | 
|---|
| 210 | { | 
|---|
| 211 | struct acpi_madt_multiproc_wakeup *mp_wake; | 
|---|
| 212 |  | 
|---|
| 213 | mp_wake = (struct acpi_madt_multiproc_wakeup *)header; | 
|---|
| 214 |  | 
|---|
| 215 | /* | 
|---|
| 216 | * Cannot use the standard BAD_MADT_ENTRY() to sanity check the @mp_wake | 
|---|
| 217 | * entry.  'sizeof (struct acpi_madt_multiproc_wakeup)' can be larger | 
|---|
| 218 | * than the actual size of the MP wakeup entry in ACPI table because the | 
|---|
| 219 | * 'reset_vector' is only available in the V1 MP wakeup structure. | 
|---|
| 220 | */ | 
|---|
| 221 | if (!mp_wake) | 
|---|
| 222 | return -EINVAL; | 
|---|
| 223 | if (end - (unsigned long)mp_wake < ACPI_MADT_MP_WAKEUP_SIZE_V0) | 
|---|
| 224 | return -EINVAL; | 
|---|
| 225 | if (mp_wake->header.length < ACPI_MADT_MP_WAKEUP_SIZE_V0) | 
|---|
| 226 | return -EINVAL; | 
|---|
| 227 |  | 
|---|
| 228 | acpi_table_print_madt_entry(madt: &header->common); | 
|---|
| 229 |  | 
|---|
| 230 | acpi_mp_wake_mailbox_paddr = mp_wake->mailbox_address; | 
|---|
| 231 |  | 
|---|
| 232 | if (mp_wake->version >= ACPI_MADT_MP_WAKEUP_VERSION_V1 && | 
|---|
| 233 | mp_wake->header.length >= ACPI_MADT_MP_WAKEUP_SIZE_V1) { | 
|---|
| 234 | if (acpi_mp_setup_reset(reset_vector: mp_wake->reset_vector)) { | 
|---|
| 235 | pr_warn( "Failed to setup MADT reset vector\n"); | 
|---|
| 236 | acpi_mp_disable_offlining(mp_wake); | 
|---|
| 237 | } | 
|---|
| 238 | } else { | 
|---|
| 239 | /* | 
|---|
| 240 | * CPU offlining requires version 1 of the ACPI MADT wakeup | 
|---|
| 241 | * structure. | 
|---|
| 242 | */ | 
|---|
| 243 | acpi_mp_disable_offlining(mp_wake); | 
|---|
| 244 | } | 
|---|
| 245 |  | 
|---|
| 246 | apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu); | 
|---|
| 247 |  | 
|---|
| 248 | return 0; | 
|---|
| 249 | } | 
|---|
| 250 |  | 
|---|