| 1 | // SPDX-License-Identifier: GPL-2.0-only | 
|---|
| 2 | /* ----------------------------------------------------------------------- * | 
|---|
| 3 | * | 
|---|
| 4 | *   Copyright 2014 Intel Corporation; author: H. Peter Anvin | 
|---|
| 5 | * | 
|---|
| 6 | * ----------------------------------------------------------------------- */ | 
|---|
| 7 |  | 
|---|
| 8 | /* | 
|---|
| 9 | * The IRET instruction, when returning to a 16-bit segment, only | 
|---|
| 10 | * restores the bottom 16 bits of the user space stack pointer.  This | 
|---|
| 11 | * causes some 16-bit software to break, but it also leaks kernel state | 
|---|
| 12 | * to user space. | 
|---|
| 13 | * | 
|---|
| 14 | * This works around this by creating percpu "ministacks", each of which | 
|---|
| 15 | * is mapped 2^16 times 64K apart.  When we detect that the return SS is | 
|---|
| 16 | * on the LDT, we copy the IRET frame to the ministack and use the | 
|---|
| 17 | * relevant alias to return to userspace.  The ministacks are mapped | 
|---|
| 18 | * readonly, so if the IRET fault we promote #GP to #DF which is an IST | 
|---|
| 19 | * vector and thus has its own stack; we then do the fixup in the #DF | 
|---|
| 20 | * handler. | 
|---|
| 21 | * | 
|---|
| 22 | * This file sets up the ministacks and the related page tables.  The | 
|---|
| 23 | * actual ministack invocation is in entry_64.S. | 
|---|
| 24 | */ | 
|---|
| 25 |  | 
|---|
| 26 | #include <linux/init.h> | 
|---|
| 27 | #include <linux/init_task.h> | 
|---|
| 28 | #include <linux/kernel.h> | 
|---|
| 29 | #include <linux/percpu.h> | 
|---|
| 30 | #include <linux/gfp.h> | 
|---|
| 31 | #include <linux/random.h> | 
|---|
| 32 | #include <linux/pgtable.h> | 
|---|
| 33 | #include <asm/pgalloc.h> | 
|---|
| 34 | #include <asm/setup.h> | 
|---|
| 35 | #include <asm/espfix.h> | 
|---|
| 36 |  | 
|---|
| 37 | /* | 
|---|
| 38 | * Note: we only need 6*8 = 48 bytes for the espfix stack, but round | 
|---|
| 39 | * it up to a cache line to avoid unnecessary sharing. | 
|---|
| 40 | */ | 
|---|
| 41 | #define ESPFIX_STACK_SIZE	(8*8UL) | 
|---|
| 42 | #define ESPFIX_STACKS_PER_PAGE	(PAGE_SIZE/ESPFIX_STACK_SIZE) | 
|---|
| 43 |  | 
|---|
| 44 | /* There is address space for how many espfix pages? */ | 
|---|
| 45 | #define ESPFIX_PAGE_SPACE	(1UL << (P4D_SHIFT-PAGE_SHIFT-16)) | 
|---|
| 46 |  | 
|---|
| 47 | #define ESPFIX_MAX_CPUS		(ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE) | 
|---|
| 48 | #if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS | 
|---|
| 49 | # error "Need more virtual address space for the ESPFIX hack" | 
|---|
| 50 | #endif | 
|---|
| 51 |  | 
|---|
| 52 | #define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) | 
|---|
| 53 |  | 
|---|
| 54 | /* This contains the *bottom* address of the espfix stack */ | 
|---|
| 55 | DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); | 
|---|
| 56 | DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); | 
|---|
| 57 |  | 
|---|
| 58 | /* Initialization mutex - should this be a spinlock? */ | 
|---|
| 59 | static DEFINE_MUTEX(espfix_init_mutex); | 
|---|
| 60 |  | 
|---|
| 61 | /* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */ | 
|---|
| 62 | #define ESPFIX_MAX_PAGES  DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE) | 
|---|
| 63 | static void *espfix_pages[ESPFIX_MAX_PAGES]; | 
|---|
| 64 |  | 
|---|
| 65 | static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD] | 
|---|
| 66 | __aligned(PAGE_SIZE); | 
|---|
| 67 |  | 
|---|
| 68 | static unsigned int page_random, slot_random; | 
|---|
| 69 |  | 
|---|
| 70 | /* | 
|---|
| 71 | * This returns the bottom address of the espfix stack for a specific CPU. | 
|---|
| 72 | * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case | 
|---|
| 73 | * we have to account for some amount of padding at the end of each page. | 
|---|
| 74 | */ | 
|---|
| 75 | static inline unsigned long espfix_base_addr(unsigned int cpu) | 
|---|
| 76 | { | 
|---|
| 77 | unsigned long page, slot; | 
|---|
| 78 | unsigned long addr; | 
|---|
| 79 |  | 
|---|
| 80 | page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random; | 
|---|
| 81 | slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE; | 
|---|
| 82 | addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE); | 
|---|
| 83 | addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16); | 
|---|
| 84 | addr += ESPFIX_BASE_ADDR; | 
|---|
| 85 | return addr; | 
|---|
| 86 | } | 
|---|
| 87 |  | 
|---|
| 88 | #define PTE_STRIDE        (65536/PAGE_SIZE) | 
|---|
| 89 | #define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE) | 
|---|
| 90 | #define ESPFIX_PMD_CLONES PTRS_PER_PMD | 
|---|
| 91 | #define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES)) | 
|---|
| 92 |  | 
|---|
| 93 | #define PGTABLE_PROT	  ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX) | 
|---|
| 94 |  | 
|---|
| 95 | static void init_espfix_random(void) | 
|---|
| 96 | { | 
|---|
| 97 | unsigned long rand = get_random_long(); | 
|---|
| 98 |  | 
|---|
| 99 | slot_random = rand % ESPFIX_STACKS_PER_PAGE; | 
|---|
| 100 | page_random = (rand / ESPFIX_STACKS_PER_PAGE) | 
|---|
| 101 | & (ESPFIX_PAGE_SPACE - 1); | 
|---|
| 102 | } | 
|---|
| 103 |  | 
|---|
| 104 | void __init init_espfix_bsp(void) | 
|---|
| 105 | { | 
|---|
| 106 | pgd_t *pgd; | 
|---|
| 107 | p4d_t *p4d; | 
|---|
| 108 |  | 
|---|
| 109 | /* FRED systems always restore the full value of %rsp */ | 
|---|
| 110 | if (cpu_feature_enabled(X86_FEATURE_FRED)) | 
|---|
| 111 | return; | 
|---|
| 112 |  | 
|---|
| 113 | /* Install the espfix pud into the kernel page directory */ | 
|---|
| 114 | pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)]; | 
|---|
| 115 | p4d = p4d_alloc(mm: &init_mm, pgd, ESPFIX_BASE_ADDR); | 
|---|
| 116 | p4d_populate(mm: &init_mm, p4d, pud: espfix_pud_page); | 
|---|
| 117 |  | 
|---|
| 118 | /* Randomize the locations */ | 
|---|
| 119 | init_espfix_random(); | 
|---|
| 120 |  | 
|---|
| 121 | /* The rest is the same as for any other processor */ | 
|---|
| 122 | init_espfix_ap(cpu: 0); | 
|---|
| 123 | } | 
|---|
| 124 |  | 
|---|
| 125 | void init_espfix_ap(int cpu) | 
|---|
| 126 | { | 
|---|
| 127 | unsigned int page; | 
|---|
| 128 | unsigned long addr; | 
|---|
| 129 | pud_t pud, *pud_p; | 
|---|
| 130 | pmd_t pmd, *pmd_p; | 
|---|
| 131 | pte_t pte, *pte_p; | 
|---|
| 132 | int n, node; | 
|---|
| 133 | void *stack_page; | 
|---|
| 134 | pteval_t ptemask; | 
|---|
| 135 |  | 
|---|
| 136 | /* FRED systems always restore the full value of %rsp */ | 
|---|
| 137 | if (cpu_feature_enabled(X86_FEATURE_FRED)) | 
|---|
| 138 | return; | 
|---|
| 139 |  | 
|---|
| 140 | /* We only have to do this once... */ | 
|---|
| 141 | if (likely(per_cpu(espfix_stack, cpu))) | 
|---|
| 142 | return;		/* Already initialized */ | 
|---|
| 143 |  | 
|---|
| 144 | addr = espfix_base_addr(cpu); | 
|---|
| 145 | page = cpu/ESPFIX_STACKS_PER_PAGE; | 
|---|
| 146 |  | 
|---|
| 147 | /* Did another CPU already set this up? */ | 
|---|
| 148 | stack_page = READ_ONCE(espfix_pages[page]); | 
|---|
| 149 | if (likely(stack_page)) | 
|---|
| 150 | goto done; | 
|---|
| 151 |  | 
|---|
| 152 | mutex_lock(lock: &espfix_init_mutex); | 
|---|
| 153 |  | 
|---|
| 154 | /* Did we race on the lock? */ | 
|---|
| 155 | stack_page = READ_ONCE(espfix_pages[page]); | 
|---|
| 156 | if (stack_page) | 
|---|
| 157 | goto unlock_done; | 
|---|
| 158 |  | 
|---|
| 159 | node = cpu_to_node(cpu); | 
|---|
| 160 | ptemask = __supported_pte_mask; | 
|---|
| 161 |  | 
|---|
| 162 | pud_p = &espfix_pud_page[pud_index(address: addr)]; | 
|---|
| 163 | pud = *pud_p; | 
|---|
| 164 | if (!pud_present(pud)) { | 
|---|
| 165 | struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0); | 
|---|
| 166 |  | 
|---|
| 167 | pmd_p = (pmd_t *)page_address(page); | 
|---|
| 168 | pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask)); | 
|---|
| 169 | paravirt_alloc_pmd(mm: &init_mm, __pa(pmd_p) >> PAGE_SHIFT); | 
|---|
| 170 | for (n = 0; n < ESPFIX_PUD_CLONES; n++) | 
|---|
| 171 | set_pud(&pud_p[n], pud); | 
|---|
| 172 | } | 
|---|
| 173 |  | 
|---|
| 174 | pmd_p = pmd_offset(pud: &pud, address: addr); | 
|---|
| 175 | pmd = *pmd_p; | 
|---|
| 176 | if (!pmd_present(pmd)) { | 
|---|
| 177 | struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0); | 
|---|
| 178 |  | 
|---|
| 179 | pte_p = (pte_t *)page_address(page); | 
|---|
| 180 | pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask)); | 
|---|
| 181 | paravirt_alloc_pte(mm: &init_mm, __pa(pte_p) >> PAGE_SHIFT); | 
|---|
| 182 | for (n = 0; n < ESPFIX_PMD_CLONES; n++) | 
|---|
| 183 | set_pmd(&pmd_p[n], pmd); | 
|---|
| 184 | } | 
|---|
| 185 |  | 
|---|
| 186 | pte_p = pte_offset_kernel(pmd: &pmd, address: addr); | 
|---|
| 187 | stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0)); | 
|---|
| 188 | /* | 
|---|
| 189 | * __PAGE_KERNEL_* includes _PAGE_GLOBAL, which we want since | 
|---|
| 190 | * this is mapped to userspace. | 
|---|
| 191 | */ | 
|---|
| 192 | pte = __pte(__pa(stack_page) | ((__PAGE_KERNEL_RO | _PAGE_ENC) & ptemask)); | 
|---|
| 193 | for (n = 0; n < ESPFIX_PTE_CLONES; n++) | 
|---|
| 194 | set_pte(&pte_p[n*PTE_STRIDE], pte); | 
|---|
| 195 |  | 
|---|
| 196 | /* Job is done for this CPU and any CPU which shares this page */ | 
|---|
| 197 | WRITE_ONCE(espfix_pages[page], stack_page); | 
|---|
| 198 |  | 
|---|
| 199 | unlock_done: | 
|---|
| 200 | mutex_unlock(lock: &espfix_init_mutex); | 
|---|
| 201 | done: | 
|---|
| 202 | per_cpu(espfix_stack, cpu) = addr; | 
|---|
| 203 | per_cpu(espfix_waddr, cpu) = (unsigned long)stack_page | 
|---|
| 204 | + (addr & ~PAGE_MASK); | 
|---|
| 205 | } | 
|---|
| 206 |  | 
|---|