| 1 | // SPDX-License-Identifier: GPL-2.0-only | 
|---|
| 2 | /* | 
|---|
| 3 | *  Copyright (C) 1995  Linus Torvalds | 
|---|
| 4 | * | 
|---|
| 5 | *  Pentium III FXSR, SSE support | 
|---|
| 6 | *	Gareth Hughes <gareth@valinux.com>, May 2000 | 
|---|
| 7 | * | 
|---|
| 8 | *  X86-64 port | 
|---|
| 9 | *	Andi Kleen. | 
|---|
| 10 | * | 
|---|
| 11 | *	CPU hotplug support - ashok.raj@intel.com | 
|---|
| 12 | */ | 
|---|
| 13 |  | 
|---|
| 14 | /* | 
|---|
| 15 | * This file handles the architecture-dependent parts of process handling.. | 
|---|
| 16 | */ | 
|---|
| 17 |  | 
|---|
| 18 | #include <linux/cpu.h> | 
|---|
| 19 | #include <linux/errno.h> | 
|---|
| 20 | #include <linux/sched.h> | 
|---|
| 21 | #include <linux/sched/task.h> | 
|---|
| 22 | #include <linux/sched/task_stack.h> | 
|---|
| 23 | #include <linux/fs.h> | 
|---|
| 24 | #include <linux/kernel.h> | 
|---|
| 25 | #include <linux/mm.h> | 
|---|
| 26 | #include <linux/elfcore.h> | 
|---|
| 27 | #include <linux/smp.h> | 
|---|
| 28 | #include <linux/slab.h> | 
|---|
| 29 | #include <linux/user.h> | 
|---|
| 30 | #include <linux/interrupt.h> | 
|---|
| 31 | #include <linux/delay.h> | 
|---|
| 32 | #include <linux/export.h> | 
|---|
| 33 | #include <linux/ptrace.h> | 
|---|
| 34 | #include <linux/notifier.h> | 
|---|
| 35 | #include <linux/kprobes.h> | 
|---|
| 36 | #include <linux/kdebug.h> | 
|---|
| 37 | #include <linux/prctl.h> | 
|---|
| 38 | #include <linux/uaccess.h> | 
|---|
| 39 | #include <linux/io.h> | 
|---|
| 40 | #include <linux/ftrace.h> | 
|---|
| 41 | #include <linux/syscalls.h> | 
|---|
| 42 | #include <linux/iommu.h> | 
|---|
| 43 |  | 
|---|
| 44 | #include <asm/processor.h> | 
|---|
| 45 | #include <asm/pkru.h> | 
|---|
| 46 | #include <asm/fpu/sched.h> | 
|---|
| 47 | #include <asm/mmu_context.h> | 
|---|
| 48 | #include <asm/prctl.h> | 
|---|
| 49 | #include <asm/desc.h> | 
|---|
| 50 | #include <asm/proto.h> | 
|---|
| 51 | #include <asm/ia32.h> | 
|---|
| 52 | #include <asm/debugreg.h> | 
|---|
| 53 | #include <asm/switch_to.h> | 
|---|
| 54 | #include <asm/xen/hypervisor.h> | 
|---|
| 55 | #include <asm/vdso.h> | 
|---|
| 56 | #include <asm/resctrl.h> | 
|---|
| 57 | #include <asm/unistd.h> | 
|---|
| 58 | #include <asm/fsgsbase.h> | 
|---|
| 59 | #include <asm/fred.h> | 
|---|
| 60 | #include <asm/msr.h> | 
|---|
| 61 | #ifdef CONFIG_IA32_EMULATION | 
|---|
| 62 | /* Not included via unistd.h */ | 
|---|
| 63 | #include <asm/unistd_32_ia32.h> | 
|---|
| 64 | #endif | 
|---|
| 65 |  | 
|---|
| 66 | #include "process.h" | 
|---|
| 67 |  | 
|---|
| 68 | /* Prints also some state that isn't saved in the pt_regs */ | 
|---|
| 69 | void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, | 
|---|
| 70 | const char *log_lvl) | 
|---|
| 71 | { | 
|---|
| 72 | unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; | 
|---|
| 73 | unsigned long d0, d1, d2, d3, d6, d7; | 
|---|
| 74 | unsigned int fsindex, gsindex; | 
|---|
| 75 | unsigned int ds, es; | 
|---|
| 76 |  | 
|---|
| 77 | show_iret_regs(regs, log_lvl); | 
|---|
| 78 |  | 
|---|
| 79 | if (regs->orig_ax != -1) | 
|---|
| 80 | pr_cont( " ORIG_RAX: %016lx\n", regs->orig_ax); | 
|---|
| 81 | else | 
|---|
| 82 | pr_cont( "\n"); | 
|---|
| 83 |  | 
|---|
| 84 | printk( "%sRAX: %016lx RBX: %016lx RCX: %016lx\n", | 
|---|
| 85 | log_lvl, regs->ax, regs->bx, regs->cx); | 
|---|
| 86 | printk( "%sRDX: %016lx RSI: %016lx RDI: %016lx\n", | 
|---|
| 87 | log_lvl, regs->dx, regs->si, regs->di); | 
|---|
| 88 | printk( "%sRBP: %016lx R08: %016lx R09: %016lx\n", | 
|---|
| 89 | log_lvl, regs->bp, regs->r8, regs->r9); | 
|---|
| 90 | printk( "%sR10: %016lx R11: %016lx R12: %016lx\n", | 
|---|
| 91 | log_lvl, regs->r10, regs->r11, regs->r12); | 
|---|
| 92 | printk( "%sR13: %016lx R14: %016lx R15: %016lx\n", | 
|---|
| 93 | log_lvl, regs->r13, regs->r14, regs->r15); | 
|---|
| 94 |  | 
|---|
| 95 | if (mode == SHOW_REGS_SHORT) | 
|---|
| 96 | return; | 
|---|
| 97 |  | 
|---|
| 98 | if (mode == SHOW_REGS_USER) { | 
|---|
| 99 | rdmsrq(MSR_FS_BASE, fs); | 
|---|
| 100 | rdmsrq(MSR_KERNEL_GS_BASE, shadowgs); | 
|---|
| 101 | printk( "%sFS:  %016lx GS:  %016lx\n", | 
|---|
| 102 | log_lvl, fs, shadowgs); | 
|---|
| 103 | return; | 
|---|
| 104 | } | 
|---|
| 105 |  | 
|---|
| 106 | asm( "movl %%ds,%0": "=r"(ds)); | 
|---|
| 107 | asm( "movl %%es,%0": "=r"(es)); | 
|---|
| 108 | asm( "movl %%fs,%0": "=r"(fsindex)); | 
|---|
| 109 | asm( "movl %%gs,%0": "=r"(gsindex)); | 
|---|
| 110 |  | 
|---|
| 111 | rdmsrq(MSR_FS_BASE, fs); | 
|---|
| 112 | rdmsrq(MSR_GS_BASE, gs); | 
|---|
| 113 | rdmsrq(MSR_KERNEL_GS_BASE, shadowgs); | 
|---|
| 114 |  | 
|---|
| 115 | cr0 = read_cr0(); | 
|---|
| 116 | cr2 = read_cr2(); | 
|---|
| 117 | cr3 = __read_cr3(); | 
|---|
| 118 | cr4 = __read_cr4(); | 
|---|
| 119 |  | 
|---|
| 120 | printk( "%sFS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", | 
|---|
| 121 | log_lvl, fs, fsindex, gs, gsindex, shadowgs); | 
|---|
| 122 | printk( "%sCS:  %04x DS: %04x ES: %04x CR0: %016lx\n", | 
|---|
| 123 | log_lvl, regs->cs, ds, es, cr0); | 
|---|
| 124 | printk( "%sCR2: %016lx CR3: %016lx CR4: %016lx\n", | 
|---|
| 125 | log_lvl, cr2, cr3, cr4); | 
|---|
| 126 |  | 
|---|
| 127 | get_debugreg(d0, 0); | 
|---|
| 128 | get_debugreg(d1, 1); | 
|---|
| 129 | get_debugreg(d2, 2); | 
|---|
| 130 | get_debugreg(d3, 3); | 
|---|
| 131 | get_debugreg(d6, 6); | 
|---|
| 132 | get_debugreg(d7, 7); | 
|---|
| 133 |  | 
|---|
| 134 | /* Only print out debug registers if they are in their non-default state. */ | 
|---|
| 135 | if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && | 
|---|
| 136 | (d6 == DR6_RESERVED) && (d7 == DR7_FIXED_1))) { | 
|---|
| 137 | printk( "%sDR0: %016lx DR1: %016lx DR2: %016lx\n", | 
|---|
| 138 | log_lvl, d0, d1, d2); | 
|---|
| 139 | printk( "%sDR3: %016lx DR6: %016lx DR7: %016lx\n", | 
|---|
| 140 | log_lvl, d3, d6, d7); | 
|---|
| 141 | } | 
|---|
| 142 |  | 
|---|
| 143 | if (cr4 & X86_CR4_PKE) | 
|---|
| 144 | printk( "%sPKRU: %08x\n", log_lvl, read_pkru()); | 
|---|
| 145 | } | 
|---|
| 146 |  | 
|---|
| 147 | void release_thread(struct task_struct *dead_task) | 
|---|
| 148 | { | 
|---|
| 149 | WARN_ON(dead_task->mm); | 
|---|
| 150 | } | 
|---|
| 151 |  | 
|---|
| 152 | enum which_selector { | 
|---|
| 153 | FS, | 
|---|
| 154 | GS | 
|---|
| 155 | }; | 
|---|
| 156 |  | 
|---|
| 157 | /* | 
|---|
| 158 | * Out of line to be protected from kprobes and tracing. If this would be | 
|---|
| 159 | * traced or probed than any access to a per CPU variable happens with | 
|---|
| 160 | * the wrong GS. | 
|---|
| 161 | * | 
|---|
| 162 | * It is not used on Xen paravirt. When paravirt support is needed, it | 
|---|
| 163 | * needs to be renamed with native_ prefix. | 
|---|
| 164 | */ | 
|---|
| 165 | static noinstr unsigned long __rdgsbase_inactive(void) | 
|---|
| 166 | { | 
|---|
| 167 | unsigned long gsbase; | 
|---|
| 168 |  | 
|---|
| 169 | lockdep_assert_irqs_disabled(); | 
|---|
| 170 |  | 
|---|
| 171 | /* | 
|---|
| 172 | * SWAPGS is no longer needed thus NOT allowed with FRED because | 
|---|
| 173 | * FRED transitions ensure that an operating system can _always_ | 
|---|
| 174 | * operate with its own GS base address: | 
|---|
| 175 | * - For events that occur in ring 3, FRED event delivery swaps | 
|---|
| 176 | *   the GS base address with the IA32_KERNEL_GS_BASE MSR. | 
|---|
| 177 | * - ERETU (the FRED transition that returns to ring 3) also swaps | 
|---|
| 178 | *   the GS base address with the IA32_KERNEL_GS_BASE MSR. | 
|---|
| 179 | * | 
|---|
| 180 | * And the operating system can still setup the GS segment for a | 
|---|
| 181 | * user thread without the need of loading a user thread GS with: | 
|---|
| 182 | * - Using LKGS, available with FRED, to modify other attributes | 
|---|
| 183 | *   of the GS segment without compromising its ability always to | 
|---|
| 184 | *   operate with its own GS base address. | 
|---|
| 185 | * - Accessing the GS segment base address for a user thread as | 
|---|
| 186 | *   before using RDMSR or WRMSR on the IA32_KERNEL_GS_BASE MSR. | 
|---|
| 187 | * | 
|---|
| 188 | * Note, LKGS loads the GS base address into the IA32_KERNEL_GS_BASE | 
|---|
| 189 | * MSR instead of the GS segment’s descriptor cache. As such, the | 
|---|
| 190 | * operating system never changes its runtime GS base address. | 
|---|
| 191 | */ | 
|---|
| 192 | if (!cpu_feature_enabled(X86_FEATURE_FRED) && | 
|---|
| 193 | !cpu_feature_enabled(X86_FEATURE_XENPV)) { | 
|---|
| 194 | native_swapgs(); | 
|---|
| 195 | gsbase = rdgsbase(); | 
|---|
| 196 | native_swapgs(); | 
|---|
| 197 | } else { | 
|---|
| 198 | instrumentation_begin(); | 
|---|
| 199 | rdmsrq(MSR_KERNEL_GS_BASE, gsbase); | 
|---|
| 200 | instrumentation_end(); | 
|---|
| 201 | } | 
|---|
| 202 |  | 
|---|
| 203 | return gsbase; | 
|---|
| 204 | } | 
|---|
| 205 |  | 
|---|
| 206 | /* | 
|---|
| 207 | * Out of line to be protected from kprobes and tracing. If this would be | 
|---|
| 208 | * traced or probed than any access to a per CPU variable happens with | 
|---|
| 209 | * the wrong GS. | 
|---|
| 210 | * | 
|---|
| 211 | * It is not used on Xen paravirt. When paravirt support is needed, it | 
|---|
| 212 | * needs to be renamed with native_ prefix. | 
|---|
| 213 | */ | 
|---|
| 214 | static noinstr void __wrgsbase_inactive(unsigned long gsbase) | 
|---|
| 215 | { | 
|---|
| 216 | lockdep_assert_irqs_disabled(); | 
|---|
| 217 |  | 
|---|
| 218 | if (!cpu_feature_enabled(X86_FEATURE_FRED) && | 
|---|
| 219 | !cpu_feature_enabled(X86_FEATURE_XENPV)) { | 
|---|
| 220 | native_swapgs(); | 
|---|
| 221 | wrgsbase(gsbase); | 
|---|
| 222 | native_swapgs(); | 
|---|
| 223 | } else { | 
|---|
| 224 | instrumentation_begin(); | 
|---|
| 225 | wrmsrq(MSR_KERNEL_GS_BASE, val: gsbase); | 
|---|
| 226 | instrumentation_end(); | 
|---|
| 227 | } | 
|---|
| 228 | } | 
|---|
| 229 |  | 
|---|
| 230 | /* | 
|---|
| 231 | * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are | 
|---|
| 232 | * not available.  The goal is to be reasonably fast on non-FSGSBASE systems. | 
|---|
| 233 | * It's forcibly inlined because it'll generate better code and this function | 
|---|
| 234 | * is hot. | 
|---|
| 235 | */ | 
|---|
| 236 | static __always_inline void save_base_legacy(struct task_struct *prev_p, | 
|---|
| 237 | unsigned short selector, | 
|---|
| 238 | enum which_selector which) | 
|---|
| 239 | { | 
|---|
| 240 | if (likely(selector == 0)) { | 
|---|
| 241 | /* | 
|---|
| 242 | * On Intel (without X86_BUG_NULL_SEG), the segment base could | 
|---|
| 243 | * be the pre-existing saved base or it could be zero.  On AMD | 
|---|
| 244 | * (with X86_BUG_NULL_SEG), the segment base could be almost | 
|---|
| 245 | * anything. | 
|---|
| 246 | * | 
|---|
| 247 | * This branch is very hot (it's hit twice on almost every | 
|---|
| 248 | * context switch between 64-bit programs), and avoiding | 
|---|
| 249 | * the RDMSR helps a lot, so we just assume that whatever | 
|---|
| 250 | * value is already saved is correct.  This matches historical | 
|---|
| 251 | * Linux behavior, so it won't break existing applications. | 
|---|
| 252 | * | 
|---|
| 253 | * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we | 
|---|
| 254 | * report that the base is zero, it needs to actually be zero: | 
|---|
| 255 | * see the corresponding logic in load_seg_legacy. | 
|---|
| 256 | */ | 
|---|
| 257 | } else { | 
|---|
| 258 | /* | 
|---|
| 259 | * If the selector is 1, 2, or 3, then the base is zero on | 
|---|
| 260 | * !X86_BUG_NULL_SEG CPUs and could be anything on | 
|---|
| 261 | * X86_BUG_NULL_SEG CPUs.  In the latter case, Linux | 
|---|
| 262 | * has never attempted to preserve the base across context | 
|---|
| 263 | * switches. | 
|---|
| 264 | * | 
|---|
| 265 | * If selector > 3, then it refers to a real segment, and | 
|---|
| 266 | * saving the base isn't necessary. | 
|---|
| 267 | */ | 
|---|
| 268 | if (which == FS) | 
|---|
| 269 | prev_p->thread.fsbase = 0; | 
|---|
| 270 | else | 
|---|
| 271 | prev_p->thread.gsbase = 0; | 
|---|
| 272 | } | 
|---|
| 273 | } | 
|---|
| 274 |  | 
|---|
| 275 | static __always_inline void save_fsgs(struct task_struct *task) | 
|---|
| 276 | { | 
|---|
| 277 | savesegment(fs, task->thread.fsindex); | 
|---|
| 278 | savesegment(gs, task->thread.gsindex); | 
|---|
| 279 | if (static_cpu_has(X86_FEATURE_FSGSBASE)) { | 
|---|
| 280 | /* | 
|---|
| 281 | * If FSGSBASE is enabled, we can't make any useful guesses | 
|---|
| 282 | * about the base, and user code expects us to save the current | 
|---|
| 283 | * value.  Fortunately, reading the base directly is efficient. | 
|---|
| 284 | */ | 
|---|
| 285 | task->thread.fsbase = rdfsbase(); | 
|---|
| 286 | task->thread.gsbase = __rdgsbase_inactive(); | 
|---|
| 287 | } else { | 
|---|
| 288 | save_base_legacy(prev_p: task, selector: task->thread.fsindex, which: FS); | 
|---|
| 289 | save_base_legacy(prev_p: task, selector: task->thread.gsindex, which: GS); | 
|---|
| 290 | } | 
|---|
| 291 | } | 
|---|
| 292 |  | 
|---|
| 293 | /* | 
|---|
| 294 | * While a process is running,current->thread.fsbase and current->thread.gsbase | 
|---|
| 295 | * may not match the corresponding CPU registers (see save_base_legacy()). | 
|---|
| 296 | */ | 
|---|
| 297 | void current_save_fsgs(void) | 
|---|
| 298 | { | 
|---|
| 299 | unsigned long flags; | 
|---|
| 300 |  | 
|---|
| 301 | /* Interrupts need to be off for FSGSBASE */ | 
|---|
| 302 | local_irq_save(flags); | 
|---|
| 303 | save_fsgs(current); | 
|---|
| 304 | local_irq_restore(flags); | 
|---|
| 305 | } | 
|---|
| 306 | #if IS_ENABLED(CONFIG_KVM) | 
|---|
| 307 | EXPORT_SYMBOL_GPL(current_save_fsgs); | 
|---|
| 308 | #endif | 
|---|
| 309 |  | 
|---|
| 310 | static __always_inline void loadseg(enum which_selector which, | 
|---|
| 311 | unsigned short sel) | 
|---|
| 312 | { | 
|---|
| 313 | if (which == FS) | 
|---|
| 314 | loadsegment(fs, sel); | 
|---|
| 315 | else | 
|---|
| 316 | load_gs_index(selector: sel); | 
|---|
| 317 | } | 
|---|
| 318 |  | 
|---|
| 319 | static __always_inline void load_seg_legacy(unsigned short prev_index, | 
|---|
| 320 | unsigned long prev_base, | 
|---|
| 321 | unsigned short next_index, | 
|---|
| 322 | unsigned long next_base, | 
|---|
| 323 | enum which_selector which) | 
|---|
| 324 | { | 
|---|
| 325 | if (likely(next_index <= 3)) { | 
|---|
| 326 | /* | 
|---|
| 327 | * The next task is using 64-bit TLS, is not using this | 
|---|
| 328 | * segment at all, or is having fun with arcane CPU features. | 
|---|
| 329 | */ | 
|---|
| 330 | if (next_base == 0) { | 
|---|
| 331 | /* | 
|---|
| 332 | * Nasty case: on AMD CPUs, we need to forcibly zero | 
|---|
| 333 | * the base. | 
|---|
| 334 | */ | 
|---|
| 335 | if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { | 
|---|
| 336 | loadseg(which, __USER_DS); | 
|---|
| 337 | loadseg(which, sel: next_index); | 
|---|
| 338 | } else { | 
|---|
| 339 | /* | 
|---|
| 340 | * We could try to exhaustively detect cases | 
|---|
| 341 | * under which we can skip the segment load, | 
|---|
| 342 | * but there's really only one case that matters | 
|---|
| 343 | * for performance: if both the previous and | 
|---|
| 344 | * next states are fully zeroed, we can skip | 
|---|
| 345 | * the load. | 
|---|
| 346 | * | 
|---|
| 347 | * (This assumes that prev_base == 0 has no | 
|---|
| 348 | * false positives.  This is the case on | 
|---|
| 349 | * Intel-style CPUs.) | 
|---|
| 350 | */ | 
|---|
| 351 | if (likely(prev_index | next_index | prev_base)) | 
|---|
| 352 | loadseg(which, sel: next_index); | 
|---|
| 353 | } | 
|---|
| 354 | } else { | 
|---|
| 355 | if (prev_index != next_index) | 
|---|
| 356 | loadseg(which, sel: next_index); | 
|---|
| 357 | wrmsrq(msr: which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, | 
|---|
| 358 | val: next_base); | 
|---|
| 359 | } | 
|---|
| 360 | } else { | 
|---|
| 361 | /* | 
|---|
| 362 | * The next task is using a real segment.  Loading the selector | 
|---|
| 363 | * is sufficient. | 
|---|
| 364 | */ | 
|---|
| 365 | loadseg(which, sel: next_index); | 
|---|
| 366 | } | 
|---|
| 367 | } | 
|---|
| 368 |  | 
|---|
| 369 | /* | 
|---|
| 370 | * Store prev's PKRU value and load next's PKRU value if they differ. PKRU | 
|---|
| 371 | * is not XSTATE managed on context switch because that would require a | 
|---|
| 372 | * lookup in the task's FPU xsave buffer and require to keep that updated | 
|---|
| 373 | * in various places. | 
|---|
| 374 | */ | 
|---|
| 375 | static __always_inline void x86_pkru_load(struct thread_struct *prev, | 
|---|
| 376 | struct thread_struct *next) | 
|---|
| 377 | { | 
|---|
| 378 | if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) | 
|---|
| 379 | return; | 
|---|
| 380 |  | 
|---|
| 381 | /* Stash the prev task's value: */ | 
|---|
| 382 | prev->pkru = rdpkru(); | 
|---|
| 383 |  | 
|---|
| 384 | /* | 
|---|
| 385 | * PKRU writes are slightly expensive.  Avoid them when not | 
|---|
| 386 | * strictly necessary: | 
|---|
| 387 | */ | 
|---|
| 388 | if (prev->pkru != next->pkru) | 
|---|
| 389 | wrpkru(pkru: next->pkru); | 
|---|
| 390 | } | 
|---|
| 391 |  | 
|---|
| 392 | static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, | 
|---|
| 393 | struct thread_struct *next) | 
|---|
| 394 | { | 
|---|
| 395 | if (static_cpu_has(X86_FEATURE_FSGSBASE)) { | 
|---|
| 396 | /* Update the FS and GS selectors if they could have changed. */ | 
|---|
| 397 | if (unlikely(prev->fsindex || next->fsindex)) | 
|---|
| 398 | loadseg(which: FS, sel: next->fsindex); | 
|---|
| 399 | if (unlikely(prev->gsindex || next->gsindex)) | 
|---|
| 400 | loadseg(which: GS, sel: next->gsindex); | 
|---|
| 401 |  | 
|---|
| 402 | /* Update the bases. */ | 
|---|
| 403 | wrfsbase(fsbase: next->fsbase); | 
|---|
| 404 | __wrgsbase_inactive(gsbase: next->gsbase); | 
|---|
| 405 | } else { | 
|---|
| 406 | load_seg_legacy(prev_index: prev->fsindex, prev_base: prev->fsbase, | 
|---|
| 407 | next_index: next->fsindex, next_base: next->fsbase, which: FS); | 
|---|
| 408 | load_seg_legacy(prev_index: prev->gsindex, prev_base: prev->gsbase, | 
|---|
| 409 | next_index: next->gsindex, next_base: next->gsbase, which: GS); | 
|---|
| 410 | } | 
|---|
| 411 | } | 
|---|
| 412 |  | 
|---|
| 413 | unsigned long x86_fsgsbase_read_task(struct task_struct *task, | 
|---|
| 414 | unsigned short selector) | 
|---|
| 415 | { | 
|---|
| 416 | unsigned short idx = selector >> 3; | 
|---|
| 417 | unsigned long base; | 
|---|
| 418 |  | 
|---|
| 419 | if (likely((selector & SEGMENT_TI_MASK) == 0)) { | 
|---|
| 420 | if (unlikely(idx >= GDT_ENTRIES)) | 
|---|
| 421 | return 0; | 
|---|
| 422 |  | 
|---|
| 423 | /* | 
|---|
| 424 | * There are no user segments in the GDT with nonzero bases | 
|---|
| 425 | * other than the TLS segments. | 
|---|
| 426 | */ | 
|---|
| 427 | if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) | 
|---|
| 428 | return 0; | 
|---|
| 429 |  | 
|---|
| 430 | idx -= GDT_ENTRY_TLS_MIN; | 
|---|
| 431 | base = get_desc_base(desc: &task->thread.tls_array[idx]); | 
|---|
| 432 | } else { | 
|---|
| 433 | #ifdef CONFIG_MODIFY_LDT_SYSCALL | 
|---|
| 434 | struct ldt_struct *ldt; | 
|---|
| 435 |  | 
|---|
| 436 | /* | 
|---|
| 437 | * If performance here mattered, we could protect the LDT | 
|---|
| 438 | * with RCU.  This is a slow path, though, so we can just | 
|---|
| 439 | * take the mutex. | 
|---|
| 440 | */ | 
|---|
| 441 | mutex_lock(lock: &task->mm->context.lock); | 
|---|
| 442 | ldt = task->mm->context.ldt; | 
|---|
| 443 | if (unlikely(!ldt || idx >= ldt->nr_entries)) | 
|---|
| 444 | base = 0; | 
|---|
| 445 | else | 
|---|
| 446 | base = get_desc_base(desc: ldt->entries + idx); | 
|---|
| 447 | mutex_unlock(lock: &task->mm->context.lock); | 
|---|
| 448 | #else | 
|---|
| 449 | base = 0; | 
|---|
| 450 | #endif | 
|---|
| 451 | } | 
|---|
| 452 |  | 
|---|
| 453 | return base; | 
|---|
| 454 | } | 
|---|
| 455 |  | 
|---|
| 456 | unsigned long x86_gsbase_read_cpu_inactive(void) | 
|---|
| 457 | { | 
|---|
| 458 | unsigned long gsbase; | 
|---|
| 459 |  | 
|---|
| 460 | if (boot_cpu_has(X86_FEATURE_FSGSBASE)) { | 
|---|
| 461 | unsigned long flags; | 
|---|
| 462 |  | 
|---|
| 463 | local_irq_save(flags); | 
|---|
| 464 | gsbase = __rdgsbase_inactive(); | 
|---|
| 465 | local_irq_restore(flags); | 
|---|
| 466 | } else { | 
|---|
| 467 | rdmsrq(MSR_KERNEL_GS_BASE, gsbase); | 
|---|
| 468 | } | 
|---|
| 469 |  | 
|---|
| 470 | return gsbase; | 
|---|
| 471 | } | 
|---|
| 472 |  | 
|---|
| 473 | void x86_gsbase_write_cpu_inactive(unsigned long gsbase) | 
|---|
| 474 | { | 
|---|
| 475 | if (boot_cpu_has(X86_FEATURE_FSGSBASE)) { | 
|---|
| 476 | unsigned long flags; | 
|---|
| 477 |  | 
|---|
| 478 | local_irq_save(flags); | 
|---|
| 479 | __wrgsbase_inactive(gsbase); | 
|---|
| 480 | local_irq_restore(flags); | 
|---|
| 481 | } else { | 
|---|
| 482 | wrmsrq(MSR_KERNEL_GS_BASE, val: gsbase); | 
|---|
| 483 | } | 
|---|
| 484 | } | 
|---|
| 485 |  | 
|---|
| 486 | unsigned long x86_fsbase_read_task(struct task_struct *task) | 
|---|
| 487 | { | 
|---|
| 488 | unsigned long fsbase; | 
|---|
| 489 |  | 
|---|
| 490 | if (task == current) | 
|---|
| 491 | fsbase = x86_fsbase_read_cpu(); | 
|---|
| 492 | else if (boot_cpu_has(X86_FEATURE_FSGSBASE) || | 
|---|
| 493 | (task->thread.fsindex == 0)) | 
|---|
| 494 | fsbase = task->thread.fsbase; | 
|---|
| 495 | else | 
|---|
| 496 | fsbase = x86_fsgsbase_read_task(task, selector: task->thread.fsindex); | 
|---|
| 497 |  | 
|---|
| 498 | return fsbase; | 
|---|
| 499 | } | 
|---|
| 500 |  | 
|---|
| 501 | unsigned long x86_gsbase_read_task(struct task_struct *task) | 
|---|
| 502 | { | 
|---|
| 503 | unsigned long gsbase; | 
|---|
| 504 |  | 
|---|
| 505 | if (task == current) | 
|---|
| 506 | gsbase = x86_gsbase_read_cpu_inactive(); | 
|---|
| 507 | else if (boot_cpu_has(X86_FEATURE_FSGSBASE) || | 
|---|
| 508 | (task->thread.gsindex == 0)) | 
|---|
| 509 | gsbase = task->thread.gsbase; | 
|---|
| 510 | else | 
|---|
| 511 | gsbase = x86_fsgsbase_read_task(task, selector: task->thread.gsindex); | 
|---|
| 512 |  | 
|---|
| 513 | return gsbase; | 
|---|
| 514 | } | 
|---|
| 515 |  | 
|---|
| 516 | void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase) | 
|---|
| 517 | { | 
|---|
| 518 | WARN_ON_ONCE(task == current); | 
|---|
| 519 |  | 
|---|
| 520 | task->thread.fsbase = fsbase; | 
|---|
| 521 | } | 
|---|
| 522 |  | 
|---|
| 523 | void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase) | 
|---|
| 524 | { | 
|---|
| 525 | WARN_ON_ONCE(task == current); | 
|---|
| 526 |  | 
|---|
| 527 | task->thread.gsbase = gsbase; | 
|---|
| 528 | } | 
|---|
| 529 |  | 
|---|
| 530 | static void | 
|---|
| 531 | start_thread_common(struct pt_regs *regs, unsigned long new_ip, | 
|---|
| 532 | unsigned long new_sp, | 
|---|
| 533 | u16 _cs, u16 _ss, u16 _ds) | 
|---|
| 534 | { | 
|---|
| 535 | WARN_ON_ONCE(regs != current_pt_regs()); | 
|---|
| 536 |  | 
|---|
| 537 | if (static_cpu_has(X86_BUG_NULL_SEG)) { | 
|---|
| 538 | /* Loading zero below won't clear the base. */ | 
|---|
| 539 | loadsegment(fs, __USER_DS); | 
|---|
| 540 | load_gs_index(__USER_DS); | 
|---|
| 541 | } | 
|---|
| 542 |  | 
|---|
| 543 | reset_thread_features(); | 
|---|
| 544 |  | 
|---|
| 545 | loadsegment(fs, 0); | 
|---|
| 546 | loadsegment(es, _ds); | 
|---|
| 547 | loadsegment(ds, _ds); | 
|---|
| 548 | load_gs_index(selector: 0); | 
|---|
| 549 |  | 
|---|
| 550 | regs->ip	= new_ip; | 
|---|
| 551 | regs->sp	= new_sp; | 
|---|
| 552 | regs->csx	= _cs; | 
|---|
| 553 | regs->ssx	= _ss; | 
|---|
| 554 | /* | 
|---|
| 555 | * Allow single-step trap and NMI when starting a new task, thus | 
|---|
| 556 | * once the new task enters user space, single-step trap and NMI | 
|---|
| 557 | * are both enabled immediately. | 
|---|
| 558 | * | 
|---|
| 559 | * Entering a new task is logically speaking a return from a | 
|---|
| 560 | * system call (exec, fork, clone, etc.). As such, if ptrace | 
|---|
| 561 | * enables single stepping a single step exception should be | 
|---|
| 562 | * allowed to trigger immediately upon entering user space. | 
|---|
| 563 | * This is not optional. | 
|---|
| 564 | * | 
|---|
| 565 | * NMI should *never* be disabled in user space. As such, this | 
|---|
| 566 | * is an optional, opportunistic way to catch errors. | 
|---|
| 567 | * | 
|---|
| 568 | * Paranoia: High-order 48 bits above the lowest 16 bit SS are | 
|---|
| 569 | * discarded by the legacy IRET instruction on all Intel, AMD, | 
|---|
| 570 | * and Cyrix/Centaur/VIA CPUs, thus can be set unconditionally, | 
|---|
| 571 | * even when FRED is not enabled. But we choose the safer side | 
|---|
| 572 | * to use these bits only when FRED is enabled. | 
|---|
| 573 | */ | 
|---|
| 574 | if (cpu_feature_enabled(X86_FEATURE_FRED)) { | 
|---|
| 575 | regs->fred_ss.swevent	= true; | 
|---|
| 576 | regs->fred_ss.nmi	= true; | 
|---|
| 577 | } | 
|---|
| 578 |  | 
|---|
| 579 | regs->flags	= X86_EFLAGS_IF | X86_EFLAGS_FIXED; | 
|---|
| 580 | } | 
|---|
| 581 |  | 
|---|
| 582 | void | 
|---|
| 583 | start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) | 
|---|
| 584 | { | 
|---|
| 585 | start_thread_common(regs, new_ip, new_sp, | 
|---|
| 586 | __USER_CS, __USER_DS, ds: 0); | 
|---|
| 587 | } | 
|---|
| 588 | EXPORT_SYMBOL_GPL(start_thread); | 
|---|
| 589 |  | 
|---|
| 590 | #ifdef CONFIG_COMPAT | 
|---|
| 591 | void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32) | 
|---|
| 592 | { | 
|---|
| 593 | start_thread_common(regs, new_ip, new_sp, | 
|---|
| 594 | cs: x32 ? __USER_CS : __USER32_CS, | 
|---|
| 595 | __USER_DS, __USER_DS); | 
|---|
| 596 | } | 
|---|
| 597 | #endif | 
|---|
| 598 |  | 
|---|
| 599 | /* | 
|---|
| 600 | *	switch_to(x,y) should switch tasks from x to y. | 
|---|
| 601 | * | 
|---|
| 602 | * This could still be optimized: | 
|---|
| 603 | * - fold all the options into a flag word and test it with a single test. | 
|---|
| 604 | * - could test fs/gs bitsliced | 
|---|
| 605 | * | 
|---|
| 606 | * Kprobes not supported here. Set the probe on schedule instead. | 
|---|
| 607 | * Function graph tracer not supported too. | 
|---|
| 608 | */ | 
|---|
| 609 | __no_kmsan_checks | 
|---|
| 610 | __visible __notrace_funcgraph struct task_struct * | 
|---|
| 611 | __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | 
|---|
| 612 | { | 
|---|
| 613 | struct thread_struct *prev = &prev_p->thread; | 
|---|
| 614 | struct thread_struct *next = &next_p->thread; | 
|---|
| 615 | int cpu = smp_processor_id(); | 
|---|
| 616 |  | 
|---|
| 617 | WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && | 
|---|
| 618 | this_cpu_read(hardirq_stack_inuse)); | 
|---|
| 619 |  | 
|---|
| 620 | switch_fpu(old: prev_p, cpu); | 
|---|
| 621 |  | 
|---|
| 622 | /* We must save %fs and %gs before load_TLS() because | 
|---|
| 623 | * %fs and %gs may be cleared by load_TLS(). | 
|---|
| 624 | * | 
|---|
| 625 | * (e.g. xen_load_tls()) | 
|---|
| 626 | */ | 
|---|
| 627 | save_fsgs(task: prev_p); | 
|---|
| 628 |  | 
|---|
| 629 | /* | 
|---|
| 630 | * Load TLS before restoring any segments so that segment loads | 
|---|
| 631 | * reference the correct GDT entries. | 
|---|
| 632 | */ | 
|---|
| 633 | load_TLS(next, cpu); | 
|---|
| 634 |  | 
|---|
| 635 | /* | 
|---|
| 636 | * Leave lazy mode, flushing any hypercalls made here.  This | 
|---|
| 637 | * must be done after loading TLS entries in the GDT but before | 
|---|
| 638 | * loading segments that might reference them. | 
|---|
| 639 | */ | 
|---|
| 640 | arch_end_context_switch(next_p); | 
|---|
| 641 |  | 
|---|
| 642 | /* Switch DS and ES. | 
|---|
| 643 | * | 
|---|
| 644 | * Reading them only returns the selectors, but writing them (if | 
|---|
| 645 | * nonzero) loads the full descriptor from the GDT or LDT.  The | 
|---|
| 646 | * LDT for next is loaded in switch_mm, and the GDT is loaded | 
|---|
| 647 | * above. | 
|---|
| 648 | * | 
|---|
| 649 | * We therefore need to write new values to the segment | 
|---|
| 650 | * registers on every context switch unless both the new and old | 
|---|
| 651 | * values are zero. | 
|---|
| 652 | * | 
|---|
| 653 | * Note that we don't need to do anything for CS and SS, as | 
|---|
| 654 | * those are saved and restored as part of pt_regs. | 
|---|
| 655 | */ | 
|---|
| 656 | savesegment(es, prev->es); | 
|---|
| 657 | if (unlikely(next->es | prev->es)) | 
|---|
| 658 | loadsegment(es, next->es); | 
|---|
| 659 |  | 
|---|
| 660 | savesegment(ds, prev->ds); | 
|---|
| 661 | if (unlikely(next->ds | prev->ds)) | 
|---|
| 662 | loadsegment(ds, next->ds); | 
|---|
| 663 |  | 
|---|
| 664 | x86_fsgsbase_load(prev, next); | 
|---|
| 665 |  | 
|---|
| 666 | x86_pkru_load(prev, next); | 
|---|
| 667 |  | 
|---|
| 668 | /* | 
|---|
| 669 | * Switch the PDA and FPU contexts. | 
|---|
| 670 | */ | 
|---|
| 671 | raw_cpu_write(current_task, next_p); | 
|---|
| 672 | raw_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); | 
|---|
| 673 |  | 
|---|
| 674 | /* Reload sp0. */ | 
|---|
| 675 | update_task_stack(task: next_p); | 
|---|
| 676 |  | 
|---|
| 677 | switch_to_extra(prev: prev_p, next: next_p); | 
|---|
| 678 |  | 
|---|
| 679 | if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) { | 
|---|
| 680 | /* | 
|---|
| 681 | * AMD CPUs have a misfeature: SYSRET sets the SS selector but | 
|---|
| 682 | * does not update the cached descriptor.  As a result, if we | 
|---|
| 683 | * do SYSRET while SS is NULL, we'll end up in user mode with | 
|---|
| 684 | * SS apparently equal to __USER_DS but actually unusable. | 
|---|
| 685 | * | 
|---|
| 686 | * The straightforward workaround would be to fix it up just | 
|---|
| 687 | * before SYSRET, but that would slow down the system call | 
|---|
| 688 | * fast paths.  Instead, we ensure that SS is never NULL in | 
|---|
| 689 | * system call context.  We do this by replacing NULL SS | 
|---|
| 690 | * selectors at every context switch.  SYSCALL sets up a valid | 
|---|
| 691 | * SS, so the only way to get NULL is to re-enter the kernel | 
|---|
| 692 | * from CPL 3 through an interrupt.  Since that can't happen | 
|---|
| 693 | * in the same task as a running syscall, we are guaranteed to | 
|---|
| 694 | * context switch between every interrupt vector entry and a | 
|---|
| 695 | * subsequent SYSRET. | 
|---|
| 696 | * | 
|---|
| 697 | * We read SS first because SS reads are much faster than | 
|---|
| 698 | * writes.  Out of caution, we force SS to __KERNEL_DS even if | 
|---|
| 699 | * it previously had a different non-NULL value. | 
|---|
| 700 | */ | 
|---|
| 701 | unsigned short ss_sel; | 
|---|
| 702 | savesegment(ss, ss_sel); | 
|---|
| 703 | if (ss_sel != __KERNEL_DS) | 
|---|
| 704 | loadsegment(ss, __KERNEL_DS); | 
|---|
| 705 | } | 
|---|
| 706 |  | 
|---|
| 707 | /* Load the Intel cache allocation PQR MSR. */ | 
|---|
| 708 | resctrl_arch_sched_in(tsk: next_p); | 
|---|
| 709 |  | 
|---|
| 710 | /* Reset hw history on AMD CPUs */ | 
|---|
| 711 | if (cpu_feature_enabled(X86_FEATURE_AMD_WORKLOAD_CLASS)) | 
|---|
| 712 | wrmsrl(MSR_AMD_WORKLOAD_HRST, 0x1); | 
|---|
| 713 |  | 
|---|
| 714 | return prev_p; | 
|---|
| 715 | } | 
|---|
| 716 |  | 
|---|
| 717 | void set_personality_64bit(void) | 
|---|
| 718 | { | 
|---|
| 719 | /* inherit personality from parent */ | 
|---|
| 720 |  | 
|---|
| 721 | /* Make sure to be in 64bit mode */ | 
|---|
| 722 | clear_thread_flag(TIF_ADDR32); | 
|---|
| 723 | /* Pretend that this comes from a 64bit execve */ | 
|---|
| 724 | task_pt_regs(current)->orig_ax = __NR_execve; | 
|---|
| 725 | current_thread_info()->status &= ~TS_COMPAT; | 
|---|
| 726 | if (current->mm) | 
|---|
| 727 | __set_bit(MM_CONTEXT_HAS_VSYSCALL, ¤t->mm->context.flags); | 
|---|
| 728 |  | 
|---|
| 729 | /* TBD: overwrites user setup. Should have two bits. | 
|---|
| 730 | But 64bit processes have always behaved this way, | 
|---|
| 731 | so it's not too bad. The main problem is just that | 
|---|
| 732 | 32bit children are affected again. */ | 
|---|
| 733 | current->personality &= ~READ_IMPLIES_EXEC; | 
|---|
| 734 | } | 
|---|
| 735 |  | 
|---|
| 736 | static void __set_personality_x32(void) | 
|---|
| 737 | { | 
|---|
| 738 | #ifdef CONFIG_X86_X32_ABI | 
|---|
| 739 | if (current->mm) | 
|---|
| 740 | current->mm->context.flags = 0; | 
|---|
| 741 |  | 
|---|
| 742 | current->personality &= ~READ_IMPLIES_EXEC; | 
|---|
| 743 | /* | 
|---|
| 744 | * in_32bit_syscall() uses the presence of the x32 syscall bit | 
|---|
| 745 | * flag to determine compat status.  The x86 mmap() code relies on | 
|---|
| 746 | * the syscall bitness so set x32 syscall bit right here to make | 
|---|
| 747 | * in_32bit_syscall() work during exec(). | 
|---|
| 748 | * | 
|---|
| 749 | * Pretend to come from a x32 execve. | 
|---|
| 750 | */ | 
|---|
| 751 | task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT; | 
|---|
| 752 | current_thread_info()->status &= ~TS_COMPAT; | 
|---|
| 753 | #endif | 
|---|
| 754 | } | 
|---|
| 755 |  | 
|---|
| 756 | static void __set_personality_ia32(void) | 
|---|
| 757 | { | 
|---|
| 758 | #ifdef CONFIG_IA32_EMULATION | 
|---|
| 759 | if (current->mm) { | 
|---|
| 760 | /* | 
|---|
| 761 | * uprobes applied to this MM need to know this and | 
|---|
| 762 | * cannot use user_64bit_mode() at that time. | 
|---|
| 763 | */ | 
|---|
| 764 | __set_bit(MM_CONTEXT_UPROBE_IA32, ¤t->mm->context.flags); | 
|---|
| 765 | } | 
|---|
| 766 |  | 
|---|
| 767 | current->personality |= force_personality32; | 
|---|
| 768 | /* Prepare the first "return" to user space */ | 
|---|
| 769 | task_pt_regs(current)->orig_ax = __NR_ia32_execve; | 
|---|
| 770 | current_thread_info()->status |= TS_COMPAT; | 
|---|
| 771 | #endif | 
|---|
| 772 | } | 
|---|
| 773 |  | 
|---|
| 774 | void set_personality_ia32(bool x32) | 
|---|
| 775 | { | 
|---|
| 776 | /* Make sure to be in 32bit mode */ | 
|---|
| 777 | set_thread_flag(TIF_ADDR32); | 
|---|
| 778 |  | 
|---|
| 779 | if (x32) | 
|---|
| 780 | __set_personality_x32(); | 
|---|
| 781 | else | 
|---|
| 782 | __set_personality_ia32(); | 
|---|
| 783 | } | 
|---|
| 784 | EXPORT_SYMBOL_GPL(set_personality_ia32); | 
|---|
| 785 |  | 
|---|
| 786 | #ifdef CONFIG_CHECKPOINT_RESTORE | 
|---|
| 787 | static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) | 
|---|
| 788 | { | 
|---|
| 789 | int ret; | 
|---|
| 790 |  | 
|---|
| 791 | ret = map_vdso_once(image, addr); | 
|---|
| 792 | if (ret) | 
|---|
| 793 | return ret; | 
|---|
| 794 |  | 
|---|
| 795 | return (long)image->size; | 
|---|
| 796 | } | 
|---|
| 797 | #endif | 
|---|
| 798 |  | 
|---|
| 799 | #ifdef CONFIG_ADDRESS_MASKING | 
|---|
| 800 |  | 
|---|
| 801 | #define LAM_U57_BITS 6 | 
|---|
| 802 |  | 
|---|
| 803 | static void enable_lam_func(void *__mm) | 
|---|
| 804 | { | 
|---|
| 805 | struct mm_struct *mm = __mm; | 
|---|
| 806 | unsigned long lam; | 
|---|
| 807 |  | 
|---|
| 808 | if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) { | 
|---|
| 809 | lam = mm_lam_cr3_mask(mm); | 
|---|
| 810 | write_cr3(__read_cr3() | lam); | 
|---|
| 811 | cpu_tlbstate_update_lam(lam, mm_untag_mask(mm)); | 
|---|
| 812 | } | 
|---|
| 813 | } | 
|---|
| 814 |  | 
|---|
| 815 | static void mm_enable_lam(struct mm_struct *mm) | 
|---|
| 816 | { | 
|---|
| 817 | mm->context.lam_cr3_mask = X86_CR3_LAM_U57; | 
|---|
| 818 | mm->context.untag_mask =  ~GENMASK(62, 57); | 
|---|
| 819 |  | 
|---|
| 820 | /* | 
|---|
| 821 | * Even though the process must still be single-threaded at this | 
|---|
| 822 | * point, kernel threads may be using the mm.  IPI those kernel | 
|---|
| 823 | * threads if they exist. | 
|---|
| 824 | */ | 
|---|
| 825 | on_each_cpu_mask(mm_cpumask(mm), enable_lam_func, mm, true); | 
|---|
| 826 | set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags); | 
|---|
| 827 | } | 
|---|
| 828 |  | 
|---|
| 829 | static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits) | 
|---|
| 830 | { | 
|---|
| 831 | if (!cpu_feature_enabled(X86_FEATURE_LAM)) | 
|---|
| 832 | return -ENODEV; | 
|---|
| 833 |  | 
|---|
| 834 | /* PTRACE_ARCH_PRCTL */ | 
|---|
| 835 | if (current->mm != mm) | 
|---|
| 836 | return -EINVAL; | 
|---|
| 837 |  | 
|---|
| 838 | if (mm_valid_pasid(mm) && | 
|---|
| 839 | !test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags)) | 
|---|
| 840 | return -EINVAL; | 
|---|
| 841 |  | 
|---|
| 842 | if (mmap_write_lock_killable(mm)) | 
|---|
| 843 | return -EINTR; | 
|---|
| 844 |  | 
|---|
| 845 | /* | 
|---|
| 846 | * MM_CONTEXT_LOCK_LAM is set on clone.  Prevent LAM from | 
|---|
| 847 | * being enabled unless the process is single threaded: | 
|---|
| 848 | */ | 
|---|
| 849 | if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) { | 
|---|
| 850 | mmap_write_unlock(mm); | 
|---|
| 851 | return -EBUSY; | 
|---|
| 852 | } | 
|---|
| 853 |  | 
|---|
| 854 | if (!nr_bits || nr_bits > LAM_U57_BITS) { | 
|---|
| 855 | mmap_write_unlock(mm); | 
|---|
| 856 | return -EINVAL; | 
|---|
| 857 | } | 
|---|
| 858 |  | 
|---|
| 859 | mm_enable_lam(mm); | 
|---|
| 860 |  | 
|---|
| 861 | mmap_write_unlock(mm); | 
|---|
| 862 |  | 
|---|
| 863 | return 0; | 
|---|
| 864 | } | 
|---|
| 865 | #endif | 
|---|
| 866 |  | 
|---|
| 867 | long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) | 
|---|
| 868 | { | 
|---|
| 869 | int ret = 0; | 
|---|
| 870 |  | 
|---|
| 871 | switch (option) { | 
|---|
| 872 | case ARCH_SET_GS: { | 
|---|
| 873 | if (unlikely(arg2 >= TASK_SIZE_MAX)) | 
|---|
| 874 | return -EPERM; | 
|---|
| 875 |  | 
|---|
| 876 | preempt_disable(); | 
|---|
| 877 | /* | 
|---|
| 878 | * ARCH_SET_GS has always overwritten the index | 
|---|
| 879 | * and the base. Zero is the most sensible value | 
|---|
| 880 | * to put in the index, and is the only value that | 
|---|
| 881 | * makes any sense if FSGSBASE is unavailable. | 
|---|
| 882 | */ | 
|---|
| 883 | if (task == current) { | 
|---|
| 884 | loadseg(which: GS, sel: 0); | 
|---|
| 885 | x86_gsbase_write_cpu_inactive(gsbase: arg2); | 
|---|
| 886 |  | 
|---|
| 887 | /* | 
|---|
| 888 | * On non-FSGSBASE systems, save_base_legacy() expects | 
|---|
| 889 | * that we also fill in thread.gsbase. | 
|---|
| 890 | */ | 
|---|
| 891 | task->thread.gsbase = arg2; | 
|---|
| 892 |  | 
|---|
| 893 | } else { | 
|---|
| 894 | task->thread.gsindex = 0; | 
|---|
| 895 | x86_gsbase_write_task(task, gsbase: arg2); | 
|---|
| 896 | } | 
|---|
| 897 | preempt_enable(); | 
|---|
| 898 | break; | 
|---|
| 899 | } | 
|---|
| 900 | case ARCH_SET_FS: { | 
|---|
| 901 | /* | 
|---|
| 902 | * Not strictly needed for %fs, but do it for symmetry | 
|---|
| 903 | * with %gs | 
|---|
| 904 | */ | 
|---|
| 905 | if (unlikely(arg2 >= TASK_SIZE_MAX)) | 
|---|
| 906 | return -EPERM; | 
|---|
| 907 |  | 
|---|
| 908 | preempt_disable(); | 
|---|
| 909 | /* | 
|---|
| 910 | * Set the selector to 0 for the same reason | 
|---|
| 911 | * as %gs above. | 
|---|
| 912 | */ | 
|---|
| 913 | if (task == current) { | 
|---|
| 914 | loadseg(which: FS, sel: 0); | 
|---|
| 915 | x86_fsbase_write_cpu(fsbase: arg2); | 
|---|
| 916 |  | 
|---|
| 917 | /* | 
|---|
| 918 | * On non-FSGSBASE systems, save_base_legacy() expects | 
|---|
| 919 | * that we also fill in thread.fsbase. | 
|---|
| 920 | */ | 
|---|
| 921 | task->thread.fsbase = arg2; | 
|---|
| 922 | } else { | 
|---|
| 923 | task->thread.fsindex = 0; | 
|---|
| 924 | x86_fsbase_write_task(task, fsbase: arg2); | 
|---|
| 925 | } | 
|---|
| 926 | preempt_enable(); | 
|---|
| 927 | break; | 
|---|
| 928 | } | 
|---|
| 929 | case ARCH_GET_FS: { | 
|---|
| 930 | unsigned long base = x86_fsbase_read_task(task); | 
|---|
| 931 |  | 
|---|
| 932 | ret = put_user(base, (unsigned long __user *)arg2); | 
|---|
| 933 | break; | 
|---|
| 934 | } | 
|---|
| 935 | case ARCH_GET_GS: { | 
|---|
| 936 | unsigned long base = x86_gsbase_read_task(task); | 
|---|
| 937 |  | 
|---|
| 938 | ret = put_user(base, (unsigned long __user *)arg2); | 
|---|
| 939 | break; | 
|---|
| 940 | } | 
|---|
| 941 |  | 
|---|
| 942 | #ifdef CONFIG_CHECKPOINT_RESTORE | 
|---|
| 943 | # ifdef CONFIG_X86_X32_ABI | 
|---|
| 944 | case ARCH_MAP_VDSO_X32: | 
|---|
| 945 | return prctl_map_vdso(&vdso_image_x32, arg2); | 
|---|
| 946 | # endif | 
|---|
| 947 | # ifdef CONFIG_IA32_EMULATION | 
|---|
| 948 | case ARCH_MAP_VDSO_32: | 
|---|
| 949 | return prctl_map_vdso(&vdso_image_32, arg2); | 
|---|
| 950 | # endif | 
|---|
| 951 | case ARCH_MAP_VDSO_64: | 
|---|
| 952 | return prctl_map_vdso(&vdso_image_64, arg2); | 
|---|
| 953 | #endif | 
|---|
| 954 | #ifdef CONFIG_ADDRESS_MASKING | 
|---|
| 955 | case ARCH_GET_UNTAG_MASK: | 
|---|
| 956 | return put_user(task->mm->context.untag_mask, | 
|---|
| 957 | (unsigned long __user *)arg2); | 
|---|
| 958 | case ARCH_ENABLE_TAGGED_ADDR: | 
|---|
| 959 | return prctl_enable_tagged_addr(task->mm, arg2); | 
|---|
| 960 | case ARCH_FORCE_TAGGED_SVA: | 
|---|
| 961 | if (current != task) | 
|---|
| 962 | return -EINVAL; | 
|---|
| 963 | set_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &task->mm->context.flags); | 
|---|
| 964 | return 0; | 
|---|
| 965 | case ARCH_GET_MAX_TAG_BITS: | 
|---|
| 966 | if (!cpu_feature_enabled(X86_FEATURE_LAM)) | 
|---|
| 967 | return put_user(0, (unsigned long __user *)arg2); | 
|---|
| 968 | else | 
|---|
| 969 | return put_user(LAM_U57_BITS, (unsigned long __user *)arg2); | 
|---|
| 970 | #endif | 
|---|
| 971 | case ARCH_SHSTK_ENABLE: | 
|---|
| 972 | case ARCH_SHSTK_DISABLE: | 
|---|
| 973 | case ARCH_SHSTK_LOCK: | 
|---|
| 974 | case ARCH_SHSTK_UNLOCK: | 
|---|
| 975 | case ARCH_SHSTK_STATUS: | 
|---|
| 976 | return shstk_prctl(task, option, arg2); | 
|---|
| 977 | default: | 
|---|
| 978 | ret = -EINVAL; | 
|---|
| 979 | break; | 
|---|
| 980 | } | 
|---|
| 981 |  | 
|---|
| 982 | return ret; | 
|---|
| 983 | } | 
|---|
| 984 |  | 
|---|