| 1 | /* SPDX-License-Identifier: GPL-2.0 */ | 
|---|
| 2 | /* | 
|---|
| 3 | * Compatibility mode system call entry point for x86-64. | 
|---|
| 4 | * | 
|---|
| 5 | * Copyright 2000-2002 Andi Kleen, SuSE Labs. | 
|---|
| 6 | */ | 
|---|
| 7 | #include <asm/asm-offsets.h> | 
|---|
| 8 | #include <asm/current.h> | 
|---|
| 9 | #include <asm/errno.h> | 
|---|
| 10 | #include <asm/thread_info.h> | 
|---|
| 11 | #include <asm/segment.h> | 
|---|
| 12 | #include <asm/irqflags.h> | 
|---|
| 13 | #include <asm/asm.h> | 
|---|
| 14 | #include <asm/smap.h> | 
|---|
| 15 | #include <asm/nospec-branch.h> | 
|---|
| 16 | #include <linux/linkage.h> | 
|---|
| 17 | #include <linux/err.h> | 
|---|
| 18 |  | 
|---|
| 19 | #include "calling.h" | 
|---|
| 20 |  | 
|---|
| 21 | .section .entry.text, "ax" | 
|---|
| 22 |  | 
|---|
| 23 | /* | 
|---|
| 24 | * 32-bit SYSENTER entry. | 
|---|
| 25 | * | 
|---|
| 26 | * 32-bit system calls through the vDSO's __kernel_vsyscall enter here | 
|---|
| 27 | * on 64-bit kernels running on Intel CPUs. | 
|---|
| 28 | * | 
|---|
| 29 | * The SYSENTER instruction, in principle, should *only* occur in the | 
|---|
| 30 | * vDSO.  In practice, a small number of Android devices were shipped | 
|---|
| 31 | * with a copy of Bionic that inlined a SYSENTER instruction.  This | 
|---|
| 32 | * never happened in any of Google's Bionic versions -- it only happened | 
|---|
| 33 | * in a narrow range of Intel-provided versions. | 
|---|
| 34 | * | 
|---|
| 35 | * SYSENTER loads SS, RSP, CS, and RIP from previously programmed MSRs. | 
|---|
| 36 | * IF and VM in RFLAGS are cleared (IOW: interrupts are off). | 
|---|
| 37 | * SYSENTER does not save anything on the stack, | 
|---|
| 38 | * and does not save old RIP (!!!), RSP, or RFLAGS. | 
|---|
| 39 | * | 
|---|
| 40 | * Arguments: | 
|---|
| 41 | * eax  system call number | 
|---|
| 42 | * ebx  arg1 | 
|---|
| 43 | * ecx  arg2 | 
|---|
| 44 | * edx  arg3 | 
|---|
| 45 | * esi  arg4 | 
|---|
| 46 | * edi  arg5 | 
|---|
| 47 | * ebp  user stack | 
|---|
| 48 | * 0(%ebp) arg6 | 
|---|
| 49 | */ | 
|---|
| 50 | SYM_CODE_START(entry_SYSENTER_compat) | 
|---|
| 51 | UNWIND_HINT_ENTRY | 
|---|
| 52 | ENDBR | 
|---|
| 53 | /* Interrupts are off on entry. */ | 
|---|
| 54 | swapgs | 
|---|
| 55 |  | 
|---|
| 56 | pushq	%rax | 
|---|
| 57 | SWITCH_TO_KERNEL_CR3 scratch_reg=%rax | 
|---|
| 58 | popq	%rax | 
|---|
| 59 |  | 
|---|
| 60 | movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp | 
|---|
| 61 |  | 
|---|
| 62 | /* Construct struct pt_regs on stack */ | 
|---|
| 63 | pushq	$__USER_DS		/* pt_regs->ss */ | 
|---|
| 64 | pushq	$0			/* pt_regs->sp = 0 (placeholder) */ | 
|---|
| 65 |  | 
|---|
| 66 | /* | 
|---|
| 67 | * Push flags.  This is nasty.  First, interrupts are currently | 
|---|
| 68 | * off, but we need pt_regs->flags to have IF set.  Second, if TS | 
|---|
| 69 | * was set in usermode, it's still set, and we're singlestepping | 
|---|
| 70 | * through this code.  do_SYSENTER_32() will fix up IF. | 
|---|
| 71 | */ | 
|---|
| 72 | pushfq				/* pt_regs->flags (except IF = 0) */ | 
|---|
| 73 | pushq	$__USER32_CS		/* pt_regs->cs */ | 
|---|
| 74 | pushq	$0			/* pt_regs->ip = 0 (placeholder) */ | 
|---|
| 75 | SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) | 
|---|
| 76 |  | 
|---|
| 77 | /* | 
|---|
| 78 | * User tracing code (ptrace or signal handlers) might assume that | 
|---|
| 79 | * the saved RAX contains a 32-bit number when we're invoking a 32-bit | 
|---|
| 80 | * syscall.  Just in case the high bits are nonzero, zero-extend | 
|---|
| 81 | * the syscall number.  (This could almost certainly be deleted | 
|---|
| 82 | * with no ill effects.) | 
|---|
| 83 | */ | 
|---|
| 84 | movl	%eax, %eax | 
|---|
| 85 |  | 
|---|
| 86 | pushq	%rax			/* pt_regs->orig_ax */ | 
|---|
| 87 | PUSH_AND_CLEAR_REGS rax=$-ENOSYS | 
|---|
| 88 | UNWIND_HINT_REGS | 
|---|
| 89 |  | 
|---|
| 90 | cld | 
|---|
| 91 |  | 
|---|
| 92 | /* | 
|---|
| 93 | * SYSENTER doesn't filter flags, so we need to clear NT and AC | 
|---|
| 94 | * ourselves.  To save a few cycles, we can check whether | 
|---|
| 95 | * either was set instead of doing an unconditional popfq. | 
|---|
| 96 | * This needs to happen before enabling interrupts so that | 
|---|
| 97 | * we don't get preempted with NT set. | 
|---|
| 98 | * | 
|---|
| 99 | * If TF is set, we will single-step all the way to here -- do_debug | 
|---|
| 100 | * will ignore all the traps.  (Yes, this is slow, but so is | 
|---|
| 101 | * single-stepping in general.  This allows us to avoid having | 
|---|
| 102 | * a more complicated code to handle the case where a user program | 
|---|
| 103 | * forces us to single-step through the SYSENTER entry code.) | 
|---|
| 104 | * | 
|---|
| 105 | * NB.: .Lsysenter_fix_flags is a label with the code under it moved | 
|---|
| 106 | * out-of-line as an optimization: NT is unlikely to be set in the | 
|---|
| 107 | * majority of the cases and instead of polluting the I$ unnecessarily, | 
|---|
| 108 | * we're keeping that code behind a branch which will predict as | 
|---|
| 109 | * not-taken and therefore its instructions won't be fetched. | 
|---|
| 110 | */ | 
|---|
| 111 | testl	$X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, EFLAGS(%rsp) | 
|---|
| 112 | jnz	.Lsysenter_fix_flags | 
|---|
| 113 | .Lsysenter_flags_fixed: | 
|---|
| 114 |  | 
|---|
| 115 | /* | 
|---|
| 116 | * CPU bugs mitigations mechanisms can call other functions. They | 
|---|
| 117 | * should be invoked after making sure TF is cleared because | 
|---|
| 118 | * single-step is ignored only for instructions inside the | 
|---|
| 119 | * entry_SYSENTER_compat function. | 
|---|
| 120 | */ | 
|---|
| 121 | IBRS_ENTER | 
|---|
| 122 | UNTRAIN_RET | 
|---|
| 123 | CLEAR_BRANCH_HISTORY | 
|---|
| 124 |  | 
|---|
| 125 | movq	%rsp, %rdi | 
|---|
| 126 | call	do_SYSENTER_32 | 
|---|
| 127 | jmp	sysret32_from_system_call | 
|---|
| 128 |  | 
|---|
| 129 | .Lsysenter_fix_flags: | 
|---|
| 130 | pushq	$X86_EFLAGS_FIXED | 
|---|
| 131 | popfq | 
|---|
| 132 | jmp	.Lsysenter_flags_fixed | 
|---|
| 133 | SYM_INNER_LABEL(__end_entry_SYSENTER_compat, SYM_L_GLOBAL) | 
|---|
| 134 | SYM_CODE_END(entry_SYSENTER_compat) | 
|---|
| 135 |  | 
|---|
| 136 | /* | 
|---|
| 137 | * 32-bit SYSCALL entry. | 
|---|
| 138 | * | 
|---|
| 139 | * 32-bit system calls through the vDSO's __kernel_vsyscall enter here | 
|---|
| 140 | * on 64-bit kernels running on AMD CPUs. | 
|---|
| 141 | * | 
|---|
| 142 | * The SYSCALL instruction, in principle, should *only* occur in the | 
|---|
| 143 | * vDSO.  In practice, it appears that this really is the case. | 
|---|
| 144 | * As evidence: | 
|---|
| 145 | * | 
|---|
| 146 | *  - The calling convention for SYSCALL has changed several times without | 
|---|
| 147 | *    anyone noticing. | 
|---|
| 148 | * | 
|---|
| 149 | *  - Prior to the in-kernel X86_BUG_SYSRET_SS_ATTRS fixup, anything | 
|---|
| 150 | *    user task that did SYSCALL without immediately reloading SS | 
|---|
| 151 | *    would randomly crash. | 
|---|
| 152 | * | 
|---|
| 153 | *  - Most programmers do not directly target AMD CPUs, and the 32-bit | 
|---|
| 154 | *    SYSCALL instruction does not exist on Intel CPUs.  Even on AMD | 
|---|
| 155 | *    CPUs, Linux disables the SYSCALL instruction on 32-bit kernels | 
|---|
| 156 | *    because the SYSCALL instruction in legacy/native 32-bit mode (as | 
|---|
| 157 | *    opposed to compat mode) is sufficiently poorly designed as to be | 
|---|
| 158 | *    essentially unusable. | 
|---|
| 159 | * | 
|---|
| 160 | * 32-bit SYSCALL saves RIP to RCX, clears RFLAGS.RF, then saves | 
|---|
| 161 | * RFLAGS to R11, then loads new SS, CS, and RIP from previously | 
|---|
| 162 | * programmed MSRs.  RFLAGS gets masked by a value from another MSR | 
|---|
| 163 | * (so CLD and CLAC are not needed).  SYSCALL does not save anything on | 
|---|
| 164 | * the stack and does not change RSP. | 
|---|
| 165 | * | 
|---|
| 166 | * Note: RFLAGS saving+masking-with-MSR happens only in Long mode | 
|---|
| 167 | * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it). | 
|---|
| 168 | * Don't get confused: RFLAGS saving+masking depends on Long Mode Active bit | 
|---|
| 169 | * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes | 
|---|
| 170 | * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). | 
|---|
| 171 | * | 
|---|
| 172 | * Arguments: | 
|---|
| 173 | * eax  system call number | 
|---|
| 174 | * ecx  return address | 
|---|
| 175 | * ebx  arg1 | 
|---|
| 176 | * ebp  arg2	(note: not saved in the stack frame, should not be touched) | 
|---|
| 177 | * edx  arg3 | 
|---|
| 178 | * esi  arg4 | 
|---|
| 179 | * edi  arg5 | 
|---|
| 180 | * esp  user stack | 
|---|
| 181 | * 0(%esp) arg6 | 
|---|
| 182 | */ | 
|---|
| 183 | SYM_CODE_START(entry_SYSCALL_compat) | 
|---|
| 184 | UNWIND_HINT_ENTRY | 
|---|
| 185 | ENDBR | 
|---|
| 186 | /* Interrupts are off on entry. */ | 
|---|
| 187 | swapgs | 
|---|
| 188 |  | 
|---|
| 189 | /* Stash user ESP */ | 
|---|
| 190 | movl	%esp, %r8d | 
|---|
| 191 |  | 
|---|
| 192 | /* Use %rsp as scratch reg. User ESP is stashed in r8 */ | 
|---|
| 193 | SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp | 
|---|
| 194 |  | 
|---|
| 195 | /* Switch to the kernel stack */ | 
|---|
| 196 | movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp | 
|---|
| 197 |  | 
|---|
| 198 | SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL) | 
|---|
| 199 | ANNOTATE_NOENDBR | 
|---|
| 200 |  | 
|---|
| 201 | /* Construct struct pt_regs on stack */ | 
|---|
| 202 | pushq	$__USER_DS		/* pt_regs->ss */ | 
|---|
| 203 | pushq	%r8			/* pt_regs->sp */ | 
|---|
| 204 | pushq	%r11			/* pt_regs->flags */ | 
|---|
| 205 | pushq	$__USER32_CS		/* pt_regs->cs */ | 
|---|
| 206 | pushq	%rcx			/* pt_regs->ip */ | 
|---|
| 207 | SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL) | 
|---|
| 208 | movl	%eax, %eax		/* discard orig_ax high bits */ | 
|---|
| 209 | pushq	%rax			/* pt_regs->orig_ax */ | 
|---|
| 210 | PUSH_AND_CLEAR_REGS rcx=%rbp rax=$-ENOSYS | 
|---|
| 211 | UNWIND_HINT_REGS | 
|---|
| 212 |  | 
|---|
| 213 | IBRS_ENTER | 
|---|
| 214 | UNTRAIN_RET | 
|---|
| 215 | CLEAR_BRANCH_HISTORY | 
|---|
| 216 |  | 
|---|
| 217 | movq	%rsp, %rdi | 
|---|
| 218 | call	do_fast_syscall_32 | 
|---|
| 219 |  | 
|---|
| 220 | sysret32_from_system_call: | 
|---|
| 221 | /* XEN PV guests always use IRET path */ | 
|---|
| 222 | ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \ | 
|---|
| 223 | "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV | 
|---|
| 224 |  | 
|---|
| 225 | /* | 
|---|
| 226 | * Opportunistic SYSRET | 
|---|
| 227 | * | 
|---|
| 228 | * We are not going to return to userspace from the trampoline | 
|---|
| 229 | * stack. So let's erase the thread stack right now. | 
|---|
| 230 | */ | 
|---|
| 231 | STACKLEAK_ERASE | 
|---|
| 232 |  | 
|---|
| 233 | IBRS_EXIT | 
|---|
| 234 |  | 
|---|
| 235 | movq	RBX(%rsp), %rbx		/* pt_regs->rbx */ | 
|---|
| 236 | movq	RBP(%rsp), %rbp		/* pt_regs->rbp */ | 
|---|
| 237 | movq	EFLAGS(%rsp), %r11	/* pt_regs->flags (in r11) */ | 
|---|
| 238 | movq	RIP(%rsp), %rcx		/* pt_regs->ip (in rcx) */ | 
|---|
| 239 | addq	$RAX, %rsp		/* Skip r8-r15 */ | 
|---|
| 240 | popq	%rax			/* pt_regs->rax */ | 
|---|
| 241 | popq	%rdx			/* Skip pt_regs->cx */ | 
|---|
| 242 | popq	%rdx			/* pt_regs->dx */ | 
|---|
| 243 | popq	%rsi			/* pt_regs->si */ | 
|---|
| 244 | popq	%rdi			/* pt_regs->di */ | 
|---|
| 245 |  | 
|---|
| 246 | /* | 
|---|
| 247 | * USERGS_SYSRET32 does: | 
|---|
| 248 | *  GSBASE = user's GS base | 
|---|
| 249 | *  EIP = ECX | 
|---|
| 250 | *  RFLAGS = R11 | 
|---|
| 251 | *  CS = __USER32_CS | 
|---|
| 252 | *  SS = __USER_DS | 
|---|
| 253 | * | 
|---|
| 254 | * ECX will not match pt_regs->cx, but we're returning to a vDSO | 
|---|
| 255 | * trampoline that will fix up RCX, so this is okay. | 
|---|
| 256 | * | 
|---|
| 257 | * R12-R15 are callee-saved, so they contain whatever was in them | 
|---|
| 258 | * when the system call started, which is already known to user | 
|---|
| 259 | * code.  We zero R8-R10 to avoid info leaks. | 
|---|
| 260 | */ | 
|---|
| 261 | movq	RSP-ORIG_RAX(%rsp), %rsp | 
|---|
| 262 | SYM_INNER_LABEL(entry_SYSRETL_compat_unsafe_stack, SYM_L_GLOBAL) | 
|---|
| 263 | ANNOTATE_NOENDBR | 
|---|
| 264 |  | 
|---|
| 265 | /* | 
|---|
| 266 | * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored | 
|---|
| 267 | * on the process stack which is not mapped to userspace and | 
|---|
| 268 | * not readable after we SWITCH_TO_USER_CR3.  Delay the CR3 | 
|---|
| 269 | * switch until after after the last reference to the process | 
|---|
| 270 | * stack. | 
|---|
| 271 | * | 
|---|
| 272 | * %r8/%r9 are zeroed before the sysret, thus safe to clobber. | 
|---|
| 273 | */ | 
|---|
| 274 | SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9 | 
|---|
| 275 |  | 
|---|
| 276 | xorl	%r8d, %r8d | 
|---|
| 277 | xorl	%r9d, %r9d | 
|---|
| 278 | xorl	%r10d, %r10d | 
|---|
| 279 | swapgs | 
|---|
| 280 | CLEAR_CPU_BUFFERS | 
|---|
| 281 | sysretl | 
|---|
| 282 | SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL) | 
|---|
| 283 | ANNOTATE_NOENDBR | 
|---|
| 284 | int3 | 
|---|
| 285 | SYM_CODE_END(entry_SYSCALL_compat) | 
|---|
| 286 |  | 
|---|
| 287 | /* | 
|---|
| 288 | * int 0x80 is used by 32 bit mode as a system call entry. Normally idt entries | 
|---|
| 289 | * point to C routines, however since this is a system call interface the branch | 
|---|
| 290 | * history needs to be scrubbed to protect against BHI attacks, and that | 
|---|
| 291 | * scrubbing needs to take place in assembly code prior to entering any C | 
|---|
| 292 | * routines. | 
|---|
| 293 | */ | 
|---|
| 294 | SYM_CODE_START(int80_emulation) | 
|---|
| 295 | ANNOTATE_NOENDBR | 
|---|
| 296 | UNWIND_HINT_FUNC | 
|---|
| 297 | CLEAR_BRANCH_HISTORY | 
|---|
| 298 | jmp do_int80_emulation | 
|---|
| 299 | SYM_CODE_END(int80_emulation) | 
|---|
| 300 |  | 
|---|