| 1 | // SPDX-License-Identifier: GPL-2.0-only | 
|---|
| 2 | /* 32-bit system call dispatch */ | 
|---|
| 3 |  | 
|---|
| 4 | #include <linux/linkage.h> | 
|---|
| 5 | #include <linux/sys.h> | 
|---|
| 6 | #include <linux/cache.h> | 
|---|
| 7 | #include <linux/syscalls.h> | 
|---|
| 8 | #include <linux/entry-common.h> | 
|---|
| 9 | #include <linux/nospec.h> | 
|---|
| 10 | #include <linux/uaccess.h> | 
|---|
| 11 | #include <asm/apic.h> | 
|---|
| 12 | #include <asm/traps.h> | 
|---|
| 13 | #include <asm/cpufeature.h> | 
|---|
| 14 | #include <asm/syscall.h> | 
|---|
| 15 |  | 
|---|
| 16 | #ifdef CONFIG_IA32_EMULATION | 
|---|
| 17 | #define __SYSCALL_WITH_COMPAT(nr, native, compat)	__SYSCALL(nr, compat) | 
|---|
| 18 | #else | 
|---|
| 19 | #define __SYSCALL_WITH_COMPAT(nr, native, compat)	__SYSCALL(nr, native) | 
|---|
| 20 | #endif | 
|---|
| 21 |  | 
|---|
| 22 | #define __SYSCALL(nr, sym) extern long __ia32_##sym(const struct pt_regs *); | 
|---|
| 23 | #define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __ia32_##sym(const struct pt_regs *); | 
|---|
| 24 | #include <asm/syscalls_32.h> | 
|---|
| 25 | #undef  __SYSCALL | 
|---|
| 26 |  | 
|---|
| 27 | #undef  __SYSCALL_NORETURN | 
|---|
| 28 | #define __SYSCALL_NORETURN __SYSCALL | 
|---|
| 29 |  | 
|---|
| 30 | /* | 
|---|
| 31 | * The sys_call_table[] is no longer used for system calls, but | 
|---|
| 32 | * kernel/trace/trace_syscalls.c still wants to know the system | 
|---|
| 33 | * call address. | 
|---|
| 34 | */ | 
|---|
| 35 | #ifdef CONFIG_X86_32 | 
|---|
| 36 | #define __SYSCALL(nr, sym) __ia32_##sym, | 
|---|
| 37 | const sys_call_ptr_t sys_call_table[] = { | 
|---|
| 38 | #include <asm/syscalls_32.h> | 
|---|
| 39 | }; | 
|---|
| 40 | #undef  __SYSCALL | 
|---|
| 41 | #endif | 
|---|
| 42 |  | 
|---|
| 43 | #define __SYSCALL(nr, sym) case nr: return __ia32_##sym(regs); | 
|---|
| 44 | long ia32_sys_call(const struct pt_regs *regs, unsigned int nr) | 
|---|
| 45 | { | 
|---|
| 46 | switch (nr) { | 
|---|
| 47 | #include <asm/syscalls_32.h> | 
|---|
| 48 | default: return __ia32_sys_ni_syscall(regs); | 
|---|
| 49 | } | 
|---|
| 50 | } | 
|---|
| 51 |  | 
|---|
| 52 | static __always_inline int syscall_32_enter(struct pt_regs *regs) | 
|---|
| 53 | { | 
|---|
| 54 | if (IS_ENABLED(CONFIG_IA32_EMULATION)) | 
|---|
| 55 | current_thread_info()->status |= TS_COMPAT; | 
|---|
| 56 |  | 
|---|
| 57 | return (int)regs->orig_ax; | 
|---|
| 58 | } | 
|---|
| 59 |  | 
|---|
| 60 | #ifdef CONFIG_IA32_EMULATION | 
|---|
| 61 | bool __ia32_enabled __ro_after_init = !IS_ENABLED(CONFIG_IA32_EMULATION_DEFAULT_DISABLED); | 
|---|
| 62 |  | 
|---|
| 63 | static int __init ia32_emulation_override_cmdline(char *arg) | 
|---|
| 64 | { | 
|---|
| 65 | return kstrtobool(s: arg, res: &__ia32_enabled); | 
|---|
| 66 | } | 
|---|
| 67 | early_param( "ia32_emulation", ia32_emulation_override_cmdline); | 
|---|
| 68 | #endif | 
|---|
| 69 |  | 
|---|
| 70 | /* | 
|---|
| 71 | * Invoke a 32-bit syscall.  Called with IRQs on in CT_STATE_KERNEL. | 
|---|
| 72 | */ | 
|---|
| 73 | static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr) | 
|---|
| 74 | { | 
|---|
| 75 | /* | 
|---|
| 76 | * Convert negative numbers to very high and thus out of range | 
|---|
| 77 | * numbers for comparisons. | 
|---|
| 78 | */ | 
|---|
| 79 | unsigned int unr = nr; | 
|---|
| 80 |  | 
|---|
| 81 | if (likely(unr < IA32_NR_syscalls)) { | 
|---|
| 82 | unr = array_index_nospec(unr, IA32_NR_syscalls); | 
|---|
| 83 | regs->ax = ia32_sys_call(regs, nr: unr); | 
|---|
| 84 | } else if (nr != -1) { | 
|---|
| 85 | regs->ax = __ia32_sys_ni_syscall(regs); | 
|---|
| 86 | } | 
|---|
| 87 | } | 
|---|
| 88 |  | 
|---|
| 89 | #ifdef CONFIG_IA32_EMULATION | 
|---|
| 90 | static __always_inline bool int80_is_external(void) | 
|---|
| 91 | { | 
|---|
| 92 | const unsigned int offs = (0x80 / 32) * 0x10; | 
|---|
| 93 | const u32 bit = BIT(0x80 % 32); | 
|---|
| 94 |  | 
|---|
| 95 | /* The local APIC on XENPV guests is fake */ | 
|---|
| 96 | if (cpu_feature_enabled(X86_FEATURE_XENPV)) | 
|---|
| 97 | return false; | 
|---|
| 98 |  | 
|---|
| 99 | /* | 
|---|
| 100 | * If vector 0x80 is set in the APIC ISR then this is an external | 
|---|
| 101 | * interrupt. Either from broken hardware or injected by a VMM. | 
|---|
| 102 | * | 
|---|
| 103 | * Note: In guest mode this is only valid for secure guests where | 
|---|
| 104 | * the secure module fully controls the vAPIC exposed to the guest. | 
|---|
| 105 | */ | 
|---|
| 106 | return apic_read(APIC_ISR + offs) & bit; | 
|---|
| 107 | } | 
|---|
| 108 |  | 
|---|
| 109 | /** | 
|---|
| 110 | * do_int80_emulation - 32-bit legacy syscall C entry from asm | 
|---|
| 111 | * @regs: syscall arguments in struct pt_args on the stack. | 
|---|
| 112 | * | 
|---|
| 113 | * This entry point can be used by 32-bit and 64-bit programs to perform | 
|---|
| 114 | * 32-bit system calls.  Instances of INT $0x80 can be found inline in | 
|---|
| 115 | * various programs and libraries.  It is also used by the vDSO's | 
|---|
| 116 | * __kernel_vsyscall fallback for hardware that doesn't support a faster | 
|---|
| 117 | * entry method.  Restarted 32-bit system calls also fall back to INT | 
|---|
| 118 | * $0x80 regardless of what instruction was originally used to do the | 
|---|
| 119 | * system call. | 
|---|
| 120 | * | 
|---|
| 121 | * This is considered a slow path.  It is not used by most libc | 
|---|
| 122 | * implementations on modern hardware except during process startup. | 
|---|
| 123 | * | 
|---|
| 124 | * The arguments for the INT $0x80 based syscall are on stack in the | 
|---|
| 125 | * pt_regs structure: | 
|---|
| 126 | *   eax:				system call number | 
|---|
| 127 | *   ebx, ecx, edx, esi, edi, ebp:	arg1 - arg 6 | 
|---|
| 128 | */ | 
|---|
| 129 | __visible noinstr void do_int80_emulation(struct pt_regs *regs) | 
|---|
| 130 | { | 
|---|
| 131 | int nr; | 
|---|
| 132 |  | 
|---|
| 133 | /* Kernel does not use INT $0x80! */ | 
|---|
| 134 | if (unlikely(!user_mode(regs))) { | 
|---|
| 135 | irqentry_enter(regs); | 
|---|
| 136 | instrumentation_begin(); | 
|---|
| 137 | panic(fmt: "Unexpected external interrupt 0x80\n"); | 
|---|
| 138 | } | 
|---|
| 139 |  | 
|---|
| 140 | /* | 
|---|
| 141 | * Establish kernel context for instrumentation, including for | 
|---|
| 142 | * int80_is_external() below which calls into the APIC driver. | 
|---|
| 143 | * Identical for soft and external interrupts. | 
|---|
| 144 | */ | 
|---|
| 145 | enter_from_user_mode(regs); | 
|---|
| 146 |  | 
|---|
| 147 | instrumentation_begin(); | 
|---|
| 148 | add_random_kstack_offset(); | 
|---|
| 149 |  | 
|---|
| 150 | /* Validate that this is a soft interrupt to the extent possible */ | 
|---|
| 151 | if (unlikely(int80_is_external())) | 
|---|
| 152 | panic(fmt: "Unexpected external interrupt 0x80\n"); | 
|---|
| 153 |  | 
|---|
| 154 | /* | 
|---|
| 155 | * The low level idtentry code pushed -1 into regs::orig_ax | 
|---|
| 156 | * and regs::ax contains the syscall number. | 
|---|
| 157 | * | 
|---|
| 158 | * User tracing code (ptrace or signal handlers) might assume | 
|---|
| 159 | * that the regs::orig_ax contains a 32-bit number on invoking | 
|---|
| 160 | * a 32-bit syscall. | 
|---|
| 161 | * | 
|---|
| 162 | * Establish the syscall convention by saving the 32bit truncated | 
|---|
| 163 | * syscall number in regs::orig_ax and by invalidating regs::ax. | 
|---|
| 164 | */ | 
|---|
| 165 | regs->orig_ax = regs->ax & GENMASK(31, 0); | 
|---|
| 166 | regs->ax = -ENOSYS; | 
|---|
| 167 |  | 
|---|
| 168 | nr = syscall_32_enter(regs); | 
|---|
| 169 |  | 
|---|
| 170 | local_irq_enable(); | 
|---|
| 171 | nr = syscall_enter_from_user_mode_work(regs, syscall: nr); | 
|---|
| 172 | do_syscall_32_irqs_on(regs, nr); | 
|---|
| 173 |  | 
|---|
| 174 | instrumentation_end(); | 
|---|
| 175 | syscall_exit_to_user_mode(regs); | 
|---|
| 176 | } | 
|---|
| 177 |  | 
|---|
| 178 | #ifdef CONFIG_X86_FRED | 
|---|
| 179 | /* | 
|---|
| 180 | * A FRED-specific INT80 handler is warranted for the follwing reasons: | 
|---|
| 181 | * | 
|---|
| 182 | * 1) As INT instructions and hardware interrupts are separate event | 
|---|
| 183 | *    types, FRED does not preclude the use of vector 0x80 for external | 
|---|
| 184 | *    interrupts. As a result, the FRED setup code does not reserve | 
|---|
| 185 | *    vector 0x80 and calling int80_is_external() is not merely | 
|---|
| 186 | *    suboptimal but actively incorrect: it could cause a system call | 
|---|
| 187 | *    to be incorrectly ignored. | 
|---|
| 188 | * | 
|---|
| 189 | * 2) It is called only for handling vector 0x80 of event type | 
|---|
| 190 | *    EVENT_TYPE_SWINT and will never be called to handle any external | 
|---|
| 191 | *    interrupt (event type EVENT_TYPE_EXTINT). | 
|---|
| 192 | * | 
|---|
| 193 | * 3) FRED has separate entry flows depending on if the event came from | 
|---|
| 194 | *    user space or kernel space, and because the kernel does not use | 
|---|
| 195 | *    INT insns, the FRED kernel entry handler fred_entry_from_kernel() | 
|---|
| 196 | *    falls through to fred_bad_type() if the event type is | 
|---|
| 197 | *    EVENT_TYPE_SWINT, i.e., INT insns. So if the kernel is handling | 
|---|
| 198 | *    an INT insn, it can only be from a user level. | 
|---|
| 199 | * | 
|---|
| 200 | * 4) int80_emulation() does a CLEAR_BRANCH_HISTORY. While FRED will | 
|---|
| 201 | *    likely take a different approach if it is ever needed: it | 
|---|
| 202 | *    probably belongs in either fred_intx()/ fred_other() or | 
|---|
| 203 | *    asm_fred_entrypoint_user(), depending on if this ought to be done | 
|---|
| 204 | *    for all entries from userspace or only system | 
|---|
| 205 | *    calls. | 
|---|
| 206 | * | 
|---|
| 207 | * 5) INT $0x80 is the fast path for 32-bit system calls under FRED. | 
|---|
| 208 | */ | 
|---|
| 209 | DEFINE_FREDENTRY_RAW(int80_emulation) | 
|---|
| 210 | { | 
|---|
| 211 | int nr; | 
|---|
| 212 |  | 
|---|
| 213 | enter_from_user_mode(regs); | 
|---|
| 214 |  | 
|---|
| 215 | instrumentation_begin(); | 
|---|
| 216 | add_random_kstack_offset(); | 
|---|
| 217 |  | 
|---|
| 218 | /* | 
|---|
| 219 | * FRED pushed 0 into regs::orig_ax and regs::ax contains the | 
|---|
| 220 | * syscall number. | 
|---|
| 221 | * | 
|---|
| 222 | * User tracing code (ptrace or signal handlers) might assume | 
|---|
| 223 | * that the regs::orig_ax contains a 32-bit number on invoking | 
|---|
| 224 | * a 32-bit syscall. | 
|---|
| 225 | * | 
|---|
| 226 | * Establish the syscall convention by saving the 32bit truncated | 
|---|
| 227 | * syscall number in regs::orig_ax and by invalidating regs::ax. | 
|---|
| 228 | */ | 
|---|
| 229 | regs->orig_ax = regs->ax & GENMASK(31, 0); | 
|---|
| 230 | regs->ax = -ENOSYS; | 
|---|
| 231 |  | 
|---|
| 232 | nr = syscall_32_enter(regs); | 
|---|
| 233 |  | 
|---|
| 234 | local_irq_enable(); | 
|---|
| 235 | nr = syscall_enter_from_user_mode_work(regs, nr); | 
|---|
| 236 | do_syscall_32_irqs_on(regs, nr); | 
|---|
| 237 |  | 
|---|
| 238 | instrumentation_end(); | 
|---|
| 239 | syscall_exit_to_user_mode(regs); | 
|---|
| 240 | } | 
|---|
| 241 | #endif /* CONFIG_X86_FRED */ | 
|---|
| 242 |  | 
|---|
| 243 | #else /* CONFIG_IA32_EMULATION */ | 
|---|
| 244 |  | 
|---|
| 245 | /* Handles int $0x80 on a 32bit kernel */ | 
|---|
| 246 | __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) | 
|---|
| 247 | { | 
|---|
| 248 | int nr = syscall_32_enter(regs); | 
|---|
| 249 |  | 
|---|
| 250 | add_random_kstack_offset(); | 
|---|
| 251 | /* | 
|---|
| 252 | * Subtlety here: if ptrace pokes something larger than 2^31-1 into | 
|---|
| 253 | * orig_ax, the int return value truncates it. This matches | 
|---|
| 254 | * the semantics of syscall_get_nr(). | 
|---|
| 255 | */ | 
|---|
| 256 | nr = syscall_enter_from_user_mode(regs, nr); | 
|---|
| 257 | instrumentation_begin(); | 
|---|
| 258 |  | 
|---|
| 259 | do_syscall_32_irqs_on(regs, nr); | 
|---|
| 260 |  | 
|---|
| 261 | instrumentation_end(); | 
|---|
| 262 | syscall_exit_to_user_mode(regs); | 
|---|
| 263 | } | 
|---|
| 264 | #endif /* !CONFIG_IA32_EMULATION */ | 
|---|
| 265 |  | 
|---|
| 266 | static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) | 
|---|
| 267 | { | 
|---|
| 268 | int nr = syscall_32_enter(regs); | 
|---|
| 269 | int res; | 
|---|
| 270 |  | 
|---|
| 271 | add_random_kstack_offset(); | 
|---|
| 272 | /* | 
|---|
| 273 | * This cannot use syscall_enter_from_user_mode() as it has to | 
|---|
| 274 | * fetch EBP before invoking any of the syscall entry work | 
|---|
| 275 | * functions. | 
|---|
| 276 | */ | 
|---|
| 277 | syscall_enter_from_user_mode_prepare(regs); | 
|---|
| 278 |  | 
|---|
| 279 | instrumentation_begin(); | 
|---|
| 280 | /* Fetch EBP from where the vDSO stashed it. */ | 
|---|
| 281 | if (IS_ENABLED(CONFIG_X86_64)) { | 
|---|
| 282 | /* | 
|---|
| 283 | * Micro-optimization: the pointer we're following is | 
|---|
| 284 | * explicitly 32 bits, so it can't be out of range. | 
|---|
| 285 | */ | 
|---|
| 286 | res = __get_user(*(u32 *)®s->bp, | 
|---|
| 287 | (u32 __user __force *)(unsigned long)(u32)regs->sp); | 
|---|
| 288 | } else { | 
|---|
| 289 | res = get_user(*(u32 *)®s->bp, | 
|---|
| 290 | (u32 __user __force *)(unsigned long)(u32)regs->sp); | 
|---|
| 291 | } | 
|---|
| 292 |  | 
|---|
| 293 | if (res) { | 
|---|
| 294 | /* User code screwed up. */ | 
|---|
| 295 | regs->ax = -EFAULT; | 
|---|
| 296 |  | 
|---|
| 297 | local_irq_disable(); | 
|---|
| 298 | instrumentation_end(); | 
|---|
| 299 | irqentry_exit_to_user_mode(regs); | 
|---|
| 300 | return false; | 
|---|
| 301 | } | 
|---|
| 302 |  | 
|---|
| 303 | nr = syscall_enter_from_user_mode_work(regs, syscall: nr); | 
|---|
| 304 |  | 
|---|
| 305 | /* Now this is just like a normal syscall. */ | 
|---|
| 306 | do_syscall_32_irqs_on(regs, nr); | 
|---|
| 307 |  | 
|---|
| 308 | instrumentation_end(); | 
|---|
| 309 | syscall_exit_to_user_mode(regs); | 
|---|
| 310 | return true; | 
|---|
| 311 | } | 
|---|
| 312 |  | 
|---|
| 313 | /* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */ | 
|---|
| 314 | __visible noinstr bool do_fast_syscall_32(struct pt_regs *regs) | 
|---|
| 315 | { | 
|---|
| 316 | /* | 
|---|
| 317 | * Called using the internal vDSO SYSENTER/SYSCALL32 calling | 
|---|
| 318 | * convention.  Adjust regs so it looks like we entered using int80. | 
|---|
| 319 | */ | 
|---|
| 320 | unsigned long landing_pad = (unsigned long)current->mm->context.vdso + | 
|---|
| 321 | vdso_image_32.sym_int80_landing_pad; | 
|---|
| 322 |  | 
|---|
| 323 | /* | 
|---|
| 324 | * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward | 
|---|
| 325 | * so that 'regs->ip -= 2' lands back on an int $0x80 instruction. | 
|---|
| 326 | * Fix it up. | 
|---|
| 327 | */ | 
|---|
| 328 | regs->ip = landing_pad; | 
|---|
| 329 |  | 
|---|
| 330 | /* Invoke the syscall. If it failed, keep it simple: use IRET. */ | 
|---|
| 331 | if (!__do_fast_syscall_32(regs)) | 
|---|
| 332 | return false; | 
|---|
| 333 |  | 
|---|
| 334 | /* | 
|---|
| 335 | * Check that the register state is valid for using SYSRETL/SYSEXIT | 
|---|
| 336 | * to exit to userspace.  Otherwise use the slower but fully capable | 
|---|
| 337 | * IRET exit path. | 
|---|
| 338 | */ | 
|---|
| 339 |  | 
|---|
| 340 | /* XEN PV guests always use the IRET path */ | 
|---|
| 341 | if (cpu_feature_enabled(X86_FEATURE_XENPV)) | 
|---|
| 342 | return false; | 
|---|
| 343 |  | 
|---|
| 344 | /* EIP must point to the VDSO landing pad */ | 
|---|
| 345 | if (unlikely(regs->ip != landing_pad)) | 
|---|
| 346 | return false; | 
|---|
| 347 |  | 
|---|
| 348 | /* CS and SS must match the values set in MSR_STAR */ | 
|---|
| 349 | if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS)) | 
|---|
| 350 | return false; | 
|---|
| 351 |  | 
|---|
| 352 | /* If the TF, RF, or VM flags are set, use IRET */ | 
|---|
| 353 | if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM))) | 
|---|
| 354 | return false; | 
|---|
| 355 |  | 
|---|
| 356 | /* Use SYSRETL/SYSEXIT to exit to userspace */ | 
|---|
| 357 | return true; | 
|---|
| 358 | } | 
|---|
| 359 |  | 
|---|
| 360 | /* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */ | 
|---|
| 361 | __visible noinstr bool do_SYSENTER_32(struct pt_regs *regs) | 
|---|
| 362 | { | 
|---|
| 363 | /* SYSENTER loses RSP, but the vDSO saved it in RBP. */ | 
|---|
| 364 | regs->sp = regs->bp; | 
|---|
| 365 |  | 
|---|
| 366 | /* SYSENTER clobbers EFLAGS.IF.  Assume it was set in usermode. */ | 
|---|
| 367 | regs->flags |= X86_EFLAGS_IF; | 
|---|
| 368 |  | 
|---|
| 369 | return do_fast_syscall_32(regs); | 
|---|
| 370 | } | 
|---|
| 371 |  | 
|---|