| 1 | // SPDX-License-Identifier: GPL-2.0-only | 
|---|
| 2 | /* 64-bit system call dispatch */ | 
|---|
| 3 |  | 
|---|
| 4 | #include <linux/linkage.h> | 
|---|
| 5 | #include <linux/sys.h> | 
|---|
| 6 | #include <linux/cache.h> | 
|---|
| 7 | #include <linux/syscalls.h> | 
|---|
| 8 | #include <linux/entry-common.h> | 
|---|
| 9 | #include <linux/nospec.h> | 
|---|
| 10 | #include <asm/syscall.h> | 
|---|
| 11 |  | 
|---|
| 12 | #define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *); | 
|---|
| 13 | #define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *); | 
|---|
| 14 | #include <asm/syscalls_64.h> | 
|---|
| 15 | #ifdef CONFIG_X86_X32_ABI | 
|---|
| 16 | #include <asm/syscalls_x32.h> | 
|---|
| 17 | #endif | 
|---|
| 18 | #undef  __SYSCALL | 
|---|
| 19 |  | 
|---|
| 20 | #undef  __SYSCALL_NORETURN | 
|---|
| 21 | #define __SYSCALL_NORETURN __SYSCALL | 
|---|
| 22 |  | 
|---|
| 23 | /* | 
|---|
| 24 | * The sys_call_table[] is no longer used for system calls, but | 
|---|
| 25 | * kernel/trace/trace_syscalls.c still wants to know the system | 
|---|
| 26 | * call address. | 
|---|
| 27 | */ | 
|---|
| 28 | #define __SYSCALL(nr, sym) __x64_##sym, | 
|---|
| 29 | const sys_call_ptr_t sys_call_table[] = { | 
|---|
| 30 | #include <asm/syscalls_64.h> | 
|---|
| 31 | }; | 
|---|
| 32 | #undef  __SYSCALL | 
|---|
| 33 |  | 
|---|
| 34 | #define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs); | 
|---|
| 35 | long x64_sys_call(const struct pt_regs *regs, unsigned int nr) | 
|---|
| 36 | { | 
|---|
| 37 | switch (nr) { | 
|---|
| 38 | #include <asm/syscalls_64.h> | 
|---|
| 39 | default: return __x64_sys_ni_syscall(regs); | 
|---|
| 40 | } | 
|---|
| 41 | } | 
|---|
| 42 |  | 
|---|
| 43 | #ifdef CONFIG_X86_X32_ABI | 
|---|
| 44 | long x32_sys_call(const struct pt_regs *regs, unsigned int nr) | 
|---|
| 45 | { | 
|---|
| 46 | switch (nr) { | 
|---|
| 47 | #include <asm/syscalls_x32.h> | 
|---|
| 48 | default: return __x64_sys_ni_syscall(regs); | 
|---|
| 49 | } | 
|---|
| 50 | } | 
|---|
| 51 | #endif | 
|---|
| 52 |  | 
|---|
| 53 | static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr) | 
|---|
| 54 | { | 
|---|
| 55 | /* | 
|---|
| 56 | * Convert negative numbers to very high and thus out of range | 
|---|
| 57 | * numbers for comparisons. | 
|---|
| 58 | */ | 
|---|
| 59 | unsigned int unr = nr; | 
|---|
| 60 |  | 
|---|
| 61 | if (likely(unr < NR_syscalls)) { | 
|---|
| 62 | unr = array_index_nospec(unr, NR_syscalls); | 
|---|
| 63 | regs->ax = x64_sys_call(regs, nr: unr); | 
|---|
| 64 | return true; | 
|---|
| 65 | } | 
|---|
| 66 | return false; | 
|---|
| 67 | } | 
|---|
| 68 |  | 
|---|
| 69 | static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) | 
|---|
| 70 | { | 
|---|
| 71 | /* | 
|---|
| 72 | * Adjust the starting offset of the table, and convert numbers | 
|---|
| 73 | * < __X32_SYSCALL_BIT to very high and thus out of range | 
|---|
| 74 | * numbers for comparisons. | 
|---|
| 75 | */ | 
|---|
| 76 | unsigned int xnr = nr - __X32_SYSCALL_BIT; | 
|---|
| 77 |  | 
|---|
| 78 | if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) { | 
|---|
| 79 | xnr = array_index_nospec(xnr, X32_NR_syscalls); | 
|---|
| 80 | regs->ax = x32_sys_call(regs, nr: xnr); | 
|---|
| 81 | return true; | 
|---|
| 82 | } | 
|---|
| 83 | return false; | 
|---|
| 84 | } | 
|---|
| 85 |  | 
|---|
| 86 | /* Returns true to return using SYSRET, or false to use IRET */ | 
|---|
| 87 | __visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr) | 
|---|
| 88 | { | 
|---|
| 89 | add_random_kstack_offset(); | 
|---|
| 90 | nr = syscall_enter_from_user_mode(regs, syscall: nr); | 
|---|
| 91 |  | 
|---|
| 92 | instrumentation_begin(); | 
|---|
| 93 |  | 
|---|
| 94 | if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) { | 
|---|
| 95 | /* Invalid system call, but still a system call. */ | 
|---|
| 96 | regs->ax = __x64_sys_ni_syscall(regs); | 
|---|
| 97 | } | 
|---|
| 98 |  | 
|---|
| 99 | instrumentation_end(); | 
|---|
| 100 | syscall_exit_to_user_mode(regs); | 
|---|
| 101 |  | 
|---|
| 102 | /* | 
|---|
| 103 | * Check that the register state is valid for using SYSRET to exit | 
|---|
| 104 | * to userspace.  Otherwise use the slower but fully capable IRET | 
|---|
| 105 | * exit path. | 
|---|
| 106 | */ | 
|---|
| 107 |  | 
|---|
| 108 | /* XEN PV guests always use the IRET path */ | 
|---|
| 109 | if (cpu_feature_enabled(X86_FEATURE_XENPV)) | 
|---|
| 110 | return false; | 
|---|
| 111 |  | 
|---|
| 112 | /* SYSRET requires RCX == RIP and R11 == EFLAGS */ | 
|---|
| 113 | if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags)) | 
|---|
| 114 | return false; | 
|---|
| 115 |  | 
|---|
| 116 | /* CS and SS must match the values set in MSR_STAR */ | 
|---|
| 117 | if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS)) | 
|---|
| 118 | return false; | 
|---|
| 119 |  | 
|---|
| 120 | /* | 
|---|
| 121 | * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP | 
|---|
| 122 | * in kernel space.  This essentially lets the user take over | 
|---|
| 123 | * the kernel, since userspace controls RSP. | 
|---|
| 124 | * | 
|---|
| 125 | * TASK_SIZE_MAX covers all user-accessible addresses other than | 
|---|
| 126 | * the deprecated vsyscall page. | 
|---|
| 127 | */ | 
|---|
| 128 | if (unlikely(regs->ip >= TASK_SIZE_MAX)) | 
|---|
| 129 | return false; | 
|---|
| 130 |  | 
|---|
| 131 | /* | 
|---|
| 132 | * SYSRET cannot restore RF.  It can restore TF, but unlike IRET, | 
|---|
| 133 | * restoring TF results in a trap from userspace immediately after | 
|---|
| 134 | * SYSRET. | 
|---|
| 135 | */ | 
|---|
| 136 | if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF))) | 
|---|
| 137 | return false; | 
|---|
| 138 |  | 
|---|
| 139 | /* Use SYSRET to exit to userspace */ | 
|---|
| 140 | return true; | 
|---|
| 141 | } | 
|---|
| 142 |  | 
|---|