^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /* SPDX-License-Identifier: GPL-2.0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * linux/arch/x86_64/entry.S
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Copyright (C) 1991, 1992 Linus Torvalds
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) * entry.S contains the system-call and fault low-level handling routines.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) * Some of this is documented in Documentation/x86/entry_64.rst
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) * A note on terminology:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) * - iret frame: Architecture defined interrupt frame from SS to RIP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) * at the top of the kernel process stack.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) * Some macro usage:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) * - SYM_FUNC_START/END:Define functions in the symbol table.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) * - idtentry: Define exception entry points.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #include <linux/linkage.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) #include <asm/segment.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) #include <asm/cache.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) #include <asm/errno.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #include <asm/asm-offsets.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) #include <asm/msr.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) #include <asm/unistd.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) #include <asm/thread_info.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) #include <asm/hw_irq.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) #include <asm/page_types.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) #include <asm/irqflags.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) #include <asm/paravirt.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) #include <asm/percpu.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) #include <asm/asm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) #include <asm/smap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) #include <asm/pgtable_types.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) #include <asm/export.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) #include <asm/frame.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) #include <asm/trapnr.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) #include <asm/nospec-branch.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) #include <asm/fsgsbase.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) #include <linux/err.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) #include "calling.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) .code64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) .section .entry.text, "ax"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) #ifdef CONFIG_PARAVIRT_XXL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) SYM_CODE_START(native_usergs_sysret64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) UNWIND_HINT_EMPTY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) swapgs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) sysretq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) SYM_CODE_END(native_usergs_sysret64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) #endif /* CONFIG_PARAVIRT_XXL */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) * This is the only entry point used for 64-bit system calls. The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) * hardware interface is reasonably well designed and the register to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) * argument mapping Linux uses fits well with the registers that are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) * available when SYSCALL is used.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) * SYSCALL instructions can be found inlined in libc implementations as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) * well as some other programs and libraries. There are also a handful
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) * of SYSCALL instructions in the vDSO used, for example, as a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) * clock_gettimeofday fallback.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) * then loads new ss, cs, and rip from previously programmed MSRs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) * rflags gets masked by a value from another MSR (so CLD and CLAC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) * are not needed). SYSCALL does not save anything on the stack
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) * and does not change rsp.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) * Registers on entry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) * rax system call number
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) * rcx return address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) * r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) * rdi arg0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) * rsi arg1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) * rdx arg2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) * r10 arg3 (needs to be moved to rcx to conform to C ABI)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) * r8 arg4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) * r9 arg5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) * (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) * Only called from user space.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) * When user can change pt_regs->foo always force IRET. That is because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) * it deals with uncanonical addresses better. SYSRET has trouble
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) * with them due to bugs in both AMD and Intel CPUs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) SYM_CODE_START(entry_SYSCALL_64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) UNWIND_HINT_EMPTY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) swapgs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) /* tss.sp2 is scratch space. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) /* Construct struct pt_regs on stack */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) pushq $__USER_DS /* pt_regs->ss */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) pushq %r11 /* pt_regs->flags */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) pushq $__USER_CS /* pt_regs->cs */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) pushq %rcx /* pt_regs->ip */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) pushq %rax /* pt_regs->orig_ax */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) PUSH_AND_CLEAR_REGS rax=$-ENOSYS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) /* IRQs are off. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) movq %rax, %rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) movq %rsp, %rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) call do_syscall_64 /* returns with IRQs disabled */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) * Try to use SYSRET instead of IRET if we're returning to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) * a completely clean 64-bit userspace context. If we're not,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) * go to the slow exit path.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) movq RCX(%rsp), %rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) movq RIP(%rsp), %r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) jne swapgs_restore_regs_and_return_to_usermode
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) * in kernel space. This essentially lets the user take over
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) * the kernel, since userspace controls RSP.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) * If width of "canonical tail" ever becomes variable, this will need
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) * to be updated to remain correct on both old and new CPUs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) * Change top bits to match most significant bit (47th or 56th bit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) * depending on paging mode) in the address.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) #ifdef CONFIG_X86_5LEVEL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) /* If this changed %rcx, it was not canonical */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) cmpq %rcx, %r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) jne swapgs_restore_regs_and_return_to_usermode
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) jne swapgs_restore_regs_and_return_to_usermode
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) movq R11(%rsp), %r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) jne swapgs_restore_regs_and_return_to_usermode
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) * restore RF properly. If the slowpath sets it for whatever reason, we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) * need to restore it correctly.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) * SYSRET can restore TF, but unlike IRET, restoring TF results in a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) * trap from userspace immediately after SYSRET. This would cause an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) * infinite loop whenever #DB happens with register state that satisfies
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) * the opportunistic SYSRET conditions. For example, single-stepping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) * this user code:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) * movq $stuck_here, %rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) * pushfq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) * popq %r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) * stuck_here:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) * would never get past 'stuck_here'.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) jnz swapgs_restore_regs_and_return_to_usermode
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) /* nothing to check for RSP */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) jne swapgs_restore_regs_and_return_to_usermode
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) * We win! This label is here just for ease of understanding
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) * perf profiles. Nothing jumps here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) syscall_return_via_sysret:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) /* rcx and r11 are already restored (see code above) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) POP_REGS pop_rdi=0 skip_r11rcx=1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) * Now all regs are restored except RSP and RDI.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) * Save old stack pointer and switch to trampoline stack.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) movq %rsp, %rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) UNWIND_HINT_EMPTY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) pushq RSP-RDI(%rdi) /* RSP */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) pushq (%rdi) /* RDI */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) * We are on the trampoline stack. All regs except RDI are live.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) * We can do future final exit work right here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) STACKLEAK_ERASE_NOCLOBBER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) popq %rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) popq %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) USERGS_SYSRET64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) SYM_CODE_END(entry_SYSCALL_64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) * %rdi: prev task
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) * %rsi: next task
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) .pushsection .text, "ax"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) SYM_FUNC_START(__switch_to_asm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) * Save callee-saved registers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) * This must match the order in inactive_task_frame
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) pushq %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) pushq %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) pushq %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) pushq %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) pushq %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) pushq %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) /* switch stack */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) movq %rsp, TASK_threadsp(%rdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) movq TASK_threadsp(%rsi), %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) #ifdef CONFIG_STACKPROTECTOR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) movq TASK_stack_canary(%rsi), %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) movq %rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) #ifdef CONFIG_RETPOLINE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) * When switching from a shallower to a deeper call stack
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) * the RSB may either underflow or use entries populated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) * with userspace addresses. On CPUs where those concerns
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) * exist, overwrite the RSB with entries which capture
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) * speculative execution to prevent attack.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) /* restore callee-saved registers */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) popq %r15
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) popq %r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) popq %r13
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) popq %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) popq %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) popq %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) jmp __switch_to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) SYM_FUNC_END(__switch_to_asm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) .popsection
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) * A newly forked process directly context switches into this address.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) * rax: prev task we switched from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) * rbx: kernel thread func (NULL for user thread)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) * r12: kernel thread arg
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) .pushsection .text, "ax"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) SYM_CODE_START(ret_from_fork)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) UNWIND_HINT_EMPTY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) movq %rax, %rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) call schedule_tail /* rdi: 'prev' task parameter */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) testq %rbx, %rbx /* from kernel_thread? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) jnz 1f /* kernel threads are uncommon */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) 2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) UNWIND_HINT_REGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) movq %rsp, %rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) call syscall_exit_to_user_mode /* returns with IRQs disabled */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) jmp swapgs_restore_regs_and_return_to_usermode
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) /* kernel thread */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) UNWIND_HINT_EMPTY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) movq %r12, %rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) CALL_NOSPEC rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) * A kernel thread is allowed to return here after successfully
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) * calling kernel_execve(). Exit to userspace to complete the execve()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) * syscall.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) movq $0, RAX(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) jmp 2b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) SYM_CODE_END(ret_from_fork)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) .popsection
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) .macro DEBUG_ENTRY_ASSERT_IRQS_OFF
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) #ifdef CONFIG_DEBUG_ENTRY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) pushq %rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) SAVE_FLAGS(CLBR_RAX)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) testl $X86_EFLAGS_IF, %eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) jz .Lokay_\@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) ud2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) .Lokay_\@:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) popq %rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) * idtentry_body - Macro to emit code calling the C function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) * @cfunc: C function to be called
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) * @has_error_code: Hardware pushed error code on stack
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) .macro idtentry_body cfunc has_error_code:req
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) call error_entry
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) UNWIND_HINT_REGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) movq %rsp, %rdi /* pt_regs pointer into 1st argument*/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) .if \has_error_code == 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) .endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) call \cfunc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) jmp error_return
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) * idtentry - Macro to generate entry stubs for simple IDT entries
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) * @vector: Vector number
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) * @asmsym: ASM symbol for the entry point
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) * @cfunc: C function to be called
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) * @has_error_code: Hardware pushed error code on stack
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) * The macro emits code to set up the kernel context for straight forward
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) * and simple IDT entries. No IST stack, no paranoid entry checks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) .macro idtentry vector asmsym cfunc has_error_code:req
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) SYM_CODE_START(\asmsym)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) UNWIND_HINT_IRET_REGS offset=\has_error_code*8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) ASM_CLAC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) .if \has_error_code == 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) pushq $-1 /* ORIG_RAX: no syscall to restart */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) .endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) .if \vector == X86_TRAP_BP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) * If coming from kernel space, create a 6-word gap to allow the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) * int3 handler to emulate a call instruction.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) testb $3, CS-ORIG_RAX(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) jnz .Lfrom_usermode_no_gap_\@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) .rept 6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) pushq 5*8(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) .endr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) UNWIND_HINT_IRET_REGS offset=8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) .Lfrom_usermode_no_gap_\@:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) .endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) idtentry_body \cfunc \has_error_code
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) _ASM_NOKPROBE(\asmsym)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) SYM_CODE_END(\asmsym)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) * Interrupt entry/exit.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) + The interrupt stubs push (vector) onto the stack, which is the error_code
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) * position of idtentry exceptions, and jump to one of the two idtentry points
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) * (common/spurious).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) * common_interrupt is a hotpath, align it to a cache line
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) .macro idtentry_irq vector cfunc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) .p2align CONFIG_X86_L1_CACHE_SHIFT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) idtentry \vector asm_\cfunc \cfunc has_error_code=1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) * System vectors which invoke their handlers directly and are not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) * going through the regular common device interrupt handling code.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) .macro idtentry_sysvec vector cfunc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) idtentry \vector asm_\cfunc \cfunc has_error_code=0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) * @vector: Vector number
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) * @asmsym: ASM symbol for the entry point
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) * @cfunc: C function to be called
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) * The macro emits code to set up the kernel context for #MC and #DB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) * If the entry comes from user space it uses the normal entry path
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) * including the return to user space work and preemption checks on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) * exit.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) * If hits in kernel mode then it needs to go through the paranoid
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) * entry as the exception can hit any random state. No preemption
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) * check on exit to keep the paranoid path simple.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) .macro idtentry_mce_db vector asmsym cfunc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) SYM_CODE_START(\asmsym)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) UNWIND_HINT_IRET_REGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) ASM_CLAC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) pushq $-1 /* ORIG_RAX: no syscall to restart */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) * If the entry is from userspace, switch stacks and treat it as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) * a normal entry.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) testb $3, CS-ORIG_RAX(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) jnz .Lfrom_usermode_switch_stack_\@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) /* paranoid_entry returns GS information for paranoid_exit in EBX. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) call paranoid_entry
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) UNWIND_HINT_REGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) movq %rsp, %rdi /* pt_regs pointer */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) call \cfunc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) jmp paranoid_exit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) /* Switch to the regular task stack and use the noist entry point */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) .Lfrom_usermode_switch_stack_\@:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) idtentry_body noist_\cfunc, has_error_code=0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) _ASM_NOKPROBE(\asmsym)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) SYM_CODE_END(\asmsym)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) #ifdef CONFIG_AMD_MEM_ENCRYPT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) * idtentry_vc - Macro to generate entry stub for #VC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) * @vector: Vector number
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) * @asmsym: ASM symbol for the entry point
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) * @cfunc: C function to be called
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) * The macro emits code to set up the kernel context for #VC. The #VC handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) * runs on an IST stack and needs to be able to cause nested #VC exceptions.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) * To make this work the #VC entry code tries its best to pretend it doesn't use
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) * an IST stack by switching to the task stack if coming from user-space (which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) * includes early SYSCALL entry path) or back to the stack in the IRET frame if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) * entered from kernel-mode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) * If entered from kernel-mode the return stack is validated first, and if it is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) * not safe to use (e.g. because it points to the entry stack) the #VC handler
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) * will switch to a fall-back stack (VC2) and call a special handler function.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) * The macro is only used for one vector, but it is planned to be extended in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) * the future for the #HV exception.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) .macro idtentry_vc vector asmsym cfunc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) SYM_CODE_START(\asmsym)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) UNWIND_HINT_IRET_REGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) ASM_CLAC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) * If the entry is from userspace, switch stacks and treat it as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) * a normal entry.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) testb $3, CS-ORIG_RAX(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) jnz .Lfrom_usermode_switch_stack_\@
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) call paranoid_entry
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) UNWIND_HINT_REGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) * Switch off the IST stack to make it free for nested exceptions. The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) * vc_switch_off_ist() function will switch back to the interrupted
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) * stack if it is safe to do so. If not it switches to the VC fall-back
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) * stack.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) movq %rsp, %rdi /* pt_regs pointer */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) call vc_switch_off_ist
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) movq %rax, %rsp /* Switch to new stack */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) UNWIND_HINT_REGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) /* Update pt_regs */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) movq %rsp, %rdi /* pt_regs pointer */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) call kernel_\cfunc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) * No need to switch back to the IST stack. The current stack is either
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) * identical to the stack in the IRET frame or the VC fall-back stack,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) * so it is definitly mapped even with PTI enabled.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) jmp paranoid_exit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) /* Switch to the regular task stack */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) .Lfrom_usermode_switch_stack_\@:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) idtentry_body user_\cfunc, has_error_code=1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) _ASM_NOKPROBE(\asmsym)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) SYM_CODE_END(\asmsym)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) * Double fault entry. Straight paranoid. No checks from which context
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) * this comes because for the espfix induced #DF this would do the wrong
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) * thing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) .macro idtentry_df vector asmsym cfunc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) SYM_CODE_START(\asmsym)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) UNWIND_HINT_IRET_REGS offset=8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) ASM_CLAC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) /* paranoid_entry returns GS information for paranoid_exit in EBX. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) call paranoid_entry
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) UNWIND_HINT_REGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) movq %rsp, %rdi /* pt_regs pointer into first argument */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) call \cfunc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) jmp paranoid_exit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) _ASM_NOKPROBE(\asmsym)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) SYM_CODE_END(\asmsym)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) * Include the defines which emit the idt entries which are shared
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) * shared between 32 and 64 bit and emit the __irqentry_text_* markers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) * so the stacktrace boundary checks work.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) .globl __irqentry_text_start
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) __irqentry_text_start:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) #include <asm/idtentry.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) .align 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) .globl __irqentry_text_end
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) __irqentry_text_end:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) SYM_CODE_START_LOCAL(common_interrupt_return)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) #ifdef CONFIG_DEBUG_ENTRY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) /* Assert that pt_regs indicates user mode. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) testb $3, CS(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) jnz 1f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) ud2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) #ifdef CONFIG_XEN_PV
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) POP_REGS pop_rdi=0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) * Save old stack pointer and switch to trampoline stack.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) movq %rsp, %rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) UNWIND_HINT_EMPTY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) /* Copy the IRET frame to the trampoline stack. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) pushq 6*8(%rdi) /* SS */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) pushq 5*8(%rdi) /* RSP */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) pushq 4*8(%rdi) /* EFLAGS */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) pushq 3*8(%rdi) /* CS */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) pushq 2*8(%rdi) /* RIP */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) /* Push user RDI on the trampoline stack. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) pushq (%rdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) * We are on the trampoline stack. All regs except RDI are live.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) * We can do future final exit work right here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) STACKLEAK_ERASE_NOCLOBBER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) /* Restore RDI. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) popq %rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) SWAPGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) INTERRUPT_RETURN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) #ifdef CONFIG_DEBUG_ENTRY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) /* Assert that pt_regs indicates kernel mode. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) testb $3, CS(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) jz 1f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) ud2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) POP_REGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) addq $8, %rsp /* skip regs->orig_ax */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) * when returning from IPI handler.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) INTERRUPT_RETURN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) SYM_INNER_LABEL_ALIGN(native_iret, SYM_L_GLOBAL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) UNWIND_HINT_IRET_REGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) * Are we returning to a stack segment from the LDT? Note: in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) * 64-bit mode SS:RSP on the exception stack is always valid.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) #ifdef CONFIG_X86_ESPFIX64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) testb $4, (SS-RIP)(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) jnz native_irq_return_ldt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) SYM_INNER_LABEL(native_irq_return_iret, SYM_L_GLOBAL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) * This may fault. Non-paranoid faults on return to userspace are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) * handled by fixup_bad_iret. These include #SS, #GP, and #NP.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) * Double-faults due to espfix64 are handled in exc_double_fault.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) * Other faults here are fatal.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) iretq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) #ifdef CONFIG_X86_ESPFIX64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) native_irq_return_ldt:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) * We are running with user GSBASE. All GPRs contain their user
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) * values. We have a percpu ESPFIX stack that is eight slots
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) * long (see ESPFIX_STACK_SIZE). espfix_waddr points to the bottom
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) * of the ESPFIX stack.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) * We clobber RAX and RDI in this code. We stash RDI on the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) * normal stack and RAX on the ESPFIX stack.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) * The ESPFIX stack layout we set up looks like this:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) * --- top of ESPFIX stack ---
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) * SS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) * RSP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) * RFLAGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) * CS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) * RIP <-- RSP points here when we're done
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) * RAX <-- espfix_waddr points here
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) * --- bottom of ESPFIX stack ---
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) pushq %rdi /* Stash user RDI */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) swapgs /* to kernel GS */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) movq PER_CPU_VAR(espfix_waddr), %rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) movq %rax, (0*8)(%rdi) /* user RAX */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) movq (1*8)(%rsp), %rax /* user RIP */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) movq %rax, (1*8)(%rdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) movq (2*8)(%rsp), %rax /* user CS */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) movq %rax, (2*8)(%rdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) movq (3*8)(%rsp), %rax /* user RFLAGS */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) movq %rax, (3*8)(%rdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) movq (5*8)(%rsp), %rax /* user SS */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) movq %rax, (5*8)(%rdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) movq (4*8)(%rsp), %rax /* user RSP */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) movq %rax, (4*8)(%rdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) /* Now RAX == RSP. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) * espfix_stack[31:16] == 0. The page tables are set up such that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) * (espfix_stack | (X & 0xffff0000)) points to a read-only alias of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) * espfix_waddr for any X. That is, there are 65536 RO aliases of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) * the same page. Set up RSP so that RSP[31:16] contains the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) * respective 16 bits of the /userspace/ RSP and RSP nonetheless
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) * still points to an RO alias of the ESPFIX stack.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) orq PER_CPU_VAR(espfix_stack), %rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) swapgs /* to user GS */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) popq %rdi /* Restore user RDI */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) movq %rax, %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) UNWIND_HINT_IRET_REGS offset=8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) * At this point, we cannot write to the stack any more, but we can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) * still read.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) popq %rax /* Restore user RAX */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) * RSP now points to an ordinary IRET frame, except that the page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) * is read-only and RSP[31:16] are preloaded with the userspace
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) * values. We can now IRET back to userspace.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) jmp native_irq_return_iret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) SYM_CODE_END(common_interrupt_return)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) _ASM_NOKPROBE(common_interrupt_return)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) * Reload gs selector with exception handling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) * edi: new selector
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) * Is in entry.text as it shouldn't be instrumented.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) SYM_FUNC_START(asm_load_gs_index)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) FRAME_BEGIN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) swapgs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) .Lgs_change:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) movl %edi, %gs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) 2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) swapgs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) FRAME_END
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) SYM_FUNC_END(asm_load_gs_index)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) EXPORT_SYMBOL(asm_load_gs_index)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) _ASM_EXTABLE(.Lgs_change, .Lbad_gs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) .section .fixup, "ax"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) /* running with kernelgs */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) swapgs /* switch back to user gs */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) .macro ZAP_GS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) /* This can't be a string because the preprocessor needs to see it. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) movl $__USER_DS, %eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) movl %eax, %gs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) .endm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) ALTERNATIVE "", "ZAP_GS", X86_BUG_NULL_SEG
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) xorl %eax, %eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) movl %eax, %gs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) jmp 2b
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) SYM_CODE_END(.Lbad_gs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) .previous
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) * rdi: New stack pointer points to the top word of the stack
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) * rsi: Function pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) * rdx: Function argument (can be NULL if none)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) SYM_FUNC_START(asm_call_on_stack)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) SYM_INNER_LABEL(asm_call_sysvec_on_stack, SYM_L_GLOBAL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) SYM_INNER_LABEL(asm_call_irq_on_stack, SYM_L_GLOBAL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) * Save the frame pointer unconditionally. This allows the ORC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) * unwinder to handle the stack switch.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) pushq %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) mov %rsp, %rbp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) * The unwinder relies on the word at the top of the new stack
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) * page linking back to the previous RSP.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) mov %rsp, (%rdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) mov %rdi, %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) /* Move the argument to the right place */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) mov %rdx, %rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) .pushsection .discard.instr_begin
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) .long 1b - .
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) .popsection
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) CALL_NOSPEC rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) 2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) .pushsection .discard.instr_end
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) .long 2b - .
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) .popsection
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) /* Restore the previous stack pointer from RBP. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) leaveq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) SYM_FUNC_END(asm_call_on_stack)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) #ifdef CONFIG_XEN_PV
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) * A note on the "critical region" in our callback handler.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807) * We want to avoid stacking callback handlers due to events occurring
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) * during handling of the last event. To do this, we keep events disabled
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) * until we've done all processing. HOWEVER, we must enable events before
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) * popping the stack frame (can't be done atomically) and so it would still
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) * be possible to get enough handler activations to overflow the stack.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) * Although unlikely, bugs of that kind are hard to track down, so we'd
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) * like to avoid the possibility.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) * So, on entry to the handler we detect whether we interrupted an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) * existing activation in its critical region -- if so, we pop the current
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) * activation and restart the handler using the previous one.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818) * C calling convention: exc_xen_hypervisor_callback(struct *pt_regs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) SYM_CODE_START_LOCAL(exc_xen_hypervisor_callback)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823) * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) * see the correct pointer to the pt_regs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) UNWIND_HINT_FUNC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) movq %rdi, %rsp /* we don't return, adjust the stack frame */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) UNWIND_HINT_REGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) call xen_pv_evtchn_do_upcall
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832) jmp error_return
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) SYM_CODE_END(exc_xen_hypervisor_callback)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) * Hypervisor uses this for application faults while it executes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837) * We get here for two reasons:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838) * 1. Fault while reloading DS, ES, FS or GS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839) * 2. Fault while executing IRET
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) * Category 1 we do not need to fix up as Xen has already reloaded all segment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841) * registers that could be reloaded and zeroed the others.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) * Category 2 we fix up by killing the current process. We cannot use the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) * normal Linux return path in this case because if we use the IRET hypercall
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845) * We distinguish between categories by comparing each saved segment register
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) * with its current contents: any discrepancy means we in category 1.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848) SYM_CODE_START(xen_failsafe_callback)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849) UNWIND_HINT_EMPTY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850) movl %ds, %ecx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) cmpw %cx, 0x10(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852) jne 1f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853) movl %es, %ecx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854) cmpw %cx, 0x18(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) jne 1f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) movl %fs, %ecx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857) cmpw %cx, 0x20(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858) jne 1f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) movl %gs, %ecx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860) cmpw %cx, 0x28(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) jne 1f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) /* All segments match their saved values => Category 2 (Bad IRET). */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863) movq (%rsp), %rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) movq 8(%rsp), %r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865) addq $0x30, %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) pushq $0 /* RIP */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867) UNWIND_HINT_IRET_REGS offset=8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) jmp asm_exc_general_protection
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869) 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) movq (%rsp), %rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) movq 8(%rsp), %r11
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872) addq $0x30, %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873) UNWIND_HINT_IRET_REGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) pushq $-1 /* orig_ax = -1 => not a system call */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875) PUSH_AND_CLEAR_REGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) ENCODE_FRAME_POINTER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877) jmp error_return
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878) SYM_CODE_END(xen_failsafe_callback)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) #endif /* CONFIG_XEN_PV */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) * Save all registers in pt_regs. Return GSBASE related information
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883) * in EBX depending on the availability of the FSGSBASE instructions:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) * FSGSBASE R/EBX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) * N 0 -> SWAPGS on exit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887) * 1 -> no SWAPGS on exit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) * Y GSBASE value at entry, must be restored in paranoid_exit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) SYM_CODE_START_LOCAL(paranoid_entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892) UNWIND_HINT_FUNC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893) cld
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894) PUSH_AND_CLEAR_REGS save_ret=1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895) ENCODE_FRAME_POINTER 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898) * Always stash CR3 in %r14. This value will be restored,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899) * verbatim, at exit. Needed if paranoid_entry interrupted
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) * another entry that already switched to the user CR3 value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) * but has not yet returned to userspace.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903) * This is also why CS (stashed in the "iret frame" by the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) * hardware at entry) can not be used: this may be a return
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) * to kernel code, but with a user CR3 value.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907) * Switching CR3 does not depend on kernel GSBASE so it can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) * be done before switching to the kernel GSBASE. This is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909) * required for FSGSBASE because the kernel GSBASE has to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) * be retrieved from a kernel internal table.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912) SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915) * Handling GSBASE depends on the availability of FSGSBASE.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) * Without FSGSBASE the kernel enforces that negative GSBASE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) * values indicate kernel GSBASE. With FSGSBASE no assumptions
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919) * can be made about the GSBASE value when entering from user
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920) * space.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922) ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925) * Read the current GSBASE and store it in %rbx unconditionally,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926) * retrieve and set the current CPUs kernel GSBASE. The stored value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927) * has to be restored in paranoid_exit unconditionally.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929) * The unconditional write to GS base below ensures that no subsequent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930) * loads based on a mispredicted GS base can happen, therefore no LFENCE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931) * is needed here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933) SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936) .Lparanoid_entry_checkgs:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) /* EBX = 1 -> kernel GSBASE active, no restore required */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938) movl $1, %ebx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941) * The kernel-enforced convention is a negative GSBASE indicates
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942) * a kernel value. No SWAPGS needed on entry and exit.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944) movl $MSR_GS_BASE, %ecx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945) rdmsr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946) testl %edx, %edx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947) js .Lparanoid_kernel_gsbase
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949) /* EBX = 0 -> SWAPGS required on exit */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950) xorl %ebx, %ebx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951) swapgs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952) .Lparanoid_kernel_gsbase:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) FENCE_SWAPGS_KERNEL_ENTRY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956) SYM_CODE_END(paranoid_entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) * "Paranoid" exit path from exception stack. This is invoked
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) * only on return from non-NMI IST interrupts that came
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961) * from kernel space.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963) * We may be returning to very strange contexts (e.g. very early
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964) * in syscall entry), so checking for preemption here would
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965) * be complicated. Fortunately, there's no good reason to try
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) * to handle preemption here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968) * R/EBX contains the GSBASE related information depending on the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969) * availability of the FSGSBASE instructions:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971) * FSGSBASE R/EBX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972) * N 0 -> SWAPGS on exit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973) * 1 -> no SWAPGS on exit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975) * Y User space GSBASE, must be restored unconditionally
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977) SYM_CODE_START_LOCAL(paranoid_exit)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978) UNWIND_HINT_REGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980) * The order of operations is important. RESTORE_CR3 requires
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981) * kernel GSBASE.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983) * NB to anyone to try to optimize this code: this code does
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984) * not execute at all for exceptions from user mode. Those
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985) * exceptions go through error_exit instead.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987) RESTORE_CR3 scratch_reg=%rax save_reg=%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989) /* Handle the three GSBASE cases */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990) ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992) /* With FSGSBASE enabled, unconditionally restore GSBASE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993) wrgsbase %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994) jmp restore_regs_and_return_to_kernel
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996) .Lparanoid_exit_checkgs:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997) /* On non-FSGSBASE systems, conditionally do SWAPGS */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998) testl %ebx, %ebx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999) jnz restore_regs_and_return_to_kernel
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) /* We are returning to a context with user GSBASE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) swapgs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) jmp restore_regs_and_return_to_kernel
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) SYM_CODE_END(paranoid_exit)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) * Save all registers in pt_regs, and switch GS if needed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) SYM_CODE_START_LOCAL(error_entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) UNWIND_HINT_FUNC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) cld
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) PUSH_AND_CLEAR_REGS save_ret=1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) ENCODE_FRAME_POINTER 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) testb $3, CS+8(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) jz .Lerror_kernelspace
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) * We entered from user mode or we're pretending to have entered
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) * from user mode due to an IRET fault.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) SWAPGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) FENCE_SWAPGS_USER_ENTRY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) /* We have user CR3. Change to kernel CR3. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) .Lerror_entry_from_usermode_after_swapgs:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) /* Put us onto the real thread stack. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) popq %r12 /* save return addr in %12 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) movq %rsp, %rdi /* arg0 = pt_regs pointer */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) call sync_regs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) movq %rax, %rsp /* switch stack */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) ENCODE_FRAME_POINTER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) pushq %r12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) * There are two places in the kernel that can potentially fault with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) * usergs. Handle them here. B stepping K8s sometimes report a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) * truncated RIP for IRET exceptions returning to compat mode. Check
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) * for these here too.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) .Lerror_kernelspace:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) leaq native_irq_return_iret(%rip), %rcx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) cmpq %rcx, RIP+8(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) je .Lerror_bad_iret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) movl %ecx, %eax /* zero extend */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) cmpq %rax, RIP+8(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) je .Lbstep_iret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) cmpq $.Lgs_change, RIP+8(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) jne .Lerror_entry_done_lfence
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) * hack: .Lgs_change can fail with user gsbase. If this happens, fix up
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) * gsbase and proceed. We'll fix up the exception and land in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) * .Lgs_change's error handler with kernel gsbase.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) SWAPGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) * Issue an LFENCE to prevent GS speculation, regardless of whether it is a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061) * kernel or user gsbase.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) .Lerror_entry_done_lfence:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) FENCE_SWAPGS_KERNEL_ENTRY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) ret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) .Lbstep_iret:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) /* Fix truncated RIP */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) movq %rcx, RIP+8(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) /* fall through */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) .Lerror_bad_iret:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) * We came from an IRET to user mode, so we have user
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) * gsbase and CR3. Switch to kernel gsbase and CR3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) SWAPGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) FENCE_SWAPGS_USER_ENTRY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) * Pretend that the exception came from user mode: set up pt_regs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) * as if we faulted immediately after IRET.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) mov %rsp, %rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) call fixup_bad_iret
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) mov %rax, %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) jmp .Lerror_entry_from_usermode_after_swapgs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089) SYM_CODE_END(error_entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) SYM_CODE_START_LOCAL(error_return)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) UNWIND_HINT_REGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) DEBUG_ENTRY_ASSERT_IRQS_OFF
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) testb $3, CS(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) jz restore_regs_and_return_to_kernel
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) jmp swapgs_restore_regs_and_return_to_usermode
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) SYM_CODE_END(error_return)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) * Runs on exception stack. Xen PV does not go through this path at all,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) * so we can use real assembly here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) * Registers:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) * %r14: Used to save/restore the CR3 of the interrupted context
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105) * when PAGE_TABLE_ISOLATION is in use. Do not clobber.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) SYM_CODE_START(asm_exc_nmi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) UNWIND_HINT_IRET_REGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) * We allow breakpoints in NMIs. If a breakpoint occurs, then
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) * the iretq it performs will take us out of NMI context.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) * This means that we can have nested NMIs where the next
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114) * NMI is using the top of the stack of the previous NMI. We
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) * can't let it execute because the nested NMI will corrupt the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) * stack of the previous NMI. NMI handlers are not re-entrant
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) * anyway.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) * To handle this case we do the following:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) * Check the a special location on the stack that contains
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) * a variable that is set when NMIs are executing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) * The interrupted task's stack is also checked to see if it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) * is an NMI stack.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) * If the variable is not set and the stack is not the NMI
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) * stack then:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) * o Set the special variable on the stack
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) * o Copy the interrupt frame into an "outermost" location on the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) * stack
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) * o Copy the interrupt frame into an "iret" location on the stack
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) * o Continue processing the NMI
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) * If the variable is set or the previous stack is the NMI stack:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) * o Modify the "iret" location to jump to the repeat_nmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) * o return back to the first NMI
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) * Now on exit of the first NMI, we first clear the stack variable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) * The NMI stack will tell any nested NMIs at that point that it is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) * nested. Then we pop the stack normally with iret, and if there was
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) * a nested NMI that updated the copy interrupt stack frame, a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) * jump will be made to the repeat_nmi code that will handle the second
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140) * NMI.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) * However, espfix prevents us from directly returning to userspace
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) * with a single IRET instruction. Similarly, IRET to user mode
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) * can fault. We therefore handle NMIs from user space like
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) * other IST entries.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) ASM_CLAC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) /* Use %rdx as our temp variable throughout */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) pushq %rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) testb $3, CS-RIP+8(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154) jz .Lnmi_from_kernel
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) * NMI from user mode. We need to run on the thread stack, but we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158) * can't go through the normal entry paths: NMIs are masked, and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) * we don't want to enable interrupts, because then we'll end
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) * up in an awkward situation in which IRQs are on but NMIs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) * are off.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163) * We also must not push anything to the stack before switching
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) * stacks lest we corrupt the "NMI executing" variable.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) swapgs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) cld
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169) FENCE_SWAPGS_USER_ENTRY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) movq %rsp, %rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) UNWIND_HINT_IRET_REGS base=%rdx offset=8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) pushq 5*8(%rdx) /* pt_regs->ss */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) pushq 4*8(%rdx) /* pt_regs->rsp */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) pushq 3*8(%rdx) /* pt_regs->flags */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177) pushq 2*8(%rdx) /* pt_regs->cs */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) pushq 1*8(%rdx) /* pt_regs->rip */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) UNWIND_HINT_IRET_REGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) pushq $-1 /* pt_regs->orig_ax */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) PUSH_AND_CLEAR_REGS rdx=(%rdx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) ENCODE_FRAME_POINTER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) * At this point we no longer need to worry about stack damage
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) * due to nesting -- we're on the normal thread stack and we're
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) * done with the NMI stack.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) movq %rsp, %rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) movq $-1, %rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) call exc_nmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) * Return back to user mode. We must *not* do the normal exit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) * work, because we don't want to enable interrupts.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) jmp swapgs_restore_regs_and_return_to_usermode
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200) .Lnmi_from_kernel:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) * Here's what our stack frame will look like:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) * +---------------------------------------------------------+
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204) * | original SS |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) * | original Return RSP |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206) * | original RFLAGS |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207) * | original CS |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) * | original RIP |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) * +---------------------------------------------------------+
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) * | temp storage for rdx |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211) * +---------------------------------------------------------+
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) * | "NMI executing" variable |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) * +---------------------------------------------------------+
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214) * | iret SS } Copied from "outermost" frame |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) * | iret Return RSP } on each loop iteration; overwritten |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) * | iret RFLAGS } by a nested NMI to force another |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) * | iret CS } iteration if needed. |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) * | iret RIP } |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) * +---------------------------------------------------------+
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) * | outermost SS } initialized in first_nmi; |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) * | outermost Return RSP } will not be changed before |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) * | outermost RFLAGS } NMI processing is done. |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223) * | outermost CS } Copied to "iret" frame on each |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) * | outermost RIP } iteration. |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) * +---------------------------------------------------------+
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) * | pt_regs |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227) * +---------------------------------------------------------+
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) * The "original" frame is used by hardware. Before re-enabling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) * NMIs, we need to be done with it, and we need to leave enough
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) * space for the asm code here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) * We return by executing IRET while RSP points to the "iret" frame.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) * That will either return for real or it will loop back into NMI
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) * processing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) * The "outermost" frame is copied to the "iret" frame on each
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238) * iteration of the loop, so each iteration starts with the "iret"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) * frame pointing to the final return target.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243) * Determine whether we're a nested NMI.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245) * If we interrupted kernel code between repeat_nmi and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) * end_repeat_nmi, then we are a nested NMI. We must not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) * modify the "iret" frame because it's being written by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) * the outer NMI. That's okay; the outer NMI handler is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) * about to about to call exc_nmi() anyway, so we can just
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250) * resume the outer NMI.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) movq $repeat_nmi, %rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254) cmpq 8(%rsp), %rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255) ja 1f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256) movq $end_repeat_nmi, %rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) cmpq 8(%rsp), %rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258) ja nested_nmi_out
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262) * Now check "NMI executing". If it's set, then we're nested.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263) * This will not detect if we interrupted an outer NMI just
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264) * before IRET.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266) cmpl $1, -8(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267) je nested_nmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270) * Now test if the previous stack was an NMI stack. This covers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271) * the case where we interrupt an outer NMI after it clears
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272) * "NMI executing" but before IRET. We need to be careful, though:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273) * there is one case in which RSP could point to the NMI stack
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274) * despite there being no NMI active: naughty userspace controls
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275) * RSP at the very beginning of the SYSCALL targets. We can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276) * pull a fast one on naughty userspace, though: we program
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) * SYSCALL to mask DF, so userspace cannot cause DF to be set
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) * if it controls the kernel's RSP. We set DF before we clear
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279) * "NMI executing".
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) lea 6*8(%rsp), %rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282) /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) cmpq %rdx, 4*8(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284) /* If the stack pointer is above the NMI stack, this is a normal NMI */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285) ja first_nmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287) subq $EXCEPTION_STKSZ, %rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288) cmpq %rdx, 4*8(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289) /* If it is below the NMI stack, it is a normal NMI */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1290) jb first_nmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1291)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1292) /* Ah, it is within the NMI stack. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1293)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1294) testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1295) jz first_nmi /* RSP was user controlled. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1296)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1297) /* This is a nested NMI. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1298)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1299) nested_nmi:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1300) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1301) * Modify the "iret" frame to point to repeat_nmi, forcing another
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1302) * iteration of NMI handling.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1303) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1304) subq $8, %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1305) leaq -10*8(%rsp), %rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1306) pushq $__KERNEL_DS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1307) pushq %rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1308) pushfq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1309) pushq $__KERNEL_CS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1310) pushq $repeat_nmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1311)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1312) /* Put stack back */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1313) addq $(6*8), %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1314)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1315) nested_nmi_out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1316) popq %rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1317)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1318) /* We are returning to kernel mode, so this cannot result in a fault. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1319) iretq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1320)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1321) first_nmi:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1322) /* Restore rdx. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1323) movq (%rsp), %rdx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1324)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1325) /* Make room for "NMI executing". */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1326) pushq $0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1327)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1328) /* Leave room for the "iret" frame */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1329) subq $(5*8), %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1330)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1331) /* Copy the "original" frame to the "outermost" frame */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1332) .rept 5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1333) pushq 11*8(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1334) .endr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1335) UNWIND_HINT_IRET_REGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1336)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1337) /* Everything up to here is safe from nested NMIs */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1338)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1339) #ifdef CONFIG_DEBUG_ENTRY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1340) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1341) * For ease of testing, unmask NMIs right away. Disabled by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1342) * default because IRET is very expensive.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1343) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1344) pushq $0 /* SS */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1345) pushq %rsp /* RSP (minus 8 because of the previous push) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1346) addq $8, (%rsp) /* Fix up RSP */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1347) pushfq /* RFLAGS */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1348) pushq $__KERNEL_CS /* CS */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1349) pushq $1f /* RIP */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1350) iretq /* continues at repeat_nmi below */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1351) UNWIND_HINT_IRET_REGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1352) 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1353) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1354)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1355) repeat_nmi:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1356) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1357) * If there was a nested NMI, the first NMI's iret will return
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1358) * here. But NMIs are still enabled and we can take another
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1359) * nested NMI. The nested NMI checks the interrupted RIP to see
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1360) * if it is between repeat_nmi and end_repeat_nmi, and if so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1361) * it will just return, as we are about to repeat an NMI anyway.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1362) * This makes it safe to copy to the stack frame that a nested
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1363) * NMI will update.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1364) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1365) * RSP is pointing to "outermost RIP". gsbase is unknown, but, if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1366) * we're repeating an NMI, gsbase has the same value that it had on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1367) * the first iteration. paranoid_entry will load the kernel
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1368) * gsbase if needed before we call exc_nmi(). "NMI executing"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1369) * is zero.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1370) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1371) movq $1, 10*8(%rsp) /* Set "NMI executing". */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1372)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1373) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1374) * Copy the "outermost" frame to the "iret" frame. NMIs that nest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1375) * here must not modify the "iret" frame while we're writing to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1376) * it or it will end up containing garbage.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1377) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1378) addq $(10*8), %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1379) .rept 5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1380) pushq -6*8(%rsp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1381) .endr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1382) subq $(5*8), %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1383) end_repeat_nmi:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1384)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1385) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1386) * Everything below this point can be preempted by a nested NMI.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1387) * If this happens, then the inner NMI will change the "iret"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1388) * frame to point back to repeat_nmi.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1389) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1390) pushq $-1 /* ORIG_RAX: no syscall to restart */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1391)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1392) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1393) * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1394) * as we should not be calling schedule in NMI context.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1395) * Even with normal interrupts enabled. An NMI should not be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1396) * setting NEED_RESCHED or anything that normal interrupts and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1397) * exceptions might do.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1398) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1399) call paranoid_entry
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1400) UNWIND_HINT_REGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1401)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1402) movq %rsp, %rdi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1403) movq $-1, %rsi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1404) call exc_nmi
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1405)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1406) /* Always restore stashed CR3 value (see paranoid_entry) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1407) RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1408)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1409) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1410) * The above invocation of paranoid_entry stored the GSBASE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1411) * related information in R/EBX depending on the availability
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1412) * of FSGSBASE.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1413) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1414) * If FSGSBASE is enabled, restore the saved GSBASE value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1415) * unconditionally, otherwise take the conditional SWAPGS path.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1416) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1417) ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1418)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1419) wrgsbase %rbx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1420) jmp nmi_restore
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1421)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1422) nmi_no_fsgsbase:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1423) /* EBX == 0 -> invoke SWAPGS */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1424) testl %ebx, %ebx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1425) jnz nmi_restore
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1426)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1427) nmi_swapgs:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1428) swapgs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1429)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1430) nmi_restore:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1431) POP_REGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1432)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1433) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1434) * Skip orig_ax and the "outermost" frame to point RSP at the "iret"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1435) * at the "iret" frame.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1436) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1437) addq $6*8, %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1438)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1439) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1440) * Clear "NMI executing". Set DF first so that we can easily
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1441) * distinguish the remaining code between here and IRET from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1442) * the SYSCALL entry and exit paths.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1443) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1444) * We arguably should just inspect RIP instead, but I (Andy) wrote
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1445) * this code when I had the misapprehension that Xen PV supported
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1446) * NMIs, and Xen PV would break that approach.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1447) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1448) std
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1449) movq $0, 5*8(%rsp) /* clear "NMI executing" */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1450)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1451) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1452) * iretq reads the "iret" frame and exits the NMI stack in a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1453) * single instruction. We are returning to kernel mode, so this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1454) * cannot result in a fault. Similarly, we don't need to worry
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1455) * about espfix64 on the way back to kernel mode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1456) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1457) iretq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1458) SYM_CODE_END(asm_exc_nmi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1459)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1460) #ifndef CONFIG_IA32_EMULATION
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1461) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1462) * This handles SYSCALL from 32-bit code. There is no way to program
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1463) * MSRs to fully disable 32-bit SYSCALL.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1464) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1465) SYM_CODE_START(ignore_sysret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1466) UNWIND_HINT_EMPTY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1467) mov $-ENOSYS, %eax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1468) sysretl
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1469) SYM_CODE_END(ignore_sysret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1470) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1471)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1472) .pushsection .text, "ax"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1473) SYM_CODE_START(rewind_stack_do_exit)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1474) UNWIND_HINT_FUNC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1475) /* Prevent any naive code from trying to unwind to our caller. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1476) xorl %ebp, %ebp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1477)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1478) movq PER_CPU_VAR(cpu_current_top_of_stack), %rax
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1479) leaq -PTREGS_SIZE(%rax), %rsp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1480) UNWIND_HINT_REGS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1481)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1482) call do_exit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1483) SYM_CODE_END(rewind_stack_do_exit)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1484) .popsection