^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0-only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * common.c - C code for kernel entry and exit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) * Copyright (c) 2015 Andrew Lutomirski
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * Based on asm and ptrace code by many authors. The code here originated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * in ptrace.c and signal.c.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) #include <linux/kernel.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) #include <linux/sched.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #include <linux/sched/task_stack.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) #include <linux/entry-common.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #include <linux/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #include <linux/smp.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #include <linux/errno.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #include <linux/ptrace.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #include <linux/export.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #include <linux/nospec.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #include <linux/syscalls.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #include <linux/uaccess.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) #ifdef CONFIG_XEN_PV
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) #include <xen/xen-ops.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #include <xen/events.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) #include <asm/desc.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) #include <asm/traps.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) #include <asm/vdso.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) #include <asm/cpufeature.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) #include <asm/fpu/api.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) #include <asm/nospec-branch.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) #include <asm/io_bitmap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) #include <asm/syscall.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) #include <asm/irq_stack.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) #ifdef CONFIG_X86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) nr = syscall_enter_from_user_mode(regs, nr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) instrumentation_begin();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) if (likely(nr < NR_syscalls)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) nr = array_index_nospec(nr, NR_syscalls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) regs->ax = sys_call_table[nr](regs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) #ifdef CONFIG_X86_X32_ABI
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) } else if (likely((nr & __X32_SYSCALL_BIT) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) (nr & ~__X32_SYSCALL_BIT) < X32_NR_syscalls)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) nr = array_index_nospec(nr & ~__X32_SYSCALL_BIT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) X32_NR_syscalls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) regs->ax = x32_sys_call_table[nr](regs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) instrumentation_end();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) syscall_exit_to_user_mode(regs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) if (IS_ENABLED(CONFIG_IA32_EMULATION))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) current_thread_info()->status |= TS_COMPAT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) return (unsigned int)regs->orig_ax;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) * Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) unsigned int nr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) if (likely(nr < IA32_NR_syscalls)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) nr = array_index_nospec(nr, IA32_NR_syscalls);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) regs->ax = ia32_sys_call_table[nr](regs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) /* Handles int $0x80 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) __visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) unsigned int nr = syscall_32_enter(regs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) * Subtlety here: if ptrace pokes something larger than 2^32-1 into
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) * orig_ax, the unsigned int return value truncates it. This may
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) * or may not be necessary, but it matches the old asm behavior.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) nr = (unsigned int)syscall_enter_from_user_mode(regs, nr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) instrumentation_begin();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) do_syscall_32_irqs_on(regs, nr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) instrumentation_end();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) syscall_exit_to_user_mode(regs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) unsigned int nr = syscall_32_enter(regs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) int res;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) * This cannot use syscall_enter_from_user_mode() as it has to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) * fetch EBP before invoking any of the syscall entry work
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) * functions.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) syscall_enter_from_user_mode_prepare(regs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) instrumentation_begin();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) /* Fetch EBP from where the vDSO stashed it. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) if (IS_ENABLED(CONFIG_X86_64)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) * Micro-optimization: the pointer we're following is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) * explicitly 32 bits, so it can't be out of range.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) res = __get_user(*(u32 *)®s->bp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) (u32 __user __force *)(unsigned long)(u32)regs->sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) res = get_user(*(u32 *)®s->bp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) (u32 __user __force *)(unsigned long)(u32)regs->sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) if (res) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) /* User code screwed up. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) regs->ax = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) local_irq_disable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) instrumentation_end();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) irqentry_exit_to_user_mode(regs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) /* The case truncates any ptrace induced syscall nr > 2^32 -1 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) nr = (unsigned int)syscall_enter_from_user_mode_work(regs, nr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) /* Now this is just like a normal syscall. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) do_syscall_32_irqs_on(regs, nr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) instrumentation_end();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) syscall_exit_to_user_mode(regs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) __visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) * Called using the internal vDSO SYSENTER/SYSCALL32 calling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) * convention. Adjust regs so it looks like we entered using int80.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) vdso_image_32.sym_int80_landing_pad;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) * Fix it up.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) regs->ip = landing_pad;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) /* Invoke the syscall. If it failed, keep it simple: use IRET. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) if (!__do_fast_syscall_32(regs))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) #ifdef CONFIG_X86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) * Opportunistic SYSRETL: if possible, try to return using SYSRETL.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) * SYSRETL is available on all 64-bit CPUs, so we don't need to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) * bother with SYSEXIT.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) * because the ECX fixup above will ensure that this is essentially
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) * never the case.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) regs->ip == landing_pad &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) * because the ECX fixup above will ensure that this is essentially
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) * never the case.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) * We don't allow syscalls at all from VM86 mode, but we still
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) * need to check VM, because we might be returning from sys_vm86.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) return static_cpu_has(X86_FEATURE_SEP) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) regs->cs == __USER_CS && regs->ss == __USER_DS &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) regs->ip == landing_pad &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) __visible noinstr long do_SYSENTER_32(struct pt_regs *regs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) /* SYSENTER loses RSP, but the vDSO saved it in RBP. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) regs->sp = regs->bp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) regs->flags |= X86_EFLAGS_IF;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) return do_fast_syscall_32(regs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) SYSCALL_DEFINE0(ni_syscall)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) return -ENOSYS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) #ifdef CONFIG_XEN_PV
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) #ifndef CONFIG_PREEMPTION
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) * Some hypercalls issued by the toolstack can take many 10s of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) * seconds. Allow tasks running hypercalls via the privcmd driver to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) * be voluntarily preempted even if full kernel preemption is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) * disabled.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) * Such preemptible hypercalls are bracketed by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) * calls.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) * In case of scheduling the flag must be cleared and restored after
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) * returning from schedule as the task might move to a different CPU.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) static __always_inline bool get_and_clear_inhcall(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) bool inhcall = __this_cpu_read(xen_in_preemptible_hcall);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) __this_cpu_write(xen_in_preemptible_hcall, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) return inhcall;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) static __always_inline void restore_inhcall(bool inhcall)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) __this_cpu_write(xen_in_preemptible_hcall, inhcall);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) static __always_inline bool get_and_clear_inhcall(void) { return false; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) static __always_inline void restore_inhcall(bool inhcall) { }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) static void __xen_pv_evtchn_do_upcall(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) irq_enter_rcu();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) inc_irq_stat(irq_hv_callback_count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) xen_hvm_evtchn_do_upcall();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) irq_exit_rcu();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) struct pt_regs *old_regs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) bool inhcall;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) irqentry_state_t state;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) state = irqentry_enter(regs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) old_regs = set_irq_regs(regs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) instrumentation_begin();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) run_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) instrumentation_end();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) set_irq_regs(old_regs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) inhcall = get_and_clear_inhcall();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) instrumentation_begin();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) irqentry_exit_cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) instrumentation_end();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) restore_inhcall(inhcall);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) irqentry_exit(regs, state);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) #endif /* CONFIG_XEN_PV */