^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * Copyright (C) 1995 Linus Torvalds
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) #include <linux/sched.h> /* test_thread_flag(), ... */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) #include <linux/sched/task_stack.h> /* task_stack_*(), ... */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) #include <linux/kdebug.h> /* oops_begin/end, ... */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) #include <linux/extable.h> /* search_exception_tables */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) #include <linux/memblock.h> /* max_low_pfn */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #include <linux/kfence.h> /* kfence_handle_page_fault */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) #include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #include <linux/mmiotrace.h> /* kmmio_handler, ... */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #include <linux/perf_event.h> /* perf_sw_event */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #include <linux/hugetlb.h> /* hstate_index_to_shift */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #include <linux/prefetch.h> /* prefetchw */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #include <linux/context_tracking.h> /* exception_enter(), ... */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #include <linux/uaccess.h> /* faulthandler_disabled() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #include <linux/efi.h> /* efi_recover_from_page_fault()*/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #include <linux/mm_types.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) #include <asm/cpufeature.h> /* boot_cpu_has, ... */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) #include <asm/traps.h> /* dotraplinkage, ... */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #include <asm/fixmap.h> /* VSYSCALL_ADDR */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) #include <asm/vsyscall.h> /* emulate_vsyscall */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) #include <asm/vm86.h> /* struct vm86 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) #include <asm/mmu_context.h> /* vma_pkey() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) #include <asm/efi.h> /* efi_recover_from_page_fault()*/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) #include <asm/desc.h> /* store_idt(), ... */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) #include <asm/cpu_entry_area.h> /* exception stack */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) #include <asm/pgtable_areas.h> /* VMALLOC_START, ... */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) #include <asm/kvm_para.h> /* kvm_handle_async_pf */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) #define CREATE_TRACE_POINTS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) #include <asm/trace/exceptions.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) * Returns 0 if mmiotrace is disabled, or if the fault is not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) * handled by mmiotrace:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) static nokprobe_inline int
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) kmmio_fault(struct pt_regs *regs, unsigned long addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) if (unlikely(is_kmmio_active()))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) if (kmmio_handler(regs, addr) == 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) * Prefetch quirks:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) * 32-bit mode:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) * Check that here and ignore it. This is AMD erratum #91.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) * 64-bit mode:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) * Sometimes the CPU reports invalid exceptions on prefetch.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) * Check that here and ignore it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) * Opcode checker based on code by Richard Brunner.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) static inline int
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) unsigned char opcode, int *prefetch)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) unsigned char instr_hi = opcode & 0xf0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) unsigned char instr_lo = opcode & 0x0f;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) switch (instr_hi) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) case 0x20:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) case 0x30:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) * In X86_64 long mode, the CPU will signal invalid
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) * opcode if some of these prefixes are present so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) * X86_64 will never get here anyway
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) return ((instr_lo & 7) == 0x6);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) #ifdef CONFIG_X86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) case 0x40:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) * In 64-bit mode 0x40..0x4F are valid REX prefixes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) return (!user_mode(regs) || user_64bit_mode(regs));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) case 0x60:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) /* 0x64 thru 0x67 are valid prefixes in all modes. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) return (instr_lo & 0xC) == 0x4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) case 0xF0:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) return !instr_lo || (instr_lo>>1) == 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) case 0x00:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) /* Prefetch instruction is 0x0F0D or 0x0F18 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) if (get_kernel_nofault(opcode, instr))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) *prefetch = (instr_lo == 0xF) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) (opcode == 0x0D || opcode == 0x18);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) static int
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) unsigned char *max_instr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) unsigned char *instr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) int prefetch = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) * If it was a exec (instruction fetch) fault on NX page, then
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) * do not ignore the fault:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) if (error_code & X86_PF_INSTR)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) instr = (void *)convert_ip_to_linear(current, regs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) max_instr = instr + 15;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) * This code has historically always bailed out if IP points to a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) * not-present page (e.g. due to a race). No one has ever
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) * complained about this.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) pagefault_disable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) while (instr < max_instr) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) unsigned char opcode;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) if (user_mode(regs)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) if (get_user(opcode, instr))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) if (get_kernel_nofault(opcode, instr))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) instr++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) pagefault_enable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) return prefetch;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) DEFINE_SPINLOCK(pgd_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) LIST_HEAD(pgd_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) #ifdef CONFIG_X86_32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) unsigned index = pgd_index(address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) pgd_t *pgd_k;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) p4d_t *p4d, *p4d_k;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) pud_t *pud, *pud_k;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) pmd_t *pmd, *pmd_k;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) pgd += index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) pgd_k = init_mm.pgd + index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) if (!pgd_present(*pgd_k))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) * set_pgd(pgd, *pgd_k); here would be useless on PAE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) * and redundant with the set_pmd() on non-PAE. As would
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) * set_p4d/set_pud.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) p4d = p4d_offset(pgd, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) p4d_k = p4d_offset(pgd_k, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) if (!p4d_present(*p4d_k))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) pud = pud_offset(p4d, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) pud_k = pud_offset(p4d_k, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) if (!pud_present(*pud_k))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) pmd = pmd_offset(pud, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) pmd_k = pmd_offset(pud_k, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) if (pmd_present(*pmd) != pmd_present(*pmd_k))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) set_pmd(pmd, *pmd_k);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) if (!pmd_present(*pmd_k))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) return pmd_k;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) * Handle a fault on the vmalloc or module mapping area
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) * This is needed because there is a race condition between the time
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) * when the vmalloc mapping code updates the PMD to the point in time
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) * where it synchronizes this update with the other page-tables in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) * system.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) * In this race window another thread/CPU can map an area on the same
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) * PMD, finds it already present and does not synchronize it with the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) * rest of the system yet. As a result v[mz]alloc might return areas
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) * which are not mapped in every page-table in the system, causing an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) * unhandled page-fault when they are accessed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) static noinline int vmalloc_fault(unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) unsigned long pgd_paddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) pmd_t *pmd_k;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) pte_t *pte_k;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) /* Make sure we are in vmalloc area: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) if (!(address >= VMALLOC_START && address < VMALLOC_END))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) * Synchronize this task's top level page-table
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) * with the 'reference' page table.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) * Do _not_ use "current" here. We might be inside
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) * an interrupt in the middle of a task switch..
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) pgd_paddr = read_cr3_pa();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) if (!pmd_k)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) if (pmd_large(*pmd_k))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) pte_k = pte_offset_kernel(pmd_k, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) if (!pte_present(*pte_k))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) NOKPROBE_SYMBOL(vmalloc_fault);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) unsigned long addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) for (addr = start & PMD_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) addr >= TASK_SIZE_MAX && addr < VMALLOC_END;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) addr += PMD_SIZE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) spin_lock(&pgd_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) list_for_each_entry(page, &pgd_list, lru) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) spinlock_t *pgt_lock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) /* the pgt_lock only for Xen */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) spin_lock(pgt_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) vmalloc_sync_one(page_address(page), addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) spin_unlock(pgt_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) spin_unlock(&pgd_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) * Did it hit the DOS screen memory VA from vm86 mode?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) static inline void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) check_v8086_mode(struct pt_regs *regs, unsigned long address,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) struct task_struct *tsk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) #ifdef CONFIG_VM86
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) unsigned long bit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) if (!v8086_mode(regs) || !tsk->thread.vm86)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) bit = (address - 0xA0000) >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) if (bit < 32)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) tsk->thread.vm86->screen_bitmap |= 1 << bit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) static bool low_pfn(unsigned long pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) return pfn < max_low_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) static void dump_pagetable(unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) pgd_t *base = __va(read_cr3_pa());
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) pgd_t *pgd = &base[pgd_index(address)];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) p4d_t *p4d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) pud_t *pud;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) pmd_t *pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) pte_t *pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) #ifdef CONFIG_X86_PAE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) pr_info("*pdpt = %016Lx ", pgd_val(*pgd));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) #define pr_pde pr_cont
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) #define pr_pde pr_info
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) p4d = p4d_offset(pgd, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) pud = pud_offset(p4d, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) pmd = pmd_offset(pud, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) #undef pr_pde
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) * We must not directly access the pte in the highpte
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) * case if the page table is located in highmem.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) * And let's rather not kmap-atomic the pte, just in case
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) * it's allocated already:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) pte = pte_offset_kernel(pmd, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) pr_cont("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) #else /* CONFIG_X86_64: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) #ifdef CONFIG_CPU_SUP_AMD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) static const char errata93_warning[] =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) KERN_ERR
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) "******* Working around it, but it may cause SEGVs or burn power.\n"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) "******* Please consider a BIOS update.\n"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) "******* Disabling USB legacy in the BIOS may also help.\n";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) * No vm86 mode in 64-bit mode:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) static inline void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) check_v8086_mode(struct pt_regs *regs, unsigned long address,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) struct task_struct *tsk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) static int bad_address(void *p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) unsigned long dummy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) return get_kernel_nofault(dummy, (unsigned long *)p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) static void dump_pagetable(unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) pgd_t *base = __va(read_cr3_pa());
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) pgd_t *pgd = base + pgd_index(address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) p4d_t *p4d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) pud_t *pud;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) pmd_t *pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) pte_t *pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) if (bad_address(pgd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) goto bad;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) pr_info("PGD %lx ", pgd_val(*pgd));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) if (!pgd_present(*pgd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) p4d = p4d_offset(pgd, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) if (bad_address(p4d))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) goto bad;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) pr_cont("P4D %lx ", p4d_val(*p4d));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) if (!p4d_present(*p4d) || p4d_large(*p4d))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) pud = pud_offset(p4d, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) if (bad_address(pud))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) goto bad;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) pr_cont("PUD %lx ", pud_val(*pud));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) if (!pud_present(*pud) || pud_large(*pud))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) pmd = pmd_offset(pud, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) if (bad_address(pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) goto bad;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) pr_cont("PMD %lx ", pmd_val(*pmd));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) if (!pmd_present(*pmd) || pmd_large(*pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) pte = pte_offset_kernel(pmd, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) if (bad_address(pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) goto bad;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) pr_cont("PTE %lx", pte_val(*pte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) pr_cont("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) bad:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) pr_info("BAD\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) #endif /* CONFIG_X86_64 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) * Workaround for K8 erratum #93 & buggy BIOS.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) * BIOS SMM functions are required to use a specific workaround
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) * to avoid corruption of the 64bit RIP register on C stepping K8.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) * A lot of BIOS that didn't get tested properly miss this.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) * The OS sees this as a page fault with the upper 32bits of RIP cleared.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) * Try to work around it here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) * Note we only handle faults in kernel here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) * Does nothing on 32-bit.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) static int is_errata93(struct pt_regs *regs, unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) #if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) || boot_cpu_data.x86 != 0xf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) if (address != regs->ip)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) if ((address >> 32) != 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) address |= 0xffffffffUL << 32;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) if ((address >= (u64)_stext && address <= (u64)_etext) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) (address >= MODULES_VADDR && address <= MODULES_END)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) printk_once(errata93_warning);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) regs->ip = address;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) * Work around K8 erratum #100 K8 in compat mode occasionally jumps
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) * to illegal addresses >4GB.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) * We catch this in the page fault handler because these addresses
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) * are not reachable. Just detect this case and return. Any code
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) * segment in LDT is compatibility mode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) static int is_errata100(struct pt_regs *regs, unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) #ifdef CONFIG_X86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) /* Pentium F0 0F C7 C8 bug workaround: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) #ifdef CONFIG_X86_F00F_BUG
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) if (boot_cpu_has_bug(X86_BUG_F00F) && idt_is_f00f_address(address)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) handle_invalid_op(regs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) u32 offset = (index >> 3) * sizeof(struct desc_struct);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) unsigned long addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) struct ldttss_desc desc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) if (index == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) pr_alert("%s: NULL\n", name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) if (offset + sizeof(struct ldttss_desc) >= gdt->size) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) pr_alert("%s: 0x%hx -- out of bounds\n", name, index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) if (copy_from_kernel_nofault(&desc, (void *)(gdt->address + offset),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) sizeof(struct ldttss_desc))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) pr_alert("%s: 0x%hx -- GDT entry is not readable\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) name, index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) addr = desc.base0 | (desc.base1 << 16) | ((unsigned long)desc.base2 << 24);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) #ifdef CONFIG_X86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) addr |= ((u64)desc.base3 << 32);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) name, index, addr, (desc.limit0 | (desc.limit1 << 16)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) static void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) if (!oops_may_print())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) if (error_code & X86_PF_INSTR) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) unsigned int level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) pgd_t *pgd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) pte_t *pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) pgd = __va(read_cr3_pa());
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) pgd += pgd_index(address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) pte = lookup_address_in_pgd(pgd, address, &level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) if (pte && pte_present(*pte) && !pte_exec(*pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) from_kuid(&init_user_ns, current_uid()));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) if (pte && pte_present(*pte) && pte_exec(*pte) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) (pgd_flags(*pgd) & _PAGE_USER) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) (__read_cr4() & X86_CR4_SMEP))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) from_kuid(&init_user_ns, current_uid()));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) if (address < PAGE_SIZE && !user_mode(regs))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) pr_alert("BUG: kernel NULL pointer dereference, address: %px\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) (void *)address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) pr_alert("BUG: unable to handle page fault for address: %px\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) (void *)address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) pr_alert("#PF: %s %s in %s mode\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) (error_code & X86_PF_USER) ? "user" : "supervisor",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) (error_code & X86_PF_INSTR) ? "instruction fetch" :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) (error_code & X86_PF_WRITE) ? "write access" :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) "read access",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) user_mode(regs) ? "user" : "kernel");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) !(error_code & X86_PF_PROT) ? "not-present page" :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) (error_code & X86_PF_RSVD) ? "reserved bit violation" :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) (error_code & X86_PF_PK) ? "protection keys violation" :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) "permissions violation");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) if (!(error_code & X86_PF_USER) && user_mode(regs)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) struct desc_ptr idt, gdt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) u16 ldtr, tr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) * This can happen for quite a few reasons. The more obvious
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) * ones are faults accessing the GDT, or LDT. Perhaps
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) * surprisingly, if the CPU tries to deliver a benign or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) * contributory exception from user code and gets a page fault
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) * during delivery, the page fault can be delivered as though
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) * it originated directly from user code. This could happen
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) * due to wrong permissions on the IDT, GDT, LDT, TSS, or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) * kernel or IST stack.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) store_idt(&idt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) /* Usable even on Xen PV -- it's just slow. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) native_store_gdt(&gdt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) idt.address, idt.size, gdt.address, gdt.size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) store_ldt(ldtr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) show_ldttss(&gdt, "LDTR", ldtr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) store_tr(tr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) show_ldttss(&gdt, "TR", tr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) dump_pagetable(address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) static noinline void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) pgtable_bad(struct pt_regs *regs, unsigned long error_code,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) struct task_struct *tsk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) int sig;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) flags = oops_begin();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) tsk = current;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) sig = SIGKILL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) tsk->comm, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) dump_pagetable(address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) if (__die("Bad pagetable", regs, error_code))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) sig = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) oops_end(flags, regs, sig);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) static void set_signal_archinfo(unsigned long address,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) unsigned long error_code)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) struct task_struct *tsk = current;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) * To avoid leaking information about the kernel page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) * table layout, pretend that user-mode accesses to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) * kernel addresses are always protection faults.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) * NB: This means that failed vsyscalls with vsyscall=none
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) * will have the PROT bit. This doesn't leak any
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) * information and does not appear to cause any problems.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) if (address >= TASK_SIZE_MAX)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) error_code |= X86_PF_PROT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) tsk->thread.trap_nr = X86_TRAP_PF;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) tsk->thread.error_code = error_code | X86_PF_USER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) tsk->thread.cr2 = address;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) static noinline void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) no_context(struct pt_regs *regs, unsigned long error_code,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) unsigned long address, int signal, int si_code)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) struct task_struct *tsk = current;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) int sig;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) if (user_mode(regs)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) * This is an implicit supervisor-mode access from user
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) * mode. Bypass all the kernel-mode recovery code and just
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) * OOPS.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) goto oops;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) /* Are we prepared to handle this kernel fault? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) * Any interrupt that takes a fault gets the fixup. This makes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) * the below recursive fault logic only apply to a faults from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) * task context.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) if (in_interrupt())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) * Per the above we're !in_interrupt(), aka. task context.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) * In this case we need to make sure we're not recursively
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) * faulting through the emulate_vsyscall() logic.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) if (current->thread.sig_on_uaccess_err && signal) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) set_signal_archinfo(address, error_code);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) /* XXX: hwpoison faults will set the wrong code. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) force_sig_fault(signal, si_code, (void __user *)address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) * Barring that, we can do the fixup and be happy.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) #ifdef CONFIG_VMAP_STACK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) * Stack overflow? During boot, we can fault near the initial
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) * stack in the direct map, but that's not an overflow -- check
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) * that we're in vmalloc space to avoid this.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) if (is_vmalloc_addr((void *)address) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) * We're likely to be running with very little stack space
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) * left. It's plausible that we'd hit this condition but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) * double-fault even before we get this far, in which case
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) * we're fine: the double-fault handler will deal with it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) * We don't want to make it all the way into the oops code
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) * and then double-fault, though, because we're likely to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) * break the console driver and lose most of the stack dump.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) asm volatile ("movq %[stack], %%rsp\n\t"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) "call handle_stack_overflow\n\t"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) "1: jmp 1b"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) : ASM_CALL_CONSTRAINT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) : "D" ("kernel stack overflow (page fault)"),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) "S" (regs), "d" (address),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) [stack] "rm" (stack));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) unreachable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) * 32-bit:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) * Valid to do another page fault here, because if this fault
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) * had been triggered by is_prefetch fixup_exception would have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) * handled it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) * 64-bit:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) * Hall of shame of CPU/BIOS bugs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) if (is_prefetch(regs, error_code, address))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) if (is_errata93(regs, address))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) * Buggy firmware could access regions which might page fault, try to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) * recover from such faults.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) if (IS_ENABLED(CONFIG_EFI))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) efi_recover_from_page_fault(address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) /* Only not-present faults should be handled by KFENCE. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) if (!(error_code & X86_PF_PROT) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) oops:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) * Oops. The kernel tried to access some bad page. We'll have to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) * terminate things with extreme prejudice:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) flags = oops_begin();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) show_fault_oops(regs, error_code, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) if (task_stack_end_corrupted(tsk))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) sig = SIGKILL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) if (__die("Oops", regs, error_code))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) sig = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) /* Executive summary in case the body of the oops scrolled away */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) printk(KERN_DEFAULT "CR2: %016lx\n", address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) oops_end(flags, regs, sig);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) * Print out info about fatal segfaults, if the show_unhandled_signals
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) * sysctl is set:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) static inline void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) show_signal_msg(struct pt_regs *regs, unsigned long error_code,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) unsigned long address, struct task_struct *tsk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) if (!unhandled_signal(tsk, SIGSEGV))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) if (!printk_ratelimit())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) loglvl, tsk->comm, task_pid_nr(tsk), address,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) (void *)regs->ip, (void *)regs->sp, error_code);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) print_vma_addr(KERN_CONT " in ", regs->ip);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) printk(KERN_CONT "\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) show_opcodes(regs, loglvl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) * The (legacy) vsyscall page is the long page in the kernel portion
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) * of the address space that has user-accessible permissions.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) static bool is_vsyscall_vaddr(unsigned long vaddr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) static void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) unsigned long address, u32 pkey, int si_code)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803) struct task_struct *tsk = current;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) /* User mode accesses just cause a SIGSEGV */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) if (user_mode(regs) && (error_code & X86_PF_USER)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) * It's possible to have interrupts off here:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) local_irq_enable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) * Valid to do another page fault here because this one came
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) * from user space:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) if (is_prefetch(regs, error_code, address))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) if (is_errata100(regs, address))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823) * To avoid leaking information about the kernel page table
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) * layout, pretend that user-mode accesses to kernel addresses
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) * are always protection faults.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) if (address >= TASK_SIZE_MAX)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) error_code |= X86_PF_PROT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) if (likely(show_unhandled_signals))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) show_signal_msg(regs, error_code, address, tsk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) set_signal_archinfo(address, error_code);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) if (si_code == SEGV_PKUERR)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) force_sig_pkuerr((void __user *)address, pkey);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838) force_sig_fault(SIGSEGV, si_code, (void __user *)address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) local_irq_disable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845) if (is_f00f_bug(regs, address))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848) no_context(regs, error_code, address, SIGSEGV, si_code);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) static noinline void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852) bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853) unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858) static void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) __bad_area(struct pt_regs *regs, unsigned long error_code,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860) unsigned long address, u32 pkey, int si_code)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) struct mm_struct *mm = current->mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) * Something tried to access memory that isn't in our memory map..
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865) * Fix it, but check if it's kernel or user first..
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867) mmap_read_unlock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869) __bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872) static noinline void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873) bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875) __bad_area(regs, error_code, address, 0, SEGV_MAPERR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878) static inline bool bad_area_access_from_pkeys(unsigned long error_code,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881) /* This code is always called on the current mm */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) bool foreign = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884) if (!boot_cpu_has(X86_FEATURE_OSPKE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) if (error_code & X86_PF_PK)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) /* this checks permission keys on the VMA: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890) (error_code & X86_PF_INSTR), foreign))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895) static noinline void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897) unsigned long address, struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) * This OSPKE check is not strictly necessary at runtime.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) * But, doing it this way allows compiler optimizations
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902) * if pkeys are compiled out.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) if (bad_area_access_from_pkeys(error_code, vma)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906) * A protection key fault means that the PKRU value did not allow
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907) * access to some PTE. Userspace can figure out what PKRU was
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) * from the XSAVE state. This function captures the pkey from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909) * the vma and passes it to userspace so userspace can discover
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) * which protection key was set on the PTE.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912) * If we get here, we know that the hardware signaled a X86_PF_PK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913) * fault and that there was a VMA once we got in the fault
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914) * handler. It does *not* guarantee that the VMA we find here
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915) * was the one that we faulted on.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) * 2. T1 : set PKRU to deny access to pkey=4, touches page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919) * 3. T1 : faults...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920) * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921) * 5. T1 : enters fault handler, takes mmap_lock, etc...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922) * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923) * faulted on a pte with its pkey=4.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925) u32 pkey = vma_pkey(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927) __bad_area(regs, error_code, address, pkey, SEGV_PKUERR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929) __bad_area(regs, error_code, address, 0, SEGV_ACCERR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933) static void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934) do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935) vm_fault_t fault)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) /* Kernel mode? Handle exceptions or die: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938) if (!(error_code & X86_PF_USER)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939) no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943) /* User-space => ok to do another page fault: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944) if (is_prefetch(regs, error_code, address))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947) set_signal_archinfo(address, error_code);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949) #ifdef CONFIG_MEMORY_FAILURE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950) if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951) struct task_struct *tsk = current;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952) unsigned lsb = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) pr_err(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955) "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956) tsk->comm, tsk->pid, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957) if (fault & VM_FAULT_HWPOISON_LARGE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958) lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) if (fault & VM_FAULT_HWPOISON)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) lsb = PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961) force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965) force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968) static noinline void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969) mm_fault_error(struct pt_regs *regs, unsigned long error_code,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970) unsigned long address, vm_fault_t fault)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972) if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973) no_context(regs, error_code, address, 0, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977) if (fault & VM_FAULT_OOM) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978) /* Kernel mode? Handle exceptions or die: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979) if (!(error_code & X86_PF_USER)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980) no_context(regs, error_code, address,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981) SIGSEGV, SEGV_MAPERR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986) * We ran out of memory, call the OOM killer, and return the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987) * userspace (which will retry the fault, or kill us if we got
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988) * oom-killed):
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990) pagefault_out_of_memory();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992) if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993) VM_FAULT_HWPOISON_LARGE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994) do_sigbus(regs, error_code, address, fault);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995) else if (fault & VM_FAULT_SIGSEGV)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996) bad_area_nosemaphore(regs, error_code, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998) BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) * Handle a spurious fault caused by a stale TLB entry.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) * This allows us to lazily refresh the TLB when increasing the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) * permissions of a kernel page (RO -> RW or NX -> X). Doing it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) * eagerly is very expensive since that implies doing a full
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) * cross-processor TLB flush, even if no stale TLB entries exist
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) * on other processors.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) * Spurious faults may only occur if the TLB contains an entry with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) * fewer permission than the page table entry. Non-present (P = 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) * and reserved bit (R = 1) faults are never spurious.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) * There are no security implications to leaving a stale TLB when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) * increasing the permissions on a page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) * Returns non-zero if a spurious fault was handled, zero otherwise.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) * (Optional Invalidation).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) static noinline int
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) spurious_kernel_fault(unsigned long error_code, unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) pgd_t *pgd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) p4d_t *p4d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) pud_t *pud;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) pmd_t *pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) pte_t *pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) * Only writes to RO or instruction fetches from NX may cause
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) * spurious faults.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) * These could be from user or supervisor accesses but the TLB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) * is only lazily flushed after a kernel mapping protection
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) * change, so user accesses are not expected to cause spurious
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) * faults.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) if (error_code != (X86_PF_WRITE | X86_PF_PROT) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) error_code != (X86_PF_INSTR | X86_PF_PROT))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) pgd = init_mm.pgd + pgd_index(address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) if (!pgd_present(*pgd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061) p4d = p4d_offset(pgd, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) if (!p4d_present(*p4d))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) if (p4d_large(*p4d))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) return spurious_kernel_fault_check(error_code, (pte_t *) p4d);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) pud = pud_offset(p4d, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) if (!pud_present(*pud))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) if (pud_large(*pud))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073) return spurious_kernel_fault_check(error_code, (pte_t *) pud);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) pmd = pmd_offset(pud, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) if (!pmd_present(*pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) if (pmd_large(*pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) return spurious_kernel_fault_check(error_code, (pte_t *) pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) pte = pte_offset_kernel(pmd, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) if (!pte_present(*pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) ret = spurious_kernel_fault_check(error_code, pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) if (!ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) * Make sure we have permissions in PMD.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) * If not, then there's a bug in the page tables:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) NOKPROBE_SYMBOL(spurious_kernel_fault);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) int show_unhandled_signals = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) static inline int
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) access_error(unsigned long error_code, struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) /* This is only called for the current mm, so: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) bool foreign = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110) * Read or write was blocked by protection keys. This is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) * always an unconditional error and can never result in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) * a follow-up action to resolve the fault, like a COW.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114) if (error_code & X86_PF_PK)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) * Make sure to check the VMA so that we do not perform
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) * faults just to hit a X86_PF_PK as soon as we fill in a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) * page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) (error_code & X86_PF_INSTR), foreign))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) if (error_code & X86_PF_WRITE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) /* write, present and write, not present: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) if (unlikely(!(vma->vm_flags & VM_WRITE)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) /* read, present: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) if (unlikely(error_code & X86_PF_PROT))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) /* read, not present: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) if (unlikely(!vma_is_accessible(vma)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) bool fault_in_kernel_space(unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147) * On 64-bit systems, the vsyscall page is at an address above
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) * TASK_SIZE_MAX, but is not considered part of the kernel
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149) * address space.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154) return address >= TASK_SIZE_MAX;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158) * Called for all faults where 'address' is part of the kernel address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) * space. Might get called for faults that originate from *code* that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) * ran in userspace or the kernel.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) static void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163) do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) * Protection keys exceptions only happen on user pages. We
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) * have no user pages in the kernel portion of the address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169) * space, so do not expect them here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) WARN_ON_ONCE(hw_error_code & X86_PF_PK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) #ifdef CONFIG_X86_32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) * We can fault-in kernel-space virtual memory on-demand. The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) * 'reference' page table is init_mm.pgd.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) * NOTE! We MUST NOT take any locks for this case. We may
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) * be in an interrupt or a critical region, and should
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) * only copy the information from the master page table,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) * nothing more.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) * Before doing this on-demand faulting, ensure that the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) * fault is not any of the following:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) * 1. A fault on a PTE with a reserved bit set.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) * 2. A fault caused by a user-mode access. (Do not demand-
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) * fault kernel memory due to user-mode accesses).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188) * 3. A fault caused by a page-level protection violation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) * (A demand fault would be on a non-present page which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) * would have X86_PF_PROT==0).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) * This is only needed to close a race condition on x86-32 in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) * the vmalloc mapping/unmapping code. See the comment above
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194) * vmalloc_fault() for details. On x86-64 the race does not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) * exist as the vmalloc mappings don't need to be synchronized
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) * there.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199) if (vmalloc_fault(address) >= 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204) /* Was the fault spurious, caused by lazy TLB invalidation? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) if (spurious_kernel_fault(hw_error_code, address))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) /* kprobes don't want to hook the spurious faults: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) if (kprobe_page_fault(regs, X86_TRAP_PF))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) * Note, despite being a "bad area", there are quite a few
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214) * acceptable reasons to get here, such as erratum fixups
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) * and handling kernel code that can fault, like get_user().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) * Don't take the mm semaphore here. If we fixup a prefetch
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) * fault we could otherwise deadlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) bad_area_nosemaphore(regs, hw_error_code, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) NOKPROBE_SYMBOL(do_kern_addr_fault);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) /* Handle faults in the user portion of the address space */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) static inline
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) void do_user_addr_fault(struct pt_regs *regs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227) unsigned long hw_error_code,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) struct vm_area_struct *vma = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) struct task_struct *tsk;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) struct mm_struct *mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) vm_fault_t fault;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) unsigned int flags = FAULT_FLAG_DEFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) tsk = current;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) mm = tsk->mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) /* kprobes don't want to hook the spurious faults: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240) if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244) * Reserved bits are never expected to be set on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245) * entries in the user portion of the page tables.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) if (unlikely(hw_error_code & X86_PF_RSVD))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) pgtable_bad(regs, hw_error_code, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) * If SMAP is on, check for invalid kernel (supervisor) access to user
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252) * pages in the user address space. The odd case here is WRUSS,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) * which, according to the preliminary documentation, does not respect
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254) * SMAP and will have the USER bit set so, in all cases, SMAP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255) * enforcement appears to be consistent with the USER bit.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258) !(hw_error_code & X86_PF_USER) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259) !(regs->flags & X86_EFLAGS_AC)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261) bad_area_nosemaphore(regs, hw_error_code, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266) * If we're in an interrupt, have no user context or are running
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267) * in a region with pagefaults disabled then we must not take the fault
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269) if (unlikely(faulthandler_disabled() || !mm)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270) bad_area_nosemaphore(regs, hw_error_code, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275) * It's safe to allow irq's after cr2 has been saved and the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276) * vmalloc fault has been handled.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) * User-mode registers count as a user access even for any
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279) * potential system fault or CPU buglet:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) if (user_mode(regs)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282) local_irq_enable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) flags |= FAULT_FLAG_USER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285) if (regs->flags & X86_EFLAGS_IF)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286) local_irq_enable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289) perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1290)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1291) if (hw_error_code & X86_PF_WRITE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1292) flags |= FAULT_FLAG_WRITE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1293) if (hw_error_code & X86_PF_INSTR)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1294) flags |= FAULT_FLAG_INSTRUCTION;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1295)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1296) #ifdef CONFIG_X86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1297) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1298) * Faults in the vsyscall page might need emulation. The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1299) * vsyscall page is at a high address (>PAGE_OFFSET), but is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1300) * considered to be part of the user address space.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1301) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1302) * The vsyscall page does not have a "real" VMA, so do this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1303) * emulation before we go searching for VMAs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1304) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1305) * PKRU never rejects instruction fetches, so we don't need
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1306) * to consider the PF_PK bit.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1307) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1308) if (is_vsyscall_vaddr(address)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1309) if (emulate_vsyscall(hw_error_code, regs, address))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1310) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1311) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1312) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1313)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1314) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1315) * Do not try to do a speculative page fault if the fault was due to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1316) * protection keys since it can't be resolved.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1317) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1318) if (!(hw_error_code & X86_PF_PK)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1319) fault = handle_speculative_fault(mm, address, flags, &vma, regs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1320) if (fault != VM_FAULT_RETRY)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1321) goto done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1322) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1323)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1324) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1325) * Kernel-mode access to the user address space should only occur
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1326) * on well-defined single instructions listed in the exception
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1327) * tables. But, an erroneous kernel fault occurring outside one of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1328) * those areas which also holds mmap_lock might deadlock attempting
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1329) * to validate the fault against the address space.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1330) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1331) * Only do the expensive exception table search when we might be at
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1332) * risk of a deadlock. This happens if we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1333) * 1. Failed to acquire mmap_lock, and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1334) * 2. The access did not originate in userspace.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1335) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1336) if (unlikely(!mmap_read_trylock(mm))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1337) if (!user_mode(regs) && !search_exception_tables(regs->ip)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1338) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1339) * Fault from code in kernel from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1340) * which we do not expect faults.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1341) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1342) bad_area_nosemaphore(regs, hw_error_code, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1343) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1344) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1345) retry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1346) mmap_read_lock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1347) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1348) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1349) * The above down_read_trylock() might have succeeded in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1350) * which case we'll have missed the might_sleep() from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1351) * down_read():
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1352) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1353) might_sleep();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1354) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1355)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1356) if (!vma || !can_reuse_spf_vma(vma, address))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1357) vma = find_vma(mm, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1358) if (unlikely(!vma)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1359) bad_area(regs, hw_error_code, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1360) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1361) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1362) if (likely(vma->vm_start <= address))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1363) goto good_area;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1364) if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1365) bad_area(regs, hw_error_code, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1366) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1367) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1368) if (unlikely(expand_stack(vma, address))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1369) bad_area(regs, hw_error_code, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1370) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1371) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1372)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1373) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1374) * Ok, we have a good vm_area for this memory access, so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1375) * we can handle it..
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1376) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1377) good_area:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1378) if (unlikely(access_error(hw_error_code, vma))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1379) bad_area_access_error(regs, hw_error_code, address, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1380) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1381) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1382)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1383) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1384) * If for any reason at all we couldn't handle the fault,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1385) * make sure we exit gracefully rather than endlessly redo
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1386) * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1387) * we get VM_FAULT_RETRY back, the mmap_lock has been unlocked.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1388) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1389) * Note that handle_userfault() may also release and reacquire mmap_lock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1390) * (and not return with VM_FAULT_RETRY), when returning to userland to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1391) * repeat the page fault later with a VM_FAULT_NOPAGE retval
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1392) * (potentially after handling any pending signal during the return to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1393) * userland). The return to userland is identified whenever
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1394) * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1395) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1396) fault = handle_mm_fault(vma, address, flags, regs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1397)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1398) /* Quick path to respond to signals */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1399) if (fault_signal_pending(fault, regs)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1400) if (!user_mode(regs))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1401) no_context(regs, hw_error_code, address, SIGBUS,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1402) BUS_ADRERR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1403) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1404) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1405)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1406) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1407) * If we need to retry the mmap_lock has already been released,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1408) * and if there is a fatal signal pending there is no guarantee
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1409) * that we made any progress. Handle this case first.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1410) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1411) if (unlikely((fault & VM_FAULT_RETRY) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1412) (flags & FAULT_FLAG_ALLOW_RETRY))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1413) flags |= FAULT_FLAG_TRIED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1414)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1415) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1416) * Do not try to reuse this vma and fetch it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1417) * again since we will release the mmap_sem.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1418) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1419) vma = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1420)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1421) goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1422) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1423)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1424) mmap_read_unlock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1425)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1426) done:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1427) if (unlikely(fault & VM_FAULT_ERROR)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1428) mm_fault_error(regs, hw_error_code, address, fault);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1429) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1430) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1431)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1432) check_v8086_mode(regs, address, tsk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1433) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1434) NOKPROBE_SYMBOL(do_user_addr_fault);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1435)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1436) static __always_inline void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1437) trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1438) unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1439) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1440) if (!trace_pagefault_enabled())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1441) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1442)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1443) if (user_mode(regs))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1444) trace_page_fault_user(address, regs, error_code);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1445) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1446) trace_page_fault_kernel(address, regs, error_code);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1447) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1448)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1449) static __always_inline void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1450) handle_page_fault(struct pt_regs *regs, unsigned long error_code,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1451) unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1452) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1453) trace_page_fault_entries(regs, error_code, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1454)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1455) if (unlikely(kmmio_fault(regs, address)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1456) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1457)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1458) /* Was the fault on kernel-controlled part of the address space? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1459) if (unlikely(fault_in_kernel_space(address))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1460) do_kern_addr_fault(regs, error_code, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1461) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1462) do_user_addr_fault(regs, error_code, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1463) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1464) * User address page fault handling might have reenabled
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1465) * interrupts. Fixing up all potential exit points of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1466) * do_user_addr_fault() and its leaf functions is just not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1467) * doable w/o creating an unholy mess or turning the code
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1468) * upside down.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1469) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1470) local_irq_disable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1471) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1472) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1473)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1474) DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1475) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1476) unsigned long address = read_cr2();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1477) irqentry_state_t state;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1478)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1479) prefetchw(¤t->mm->mmap_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1480)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1481) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1482) * KVM uses #PF vector to deliver 'page not present' events to guests
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1483) * (asynchronous page fault mechanism). The event happens when a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1484) * userspace task is trying to access some valid (from guest's point of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1485) * view) memory which is not currently mapped by the host (e.g. the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1486) * memory is swapped out). Note, the corresponding "page ready" event
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1487) * which is injected when the memory becomes available, is delived via
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1488) * an interrupt mechanism and not a #PF exception
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1489) * (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1490) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1491) * We are relying on the interrupted context being sane (valid RSP,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1492) * relevant locks not held, etc.), which is fine as long as the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1493) * interrupted context had IF=1. We are also relying on the KVM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1494) * async pf type field and CR2 being read consistently instead of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1495) * getting values from real and async page faults mixed up.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1496) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1497) * Fingers crossed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1498) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1499) * The async #PF handling code takes care of idtentry handling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1500) * itself.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1501) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1502) if (kvm_handle_async_pf(regs, (u32)address))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1503) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1504)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1505) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1506) * Entry handling for valid #PF from kernel mode is slightly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1507) * different: RCU is already watching and rcu_irq_enter() must not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1508) * be invoked because a kernel fault on a user space address might
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1509) * sleep.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1510) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1511) * In case the fault hit a RCU idle region the conditional entry
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1512) * code reenabled RCU to avoid subsequent wreckage which helps
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1513) * debugability.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1514) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1515) state = irqentry_enter(regs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1516)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1517) instrumentation_begin();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1518) handle_page_fault(regs, error_code, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1519) instrumentation_end();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1520)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1521) irqentry_exit(regs, state);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1522) }