^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0-only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * Kernel-based Virtual Machine driver for Linux
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * This module enables machines with Intel VT-x extensions to run virtual
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * machines without emulation or binary translation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) * MMU support
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) * Copyright (C) 2006 Qumranet, Inc.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) * Copyright 2010 Red Hat, Inc. and/or its affiliates.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) * Authors:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) * Yaniv Kamay <yaniv@qumranet.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) * Avi Kivity <avi@qumranet.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #include "irq.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #include "ioapic.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #include "mmu.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #include "mmu_internal.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) #include "tdp_mmu.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) #include "x86.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) #include "kvm_cache_regs.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #include "kvm_emulate.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) #include "cpuid.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) #include "spte.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) #include <linux/kvm_host.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) #include <linux/types.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) #include <linux/string.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) #include <linux/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) #include <linux/highmem.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) #include <linux/moduleparam.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) #include <linux/export.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) #include <linux/swap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) #include <linux/hugetlb.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) #include <linux/compiler.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) #include <linux/srcu.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) #include <linux/slab.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) #include <linux/sched/signal.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) #include <linux/uaccess.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) #include <linux/hash.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) #include <linux/kern_levels.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) #include <linux/kthread.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) #include <asm/page.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) #include <asm/memtype.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) #include <asm/cmpxchg.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) #include <asm/io.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) #include <asm/vmx.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) #include <asm/kvm_page_track.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) #include "trace.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) #include "paging.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) extern bool itlb_multihit_kvm_mitigation;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) static int __read_mostly nx_huge_pages = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) #ifdef CONFIG_PREEMPT_RT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) /* Recovery can cause latency spikes, disable it for PREEMPT_RT. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) static const struct kernel_param_ops nx_huge_pages_ops = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) .set = set_nx_huge_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) .get = param_get_bool,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) static const struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) .set = set_nx_huge_pages_recovery_ratio,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) .get = param_get_uint,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) __MODULE_PARM_TYPE(nx_huge_pages, "bool");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) &nx_huge_pages_recovery_ratio, 0644);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) static bool __read_mostly force_flush_and_sync_on_reuse;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) * When setting this variable to true it enables Two-Dimensional-Paging
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) * where the hardware walks 2 page tables:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) * 1. the guest-virtual to guest-physical
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) * 2. while doing 1. it walks guest-physical to host-physical
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) * If the hardware supports that we don't need to do shadow paging.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) bool tdp_enabled = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) static int max_huge_page_level __read_mostly;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) static int max_tdp_level __read_mostly;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) enum {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) AUDIT_PRE_PAGE_FAULT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) AUDIT_POST_PAGE_FAULT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) AUDIT_PRE_PTE_WRITE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) AUDIT_POST_PTE_WRITE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) AUDIT_PRE_SYNC,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) AUDIT_POST_SYNC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) #ifdef MMU_DEBUG
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) bool dbg = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) module_param(dbg, bool, 0644);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) #define PTE_PREFETCH_NUM 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) #define PT32_LEVEL_BITS 10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) #define PT32_LEVEL_SHIFT(level) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) #define PT32_LVL_OFFSET_MASK(level) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) * PT32_LEVEL_BITS))) - 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) #define PT32_INDEX(address, level)\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) #define PT32_BASE_ADDR_MASK PAGE_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) #define PT32_DIR_BASE_ADDR_MASK \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) #define PT32_LVL_ADDR_MASK(level) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) * PT32_LEVEL_BITS))) - 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) #include <trace/events/kvm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) /* make pte_list_desc fit well in cache line */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) #define PTE_LIST_EXT 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) struct pte_list_desc {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) u64 *sptes[PTE_LIST_EXT];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) struct pte_list_desc *more;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) struct kvm_shadow_walk_iterator {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) u64 addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) hpa_t shadow_addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) u64 *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) int level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) unsigned index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) #define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) for (shadow_walk_init_using_root(&(_walker), (_vcpu), \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) (_root), (_addr)); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) shadow_walk_okay(&(_walker)); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) shadow_walk_next(&(_walker)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) #define for_each_shadow_entry(_vcpu, _addr, _walker) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) for (shadow_walk_init(&(_walker), _vcpu, _addr); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) shadow_walk_okay(&(_walker)); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) shadow_walk_next(&(_walker)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) for (shadow_walk_init(&(_walker), _vcpu, _addr); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) shadow_walk_okay(&(_walker)) && \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) __shadow_walk_next(&(_walker), spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) static struct kmem_cache *pte_list_desc_cache;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) struct kmem_cache *mmu_page_header_cache;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) static struct percpu_counter kvm_total_used_mmu_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) static void mmu_spte_set(u64 *sptep, u64 spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) static union kvm_mmu_page_role
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) #define CREATE_TRACE_POINTS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) #include "mmutrace.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) static inline bool kvm_available_flush_tlb_with_range(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) return kvm_x86_ops.tlb_remote_flush_with_range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) struct kvm_tlb_range *range)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) int ret = -ENOTSUPP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) if (range && kvm_x86_ops.tlb_remote_flush_with_range)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) ret = kvm_x86_ops.tlb_remote_flush_with_range(kvm, range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) kvm_flush_remote_tlbs(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) u64 start_gfn, u64 pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) struct kvm_tlb_range range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) range.start_gfn = start_gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) range.pages = pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) kvm_flush_remote_tlbs_with_range(kvm, &range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) bool is_nx_huge_page_enabled(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) return READ_ONCE(nx_huge_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) unsigned int access)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) u64 mask = make_mmio_spte(vcpu, gfn, access);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) trace_mark_mmio_spte(sptep, gfn, mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) mmu_spte_set(sptep, mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) static gfn_t get_mmio_spte_gfn(u64 spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) & shadow_nonpresent_or_rsvd_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) return gpa >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) static unsigned get_mmio_spte_access(u64 spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) return spte & shadow_mmio_access_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) kvm_pfn_t pfn, unsigned int access)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) if (unlikely(is_noslot_pfn(pfn))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) mark_mmio_spte(vcpu, sptep, gfn, access);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) u64 kvm_gen, spte_gen, gen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) gen = kvm_vcpu_memslots(vcpu)->generation;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) kvm_gen = gen & MMIO_SPTE_GEN_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) spte_gen = get_mmio_spte_generation(spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) trace_check_mmio_spte(spte, kvm_gen, spte_gen);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) return likely(kvm_gen == spte_gen);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) struct x86_exception *exception)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) return gpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) static int is_cpuid_PSE36(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) static int is_nx(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) return vcpu->arch.efer & EFER_NX;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) static gfn_t pse36_gfn_delta(u32 gpte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) return (gpte & PT32_DIR_PSE36_MASK) << shift;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) #ifdef CONFIG_X86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) static void __set_spte(u64 *sptep, u64 spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) WRITE_ONCE(*sptep, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) static void __update_clear_spte_fast(u64 *sptep, u64 spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) WRITE_ONCE(*sptep, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) return xchg(sptep, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) static u64 __get_spte_lockless(u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) return READ_ONCE(*sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) union split_spte {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) struct {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) u32 spte_low;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) u32 spte_high;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) u64 spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) static void count_spte_clear(u64 *sptep, u64 spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) struct kvm_mmu_page *sp = sptep_to_sp(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) if (is_shadow_present_pte(spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) /* Ensure the spte is completely set before we increase the count */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) smp_wmb();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) sp->clear_spte_count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) static void __set_spte(u64 *sptep, u64 spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) union split_spte *ssptep, sspte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) ssptep = (union split_spte *)sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) sspte = (union split_spte)spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) ssptep->spte_high = sspte.spte_high;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) * If we map the spte from nonpresent to present, We should store
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) * the high bits firstly, then set present bit, so cpu can not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) * fetch this spte while we are setting the spte.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) smp_wmb();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) static void __update_clear_spte_fast(u64 *sptep, u64 spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) union split_spte *ssptep, sspte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) ssptep = (union split_spte *)sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) sspte = (union split_spte)spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) * If we map the spte from present to nonpresent, we should clear
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) * present bit firstly to avoid vcpu fetch the old high bits.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) smp_wmb();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) ssptep->spte_high = sspte.spte_high;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) count_spte_clear(sptep, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) union split_spte *ssptep, sspte, orig;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) ssptep = (union split_spte *)sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) sspte = (union split_spte)spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) /* xchg acts as a barrier before the setting of the high bits */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) orig.spte_high = ssptep->spte_high;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) ssptep->spte_high = sspte.spte_high;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) count_spte_clear(sptep, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) return orig.spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) * The idea using the light way get the spte on x86_32 guest is from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) * gup_get_pte (mm/gup.c).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) * An spte tlb flush may be pending, because kvm_set_pte_rmapp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) * coalesces them and we are running out of the MMU lock. Therefore
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) * we need to protect against in-progress updates of the spte.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) * Reading the spte while an update is in progress may get the old value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) * for the high part of the spte. The race is fine for a present->non-present
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) * change (because the high part of the spte is ignored for non-present spte),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) * but for a present->present change we must reread the spte.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) * All such changes are done in two steps (present->non-present and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) * non-present->present), hence it is enough to count the number of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) * present->non-present updates: if it changed while reading the spte,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) * we might have hit the race. This is done using clear_spte_count.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) static u64 __get_spte_lockless(u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) struct kvm_mmu_page *sp = sptep_to_sp(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) union split_spte spte, *orig = (union split_spte *)sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) int count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) retry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) count = sp->clear_spte_count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) smp_rmb();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) spte.spte_low = orig->spte_low;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) smp_rmb();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) spte.spte_high = orig->spte_high;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) smp_rmb();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) if (unlikely(spte.spte_low != orig->spte_low ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) count != sp->clear_spte_count))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) return spte.spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) static bool spte_has_volatile_bits(u64 spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) if (!is_shadow_present_pte(spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) * Always atomically update spte if it can be updated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) * out of mmu-lock, it can ensure dirty bit is not lost,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) * also, it can help us to get a stable is_writable_pte()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) * to ensure tlb flush is not missed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) if (spte_can_locklessly_be_made_writable(spte) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) is_access_track_spte(spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) if (spte_ad_enabled(spte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) if ((spte & shadow_accessed_mask) == 0 ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) /* Rules for using mmu_spte_set:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) * Set the sptep from nonpresent to present.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) * Note: the sptep being assigned *must* be either not present
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) * or in a state where the hardware will not attempt to update
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) * the spte.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) static void mmu_spte_set(u64 *sptep, u64 new_spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) WARN_ON(is_shadow_present_pte(*sptep));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) __set_spte(sptep, new_spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) * Update the SPTE (excluding the PFN), but do not track changes in its
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) * accessed/dirty status.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) u64 old_spte = *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) WARN_ON(!is_shadow_present_pte(new_spte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) if (!is_shadow_present_pte(old_spte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) mmu_spte_set(sptep, new_spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) return old_spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) if (!spte_has_volatile_bits(old_spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) __update_clear_spte_fast(sptep, new_spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) old_spte = __update_clear_spte_slow(sptep, new_spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) return old_spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) /* Rules for using mmu_spte_update:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) * Update the state bits, it means the mapped pfn is not changed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) * Whenever we overwrite a writable spte with a read-only one we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) * should flush remote TLBs. Otherwise rmap_write_protect
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) * will find a read-only spte, even though the writable spte
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) * might be cached on a CPU's TLB, the return value indicates this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) * case.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) * Returns true if the TLB needs to be flushed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) static bool mmu_spte_update(u64 *sptep, u64 new_spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) bool flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) if (!is_shadow_present_pte(old_spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) * For the spte updated out of mmu-lock is safe, since
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) * we always atomically update it, see the comments in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) * spte_has_volatile_bits().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) if (spte_can_locklessly_be_made_writable(old_spte) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) !is_writable_pte(new_spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) flush = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) * Flush TLB when accessed/dirty states are changed in the page tables,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) * to guarantee consistency between TLB and page tables.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) flush = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) kvm_set_pfn_accessed(spte_to_pfn(old_spte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) flush = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) kvm_set_pfn_dirty(spte_to_pfn(old_spte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) return flush;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) * Rules for using mmu_spte_clear_track_bits:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) * It sets the sptep from present to nonpresent, and track the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) * state bits, it is used to clear the last level sptep.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) * Returns non-zero if the PTE was previously valid.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) static int mmu_spte_clear_track_bits(u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) kvm_pfn_t pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) u64 old_spte = *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) if (!spte_has_volatile_bits(old_spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) __update_clear_spte_fast(sptep, 0ull);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) old_spte = __update_clear_spte_slow(sptep, 0ull);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) if (!is_shadow_present_pte(old_spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) pfn = spte_to_pfn(old_spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) * KVM does not hold the refcount of the page used by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) * kvm mmu, before reclaiming the page, we should
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) * unmap it from mmu first.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) if (is_accessed_spte(old_spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) kvm_set_pfn_accessed(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) if (is_dirty_spte(old_spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) kvm_set_pfn_dirty(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) * Rules for using mmu_spte_clear_no_track:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) * Directly clear spte without caring the state bits of sptep,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) * it is used to set the upper level spte.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) static void mmu_spte_clear_no_track(u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) __update_clear_spte_fast(sptep, 0ull);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) static u64 mmu_spte_get_lockless(u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) return __get_spte_lockless(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) /* Restore an acc-track PTE back to a regular PTE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) static u64 restore_acc_track_spte(u64 spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) u64 new_spte = spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) & SHADOW_ACC_TRACK_SAVED_BITS_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) WARN_ON_ONCE(spte_ad_enabled(spte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) WARN_ON_ONCE(!is_access_track_spte(spte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) new_spte &= ~shadow_acc_track_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) new_spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) SHADOW_ACC_TRACK_SAVED_BITS_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) new_spte |= saved_bits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) return new_spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) /* Returns the Accessed status of the PTE and resets it at the same time. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) static bool mmu_spte_age(u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) u64 spte = mmu_spte_get_lockless(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) if (!is_accessed_spte(spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) if (spte_ad_enabled(spte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) clear_bit((ffs(shadow_accessed_mask) - 1),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) (unsigned long *)sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) * Capture the dirty status of the page, so that it doesn't get
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) * lost when the SPTE is marked for access tracking.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) if (is_writable_pte(spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) kvm_set_pfn_dirty(spte_to_pfn(spte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) spte = mark_spte_for_access_track(spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) mmu_spte_update_no_track(sptep, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) * Prevent page table teardown by making any free-er wait during
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) * kvm_flush_remote_tlbs() IPI to all active vcpus.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) local_irq_disable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) * Make sure a following spte read is not reordered ahead of the write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) * to vcpu->mode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) * Make sure the write to vcpu->mode is not reordered in front of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) local_irq_enable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) int r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) if (r)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) return r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) PT64_ROOT_MAX_LEVEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) if (r)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) return r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) if (maybe_indirect) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) PT64_ROOT_MAX_LEVEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) if (r)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) return r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) PT64_ROOT_MAX_LEVEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) kmem_cache_free(pte_list_desc_cache, pte_list_desc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) if (!sp->role.direct)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) return sp->gfns[index];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) if (!sp->role.direct) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) sp->gfns[index] = gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) pr_err_ratelimited("gfn mismatch under direct page %llx "
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) "(expected %llx, got %llx)\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) sp->gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) kvm_mmu_page_get_gfn(sp, index), gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) * Return the pointer to the large page information for a given gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) * handling slots that are not large page aligned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) struct kvm_memory_slot *slot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) int level)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) unsigned long idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) idx = gfn_to_index(gfn, slot->base_gfn, level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) return &slot->arch.lpage_info[level - 2][idx];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) gfn_t gfn, int count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) struct kvm_lpage_info *linfo;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) linfo = lpage_info_slot(gfn, slot, i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) linfo->disallow_lpage += count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) WARN_ON(linfo->disallow_lpage < 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) update_gfn_disallow_lpage_count(slot, gfn, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) update_gfn_disallow_lpage_count(slot, gfn, -1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) struct kvm_memslots *slots;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) struct kvm_memory_slot *slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) gfn_t gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) kvm->arch.indirect_shadow_pages++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) gfn = sp->gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) slots = kvm_memslots_for_spte_role(kvm, sp->role);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) slot = __gfn_to_memslot(slots, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) /* the non-leaf shadow pages are keeping readonly. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) if (sp->role.level > PG_LEVEL_4K)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) return kvm_slot_page_track_add_page(kvm, slot, gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) KVM_PAGE_TRACK_WRITE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) kvm_mmu_gfn_disallow_lpage(slot, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) if (sp->lpage_disallowed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) ++kvm->stat.nx_lpage_splits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) list_add_tail(&sp->lpage_disallowed_link,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) &kvm->arch.lpage_disallowed_mmu_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) sp->lpage_disallowed = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) struct kvm_memslots *slots;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) struct kvm_memory_slot *slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) gfn_t gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) kvm->arch.indirect_shadow_pages--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793) gfn = sp->gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) slots = kvm_memslots_for_spte_role(kvm, sp->role);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) slot = __gfn_to_memslot(slots, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) if (sp->role.level > PG_LEVEL_4K)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) return kvm_slot_page_track_remove_page(kvm, slot, gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) KVM_PAGE_TRACK_WRITE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) kvm_mmu_gfn_allow_lpage(slot, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803) void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) --kvm->stat.nx_lpage_splits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) sp->lpage_disallowed = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807) list_del(&sp->lpage_disallowed_link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) static struct kvm_memory_slot *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) bool no_dirty_log)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) struct kvm_memory_slot *slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) if (no_dirty_log && slot->dirty_bitmap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) return slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) * About rmap_head encoding:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) * If the bit zero of rmap_head->val is clear, then it points to the only spte
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829) * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) * pte_list_desc containing more mappings.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834) * Returns the number of pointers in the rmap chain, not counting the new one.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837) struct kvm_rmap_head *rmap_head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839) struct pte_list_desc *desc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) int i, count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) if (!rmap_head->val) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) rmap_head->val = (unsigned long)spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845) } else if (!(rmap_head->val & 1)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847) desc = mmu_alloc_pte_list_desc(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848) desc->sptes[0] = (u64 *)rmap_head->val;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849) desc->sptes[1] = spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850) rmap_head->val = (unsigned long)desc | 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) ++count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853) rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854) desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) while (desc->sptes[PTE_LIST_EXT-1]) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) count += PTE_LIST_EXT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858) if (!desc->more) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) desc->more = mmu_alloc_pte_list_desc(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860) desc = desc->more;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863) desc = desc->more;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865) for (i = 0; desc->sptes[i]; ++i)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) ++count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867) desc->sptes[i] = spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869) return count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872) static void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873) pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) struct pte_list_desc *desc, int i,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875) struct pte_list_desc *prev_desc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877) int j;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880) ;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881) desc->sptes[i] = desc->sptes[j];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) desc->sptes[j] = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883) if (j != 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) if (!prev_desc && !desc->more)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) rmap_head->val = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) if (prev_desc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) prev_desc->more = desc->more;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) rmap_head->val = (unsigned long)desc->more | 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892) mmu_free_pte_list_desc(desc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895) static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897) struct pte_list_desc *desc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898) struct pte_list_desc *prev_desc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) if (!rmap_head->val) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902) pr_err("%s: %p 0->BUG\n", __func__, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903) BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) } else if (!(rmap_head->val & 1)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) rmap_printk("%s: %p 1->0\n", __func__, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906) if ((u64 *)rmap_head->val != spte) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907) pr_err("%s: %p 1->BUG\n", __func__, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) rmap_head->val = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912) rmap_printk("%s: %p many->many\n", __func__, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913) desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914) prev_desc = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915) while (desc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916) for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) if (desc->sptes[i] == spte) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) pte_list_desc_remove_entry(rmap_head,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919) desc, i, prev_desc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923) prev_desc = desc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924) desc = desc->more;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926) pr_err("%s: %p many->many\n", __func__, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927) BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931) static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933) mmu_spte_clear_track_bits(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934) __pte_list_remove(sptep, rmap_head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938) struct kvm_memory_slot *slot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940) unsigned long idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942) idx = gfn_to_index(gfn, slot->base_gfn, level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943) return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946) static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947) struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949) struct kvm_memslots *slots;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950) struct kvm_memory_slot *slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952) slots = kvm_memslots_for_spte_role(kvm, sp->role);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953) slot = __gfn_to_memslot(slots, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) return __gfn_to_rmap(gfn, sp->role.level, slot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957) static bool rmap_can_add(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) struct kvm_mmu_memory_cache *mc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961) mc = &vcpu->arch.mmu_pte_list_desc_cache;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962) return kvm_mmu_memory_cache_nr_free_objects(mc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965) static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967) struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968) struct kvm_rmap_head *rmap_head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970) sp = sptep_to_sp(spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971) kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972) rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973) return pte_list_add(vcpu, spte, rmap_head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976) static void rmap_remove(struct kvm *kvm, u64 *spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978) struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979) gfn_t gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980) struct kvm_rmap_head *rmap_head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) sp = sptep_to_sp(spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983) gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984) rmap_head = gfn_to_rmap(kvm, gfn, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985) __pte_list_remove(spte, rmap_head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989) * Used by the following functions to iterate through the sptes linked by a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990) * rmap. All fields are private and not assumed to be used outside.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992) struct rmap_iterator {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993) /* private fields */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994) struct pte_list_desc *desc; /* holds the sptep if not NULL */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995) int pos; /* index of the sptep */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999) * Iteration must be started by this function. This should also be used after
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) * removing/dropping sptes from the rmap link because in such cases the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) * information in the iterator may not be valid.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) * Returns sptep if found, NULL otherwise.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) struct rmap_iterator *iter)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) u64 *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) if (!rmap_head->val)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) if (!(rmap_head->val & 1)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) iter->desc = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) sptep = (u64 *)rmap_head->val;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) iter->pos = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) sptep = iter->desc->sptes[iter->pos];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) BUG_ON(!is_shadow_present_pte(*sptep));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) return sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) * Must be used with a valid iterator: e.g. after rmap_get_first().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) * Returns sptep if found, NULL otherwise.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) static u64 *rmap_get_next(struct rmap_iterator *iter)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) u64 *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) if (iter->desc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) if (iter->pos < PTE_LIST_EXT - 1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) ++iter->pos;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) sptep = iter->desc->sptes[iter->pos];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) if (sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) iter->desc = iter->desc->more;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) if (iter->desc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) iter->pos = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) /* desc->sptes[0] cannot be NULL */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) sptep = iter->desc->sptes[iter->pos];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) BUG_ON(!is_shadow_present_pte(*sptep));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) return sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) #define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061) for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) _spte_; _spte_ = rmap_get_next(_iter_))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) static void drop_spte(struct kvm *kvm, u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) if (mmu_spte_clear_track_bits(sptep))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) rmap_remove(kvm, sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073) if (is_large_pte(*sptep)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) drop_spte(kvm, sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) --kvm->stat.lpages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) if (__drop_large_spte(vcpu->kvm, sptep)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) struct kvm_mmu_page *sp = sptep_to_sp(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089) KVM_PAGES_PER_HPAGE(sp->role.level));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) * Write-protect on the specified @sptep, @pt_protect indicates whether
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) * spte write-protection is caused by protecting shadow page table.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) * Note: write protection is difference between dirty logging and spte
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) * protection:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) * - for dirty logging, the spte can be set to writable at anytime if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) * its dirty bitmap is properly set.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) * - for spte protection, the spte can be writable only after unsync-ing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) * shadow page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) * Return true if tlb need be flushed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) static bool spte_write_protect(u64 *sptep, bool pt_protect)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) u64 spte = *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110) if (!is_writable_pte(spte) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114) rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) if (pt_protect)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) spte &= ~SPTE_MMU_WRITEABLE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) spte = spte & ~PT_WRITABLE_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) return mmu_spte_update(sptep, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) static bool __rmap_write_protect(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) struct kvm_rmap_head *rmap_head,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) bool pt_protect)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) u64 *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) struct rmap_iterator iter;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) bool flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) for_each_rmap_spte(rmap_head, &iter, sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) flush |= spte_write_protect(sptep, pt_protect);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) return flush;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) static bool spte_clear_dirty(u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) u64 spte = *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) MMU_WARN_ON(!spte_ad_enabled(spte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) spte &= ~shadow_dirty_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) return mmu_spte_update(sptep, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) static bool spte_wrprot_for_clear_dirty(u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) (unsigned long *)sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) if (was_writable && !spte_ad_enabled(*sptep))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) kvm_set_pfn_dirty(spte_to_pfn(*sptep));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) return was_writable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) * Gets the GFN ready for another round of dirty logging by clearing the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) * - D bit on ad-enabled SPTEs, and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) * - W bit on ad-disabled SPTEs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) * Returns true iff any D or W bits were cleared.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166) u64 *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) struct rmap_iterator iter;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) bool flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) for_each_rmap_spte(rmap_head, &iter, sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) if (spte_ad_need_write_protect(*sptep))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) flush |= spte_wrprot_for_clear_dirty(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) flush |= spte_clear_dirty(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) return flush;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) static bool spte_set_dirty(u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) u64 spte = *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) * Similar to the !kvm_x86_ops.slot_disable_log_dirty case,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) * do not bother adding back write access to pages marked
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188) * SPTE_AD_WRPROT_ONLY_MASK.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) spte |= shadow_dirty_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) return mmu_spte_update(sptep, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) u64 *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) struct rmap_iterator iter;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199) bool flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) for_each_rmap_spte(rmap_head, &iter, sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) if (spte_ad_enabled(*sptep))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) flush |= spte_set_dirty(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) return flush;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) * @kvm: kvm instance
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211) * @slot: slot to protect
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) * @gfn_offset: start of the BITS_PER_LONG pages we care about
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) * @mask: indicates which pages we should protect
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) * Used when we do not need to care about huge page mappings: e.g. during dirty
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) * logging we do not have any such mappings.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) struct kvm_memory_slot *slot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) gfn_t gfn_offset, unsigned long mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) struct kvm_rmap_head *rmap_head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) if (kvm->arch.tdp_mmu_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) slot->base_gfn + gfn_offset, mask, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227) while (mask) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) PG_LEVEL_4K, slot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) __rmap_write_protect(kvm, rmap_head, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) /* clear the first set bit */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) mask &= mask - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238) * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) * protect the page if the D-bit isn't supported.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240) * @kvm: kvm instance
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241) * @slot: slot to clear D-bit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242) * @gfn_offset: start of the BITS_PER_LONG pages we care about
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243) * @mask: indicates which pages we should clear D-bit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245) * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) struct kvm_memory_slot *slot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) gfn_t gfn_offset, unsigned long mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) struct kvm_rmap_head *rmap_head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) if (kvm->arch.tdp_mmu_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255) slot->base_gfn + gfn_offset, mask, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256) while (mask) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258) PG_LEVEL_4K, slot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259) __rmap_clear_dirty(kvm, rmap_head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261) /* clear the first set bit */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262) mask &= mask - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265) EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268) * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269) * PT level pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271) * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272) * enable dirty logging for them.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274) * Used when we do not need to care about huge page mappings: e.g. during dirty
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275) * logging we do not have any such mappings.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) struct kvm_memory_slot *slot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279) gfn_t gfn_offset, unsigned long mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) if (kvm_x86_ops.enable_log_dirty_pt_masked)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282) kvm_x86_ops.enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285) kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288) bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289) struct kvm_memory_slot *slot, u64 gfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1290) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1291) struct kvm_rmap_head *rmap_head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1292) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1293) bool write_protected = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1294)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1295) for (i = PG_LEVEL_4K; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1296) rmap_head = __gfn_to_rmap(gfn, i, slot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1297) write_protected |= __rmap_write_protect(kvm, rmap_head, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1298) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1299)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1300) if (kvm->arch.tdp_mmu_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1301) write_protected |=
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1302) kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1303)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1304) return write_protected;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1305) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1306)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1307) static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1308) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1309) struct kvm_memory_slot *slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1310)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1311) slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1312) return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1313) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1314)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1315) static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1316) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1317) u64 *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1318) struct rmap_iterator iter;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1319) bool flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1320)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1321) while ((sptep = rmap_get_first(rmap_head, &iter))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1322) rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1323)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1324) pte_list_remove(rmap_head, sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1325) flush = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1326) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1327)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1328) return flush;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1329) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1330)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1331) static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1332) struct kvm_memory_slot *slot, gfn_t gfn, int level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1333) unsigned long data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1334) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1335) return kvm_zap_rmapp(kvm, rmap_head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1336) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1337)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1338) static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1339) struct kvm_memory_slot *slot, gfn_t gfn, int level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1340) unsigned long data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1341) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1342) u64 *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1343) struct rmap_iterator iter;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1344) int need_flush = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1345) u64 new_spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1346) pte_t *ptep = (pte_t *)data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1347) kvm_pfn_t new_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1348)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1349) WARN_ON(pte_huge(*ptep));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1350) new_pfn = pte_pfn(*ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1351)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1352) restart:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1353) for_each_rmap_spte(rmap_head, &iter, sptep) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1354) rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1355) sptep, *sptep, gfn, level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1356)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1357) need_flush = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1358)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1359) if (pte_write(*ptep)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1360) pte_list_remove(rmap_head, sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1361) goto restart;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1362) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1363) new_spte = kvm_mmu_changed_pte_notifier_make_spte(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1364) *sptep, new_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1365)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1366) mmu_spte_clear_track_bits(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1367) mmu_spte_set(sptep, new_spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1368) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1369) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1370)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1371) if (need_flush && kvm_available_flush_tlb_with_range()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1372) kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1373) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1374) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1375)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1376) return need_flush;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1377) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1378)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1379) struct slot_rmap_walk_iterator {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1380) /* input fields. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1381) struct kvm_memory_slot *slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1382) gfn_t start_gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1383) gfn_t end_gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1384) int start_level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1385) int end_level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1386)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1387) /* output fields. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1388) gfn_t gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1389) struct kvm_rmap_head *rmap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1390) int level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1391)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1392) /* private field. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1393) struct kvm_rmap_head *end_rmap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1394) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1395)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1396) static void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1397) rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1398) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1399) iterator->level = level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1400) iterator->gfn = iterator->start_gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1401) iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1402) iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1403) iterator->slot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1404) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1405)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1406) static void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1407) slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1408) struct kvm_memory_slot *slot, int start_level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1409) int end_level, gfn_t start_gfn, gfn_t end_gfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1410) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1411) iterator->slot = slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1412) iterator->start_level = start_level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1413) iterator->end_level = end_level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1414) iterator->start_gfn = start_gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1415) iterator->end_gfn = end_gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1416)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1417) rmap_walk_init_level(iterator, iterator->start_level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1418) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1419)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1420) static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1421) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1422) return !!iterator->rmap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1423) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1424)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1425) static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1426) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1427) if (++iterator->rmap <= iterator->end_rmap) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1428) iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1429) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1430) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1431)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1432) if (++iterator->level > iterator->end_level) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1433) iterator->rmap = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1434) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1435) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1436)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1437) rmap_walk_init_level(iterator, iterator->level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1438) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1439)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1440) #define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1441) _start_gfn, _end_gfn, _iter_) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1442) for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1443) _end_level_, _start_gfn, _end_gfn); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1444) slot_rmap_walk_okay(_iter_); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1445) slot_rmap_walk_next(_iter_))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1446)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1447) static int kvm_handle_hva_range(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1448) unsigned long start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1449) unsigned long end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1450) unsigned long data,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1451) int (*handler)(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1452) struct kvm_rmap_head *rmap_head,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1453) struct kvm_memory_slot *slot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1454) gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1455) int level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1456) unsigned long data))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1457) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1458) struct kvm_memslots *slots;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1459) struct kvm_memory_slot *memslot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1460) struct slot_rmap_walk_iterator iterator;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1461) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1462) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1463)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1464) for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1465) slots = __kvm_memslots(kvm, i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1466) kvm_for_each_memslot(memslot, slots) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1467) unsigned long hva_start, hva_end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1468) gfn_t gfn_start, gfn_end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1469)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1470) hva_start = max(start, memslot->userspace_addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1471) hva_end = min(end, memslot->userspace_addr +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1472) (memslot->npages << PAGE_SHIFT));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1473) if (hva_start >= hva_end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1474) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1475) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1476) * {gfn(page) | page intersects with [hva_start, hva_end)} =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1477) * {gfn_start, gfn_start+1, ..., gfn_end-1}.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1478) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1479) gfn_start = hva_to_gfn_memslot(hva_start, memslot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1480) gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1481)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1482) for_each_slot_rmap_range(memslot, PG_LEVEL_4K,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1483) KVM_MAX_HUGEPAGE_LEVEL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1484) gfn_start, gfn_end - 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1485) &iterator)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1486) ret |= handler(kvm, iterator.rmap, memslot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1487) iterator.gfn, iterator.level, data);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1488) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1489) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1490)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1491) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1492) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1493)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1494) static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1495) unsigned long data,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1496) int (*handler)(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1497) struct kvm_rmap_head *rmap_head,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1498) struct kvm_memory_slot *slot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1499) gfn_t gfn, int level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1500) unsigned long data))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1501) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1502) return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1503) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1504)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1505) int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1506) unsigned flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1507) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1508) int r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1509)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1510) r = kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1511)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1512) if (kvm->arch.tdp_mmu_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1513) r |= kvm_tdp_mmu_zap_hva_range(kvm, start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1514)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1515) return r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1516) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1517)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1518) int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1519) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1520) int r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1521)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1522) r = kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1523)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1524) if (kvm->arch.tdp_mmu_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1525) r |= kvm_tdp_mmu_set_spte_hva(kvm, hva, &pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1526)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1527) return r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1528) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1529)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1530) static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1531) struct kvm_memory_slot *slot, gfn_t gfn, int level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1532) unsigned long data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1533) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1534) u64 *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1535) struct rmap_iterator iter;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1536) int young = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1537)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1538) for_each_rmap_spte(rmap_head, &iter, sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1539) young |= mmu_spte_age(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1540)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1541) trace_kvm_age_page(gfn, level, slot, young);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1542) return young;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1543) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1544)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1545) static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1546) struct kvm_memory_slot *slot, gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1547) int level, unsigned long data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1548) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1549) u64 *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1550) struct rmap_iterator iter;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1551)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1552) for_each_rmap_spte(rmap_head, &iter, sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1553) if (is_accessed_spte(*sptep))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1554) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1555) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1556) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1557)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1558) #define RMAP_RECYCLE_THRESHOLD 1000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1559)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1560) static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1561) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1562) struct kvm_rmap_head *rmap_head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1563) struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1564)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1565) sp = sptep_to_sp(spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1566)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1567) rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1568)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1569) kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1570) kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1571) KVM_PAGES_PER_HPAGE(sp->role.level));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1572) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1573)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1574) int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1575) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1576) int young = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1577)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1578) young = kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1579) if (kvm->arch.tdp_mmu_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1580) young |= kvm_tdp_mmu_age_hva_range(kvm, start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1581)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1582) return young;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1583) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1584)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1585) int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1586) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1587) int young = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1588)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1589) young = kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1590) if (kvm->arch.tdp_mmu_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1591) young |= kvm_tdp_mmu_test_age_hva(kvm, hva);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1592)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1593) return young;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1594) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1595)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1596) #ifdef MMU_DEBUG
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1597) static int is_empty_shadow_page(u64 *spt)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1598) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1599) u64 *pos;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1600) u64 *end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1601)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1602) for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1603) if (is_shadow_present_pte(*pos)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1604) printk(KERN_ERR "%s: %p %llx\n", __func__,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1605) pos, *pos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1606) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1607) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1608) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1609) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1610) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1611)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1612) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1613) * This value is the sum of all of the kvm instances's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1614) * kvm->arch.n_used_mmu_pages values. We need a global,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1615) * aggregate version in order to make the slab shrinker
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1616) * faster
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1617) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1618) static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1619) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1620) kvm->arch.n_used_mmu_pages += nr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1621) percpu_counter_add(&kvm_total_used_mmu_pages, nr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1622) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1623)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1624) static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1625) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1626) MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1627) hlist_del(&sp->hash_link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1628) list_del(&sp->link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1629) free_page((unsigned long)sp->spt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1630) if (!sp->role.direct)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1631) free_page((unsigned long)sp->gfns);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1632) kmem_cache_free(mmu_page_header_cache, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1633) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1634)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1635) static unsigned kvm_page_table_hashfn(gfn_t gfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1636) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1637) return hash_64(gfn, KVM_MMU_HASH_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1638) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1639)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1640) static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1641) struct kvm_mmu_page *sp, u64 *parent_pte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1642) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1643) if (!parent_pte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1644) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1645)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1646) pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1647) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1648)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1649) static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1650) u64 *parent_pte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1651) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1652) __pte_list_remove(parent_pte, &sp->parent_ptes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1653) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1654)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1655) static void drop_parent_pte(struct kvm_mmu_page *sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1656) u64 *parent_pte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1657) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1658) mmu_page_remove_parent_pte(sp, parent_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1659) mmu_spte_clear_no_track(parent_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1660) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1661)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1662) static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1663) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1664) struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1665)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1666) sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1667) sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1668) if (!direct)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1669) sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1670) set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1671)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1672) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1673) * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1674) * depends on valid pages being added to the head of the list. See
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1675) * comments in kvm_zap_obsolete_pages().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1676) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1677) sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1678) list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1679) kvm_mod_used_mmu_pages(vcpu->kvm, +1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1680) return sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1681) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1682)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1683) static void mark_unsync(u64 *spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1684) static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1685) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1686) u64 *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1687) struct rmap_iterator iter;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1688)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1689) for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1690) mark_unsync(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1691) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1692) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1693)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1694) static void mark_unsync(u64 *spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1695) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1696) struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1697) unsigned int index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1698)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1699) sp = sptep_to_sp(spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1700) index = spte - sp->spt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1701) if (__test_and_set_bit(index, sp->unsync_child_bitmap))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1702) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1703) if (sp->unsync_children++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1704) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1705) kvm_mmu_mark_parents_unsync(sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1706) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1707)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1708) static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1709) struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1710) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1711) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1712) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1713)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1714) #define KVM_PAGE_ARRAY_NR 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1715)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1716) struct kvm_mmu_pages {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1717) struct mmu_page_and_offset {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1718) struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1719) unsigned int idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1720) } page[KVM_PAGE_ARRAY_NR];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1721) unsigned int nr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1722) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1723)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1724) static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1725) int idx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1726) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1727) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1728)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1729) if (sp->unsync)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1730) for (i=0; i < pvec->nr; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1731) if (pvec->page[i].sp == sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1732) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1733)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1734) pvec->page[pvec->nr].sp = sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1735) pvec->page[pvec->nr].idx = idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1736) pvec->nr++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1737) return (pvec->nr == KVM_PAGE_ARRAY_NR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1738) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1739)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1740) static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1741) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1742) --sp->unsync_children;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1743) WARN_ON((int)sp->unsync_children < 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1744) __clear_bit(idx, sp->unsync_child_bitmap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1745) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1746)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1747) static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1748) struct kvm_mmu_pages *pvec)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1749) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1750) int i, ret, nr_unsync_leaf = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1751)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1752) for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1753) struct kvm_mmu_page *child;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1754) u64 ent = sp->spt[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1755)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1756) if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1757) clear_unsync_child_bit(sp, i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1758) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1759) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1760)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1761) child = to_shadow_page(ent & PT64_BASE_ADDR_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1762)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1763) if (child->unsync_children) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1764) if (mmu_pages_add(pvec, child, i))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1765) return -ENOSPC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1766)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1767) ret = __mmu_unsync_walk(child, pvec);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1768) if (!ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1769) clear_unsync_child_bit(sp, i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1770) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1771) } else if (ret > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1772) nr_unsync_leaf += ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1773) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1774) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1775) } else if (child->unsync) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1776) nr_unsync_leaf++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1777) if (mmu_pages_add(pvec, child, i))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1778) return -ENOSPC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1779) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1780) clear_unsync_child_bit(sp, i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1781) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1782)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1783) return nr_unsync_leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1784) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1785)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1786) #define INVALID_INDEX (-1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1787)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1788) static int mmu_unsync_walk(struct kvm_mmu_page *sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1789) struct kvm_mmu_pages *pvec)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1790) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1791) pvec->nr = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1792) if (!sp->unsync_children)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1793) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1794)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1795) mmu_pages_add(pvec, sp, INVALID_INDEX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1796) return __mmu_unsync_walk(sp, pvec);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1797) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1798)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1799) static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1800) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1801) WARN_ON(!sp->unsync);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1802) trace_kvm_mmu_sync_page(sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1803) sp->unsync = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1804) --kvm->stat.mmu_unsync;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1805) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1806)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1807) static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1808) struct list_head *invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1809) static void kvm_mmu_commit_zap_page(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1810) struct list_head *invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1811)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1812) #define for_each_valid_sp(_kvm, _sp, _list) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1813) hlist_for_each_entry(_sp, _list, hash_link) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1814) if (is_obsolete_sp((_kvm), (_sp))) { \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1815) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1816)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1817) #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1818) for_each_valid_sp(_kvm, _sp, \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1819) &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1820) if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1821)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1822) static inline bool is_ept_sp(struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1823) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1824) return sp->role.cr0_wp && sp->role.smap_andnot_wp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1825) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1826)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1827) /* @sp->gfn should be write-protected at the call site */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1828) static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1829) struct list_head *invalid_list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1830) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1831) if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1832) vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1833) kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1834) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1835) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1836)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1837) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1838) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1839)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1840) static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1841) struct list_head *invalid_list,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1842) bool remote_flush)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1843) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1844) if (!remote_flush && list_empty(invalid_list))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1845) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1846)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1847) if (!list_empty(invalid_list))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1848) kvm_mmu_commit_zap_page(kvm, invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1849) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1850) kvm_flush_remote_tlbs(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1851) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1852) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1853)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1854) static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1855) struct list_head *invalid_list,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1856) bool remote_flush, bool local_flush)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1857) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1858) if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1859) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1860)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1861) if (local_flush)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1862) kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1863) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1864)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1865) #ifdef CONFIG_KVM_MMU_AUDIT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1866) #include "mmu_audit.c"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1867) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1868) static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1869) static void mmu_audit_disable(void) { }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1870) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1871)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1872) static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1873) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1874) return sp->role.invalid ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1875) unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1876) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1877)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1878) static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1879) struct list_head *invalid_list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1880) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1881) kvm_unlink_unsync_page(vcpu->kvm, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1882) return __kvm_sync_page(vcpu, sp, invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1883) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1884)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1885) /* @gfn should be write-protected at the call site */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1886) static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1887) struct list_head *invalid_list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1888) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1889) struct kvm_mmu_page *s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1890) bool ret = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1891)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1892) for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1893) if (!s->unsync)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1894) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1895)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1896) WARN_ON(s->role.level != PG_LEVEL_4K);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1897) ret |= kvm_sync_page(vcpu, s, invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1898) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1899)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1900) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1901) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1902)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1903) struct mmu_page_path {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1904) struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1905) unsigned int idx[PT64_ROOT_MAX_LEVEL];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1906) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1907)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1908) #define for_each_sp(pvec, sp, parents, i) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1909) for (i = mmu_pages_first(&pvec, &parents); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1910) i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1911) i = mmu_pages_next(&pvec, &parents, i))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1912)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1913) static int mmu_pages_next(struct kvm_mmu_pages *pvec,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1914) struct mmu_page_path *parents,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1915) int i)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1916) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1917) int n;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1918)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1919) for (n = i+1; n < pvec->nr; n++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1920) struct kvm_mmu_page *sp = pvec->page[n].sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1921) unsigned idx = pvec->page[n].idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1922) int level = sp->role.level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1923)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1924) parents->idx[level-1] = idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1925) if (level == PG_LEVEL_4K)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1926) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1927)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1928) parents->parent[level-2] = sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1929) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1930)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1931) return n;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1932) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1933)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1934) static int mmu_pages_first(struct kvm_mmu_pages *pvec,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1935) struct mmu_page_path *parents)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1936) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1937) struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1938) int level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1939)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1940) if (pvec->nr == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1941) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1942)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1943) WARN_ON(pvec->page[0].idx != INVALID_INDEX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1944)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1945) sp = pvec->page[0].sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1946) level = sp->role.level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1947) WARN_ON(level == PG_LEVEL_4K);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1948)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1949) parents->parent[level-2] = sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1950)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1951) /* Also set up a sentinel. Further entries in pvec are all
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1952) * children of sp, so this element is never overwritten.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1953) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1954) parents->parent[level-1] = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1955) return mmu_pages_next(pvec, parents, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1956) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1957)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1958) static void mmu_pages_clear_parents(struct mmu_page_path *parents)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1959) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1960) struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1961) unsigned int level = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1962)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1963) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1964) unsigned int idx = parents->idx[level];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1965) sp = parents->parent[level];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1966) if (!sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1967) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1968)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1969) WARN_ON(idx == INVALID_INDEX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1970) clear_unsync_child_bit(sp, idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1971) level++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1972) } while (!sp->unsync_children);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1973) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1974)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1975) static void mmu_sync_children(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1976) struct kvm_mmu_page *parent)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1977) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1978) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1979) struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1980) struct mmu_page_path parents;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1981) struct kvm_mmu_pages pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1982) LIST_HEAD(invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1983) bool flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1984)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1985) while (mmu_unsync_walk(parent, &pages)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1986) bool protected = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1987)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1988) for_each_sp(pages, sp, parents, i)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1989) protected |= rmap_write_protect(vcpu, sp->gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1990)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1991) if (protected) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1992) kvm_flush_remote_tlbs(vcpu->kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1993) flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1994) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1995)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1996) for_each_sp(pages, sp, parents, i) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1997) flush |= kvm_sync_page(vcpu, sp, &invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1998) mmu_pages_clear_parents(&parents);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1999) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2000) if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2001) kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2002) cond_resched_lock(&vcpu->kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2003) flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2004) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2005) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2006)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2007) kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2008) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2009)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2010) static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2011) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2012) atomic_set(&sp->write_flooding_count, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2013) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2014)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2015) static void clear_sp_write_flooding_count(u64 *spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2016) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2017) __clear_sp_write_flooding_count(sptep_to_sp(spte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2018) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2019)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2020) static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2021) gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2022) gva_t gaddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2023) unsigned level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2024) int direct,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2025) unsigned int access)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2026) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2027) bool direct_mmu = vcpu->arch.mmu->direct_map;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2028) union kvm_mmu_page_role role;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2029) struct hlist_head *sp_list;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2030) unsigned quadrant;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2031) struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2032) bool need_sync = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2033) bool flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2034) int collisions = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2035) LIST_HEAD(invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2036)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2037) role = vcpu->arch.mmu->mmu_role.base;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2038) role.level = level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2039) role.direct = direct;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2040) if (role.direct)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2041) role.gpte_is_8_bytes = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2042) role.access = access;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2043) if (!direct_mmu && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2044) quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2045) quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2046) role.quadrant = quadrant;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2047) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2048)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2049) sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2050) for_each_valid_sp(vcpu->kvm, sp, sp_list) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2051) if (sp->gfn != gfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2052) collisions++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2053) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2054) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2055)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2056) if (!need_sync && sp->unsync)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2057) need_sync = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2058)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2059) if (sp->role.word != role.word)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2060) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2061)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2062) if (direct_mmu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2063) goto trace_get_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2064)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2065) if (sp->unsync) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2066) /* The page is good, but __kvm_sync_page might still end
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2067) * up zapping it. If so, break in order to rebuild it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2068) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2069) if (!__kvm_sync_page(vcpu, sp, &invalid_list))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2070) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2071)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2072) WARN_ON(!list_empty(&invalid_list));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2073) kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2074) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2075)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2076) if (sp->unsync_children)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2077) kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2078)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2079) __clear_sp_write_flooding_count(sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2080)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2081) trace_get_page:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2082) trace_kvm_mmu_get_page(sp, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2083) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2084) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2085)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2086) ++vcpu->kvm->stat.mmu_cache_miss;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2087)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2088) sp = kvm_mmu_alloc_page(vcpu, direct);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2089)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2090) sp->gfn = gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2091) sp->role = role;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2092) hlist_add_head(&sp->hash_link, sp_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2093) if (!direct) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2094) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2095) * we should do write protection before syncing pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2096) * otherwise the content of the synced shadow page may
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2097) * be inconsistent with guest page table.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2098) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2099) account_shadowed(vcpu->kvm, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2100) if (level == PG_LEVEL_4K && rmap_write_protect(vcpu, gfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2101) kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2102)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2103) if (level > PG_LEVEL_4K && need_sync)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2104) flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2105) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2106) trace_kvm_mmu_get_page(sp, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2107)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2108) kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2109) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2110) if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2111) vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2112) return sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2113) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2114)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2115) static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2116) struct kvm_vcpu *vcpu, hpa_t root,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2117) u64 addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2118) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2119) iterator->addr = addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2120) iterator->shadow_addr = root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2121) iterator->level = vcpu->arch.mmu->shadow_root_level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2122)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2123) if (iterator->level == PT64_ROOT_4LEVEL &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2124) vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2125) !vcpu->arch.mmu->direct_map)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2126) --iterator->level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2127)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2128) if (iterator->level == PT32E_ROOT_LEVEL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2129) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2130) * prev_root is currently only used for 64-bit hosts. So only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2131) * the active root_hpa is valid here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2132) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2133) BUG_ON(root != vcpu->arch.mmu->root_hpa);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2134)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2135) iterator->shadow_addr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2136) = vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2137) iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2138) --iterator->level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2139) if (!iterator->shadow_addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2140) iterator->level = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2141) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2142) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2143)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2144) static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2145) struct kvm_vcpu *vcpu, u64 addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2146) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2147) shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2148) addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2149) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2150)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2151) static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2152) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2153) if (iterator->level < PG_LEVEL_4K)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2154) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2155)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2156) iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2157) iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2158) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2159) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2160)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2161) static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2162) u64 spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2163) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2164) if (is_last_spte(spte, iterator->level)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2165) iterator->level = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2166) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2167) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2168)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2169) iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2170) --iterator->level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2171) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2172)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2173) static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2174) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2175) __shadow_walk_next(iterator, *iterator->sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2176) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2177)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2178) static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2179) struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2180) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2181) u64 spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2182)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2183) BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2184)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2185) spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2186)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2187) mmu_spte_set(sptep, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2188)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2189) mmu_page_add_parent_pte(vcpu, sp, sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2190)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2191) if (sp->unsync_children || sp->unsync)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2192) mark_unsync(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2193) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2194)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2195) static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2196) unsigned direct_access)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2197) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2198) if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2199) struct kvm_mmu_page *child;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2200)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2201) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2202) * For the direct sp, if the guest pte's dirty bit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2203) * changed form clean to dirty, it will corrupt the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2204) * sp's access: allow writable in the read-only sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2205) * so we should update the spte at this point to get
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2206) * a new sp with the correct access.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2207) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2208) child = to_shadow_page(*sptep & PT64_BASE_ADDR_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2209) if (child->role.access == direct_access)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2210) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2211)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2212) drop_parent_pte(child, sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2213) kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2214) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2215) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2216)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2217) /* Returns the number of zapped non-leaf child shadow pages. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2218) static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2219) u64 *spte, struct list_head *invalid_list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2220) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2221) u64 pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2222) struct kvm_mmu_page *child;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2223)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2224) pte = *spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2225) if (is_shadow_present_pte(pte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2226) if (is_last_spte(pte, sp->role.level)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2227) drop_spte(kvm, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2228) if (is_large_pte(pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2229) --kvm->stat.lpages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2230) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2231) child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2232) drop_parent_pte(child, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2233)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2234) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2235) * Recursively zap nested TDP SPs, parentless SPs are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2236) * unlikely to be used again in the near future. This
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2237) * avoids retaining a large number of stale nested SPs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2238) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2239) if (tdp_enabled && invalid_list &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2240) child->role.guest_mode && !child->parent_ptes.val)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2241) return kvm_mmu_prepare_zap_page(kvm, child,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2242) invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2243) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2244) } else if (is_mmio_spte(pte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2245) mmu_spte_clear_no_track(spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2246) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2247) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2248) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2249)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2250) static int kvm_mmu_page_unlink_children(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2251) struct kvm_mmu_page *sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2252) struct list_head *invalid_list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2253) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2254) int zapped = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2255) unsigned i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2256)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2257) for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2258) zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2259)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2260) return zapped;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2261) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2262)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2263) static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2264) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2265) u64 *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2266) struct rmap_iterator iter;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2267)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2268) while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2269) drop_parent_pte(sp, sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2270) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2271)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2272) static int mmu_zap_unsync_children(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2273) struct kvm_mmu_page *parent,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2274) struct list_head *invalid_list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2275) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2276) int i, zapped = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2277) struct mmu_page_path parents;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2278) struct kvm_mmu_pages pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2279)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2280) if (parent->role.level == PG_LEVEL_4K)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2281) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2282)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2283) while (mmu_unsync_walk(parent, &pages)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2284) struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2285)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2286) for_each_sp(pages, sp, parents, i) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2287) kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2288) mmu_pages_clear_parents(&parents);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2289) zapped++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2290) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2291) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2292)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2293) return zapped;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2294) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2295)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2296) static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2297) struct kvm_mmu_page *sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2298) struct list_head *invalid_list,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2299) int *nr_zapped)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2300) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2301) bool list_unstable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2302)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2303) trace_kvm_mmu_prepare_zap_page(sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2304) ++kvm->stat.mmu_shadow_zapped;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2305) *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2306) *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2307) kvm_mmu_unlink_parents(kvm, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2308)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2309) /* Zapping children means active_mmu_pages has become unstable. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2310) list_unstable = *nr_zapped;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2311)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2312) if (!sp->role.invalid && !sp->role.direct)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2313) unaccount_shadowed(kvm, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2314)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2315) if (sp->unsync)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2316) kvm_unlink_unsync_page(kvm, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2317) if (!sp->root_count) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2318) /* Count self */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2319) (*nr_zapped)++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2320)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2321) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2322) * Already invalid pages (previously active roots) are not on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2323) * the active page list. See list_del() in the "else" case of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2324) * !sp->root_count.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2325) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2326) if (sp->role.invalid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2327) list_add(&sp->link, invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2328) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2329) list_move(&sp->link, invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2330) kvm_mod_used_mmu_pages(kvm, -1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2331) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2332) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2333) * Remove the active root from the active page list, the root
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2334) * will be explicitly freed when the root_count hits zero.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2335) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2336) list_del(&sp->link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2337)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2338) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2339) * Obsolete pages cannot be used on any vCPUs, see the comment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2340) * in kvm_mmu_zap_all_fast(). Note, is_obsolete_sp() also
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2341) * treats invalid shadow pages as being obsolete.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2342) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2343) if (!is_obsolete_sp(kvm, sp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2344) kvm_reload_remote_mmus(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2345) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2346)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2347) if (sp->lpage_disallowed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2348) unaccount_huge_nx_page(kvm, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2349)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2350) sp->role.invalid = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2351) return list_unstable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2352) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2353)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2354) static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2355) struct list_head *invalid_list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2356) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2357) int nr_zapped;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2358)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2359) __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2360) return nr_zapped;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2361) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2362)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2363) static void kvm_mmu_commit_zap_page(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2364) struct list_head *invalid_list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2365) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2366) struct kvm_mmu_page *sp, *nsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2367)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2368) if (list_empty(invalid_list))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2369) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2370)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2371) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2372) * We need to make sure everyone sees our modifications to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2373) * the page tables and see changes to vcpu->mode here. The barrier
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2374) * in the kvm_flush_remote_tlbs() achieves this. This pairs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2375) * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2376) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2377) * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2378) * guest mode and/or lockless shadow page table walks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2379) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2380) kvm_flush_remote_tlbs(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2381)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2382) list_for_each_entry_safe(sp, nsp, invalid_list, link) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2383) WARN_ON(!sp->role.invalid || sp->root_count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2384) kvm_mmu_free_page(sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2385) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2386) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2387)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2388) static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2389) unsigned long nr_to_zap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2390) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2391) unsigned long total_zapped = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2392) struct kvm_mmu_page *sp, *tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2393) LIST_HEAD(invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2394) bool unstable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2395) int nr_zapped;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2396)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2397) if (list_empty(&kvm->arch.active_mmu_pages))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2398) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2399)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2400) restart:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2401) list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2402) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2403) * Don't zap active root pages, the page itself can't be freed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2404) * and zapping it will just force vCPUs to realloc and reload.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2405) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2406) if (sp->root_count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2407) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2408)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2409) unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2410) &nr_zapped);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2411) total_zapped += nr_zapped;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2412) if (total_zapped >= nr_to_zap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2413) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2414)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2415) if (unstable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2416) goto restart;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2417) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2418)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2419) kvm_mmu_commit_zap_page(kvm, &invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2420)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2421) kvm->stat.mmu_recycled += total_zapped;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2422) return total_zapped;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2423) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2424)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2425) static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2426) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2427) if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2428) return kvm->arch.n_max_mmu_pages -
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2429) kvm->arch.n_used_mmu_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2430)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2431) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2432) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2433)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2434) static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2435) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2436) unsigned long avail = kvm_mmu_available_pages(vcpu->kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2437)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2438) if (likely(avail >= KVM_MIN_FREE_MMU_PAGES))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2439) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2440)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2441) kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2442)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2443) if (!kvm_mmu_available_pages(vcpu->kvm))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2444) return -ENOSPC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2445) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2446) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2447)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2448) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2449) * Changing the number of mmu pages allocated to the vm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2450) * Note: if goal_nr_mmu_pages is too small, you will get dead lock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2451) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2452) void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2453) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2454) spin_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2455)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2456) if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2457) kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2458) goal_nr_mmu_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2459)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2460) goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2461) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2462)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2463) kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2464)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2465) spin_unlock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2466) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2467)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2468) int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2469) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2470) struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2471) LIST_HEAD(invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2472) int r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2473)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2474) pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2475) r = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2476) spin_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2477) for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2478) pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2479) sp->role.word);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2480) r = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2481) kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2482) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2483) kvm_mmu_commit_zap_page(kvm, &invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2484) spin_unlock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2485)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2486) return r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2487) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2488) EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2489)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2490) static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2491) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2492) trace_kvm_mmu_unsync_page(sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2493) ++vcpu->kvm->stat.mmu_unsync;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2494) sp->unsync = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2495)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2496) kvm_mmu_mark_parents_unsync(sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2497) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2498)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2499) bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2500) bool can_unsync)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2501) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2502) struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2503)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2504) if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2505) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2506)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2507) for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2508) if (!can_unsync)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2509) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2510)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2511) if (sp->unsync)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2512) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2513)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2514) WARN_ON(sp->role.level != PG_LEVEL_4K);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2515) kvm_unsync_page(vcpu, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2516) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2517)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2518) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2519) * We need to ensure that the marking of unsync pages is visible
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2520) * before the SPTE is updated to allow writes because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2521) * kvm_mmu_sync_roots() checks the unsync flags without holding
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2522) * the MMU lock and so can race with this. If the SPTE was updated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2523) * before the page had been marked as unsync-ed, something like the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2524) * following could happen:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2525) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2526) * CPU 1 CPU 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2527) * ---------------------------------------------------------------------
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2528) * 1.2 Host updates SPTE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2529) * to be writable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2530) * 2.1 Guest writes a GPTE for GVA X.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2531) * (GPTE being in the guest page table shadowed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2532) * by the SP from CPU 1.)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2533) * This reads SPTE during the page table walk.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2534) * Since SPTE.W is read as 1, there is no
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2535) * fault.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2536) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2537) * 2.2 Guest issues TLB flush.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2538) * That causes a VM Exit.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2539) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2540) * 2.3 kvm_mmu_sync_pages() reads sp->unsync.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2541) * Since it is false, so it just returns.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2542) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2543) * 2.4 Guest accesses GVA X.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2544) * Since the mapping in the SP was not updated,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2545) * so the old mapping for GVA X incorrectly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2546) * gets used.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2547) * 1.1 Host marks SP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2548) * as unsync
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2549) * (sp->unsync = true)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2550) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2551) * The write barrier below ensures that 1.1 happens before 1.2 and thus
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2552) * the situation in 2.4 does not arise. The implicit barrier in 2.2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2553) * pairs with this write barrier.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2554) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2555) smp_wmb();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2556)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2557) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2558) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2559)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2560) static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2561) unsigned int pte_access, int level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2562) gfn_t gfn, kvm_pfn_t pfn, bool speculative,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2563) bool can_unsync, bool host_writable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2564) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2565) u64 spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2566) struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2567) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2568)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2569) if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2570) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2571)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2572) sp = sptep_to_sp(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2573)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2574) ret = make_spte(vcpu, pte_access, level, gfn, pfn, *sptep, speculative,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2575) can_unsync, host_writable, sp_ad_disabled(sp), &spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2576)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2577) if (spte & PT_WRITABLE_MASK)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2578) kvm_vcpu_mark_page_dirty(vcpu, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2579)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2580) if (*sptep == spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2581) ret |= SET_SPTE_SPURIOUS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2582) else if (mmu_spte_update(sptep, spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2583) ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2584) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2585) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2586)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2587) static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2588) unsigned int pte_access, bool write_fault, int level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2589) gfn_t gfn, kvm_pfn_t pfn, bool speculative,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2590) bool host_writable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2591) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2592) int was_rmapped = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2593) int rmap_count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2594) int set_spte_ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2595) int ret = RET_PF_FIXED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2596) bool flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2597)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2598) pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2599) *sptep, write_fault, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2600)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2601) if (is_shadow_present_pte(*sptep)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2602) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2603) * If we overwrite a PTE page pointer with a 2MB PMD, unlink
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2604) * the parent of the now unreachable PTE.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2605) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2606) if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2607) struct kvm_mmu_page *child;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2608) u64 pte = *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2609)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2610) child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2611) drop_parent_pte(child, sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2612) flush = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2613) } else if (pfn != spte_to_pfn(*sptep)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2614) pgprintk("hfn old %llx new %llx\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2615) spte_to_pfn(*sptep), pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2616) drop_spte(vcpu->kvm, sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2617) flush = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2618) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2619) was_rmapped = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2620) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2621)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2622) set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2623) speculative, true, host_writable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2624) if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2625) if (write_fault)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2626) ret = RET_PF_EMULATE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2627) kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2628) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2629)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2630) if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2631) kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2632) KVM_PAGES_PER_HPAGE(level));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2633)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2634) if (unlikely(is_mmio_spte(*sptep)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2635) ret = RET_PF_EMULATE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2636)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2637) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2638) * The fault is fully spurious if and only if the new SPTE and old SPTE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2639) * are identical, and emulation is not required.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2640) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2641) if ((set_spte_ret & SET_SPTE_SPURIOUS) && ret == RET_PF_FIXED) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2642) WARN_ON_ONCE(!was_rmapped);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2643) return RET_PF_SPURIOUS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2644) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2645)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2646) pgprintk("%s: setting spte %llx\n", __func__, *sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2647) trace_kvm_mmu_set_spte(level, gfn, sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2648) if (!was_rmapped && is_large_pte(*sptep))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2649) ++vcpu->kvm->stat.lpages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2650)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2651) if (is_shadow_present_pte(*sptep)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2652) if (!was_rmapped) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2653) rmap_count = rmap_add(vcpu, sptep, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2654) if (rmap_count > RMAP_RECYCLE_THRESHOLD)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2655) rmap_recycle(vcpu, sptep, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2656) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2657) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2658)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2659) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2660) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2661)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2662) static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2663) bool no_dirty_log)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2664) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2665) struct kvm_memory_slot *slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2666)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2667) slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2668) if (!slot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2669) return KVM_PFN_ERR_FAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2670)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2671) return gfn_to_pfn_memslot_atomic(slot, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2672) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2673)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2674) static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2675) struct kvm_mmu_page *sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2676) u64 *start, u64 *end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2677) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2678) struct page *pages[PTE_PREFETCH_NUM];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2679) struct kvm_memory_slot *slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2680) unsigned int access = sp->role.access;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2681) int i, ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2682) gfn_t gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2683)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2684) gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2685) slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2686) if (!slot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2687) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2688)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2689) ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2690) if (ret <= 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2691) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2692)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2693) for (i = 0; i < ret; i++, gfn++, start++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2694) mmu_set_spte(vcpu, start, access, false, sp->role.level, gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2695) page_to_pfn(pages[i]), true, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2696) put_page(pages[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2697) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2698)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2699) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2700) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2701)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2702) static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2703) struct kvm_mmu_page *sp, u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2704) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2705) u64 *spte, *start = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2706) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2707)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2708) WARN_ON(!sp->role.direct);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2709)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2710) i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2711) spte = sp->spt + i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2712)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2713) for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2714) if (is_shadow_present_pte(*spte) || spte == sptep) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2715) if (!start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2716) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2717) if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2718) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2719) start = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2720) } else if (!start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2721) start = spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2722) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2723) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2724)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2725) static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2726) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2727) struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2728)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2729) sp = sptep_to_sp(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2730)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2731) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2732) * Without accessed bits, there's no way to distinguish between
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2733) * actually accessed translations and prefetched, so disable pte
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2734) * prefetch if accessed bits aren't available.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2735) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2736) if (sp_ad_disabled(sp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2737) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2738)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2739) if (sp->role.level > PG_LEVEL_4K)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2740) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2741)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2742) __direct_pte_prefetch(vcpu, sp, sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2743) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2744)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2745) static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2746) kvm_pfn_t pfn, struct kvm_memory_slot *slot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2747) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2748) unsigned long hva;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2749) pte_t *pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2750) int level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2751)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2752) if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2753) return PG_LEVEL_4K;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2754)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2755) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2756) * Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2757) * is not solely for performance, it's also necessary to avoid the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2758) * "writable" check in __gfn_to_hva_many(), which will always fail on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2759) * read-only memslots due to gfn_to_hva() assuming writes. Earlier
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2760) * page fault steps have already verified the guest isn't writing a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2761) * read-only memslot.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2762) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2763) hva = __gfn_to_hva_memslot(slot, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2764)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2765) pte = lookup_address_in_mm(vcpu->kvm->mm, hva, &level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2766) if (unlikely(!pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2767) return PG_LEVEL_4K;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2768)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2769) return level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2770) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2771)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2772) int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2773) int max_level, kvm_pfn_t *pfnp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2774) bool huge_page_disallowed, int *req_level)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2775) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2776) struct kvm_memory_slot *slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2777) struct kvm_lpage_info *linfo;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2778) kvm_pfn_t pfn = *pfnp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2779) kvm_pfn_t mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2780) int level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2781)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2782) *req_level = PG_LEVEL_4K;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2783)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2784) if (unlikely(max_level == PG_LEVEL_4K))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2785) return PG_LEVEL_4K;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2786)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2787) if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2788) return PG_LEVEL_4K;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2789)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2790) slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2791) if (!slot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2792) return PG_LEVEL_4K;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2793)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2794) max_level = min(max_level, max_huge_page_level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2795) for ( ; max_level > PG_LEVEL_4K; max_level--) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2796) linfo = lpage_info_slot(gfn, slot, max_level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2797) if (!linfo->disallow_lpage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2798) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2799) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2800)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2801) if (max_level == PG_LEVEL_4K)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2802) return PG_LEVEL_4K;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2803)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2804) level = host_pfn_mapping_level(vcpu, gfn, pfn, slot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2805) if (level == PG_LEVEL_4K)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2806) return level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2807)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2808) *req_level = level = min(level, max_level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2809)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2810) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2811) * Enforce the iTLB multihit workaround after capturing the requested
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2812) * level, which will be used to do precise, accurate accounting.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2813) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2814) if (huge_page_disallowed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2815) return PG_LEVEL_4K;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2816)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2817) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2818) * mmu_notifier_retry() was successful and mmu_lock is held, so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2819) * the pmd can't be split from under us.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2820) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2821) mask = KVM_PAGES_PER_HPAGE(level) - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2822) VM_BUG_ON((gfn & mask) != (pfn & mask));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2823) *pfnp = pfn & ~mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2824)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2825) return level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2826) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2827)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2828) void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2829) kvm_pfn_t *pfnp, int *goal_levelp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2830) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2831) int level = *goal_levelp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2832)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2833) if (cur_level == level && level > PG_LEVEL_4K &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2834) is_shadow_present_pte(spte) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2835) !is_large_pte(spte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2836) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2837) * A small SPTE exists for this pfn, but FNAME(fetch)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2838) * and __direct_map would like to create a large PTE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2839) * instead: just force them to go down another level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2840) * patching back for them into pfn the next 9 bits of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2841) * the address.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2842) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2843) u64 page_mask = KVM_PAGES_PER_HPAGE(level) -
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2844) KVM_PAGES_PER_HPAGE(level - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2845) *pfnp |= gfn & page_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2846) (*goal_levelp)--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2847) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2848) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2849)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2850) static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2851) int map_writable, int max_level, kvm_pfn_t pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2852) bool prefault, bool is_tdp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2853) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2854) bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2855) bool write = error_code & PFERR_WRITE_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2856) bool exec = error_code & PFERR_FETCH_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2857) bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2858) struct kvm_shadow_walk_iterator it;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2859) struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2860) int level, req_level, ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2861) gfn_t gfn = gpa >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2862) gfn_t base_gfn = gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2863)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2864) if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2865) return RET_PF_RETRY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2866)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2867) level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2868) huge_page_disallowed, &req_level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2869)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2870) trace_kvm_mmu_spte_requested(gpa, level, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2871) for_each_shadow_entry(vcpu, gpa, it) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2872) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2873) * We cannot overwrite existing page tables with an NX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2874) * large page, as the leaf could be executable.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2875) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2876) if (nx_huge_page_workaround_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2877) disallowed_hugepage_adjust(*it.sptep, gfn, it.level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2878) &pfn, &level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2879)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2880) base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2881) if (it.level == level)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2882) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2883)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2884) drop_large_spte(vcpu, it.sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2885) if (!is_shadow_present_pte(*it.sptep)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2886) sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2887) it.level - 1, true, ACC_ALL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2888)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2889) link_shadow_page(vcpu, it.sptep, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2890) if (is_tdp && huge_page_disallowed &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2891) req_level >= it.level)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2892) account_huge_nx_page(vcpu->kvm, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2893) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2894) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2895)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2896) ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2897) write, level, base_gfn, pfn, prefault,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2898) map_writable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2899) if (ret == RET_PF_SPURIOUS)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2900) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2901)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2902) direct_pte_prefetch(vcpu, it.sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2903) ++vcpu->stat.pf_fixed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2904) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2905) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2906)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2907) static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2908) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2909) send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2910) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2911)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2912) static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2913) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2914) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2915) * Do not cache the mmio info caused by writing the readonly gfn
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2916) * into the spte otherwise read access on readonly gfn also can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2917) * caused mmio page fault and treat it as mmio access.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2918) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2919) if (pfn == KVM_PFN_ERR_RO_FAULT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2920) return RET_PF_EMULATE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2921)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2922) if (pfn == KVM_PFN_ERR_HWPOISON) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2923) kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2924) return RET_PF_RETRY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2925) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2926)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2927) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2928) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2929)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2930) static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2931) kvm_pfn_t pfn, unsigned int access,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2932) int *ret_val)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2933) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2934) /* The pfn is invalid, report the error! */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2935) if (unlikely(is_error_pfn(pfn))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2936) *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2937) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2938) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2939)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2940) if (unlikely(is_noslot_pfn(pfn)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2941) vcpu_cache_mmio_info(vcpu, gva, gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2942) access & shadow_mmio_access_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2943)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2944) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2945) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2946)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2947) static bool page_fault_can_be_fast(u32 error_code)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2948) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2949) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2950) * Do not fix the mmio spte with invalid generation number which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2951) * need to be updated by slow page fault path.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2952) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2953) if (unlikely(error_code & PFERR_RSVD_MASK))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2954) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2955)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2956) /* See if the page fault is due to an NX violation */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2957) if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2958) == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2959) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2960)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2961) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2962) * #PF can be fast if:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2963) * 1. The shadow page table entry is not present, which could mean that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2964) * the fault is potentially caused by access tracking (if enabled).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2965) * 2. The shadow page table entry is present and the fault
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2966) * is caused by write-protect, that means we just need change the W
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2967) * bit of the spte which can be done out of mmu-lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2968) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2969) * However, if access tracking is disabled we know that a non-present
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2970) * page must be a genuine page fault where we have to create a new SPTE.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2971) * So, if access tracking is disabled, we return true only for write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2972) * accesses to a present page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2973) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2974)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2975) return shadow_acc_track_mask != 0 ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2976) ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2977) == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2978) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2979)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2980) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2981) * Returns true if the SPTE was fixed successfully. Otherwise,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2982) * someone else modified the SPTE from its original value.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2983) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2984) static bool
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2985) fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2986) u64 *sptep, u64 old_spte, u64 new_spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2987) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2988) gfn_t gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2989)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2990) WARN_ON(!sp->role.direct);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2991)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2992) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2993) * Theoretically we could also set dirty bit (and flush TLB) here in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2994) * order to eliminate unnecessary PML logging. See comments in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2995) * set_spte. But fast_page_fault is very unlikely to happen with PML
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2996) * enabled, so we do not do this. This might result in the same GPA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2997) * to be logged in PML buffer again when the write really happens, and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2998) * eventually to be called by mark_page_dirty twice. But it's also no
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2999) * harm. This also avoids the TLB flush needed after setting dirty bit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3000) * so non-PML cases won't be impacted.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3001) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3002) * Compare with set_spte where instead shadow_dirty_mask is set.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3003) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3004) if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3005) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3006)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3007) if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3008) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3009) * The gfn of direct spte is stable since it is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3010) * calculated by sp->gfn.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3011) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3012) gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3013) kvm_vcpu_mark_page_dirty(vcpu, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3014) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3015)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3016) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3017) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3018)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3019) static bool is_access_allowed(u32 fault_err_code, u64 spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3020) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3021) if (fault_err_code & PFERR_FETCH_MASK)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3022) return is_executable_pte(spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3023)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3024) if (fault_err_code & PFERR_WRITE_MASK)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3025) return is_writable_pte(spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3026)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3027) /* Fault was on Read access */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3028) return spte & PT_PRESENT_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3029) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3030)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3031) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3032) * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3033) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3034) static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3035) u32 error_code)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3036) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3037) struct kvm_shadow_walk_iterator iterator;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3038) struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3039) int ret = RET_PF_INVALID;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3040) u64 spte = 0ull;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3041) uint retry_count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3042)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3043) if (!page_fault_can_be_fast(error_code))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3044) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3045)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3046) walk_shadow_page_lockless_begin(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3047)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3048) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3049) u64 new_spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3050)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3051) for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3052) if (!is_shadow_present_pte(spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3053) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3054)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3055) sp = sptep_to_sp(iterator.sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3056) if (!is_last_spte(spte, sp->role.level))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3057) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3058)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3059) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3060) * Check whether the memory access that caused the fault would
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3061) * still cause it if it were to be performed right now. If not,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3062) * then this is a spurious fault caused by TLB lazily flushed,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3063) * or some other CPU has already fixed the PTE after the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3064) * current CPU took the fault.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3065) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3066) * Need not check the access of upper level table entries since
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3067) * they are always ACC_ALL.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3068) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3069) if (is_access_allowed(error_code, spte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3070) ret = RET_PF_SPURIOUS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3071) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3072) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3073)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3074) new_spte = spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3075)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3076) if (is_access_track_spte(spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3077) new_spte = restore_acc_track_spte(new_spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3078)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3079) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3080) * Currently, to simplify the code, write-protection can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3081) * be removed in the fast path only if the SPTE was
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3082) * write-protected for dirty-logging or access tracking.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3083) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3084) if ((error_code & PFERR_WRITE_MASK) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3085) spte_can_locklessly_be_made_writable(spte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3086) new_spte |= PT_WRITABLE_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3087)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3088) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3089) * Do not fix write-permission on the large spte. Since
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3090) * we only dirty the first page into the dirty-bitmap in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3091) * fast_pf_fix_direct_spte(), other pages are missed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3092) * if its slot has dirty logging enabled.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3093) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3094) * Instead, we let the slow page fault path create a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3095) * normal spte to fix the access.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3096) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3097) * See the comments in kvm_arch_commit_memory_region().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3098) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3099) if (sp->role.level > PG_LEVEL_4K)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3100) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3101) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3102)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3103) /* Verify that the fault can be handled in the fast path */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3104) if (new_spte == spte ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3105) !is_access_allowed(error_code, new_spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3106) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3107)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3108) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3109) * Currently, fast page fault only works for direct mapping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3110) * since the gfn is not stable for indirect shadow page. See
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3111) * Documentation/virt/kvm/locking.rst to get more detail.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3112) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3113) if (fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3114) new_spte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3115) ret = RET_PF_FIXED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3116) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3117) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3118)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3119) if (++retry_count > 4) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3120) printk_once(KERN_WARNING
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3121) "kvm: Fast #PF retrying more than 4 times.\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3122) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3123) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3124)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3125) } while (true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3126)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3127) trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3128) spte, ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3129) walk_shadow_page_lockless_end(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3130)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3131) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3132) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3133)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3134) static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3135) struct list_head *invalid_list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3136) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3137) struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3138)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3139) if (!VALID_PAGE(*root_hpa))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3140) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3141)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3142) sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3143)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3144) if (kvm_mmu_put_root(kvm, sp)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3145) if (sp->tdp_mmu_page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3146) kvm_tdp_mmu_free_root(kvm, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3147) else if (sp->role.invalid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3148) kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3149) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3150)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3151) *root_hpa = INVALID_PAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3152) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3153)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3154) /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3155) void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3156) ulong roots_to_free)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3157) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3158) struct kvm *kvm = vcpu->kvm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3159) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3160) LIST_HEAD(invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3161) bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3162)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3163) BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3164)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3165) /* Before acquiring the MMU lock, see if we need to do any real work. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3166) if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3167) for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3168) if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3169) VALID_PAGE(mmu->prev_roots[i].hpa))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3170) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3171)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3172) if (i == KVM_MMU_NUM_PREV_ROOTS)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3173) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3174) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3175)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3176) spin_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3177)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3178) for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3179) if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3180) mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3181) &invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3182)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3183) if (free_active_root) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3184) if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3185) (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3186) mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3187) } else if (mmu->pae_root) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3188) for (i = 0; i < 4; ++i)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3189) if (mmu->pae_root[i] != 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3190) mmu_free_root_page(kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3191) &mmu->pae_root[i],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3192) &invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3193) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3194) mmu->root_hpa = INVALID_PAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3195) mmu->root_pgd = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3196) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3197)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3198) kvm_mmu_commit_zap_page(kvm, &invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3199) spin_unlock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3200) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3201) EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3202)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3203) static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3204) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3205) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3206)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3207) if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3208) kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3209) ret = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3210) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3211)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3212) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3213) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3214)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3215) static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3216) u8 level, bool direct)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3217) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3218) struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3219)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3220) spin_lock(&vcpu->kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3221)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3222) if (make_mmu_pages_available(vcpu)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3223) spin_unlock(&vcpu->kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3224) return INVALID_PAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3225) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3226) sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3227) ++sp->root_count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3228)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3229) spin_unlock(&vcpu->kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3230) return __pa(sp->spt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3231) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3232)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3233) static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3234) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3235) u8 shadow_root_level = vcpu->arch.mmu->shadow_root_level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3236) hpa_t root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3237) unsigned i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3238)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3239) if (vcpu->kvm->arch.tdp_mmu_enabled) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3240) root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3241)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3242) if (!VALID_PAGE(root))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3243) return -ENOSPC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3244) vcpu->arch.mmu->root_hpa = root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3245) } else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3246) root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3247) true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3248)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3249) if (!VALID_PAGE(root))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3250) return -ENOSPC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3251) vcpu->arch.mmu->root_hpa = root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3252) } else if (shadow_root_level == PT32E_ROOT_LEVEL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3253) for (i = 0; i < 4; ++i) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3254) MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3255)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3256) root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3257) i << 30, PT32_ROOT_LEVEL, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3258) if (!VALID_PAGE(root))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3259) return -ENOSPC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3260) vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3261) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3262) vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3263) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3264) BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3265)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3266) /* root_pgd is ignored for direct MMUs. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3267) vcpu->arch.mmu->root_pgd = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3268)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3269) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3270) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3271)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3272) static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3273) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3274) u64 pdptr, pm_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3275) gfn_t root_gfn, root_pgd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3276) hpa_t root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3277) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3278)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3279) root_pgd = vcpu->arch.mmu->get_guest_pgd(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3280) root_gfn = root_pgd >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3281)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3282) if (mmu_check_root(vcpu, root_gfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3283) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3284)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3285) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3286) * Do we shadow a long mode page table? If so we need to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3287) * write-protect the guests page table root.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3288) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3289) if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3290) MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->root_hpa));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3291)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3292) root = mmu_alloc_root(vcpu, root_gfn, 0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3293) vcpu->arch.mmu->shadow_root_level, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3294) if (!VALID_PAGE(root))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3295) return -ENOSPC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3296) vcpu->arch.mmu->root_hpa = root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3297) goto set_root_pgd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3298) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3299)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3300) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3301) * We shadow a 32 bit page table. This may be a legacy 2-level
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3302) * or a PAE 3-level page table. In either case we need to be aware that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3303) * the shadow page table may be a PAE or a long mode page table.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3304) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3305) pm_mask = PT_PRESENT_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3306) if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3307) pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3308)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3309) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3310) * Allocate the page for the PDPTEs when shadowing 32-bit NPT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3311) * with 64-bit only when needed. Unlike 32-bit NPT, it doesn't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3312) * need to be in low mem. See also lm_root below.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3313) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3314) if (!vcpu->arch.mmu->pae_root) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3315) WARN_ON_ONCE(!tdp_enabled);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3316)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3317) vcpu->arch.mmu->pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3318) if (!vcpu->arch.mmu->pae_root)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3319) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3320) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3321) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3322)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3323) for (i = 0; i < 4; ++i) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3324) MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3325) if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3326) pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3327) if (!(pdptr & PT_PRESENT_MASK)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3328) vcpu->arch.mmu->pae_root[i] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3329) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3330) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3331) root_gfn = pdptr >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3332) if (mmu_check_root(vcpu, root_gfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3333) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3334) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3335)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3336) root = mmu_alloc_root(vcpu, root_gfn, i << 30,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3337) PT32_ROOT_LEVEL, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3338) if (!VALID_PAGE(root))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3339) return -ENOSPC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3340) vcpu->arch.mmu->pae_root[i] = root | pm_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3341) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3342) vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3343)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3344) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3345) * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3346) * tables are allocated and initialized at MMU creation as there is no
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3347) * equivalent level in the guest's NPT to shadow. Allocate the tables
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3348) * on demand, as running a 32-bit L1 VMM is very rare. The PDP is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3349) * handled above (to share logic with PAE), deal with the PML4 here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3350) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3351) if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3352) if (vcpu->arch.mmu->lm_root == NULL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3353) u64 *lm_root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3354)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3355) lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3356) if (!lm_root)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3357) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3358)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3359) lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3360)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3361) vcpu->arch.mmu->lm_root = lm_root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3362) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3363)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3364) vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3365) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3366)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3367) set_root_pgd:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3368) vcpu->arch.mmu->root_pgd = root_pgd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3369)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3370) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3371) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3372)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3373) static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3374) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3375) if (vcpu->arch.mmu->direct_map)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3376) return mmu_alloc_direct_roots(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3377) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3378) return mmu_alloc_shadow_roots(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3379) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3380)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3381) void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3382) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3383) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3384) struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3385)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3386) if (vcpu->arch.mmu->direct_map)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3387) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3388)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3389) if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3390) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3391)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3392) vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3393)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3394) if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3395) hpa_t root = vcpu->arch.mmu->root_hpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3396) sp = to_shadow_page(root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3397)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3398) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3399) * Even if another CPU was marking the SP as unsync-ed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3400) * simultaneously, any guest page table changes are not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3401) * guaranteed to be visible anyway until this VCPU issues a TLB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3402) * flush strictly after those changes are made. We only need to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3403) * ensure that the other CPU sets these flags before any actual
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3404) * changes to the page tables are made. The comments in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3405) * mmu_need_write_protect() describe what could go wrong if this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3406) * requirement isn't satisfied.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3407) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3408) if (!smp_load_acquire(&sp->unsync) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3409) !smp_load_acquire(&sp->unsync_children))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3410) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3411)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3412) spin_lock(&vcpu->kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3413) kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3414)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3415) mmu_sync_children(vcpu, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3416)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3417) kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3418) spin_unlock(&vcpu->kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3419) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3420) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3421)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3422) spin_lock(&vcpu->kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3423) kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3424)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3425) for (i = 0; i < 4; ++i) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3426) hpa_t root = vcpu->arch.mmu->pae_root[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3427)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3428) if (root && VALID_PAGE(root)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3429) root &= PT64_BASE_ADDR_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3430) sp = to_shadow_page(root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3431) mmu_sync_children(vcpu, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3432) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3433) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3434)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3435) kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3436) spin_unlock(&vcpu->kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3437) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3438) EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3439)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3440) static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3441) u32 access, struct x86_exception *exception)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3442) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3443) if (exception)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3444) exception->error_code = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3445) return vaddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3446) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3447)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3448) static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3449) u32 access,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3450) struct x86_exception *exception)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3451) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3452) if (exception)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3453) exception->error_code = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3454) return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3455) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3456)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3457) static bool
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3458) __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3459) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3460) int bit7 = (pte >> 7) & 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3461)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3462) return pte & rsvd_check->rsvd_bits_mask[bit7][level-1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3463) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3464)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3465) static bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, u64 pte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3466) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3467) return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3468) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3469)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3470) static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3471) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3472) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3473) * A nested guest cannot use the MMIO cache if it is using nested
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3474) * page tables, because cr2 is a nGPA while the cache stores GPAs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3475) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3476) if (mmu_is_nested(vcpu))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3477) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3478)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3479) if (direct)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3480) return vcpu_match_mmio_gpa(vcpu, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3481)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3482) return vcpu_match_mmio_gva(vcpu, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3483) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3484)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3485) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3486) * Return the level of the lowest level SPTE added to sptes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3487) * That SPTE may be non-present.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3488) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3489) static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3490) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3491) struct kvm_shadow_walk_iterator iterator;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3492) int leaf = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3493) u64 spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3494)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3495) walk_shadow_page_lockless_begin(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3496)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3497) for (shadow_walk_init(&iterator, vcpu, addr),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3498) *root_level = iterator.level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3499) shadow_walk_okay(&iterator);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3500) __shadow_walk_next(&iterator, spte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3501) leaf = iterator.level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3502) spte = mmu_spte_get_lockless(iterator.sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3503)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3504) sptes[leaf - 1] = spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3505)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3506) if (!is_shadow_present_pte(spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3507) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3508) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3509)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3510) walk_shadow_page_lockless_end(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3511)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3512) return leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3513) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3514)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3515) /* return true if reserved bit is detected on spte. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3516) static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3517) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3518) u64 sptes[PT64_ROOT_MAX_LEVEL];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3519) struct rsvd_bits_validate *rsvd_check;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3520) int root, leaf, level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3521) bool reserved = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3522)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3523) if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3524) *sptep = 0ull;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3525) return reserved;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3526) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3527)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3528) if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3529) leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3530) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3531) leaf = get_walk(vcpu, addr, sptes, &root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3532)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3533) if (unlikely(leaf < 0)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3534) *sptep = 0ull;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3535) return reserved;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3536) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3537)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3538) rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3539)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3540) for (level = root; level >= leaf; level--) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3541) if (!is_shadow_present_pte(sptes[level - 1]))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3542) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3543) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3544) * Use a bitwise-OR instead of a logical-OR to aggregate the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3545) * reserved bit and EPT's invalid memtype/XWR checks to avoid
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3546) * adding a Jcc in the loop.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3547) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3548) reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level - 1]) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3549) __is_rsvd_bits_set(rsvd_check, sptes[level - 1],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3550) level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3551) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3552)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3553) if (reserved) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3554) pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3555) __func__, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3556) for (level = root; level >= leaf; level--)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3557) pr_err("------ spte 0x%llx level %d.\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3558) sptes[level - 1], level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3559) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3560)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3561) *sptep = sptes[leaf - 1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3562)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3563) return reserved;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3564) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3565)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3566) static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3567) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3568) u64 spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3569) bool reserved;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3570)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3571) if (mmio_info_in_cache(vcpu, addr, direct))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3572) return RET_PF_EMULATE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3573)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3574) reserved = get_mmio_spte(vcpu, addr, &spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3575) if (WARN_ON(reserved))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3576) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3577)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3578) if (is_mmio_spte(spte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3579) gfn_t gfn = get_mmio_spte_gfn(spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3580) unsigned int access = get_mmio_spte_access(spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3581)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3582) if (!check_mmio_spte(vcpu, spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3583) return RET_PF_INVALID;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3584)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3585) if (direct)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3586) addr = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3587)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3588) trace_handle_mmio_page_fault(addr, gfn, access);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3589) vcpu_cache_mmio_info(vcpu, addr, gfn, access);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3590) return RET_PF_EMULATE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3591) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3592)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3593) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3594) * If the page table is zapped by other cpus, let CPU fault again on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3595) * the address.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3596) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3597) return RET_PF_RETRY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3598) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3599)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3600) static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3601) u32 error_code, gfn_t gfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3602) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3603) if (unlikely(error_code & PFERR_RSVD_MASK))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3604) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3605)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3606) if (!(error_code & PFERR_PRESENT_MASK) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3607) !(error_code & PFERR_WRITE_MASK))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3608) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3609)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3610) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3611) * guest is writing the page which is write tracked which can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3612) * not be fixed by page fault handler.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3613) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3614) if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3615) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3616)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3617) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3618) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3619)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3620) static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3621) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3622) struct kvm_shadow_walk_iterator iterator;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3623) u64 spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3624)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3625) walk_shadow_page_lockless_begin(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3626) for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3627) clear_sp_write_flooding_count(iterator.sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3628) if (!is_shadow_present_pte(spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3629) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3630) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3631) walk_shadow_page_lockless_end(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3632) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3633)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3634) static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3635) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3636) /* make sure the token value is not 0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3637) u32 id = vcpu->arch.apf.id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3638)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3639) if (id << 12 == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3640) vcpu->arch.apf.id = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3641)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3642) return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3643) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3644)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3645) static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3646) gfn_t gfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3647) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3648) struct kvm_arch_async_pf arch;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3649)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3650) arch.token = alloc_apf_token(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3651) arch.gfn = gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3652) arch.direct_map = vcpu->arch.mmu->direct_map;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3653) arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3654)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3655) return kvm_setup_async_pf(vcpu, cr2_or_gpa,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3656) kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3657) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3658)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3659) static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3660) gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3661) bool *writable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3662) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3663) struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3664) bool async;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3665)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3666) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3667) * Retry the page fault if the gfn hit a memslot that is being deleted
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3668) * or moved. This ensures any existing SPTEs for the old memslot will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3669) * be zapped before KVM inserts a new MMIO SPTE for the gfn.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3670) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3671) if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3672) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3673)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3674) /* Don't expose private memslots to L2. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3675) if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3676) *pfn = KVM_PFN_NOSLOT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3677) *writable = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3678) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3679) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3680)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3681) async = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3682) *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3683) if (!async)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3684) return false; /* *pfn has correct page already */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3685)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3686) if (!prefault && kvm_can_do_async_pf(vcpu)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3687) trace_kvm_try_async_get_page(cr2_or_gpa, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3688) if (kvm_find_async_pf_gfn(vcpu, gfn)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3689) trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3690) kvm_make_request(KVM_REQ_APF_HALT, vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3691) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3692) } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3693) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3694) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3695)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3696) *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3697) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3698) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3699)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3700) static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3701) bool prefault, int max_level, bool is_tdp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3702) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3703) bool write = error_code & PFERR_WRITE_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3704) bool map_writable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3705)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3706) gfn_t gfn = gpa >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3707) unsigned long mmu_seq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3708) kvm_pfn_t pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3709) int r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3710)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3711) if (page_fault_handle_page_track(vcpu, error_code, gfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3712) return RET_PF_EMULATE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3713)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3714) if (!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3715) r = fast_page_fault(vcpu, gpa, error_code);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3716) if (r != RET_PF_INVALID)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3717) return r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3718) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3719)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3720) r = mmu_topup_memory_caches(vcpu, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3721) if (r)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3722) return r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3723)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3724) mmu_seq = vcpu->kvm->mmu_notifier_seq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3725) smp_rmb();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3726)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3727) if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3728) return RET_PF_RETRY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3729)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3730) if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3731) return r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3732)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3733) r = RET_PF_RETRY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3734) spin_lock(&vcpu->kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3735) if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3736) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3737) r = make_mmu_pages_available(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3738) if (r)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3739) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3740)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3741) if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3742) r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3743) pfn, prefault);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3744) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3745) r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3746) prefault, is_tdp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3747)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3748) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3749) spin_unlock(&vcpu->kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3750) kvm_release_pfn_clean(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3751) return r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3752) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3753)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3754) static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3755) u32 error_code, bool prefault)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3756) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3757) pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3758)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3759) /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3760) return direct_page_fault(vcpu, gpa & PAGE_MASK, error_code, prefault,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3761) PG_LEVEL_2M, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3762) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3763)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3764) int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3765) u64 fault_address, char *insn, int insn_len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3766) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3767) int r = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3768) u32 flags = vcpu->arch.apf.host_apf_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3769)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3770) #ifndef CONFIG_X86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3771) /* A 64-bit CR2 should be impossible on 32-bit KVM. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3772) if (WARN_ON_ONCE(fault_address >> 32))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3773) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3774) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3775)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3776) vcpu->arch.l1tf_flush_l1d = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3777) if (!flags) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3778) trace_kvm_page_fault(fault_address, error_code);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3779)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3780) if (kvm_event_needs_reinjection(vcpu))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3781) kvm_mmu_unprotect_page_virt(vcpu, fault_address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3782) r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3783) insn_len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3784) } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3785) vcpu->arch.apf.host_apf_flags = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3786) local_irq_disable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3787) kvm_async_pf_task_wait_schedule(fault_address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3788) local_irq_enable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3789) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3790) WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3791) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3792)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3793) return r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3794) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3795) EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3796)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3797) int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3798) bool prefault)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3799) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3800) int max_level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3801)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3802) for (max_level = KVM_MAX_HUGEPAGE_LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3803) max_level > PG_LEVEL_4K;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3804) max_level--) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3805) int page_num = KVM_PAGES_PER_HPAGE(max_level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3806) gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3807)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3808) if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3809) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3810) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3811)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3812) return direct_page_fault(vcpu, gpa, error_code, prefault,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3813) max_level, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3814) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3815)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3816) static void nonpaging_init_context(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3817) struct kvm_mmu *context)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3818) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3819) context->page_fault = nonpaging_page_fault;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3820) context->gva_to_gpa = nonpaging_gva_to_gpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3821) context->sync_page = nonpaging_sync_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3822) context->invlpg = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3823) context->root_level = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3824) context->shadow_root_level = PT32E_ROOT_LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3825) context->direct_map = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3826) context->nx = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3827) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3828)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3829) static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3830) union kvm_mmu_page_role role)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3831) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3832) return (role.direct || pgd == root->pgd) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3833) VALID_PAGE(root->hpa) && to_shadow_page(root->hpa) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3834) role.word == to_shadow_page(root->hpa)->role.word;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3835) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3836)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3837) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3838) * Find out if a previously cached root matching the new pgd/role is available.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3839) * The current root is also inserted into the cache.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3840) * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3841) * returned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3842) * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3843) * false is returned. This root should now be freed by the caller.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3844) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3845) static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_pgd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3846) union kvm_mmu_page_role new_role)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3847) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3848) uint i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3849) struct kvm_mmu_root_info root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3850) struct kvm_mmu *mmu = vcpu->arch.mmu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3851)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3852) root.pgd = mmu->root_pgd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3853) root.hpa = mmu->root_hpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3854)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3855) if (is_root_usable(&root, new_pgd, new_role))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3856) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3857)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3858) for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3859) swap(root, mmu->prev_roots[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3860)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3861) if (is_root_usable(&root, new_pgd, new_role))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3862) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3863) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3864)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3865) mmu->root_hpa = root.hpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3866) mmu->root_pgd = root.pgd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3867)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3868) return i < KVM_MMU_NUM_PREV_ROOTS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3869) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3870)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3871) static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3872) union kvm_mmu_page_role new_role)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3873) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3874) struct kvm_mmu *mmu = vcpu->arch.mmu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3875)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3876) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3877) * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3878) * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3879) * later if necessary.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3880) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3881) if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3882) mmu->root_level >= PT64_ROOT_4LEVEL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3883) return cached_root_available(vcpu, new_pgd, new_role);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3884)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3885) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3886) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3887)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3888) static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3889) union kvm_mmu_page_role new_role,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3890) bool skip_tlb_flush, bool skip_mmu_sync)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3891) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3892) if (!fast_pgd_switch(vcpu, new_pgd, new_role)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3893) kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, KVM_MMU_ROOT_CURRENT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3894) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3895) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3896)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3897) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3898) * It's possible that the cached previous root page is obsolete because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3899) * of a change in the MMU generation number. However, changing the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3900) * generation number is accompanied by KVM_REQ_MMU_RELOAD, which will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3901) * free the root set here and allocate a new one.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3902) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3903) kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3904)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3905) if (!skip_mmu_sync || force_flush_and_sync_on_reuse)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3906) kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3907) if (!skip_tlb_flush || force_flush_and_sync_on_reuse)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3908) kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3909)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3910) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3911) * The last MMIO access's GVA and GPA are cached in the VCPU. When
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3912) * switching to a new CR3, that GVA->GPA mapping may no longer be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3913) * valid. So clear any cached MMIO info even when we don't need to sync
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3914) * the shadow page tables.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3915) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3916) vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3917)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3918) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3919) * If this is a direct root page, it doesn't have a write flooding
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3920) * count. Otherwise, clear the write flooding count.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3921) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3922) if (!new_role.direct)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3923) __clear_sp_write_flooding_count(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3924) to_shadow_page(vcpu->arch.mmu->root_hpa));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3925) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3926)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3927) void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3928) bool skip_mmu_sync)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3929) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3930) __kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3931) skip_tlb_flush, skip_mmu_sync);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3932) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3933) EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3934)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3935) static unsigned long get_cr3(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3936) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3937) return kvm_read_cr3(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3938) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3939)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3940) static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3941) unsigned int access, int *nr_present)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3942) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3943) if (unlikely(is_mmio_spte(*sptep))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3944) if (gfn != get_mmio_spte_gfn(*sptep)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3945) mmu_spte_clear_no_track(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3946) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3947) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3948)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3949) (*nr_present)++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3950) mark_mmio_spte(vcpu, sptep, gfn, access);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3951) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3952) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3953)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3954) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3955) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3956)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3957) static inline bool is_last_gpte(struct kvm_mmu *mmu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3958) unsigned level, unsigned gpte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3959) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3960) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3961) * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3962) * If it is clear, there are no large pages at this level, so clear
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3963) * PT_PAGE_SIZE_MASK in gpte if that is the case.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3964) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3965) gpte &= level - mmu->last_nonleaf_level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3966)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3967) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3968) * PG_LEVEL_4K always terminates. The RHS has bit 7 set
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3969) * iff level <= PG_LEVEL_4K, which for our purpose means
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3970) * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3971) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3972) gpte |= level - PG_LEVEL_4K - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3973)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3974) return gpte & PT_PAGE_SIZE_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3975) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3976)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3977) #define PTTYPE_EPT 18 /* arbitrary */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3978) #define PTTYPE PTTYPE_EPT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3979) #include "paging_tmpl.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3980) #undef PTTYPE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3981)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3982) #define PTTYPE 64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3983) #include "paging_tmpl.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3984) #undef PTTYPE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3985)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3986) #define PTTYPE 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3987) #include "paging_tmpl.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3988) #undef PTTYPE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3989)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3990) static void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3991) __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3992) struct rsvd_bits_validate *rsvd_check,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3993) int maxphyaddr, int level, bool nx, bool gbpages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3994) bool pse, bool amd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3995) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3996) u64 exb_bit_rsvd = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3997) u64 gbpages_bit_rsvd = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3998) u64 nonleaf_bit8_rsvd = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3999)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4000) rsvd_check->bad_mt_xwr = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4001)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4002) if (!nx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4003) exb_bit_rsvd = rsvd_bits(63, 63);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4004) if (!gbpages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4005) gbpages_bit_rsvd = rsvd_bits(7, 7);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4006)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4007) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4008) * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4009) * leaf entries) on AMD CPUs only.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4010) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4011) if (amd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4012) nonleaf_bit8_rsvd = rsvd_bits(8, 8);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4013)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4014) switch (level) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4015) case PT32_ROOT_LEVEL:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4016) /* no rsvd bits for 2 level 4K page table entries */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4017) rsvd_check->rsvd_bits_mask[0][1] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4018) rsvd_check->rsvd_bits_mask[0][0] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4019) rsvd_check->rsvd_bits_mask[1][0] =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4020) rsvd_check->rsvd_bits_mask[0][0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4021)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4022) if (!pse) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4023) rsvd_check->rsvd_bits_mask[1][1] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4024) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4025) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4026)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4027) if (is_cpuid_PSE36())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4028) /* 36bits PSE 4MB page */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4029) rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4030) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4031) /* 32 bits PSE 4MB page */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4032) rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4033) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4034) case PT32E_ROOT_LEVEL:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4035) rsvd_check->rsvd_bits_mask[0][2] =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4036) rsvd_bits(maxphyaddr, 63) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4037) rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4038) rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4039) rsvd_bits(maxphyaddr, 62); /* PDE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4040) rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4041) rsvd_bits(maxphyaddr, 62); /* PTE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4042) rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4043) rsvd_bits(maxphyaddr, 62) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4044) rsvd_bits(13, 20); /* large page */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4045) rsvd_check->rsvd_bits_mask[1][0] =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4046) rsvd_check->rsvd_bits_mask[0][0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4047) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4048) case PT64_ROOT_5LEVEL:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4049) rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4050) nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4051) rsvd_bits(maxphyaddr, 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4052) rsvd_check->rsvd_bits_mask[1][4] =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4053) rsvd_check->rsvd_bits_mask[0][4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4054) fallthrough;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4055) case PT64_ROOT_4LEVEL:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4056) rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4057) nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4058) rsvd_bits(maxphyaddr, 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4059) rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4060) gbpages_bit_rsvd |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4061) rsvd_bits(maxphyaddr, 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4062) rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4063) rsvd_bits(maxphyaddr, 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4064) rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4065) rsvd_bits(maxphyaddr, 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4066) rsvd_check->rsvd_bits_mask[1][3] =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4067) rsvd_check->rsvd_bits_mask[0][3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4068) rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4069) gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4070) rsvd_bits(13, 29);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4071) rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4072) rsvd_bits(maxphyaddr, 51) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4073) rsvd_bits(13, 20); /* large page */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4074) rsvd_check->rsvd_bits_mask[1][0] =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4075) rsvd_check->rsvd_bits_mask[0][0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4076) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4077) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4078) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4079)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4080) static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4081) struct kvm_mmu *context)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4082) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4083) __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4084) cpuid_maxphyaddr(vcpu), context->root_level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4085) context->nx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4086) guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4087) is_pse(vcpu),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4088) guest_cpuid_is_amd_or_hygon(vcpu));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4089) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4090)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4091) static void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4092) __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4093) int maxphyaddr, bool execonly)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4094) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4095) u64 bad_mt_xwr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4096)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4097) rsvd_check->rsvd_bits_mask[0][4] =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4098) rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4099) rsvd_check->rsvd_bits_mask[0][3] =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4100) rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4101) rsvd_check->rsvd_bits_mask[0][2] =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4102) rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4103) rsvd_check->rsvd_bits_mask[0][1] =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4104) rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4105) rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4106)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4107) /* large page */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4108) rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4109) rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4110) rsvd_check->rsvd_bits_mask[1][2] =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4111) rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4112) rsvd_check->rsvd_bits_mask[1][1] =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4113) rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4114) rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4115)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4116) bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4117) bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4118) bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4119) bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4120) bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4121) if (!execonly) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4122) /* bits 0..2 must not be 100 unless VMX capabilities allow it */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4123) bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4124) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4125) rsvd_check->bad_mt_xwr = bad_mt_xwr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4126) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4127)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4128) static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4129) struct kvm_mmu *context, bool execonly)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4130) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4131) __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4132) cpuid_maxphyaddr(vcpu), execonly);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4133) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4134)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4135) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4136) * the page table on host is the shadow page table for the page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4137) * table in guest or amd nested guest, its mmu features completely
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4138) * follow the features in guest.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4139) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4140) void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4141) reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4142) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4143) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4144) * KVM uses NX when TDP is disabled to handle a variety of scenarios,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4145) * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4146) * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4147) * The iTLB multi-hit workaround can be toggled at any time, so assume
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4148) * NX can be used by any non-nested shadow MMU to avoid having to reset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4149) * MMU contexts. Note, KVM forces EFER.NX=1 when TDP is disabled.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4150) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4151) bool uses_nx = context->nx || !tdp_enabled ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4152) context->mmu_role.base.smep_andnot_wp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4153) struct rsvd_bits_validate *shadow_zero_check;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4154) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4155)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4156) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4157) * Passing "true" to the last argument is okay; it adds a check
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4158) * on bit 8 of the SPTEs which KVM doesn't use anyway.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4159) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4160) shadow_zero_check = &context->shadow_zero_check;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4161) __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4162) shadow_phys_bits,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4163) context->shadow_root_level, uses_nx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4164) guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4165) is_pse(vcpu), true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4166)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4167) if (!shadow_me_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4168) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4169)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4170) for (i = context->shadow_root_level; --i >= 0;) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4171) shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4172) shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4173) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4174)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4175) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4176) EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4177)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4178) static inline bool boot_cpu_is_amd(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4179) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4180) WARN_ON_ONCE(!tdp_enabled);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4181) return shadow_x_mask == 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4182) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4183)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4184) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4185) * the direct page table on host, use as much mmu features as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4186) * possible, however, kvm currently does not do execution-protection.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4187) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4188) static void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4189) reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4190) struct kvm_mmu *context)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4191) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4192) struct rsvd_bits_validate *shadow_zero_check;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4193) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4194)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4195) shadow_zero_check = &context->shadow_zero_check;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4196)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4197) if (boot_cpu_is_amd())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4198) __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4199) shadow_phys_bits,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4200) context->shadow_root_level, false,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4201) boot_cpu_has(X86_FEATURE_GBPAGES),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4202) true, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4203) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4204) __reset_rsvds_bits_mask_ept(shadow_zero_check,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4205) shadow_phys_bits,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4206) false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4207)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4208) if (!shadow_me_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4209) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4210)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4211) for (i = context->shadow_root_level; --i >= 0;) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4212) shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4213) shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4214) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4215) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4216)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4217) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4218) * as the comments in reset_shadow_zero_bits_mask() except it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4219) * is the shadow page table for intel nested guest.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4220) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4221) static void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4222) reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4223) struct kvm_mmu *context, bool execonly)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4224) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4225) __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4226) shadow_phys_bits, execonly);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4227) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4228)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4229) #define BYTE_MASK(access) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4230) ((1 & (access) ? 2 : 0) | \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4231) (2 & (access) ? 4 : 0) | \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4232) (3 & (access) ? 8 : 0) | \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4233) (4 & (access) ? 16 : 0) | \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4234) (5 & (access) ? 32 : 0) | \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4235) (6 & (access) ? 64 : 0) | \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4236) (7 & (access) ? 128 : 0))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4237)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4238)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4239) static void update_permission_bitmask(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4240) struct kvm_mmu *mmu, bool ept)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4241) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4242) unsigned byte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4243)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4244) const u8 x = BYTE_MASK(ACC_EXEC_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4245) const u8 w = BYTE_MASK(ACC_WRITE_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4246) const u8 u = BYTE_MASK(ACC_USER_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4247)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4248) bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4249) bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4250) bool cr0_wp = is_write_protection(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4251)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4252) for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4253) unsigned pfec = byte << 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4254)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4255) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4256) * Each "*f" variable has a 1 bit for each UWX value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4257) * that causes a fault with the given PFEC.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4258) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4259)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4260) /* Faults from writes to non-writable pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4261) u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4262) /* Faults from user mode accesses to supervisor pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4263) u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4264) /* Faults from fetches of non-executable pages*/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4265) u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4266) /* Faults from kernel mode fetches of user pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4267) u8 smepf = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4268) /* Faults from kernel mode accesses of user pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4269) u8 smapf = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4270)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4271) if (!ept) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4272) /* Faults from kernel mode accesses to user pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4273) u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4274)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4275) /* Not really needed: !nx will cause pte.nx to fault */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4276) if (!mmu->nx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4277) ff = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4278)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4279) /* Allow supervisor writes if !cr0.wp */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4280) if (!cr0_wp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4281) wf = (pfec & PFERR_USER_MASK) ? wf : 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4282)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4283) /* Disallow supervisor fetches of user code if cr4.smep */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4284) if (cr4_smep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4285) smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4286)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4287) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4288) * SMAP:kernel-mode data accesses from user-mode
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4289) * mappings should fault. A fault is considered
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4290) * as a SMAP violation if all of the following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4291) * conditions are true:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4292) * - X86_CR4_SMAP is set in CR4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4293) * - A user page is accessed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4294) * - The access is not a fetch
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4295) * - Page fault in kernel mode
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4296) * - if CPL = 3 or X86_EFLAGS_AC is clear
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4297) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4298) * Here, we cover the first three conditions.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4299) * The fourth is computed dynamically in permission_fault();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4300) * PFERR_RSVD_MASK bit will be set in PFEC if the access is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4301) * *not* subject to SMAP restrictions.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4302) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4303) if (cr4_smap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4304) smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4305) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4306)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4307) mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4308) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4309) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4310)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4311) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4312) * PKU is an additional mechanism by which the paging controls access to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4313) * user-mode addresses based on the value in the PKRU register. Protection
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4314) * key violations are reported through a bit in the page fault error code.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4315) * Unlike other bits of the error code, the PK bit is not known at the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4316) * call site of e.g. gva_to_gpa; it must be computed directly in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4317) * permission_fault based on two bits of PKRU, on some machine state (CR4,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4318) * CR0, EFER, CPL), and on other bits of the error code and the page tables.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4319) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4320) * In particular the following conditions come from the error code, the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4321) * page tables and the machine state:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4322) * - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4323) * - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4324) * - PK is always zero if U=0 in the page tables
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4325) * - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4326) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4327) * The PKRU bitmask caches the result of these four conditions. The error
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4328) * code (minus the P bit) and the page table's U bit form an index into the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4329) * PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4330) * with the two bits of the PKRU register corresponding to the protection key.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4331) * For the first three conditions above the bits will be 00, thus masking
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4332) * away both AD and WD. For all reads or if the last condition holds, WD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4333) * only will be masked away.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4334) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4335) static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4336) bool ept)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4337) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4338) unsigned bit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4339) bool wp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4340)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4341) if (ept) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4342) mmu->pkru_mask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4343) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4344) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4345)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4346) /* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4347) if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4348) mmu->pkru_mask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4349) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4350) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4351)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4352) wp = is_write_protection(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4353)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4354) for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4355) unsigned pfec, pkey_bits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4356) bool check_pkey, check_write, ff, uf, wf, pte_user;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4357)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4358) pfec = bit << 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4359) ff = pfec & PFERR_FETCH_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4360) uf = pfec & PFERR_USER_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4361) wf = pfec & PFERR_WRITE_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4362)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4363) /* PFEC.RSVD is replaced by ACC_USER_MASK. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4364) pte_user = pfec & PFERR_RSVD_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4365)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4366) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4367) * Only need to check the access which is not an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4368) * instruction fetch and is to a user page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4369) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4370) check_pkey = (!ff && pte_user);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4371) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4372) * write access is controlled by PKRU if it is a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4373) * user access or CR0.WP = 1.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4374) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4375) check_write = check_pkey && wf && (uf || wp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4376)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4377) /* PKRU.AD stops both read and write access. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4378) pkey_bits = !!check_pkey;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4379) /* PKRU.WD stops write access. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4380) pkey_bits |= (!!check_write) << 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4381)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4382) mmu->pkru_mask |= (pkey_bits & 3) << pfec;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4383) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4384) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4385)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4386) static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4387) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4388) unsigned root_level = mmu->root_level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4389)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4390) mmu->last_nonleaf_level = root_level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4391) if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4392) mmu->last_nonleaf_level++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4393) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4394)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4395) static void paging64_init_context_common(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4396) struct kvm_mmu *context,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4397) int level)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4398) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4399) context->nx = is_nx(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4400) context->root_level = level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4401)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4402) reset_rsvds_bits_mask(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4403) update_permission_bitmask(vcpu, context, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4404) update_pkru_bitmask(vcpu, context, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4405) update_last_nonleaf_level(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4406)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4407) MMU_WARN_ON(!is_pae(vcpu));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4408) context->page_fault = paging64_page_fault;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4409) context->gva_to_gpa = paging64_gva_to_gpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4410) context->sync_page = paging64_sync_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4411) context->invlpg = paging64_invlpg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4412) context->shadow_root_level = level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4413) context->direct_map = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4414) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4415)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4416) static void paging64_init_context(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4417) struct kvm_mmu *context)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4418) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4419) int root_level = is_la57_mode(vcpu) ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4420) PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4421)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4422) paging64_init_context_common(vcpu, context, root_level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4423) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4424)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4425) static void paging32_init_context(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4426) struct kvm_mmu *context)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4427) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4428) context->nx = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4429) context->root_level = PT32_ROOT_LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4430)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4431) reset_rsvds_bits_mask(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4432) update_permission_bitmask(vcpu, context, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4433) update_pkru_bitmask(vcpu, context, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4434) update_last_nonleaf_level(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4435)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4436) context->page_fault = paging32_page_fault;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4437) context->gva_to_gpa = paging32_gva_to_gpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4438) context->sync_page = paging32_sync_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4439) context->invlpg = paging32_invlpg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4440) context->shadow_root_level = PT32E_ROOT_LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4441) context->direct_map = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4442) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4443)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4444) static void paging32E_init_context(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4445) struct kvm_mmu *context)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4446) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4447) paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4448) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4449)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4450) static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4451) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4452) union kvm_mmu_extended_role ext = {0};
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4453)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4454) ext.cr0_pg = !!is_paging(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4455) ext.cr4_pae = !!is_pae(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4456) ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4457) ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4458) ext.cr4_pse = !!is_pse(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4459) ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4460) ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4461) ext.maxphyaddr = cpuid_maxphyaddr(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4462)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4463) ext.valid = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4464)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4465) return ext;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4466) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4467)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4468) static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4469) bool base_only)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4470) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4471) union kvm_mmu_role role = {0};
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4472)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4473) role.base.access = ACC_ALL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4474) role.base.nxe = !!is_nx(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4475) role.base.cr0_wp = is_write_protection(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4476) role.base.smm = is_smm(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4477) role.base.guest_mode = is_guest_mode(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4478)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4479) if (base_only)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4480) return role;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4481)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4482) role.ext = kvm_calc_mmu_role_ext(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4483)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4484) return role;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4485) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4486)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4487) static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4488) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4489) /* Use 5-level TDP if and only if it's useful/necessary. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4490) if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4491) return 4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4492)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4493) return max_tdp_level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4494) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4495)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4496) static union kvm_mmu_role
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4497) kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4498) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4499) union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4500)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4501) role.base.ad_disabled = (shadow_accessed_mask == 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4502) role.base.level = kvm_mmu_get_tdp_level(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4503) role.base.direct = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4504) role.base.gpte_is_8_bytes = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4505)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4506) return role;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4507) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4508)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4509) static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4510) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4511) struct kvm_mmu *context = &vcpu->arch.root_mmu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4512) union kvm_mmu_role new_role =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4513) kvm_calc_tdp_mmu_root_page_role(vcpu, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4514)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4515) if (new_role.as_u64 == context->mmu_role.as_u64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4516) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4517)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4518) context->mmu_role.as_u64 = new_role.as_u64;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4519) context->page_fault = kvm_tdp_page_fault;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4520) context->sync_page = nonpaging_sync_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4521) context->invlpg = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4522) context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4523) context->direct_map = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4524) context->get_guest_pgd = get_cr3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4525) context->get_pdptr = kvm_pdptr_read;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4526) context->inject_page_fault = kvm_inject_page_fault;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4527)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4528) if (!is_paging(vcpu)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4529) context->nx = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4530) context->gva_to_gpa = nonpaging_gva_to_gpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4531) context->root_level = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4532) } else if (is_long_mode(vcpu)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4533) context->nx = is_nx(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4534) context->root_level = is_la57_mode(vcpu) ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4535) PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4536) reset_rsvds_bits_mask(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4537) context->gva_to_gpa = paging64_gva_to_gpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4538) } else if (is_pae(vcpu)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4539) context->nx = is_nx(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4540) context->root_level = PT32E_ROOT_LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4541) reset_rsvds_bits_mask(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4542) context->gva_to_gpa = paging64_gva_to_gpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4543) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4544) context->nx = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4545) context->root_level = PT32_ROOT_LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4546) reset_rsvds_bits_mask(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4547) context->gva_to_gpa = paging32_gva_to_gpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4548) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4549)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4550) update_permission_bitmask(vcpu, context, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4551) update_pkru_bitmask(vcpu, context, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4552) update_last_nonleaf_level(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4553) reset_tdp_shadow_zero_bits_mask(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4554) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4555)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4556) static union kvm_mmu_role
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4557) kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu, bool base_only)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4558) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4559) union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4560)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4561) role.base.smep_andnot_wp = role.ext.cr4_smep &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4562) !is_write_protection(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4563) role.base.smap_andnot_wp = role.ext.cr4_smap &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4564) !is_write_protection(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4565) role.base.gpte_is_8_bytes = !!is_pae(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4566)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4567) return role;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4568) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4569)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4570) static union kvm_mmu_role
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4571) kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4572) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4573) union kvm_mmu_role role =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4574) kvm_calc_shadow_root_page_role_common(vcpu, base_only);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4575)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4576) role.base.direct = !is_paging(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4577)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4578) if (!is_long_mode(vcpu))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4579) role.base.level = PT32E_ROOT_LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4580) else if (is_la57_mode(vcpu))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4581) role.base.level = PT64_ROOT_5LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4582) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4583) role.base.level = PT64_ROOT_4LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4584)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4585) return role;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4586) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4587)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4588) static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4589) u32 cr0, u32 cr4, u32 efer,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4590) union kvm_mmu_role new_role)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4591) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4592) if (!(cr0 & X86_CR0_PG))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4593) nonpaging_init_context(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4594) else if (efer & EFER_LMA)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4595) paging64_init_context(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4596) else if (cr4 & X86_CR4_PAE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4597) paging32E_init_context(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4598) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4599) paging32_init_context(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4600)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4601) context->mmu_role.as_u64 = new_role.as_u64;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4602) reset_shadow_zero_bits_mask(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4603) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4604)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4605) static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4606) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4607) struct kvm_mmu *context = &vcpu->arch.root_mmu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4608) union kvm_mmu_role new_role =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4609) kvm_calc_shadow_mmu_root_page_role(vcpu, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4610)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4611) if (new_role.as_u64 != context->mmu_role.as_u64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4612) shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4613) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4614)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4615) static union kvm_mmu_role
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4616) kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4617) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4618) union kvm_mmu_role role =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4619) kvm_calc_shadow_root_page_role_common(vcpu, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4620)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4621) role.base.direct = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4622) role.base.level = kvm_mmu_get_tdp_level(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4623)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4624) return role;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4625) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4626)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4627) void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4628) gpa_t nested_cr3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4629) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4630) struct kvm_mmu *context = &vcpu->arch.guest_mmu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4631) union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4632)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4633) __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, false, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4634)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4635) if (new_role.as_u64 != context->mmu_role.as_u64) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4636) shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4637)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4638) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4639) * Override the level set by the common init helper, nested TDP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4640) * always uses the host's TDP configuration.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4641) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4642) context->shadow_root_level = new_role.base.level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4643) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4644) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4645) EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4646)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4647) static union kvm_mmu_role
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4648) kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4649) bool execonly, u8 level)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4650) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4651) union kvm_mmu_role role = {0};
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4652)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4653) /* SMM flag is inherited from root_mmu */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4654) role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4655)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4656) role.base.level = level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4657) role.base.gpte_is_8_bytes = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4658) role.base.direct = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4659) role.base.ad_disabled = !accessed_dirty;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4660) role.base.guest_mode = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4661) role.base.access = ACC_ALL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4662)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4663) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4664) * WP=1 and NOT_WP=1 is an impossible combination, use WP and the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4665) * SMAP variation to denote shadow EPT entries.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4666) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4667) role.base.cr0_wp = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4668) role.base.smap_andnot_wp = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4669)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4670) role.ext = kvm_calc_mmu_role_ext(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4671) role.ext.execonly = execonly;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4672)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4673) return role;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4674) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4675)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4676) void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4677) bool accessed_dirty, gpa_t new_eptp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4678) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4679) struct kvm_mmu *context = &vcpu->arch.guest_mmu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4680) u8 level = vmx_eptp_page_walk_level(new_eptp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4681) union kvm_mmu_role new_role =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4682) kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4683) execonly, level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4684)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4685) __kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base, true, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4686)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4687) if (new_role.as_u64 == context->mmu_role.as_u64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4688) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4689)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4690) context->shadow_root_level = level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4691)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4692) context->nx = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4693) context->ept_ad = accessed_dirty;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4694) context->page_fault = ept_page_fault;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4695) context->gva_to_gpa = ept_gva_to_gpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4696) context->sync_page = ept_sync_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4697) context->invlpg = ept_invlpg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4698) context->root_level = level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4699) context->direct_map = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4700) context->mmu_role.as_u64 = new_role.as_u64;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4701)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4702) update_permission_bitmask(vcpu, context, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4703) update_pkru_bitmask(vcpu, context, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4704) update_last_nonleaf_level(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4705) reset_rsvds_bits_mask_ept(vcpu, context, execonly);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4706) reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4707) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4708) EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4709)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4710) static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4711) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4712) struct kvm_mmu *context = &vcpu->arch.root_mmu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4713)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4714) kvm_init_shadow_mmu(vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4715) kvm_read_cr0_bits(vcpu, X86_CR0_PG),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4716) kvm_read_cr4_bits(vcpu, X86_CR4_PAE),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4717) vcpu->arch.efer);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4718)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4719) context->get_guest_pgd = get_cr3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4720) context->get_pdptr = kvm_pdptr_read;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4721) context->inject_page_fault = kvm_inject_page_fault;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4722) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4723)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4724) static union kvm_mmu_role kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4725) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4726) union kvm_mmu_role role = kvm_calc_shadow_root_page_role_common(vcpu, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4727)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4728) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4729) * Nested MMUs are used only for walking L2's gva->gpa, they never have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4730) * shadow pages of their own and so "direct" has no meaning. Set it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4731) * to "true" to try to detect bogus usage of the nested MMU.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4732) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4733) role.base.direct = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4734)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4735) if (!is_paging(vcpu))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4736) role.base.level = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4737) else if (is_long_mode(vcpu))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4738) role.base.level = is_la57_mode(vcpu) ? PT64_ROOT_5LEVEL :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4739) PT64_ROOT_4LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4740) else if (is_pae(vcpu))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4741) role.base.level = PT32E_ROOT_LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4742) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4743) role.base.level = PT32_ROOT_LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4744)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4745) return role;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4746) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4747)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4748) static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4749) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4750) union kvm_mmu_role new_role = kvm_calc_nested_mmu_role(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4751) struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4752)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4753) if (new_role.as_u64 == g_context->mmu_role.as_u64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4754) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4755)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4756) g_context->mmu_role.as_u64 = new_role.as_u64;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4757) g_context->get_guest_pgd = get_cr3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4758) g_context->get_pdptr = kvm_pdptr_read;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4759) g_context->inject_page_fault = kvm_inject_page_fault;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4760)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4761) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4762) * L2 page tables are never shadowed, so there is no need to sync
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4763) * SPTEs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4764) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4765) g_context->invlpg = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4766)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4767) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4768) * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4769) * L1's nested page tables (e.g. EPT12). The nested translation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4770) * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4771) * L2's page tables as the first level of translation and L1's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4772) * nested page tables as the second level of translation. Basically
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4773) * the gva_to_gpa functions between mmu and nested_mmu are swapped.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4774) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4775) if (!is_paging(vcpu)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4776) g_context->nx = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4777) g_context->root_level = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4778) g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4779) } else if (is_long_mode(vcpu)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4780) g_context->nx = is_nx(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4781) g_context->root_level = is_la57_mode(vcpu) ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4782) PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4783) reset_rsvds_bits_mask(vcpu, g_context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4784) g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4785) } else if (is_pae(vcpu)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4786) g_context->nx = is_nx(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4787) g_context->root_level = PT32E_ROOT_LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4788) reset_rsvds_bits_mask(vcpu, g_context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4789) g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4790) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4791) g_context->nx = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4792) g_context->root_level = PT32_ROOT_LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4793) reset_rsvds_bits_mask(vcpu, g_context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4794) g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4795) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4796)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4797) update_permission_bitmask(vcpu, g_context, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4798) update_pkru_bitmask(vcpu, g_context, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4799) update_last_nonleaf_level(vcpu, g_context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4800) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4801)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4802) void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4803) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4804) if (reset_roots) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4805) uint i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4806)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4807) vcpu->arch.mmu->root_hpa = INVALID_PAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4808)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4809) for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4810) vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4811) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4812)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4813) if (mmu_is_nested(vcpu))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4814) init_kvm_nested_mmu(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4815) else if (tdp_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4816) init_kvm_tdp_mmu(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4817) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4818) init_kvm_softmmu(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4819) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4820) EXPORT_SYMBOL_GPL(kvm_init_mmu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4821)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4822) static union kvm_mmu_page_role
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4823) kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4824) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4825) union kvm_mmu_role role;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4826)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4827) if (tdp_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4828) role = kvm_calc_tdp_mmu_root_page_role(vcpu, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4829) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4830) role = kvm_calc_shadow_mmu_root_page_role(vcpu, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4831)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4832) return role.base;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4833) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4834)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4835) void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4836) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4837) kvm_mmu_unload(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4838) kvm_init_mmu(vcpu, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4839) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4840) EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4841)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4842) int kvm_mmu_load(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4843) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4844) int r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4845)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4846) r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4847) if (r)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4848) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4849) r = mmu_alloc_roots(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4850) kvm_mmu_sync_roots(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4851) if (r)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4852) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4853) kvm_mmu_load_pgd(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4854) kvm_x86_ops.tlb_flush_current(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4855) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4856) return r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4857) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4858) EXPORT_SYMBOL_GPL(kvm_mmu_load);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4859)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4860) void kvm_mmu_unload(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4861) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4862) kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4863) WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4864) kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4865) WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4866) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4867) EXPORT_SYMBOL_GPL(kvm_mmu_unload);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4868)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4869) static bool need_remote_flush(u64 old, u64 new)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4870) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4871) if (!is_shadow_present_pte(old))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4872) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4873) if (!is_shadow_present_pte(new))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4874) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4875) if ((old ^ new) & PT64_BASE_ADDR_MASK)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4876) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4877) old ^= shadow_nx_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4878) new ^= shadow_nx_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4879) return (old & ~new & PT64_PERM_MASK) != 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4880) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4881)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4882) static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4883) int *bytes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4884) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4885) u64 gentry = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4886) int r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4887)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4888) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4889) * Assume that the pte write on a page table of the same type
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4890) * as the current vcpu paging mode since we update the sptes only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4891) * when they have the same mode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4892) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4893) if (is_pae(vcpu) && *bytes == 4) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4894) /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4895) *gpa &= ~(gpa_t)7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4896) *bytes = 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4897) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4898)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4899) if (*bytes == 4 || *bytes == 8) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4900) r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4901) if (r)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4902) gentry = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4903) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4904)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4905) return gentry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4906) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4907)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4908) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4909) * If we're seeing too many writes to a page, it may no longer be a page table,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4910) * or we may be forking, in which case it is better to unmap the page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4911) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4912) static bool detect_write_flooding(struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4913) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4914) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4915) * Skip write-flooding detected for the sp whose level is 1, because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4916) * it can become unsync, then the guest page is not write-protected.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4917) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4918) if (sp->role.level == PG_LEVEL_4K)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4919) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4920)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4921) atomic_inc(&sp->write_flooding_count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4922) return atomic_read(&sp->write_flooding_count) >= 3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4923) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4924)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4925) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4926) * Misaligned accesses are too much trouble to fix up; also, they usually
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4927) * indicate a page is not used as a page table.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4928) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4929) static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4930) int bytes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4931) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4932) unsigned offset, pte_size, misaligned;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4933)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4934) pgprintk("misaligned: gpa %llx bytes %d role %x\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4935) gpa, bytes, sp->role.word);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4936)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4937) offset = offset_in_page(gpa);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4938) pte_size = sp->role.gpte_is_8_bytes ? 8 : 4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4939)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4940) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4941) * Sometimes, the OS only writes the last one bytes to update status
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4942) * bits, for example, in linux, andb instruction is used in clear_bit().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4943) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4944) if (!(offset & (pte_size - 1)) && bytes == 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4945) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4946)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4947) misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4948) misaligned |= bytes < 4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4949)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4950) return misaligned;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4951) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4952)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4953) static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4954) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4955) unsigned page_offset, quadrant;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4956) u64 *spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4957) int level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4958)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4959) page_offset = offset_in_page(gpa);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4960) level = sp->role.level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4961) *nspte = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4962) if (!sp->role.gpte_is_8_bytes) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4963) page_offset <<= 1; /* 32->64 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4964) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4965) * A 32-bit pde maps 4MB while the shadow pdes map
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4966) * only 2MB. So we need to double the offset again
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4967) * and zap two pdes instead of one.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4968) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4969) if (level == PT32_ROOT_LEVEL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4970) page_offset &= ~7; /* kill rounding error */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4971) page_offset <<= 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4972) *nspte = 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4973) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4974) quadrant = page_offset >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4975) page_offset &= ~PAGE_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4976) if (quadrant != sp->role.quadrant)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4977) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4978) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4979)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4980) spte = &sp->spt[page_offset / sizeof(*spte)];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4981) return spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4982) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4983)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4984) static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4985) const u8 *new, int bytes,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4986) struct kvm_page_track_notifier_node *node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4987) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4988) gfn_t gfn = gpa >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4989) struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4990) LIST_HEAD(invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4991) u64 entry, gentry, *spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4992) int npte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4993) bool remote_flush, local_flush;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4994)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4995) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4996) * If we don't have indirect shadow pages, it means no page is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4997) * write-protected, so we can exit simply.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4998) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4999) if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5000) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5001)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5002) remote_flush = local_flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5003)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5004) pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5005)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5006) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5007) * No need to care whether allocation memory is successful
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5008) * or not since pte prefetch is skiped if it does not have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5009) * enough objects in the cache.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5010) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5011) mmu_topup_memory_caches(vcpu, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5012)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5013) spin_lock(&vcpu->kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5014)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5015) gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5016)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5017) ++vcpu->kvm->stat.mmu_pte_write;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5018) kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5019)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5020) for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5021) if (detect_write_misaligned(sp, gpa, bytes) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5022) detect_write_flooding(sp)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5023) kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5024) ++vcpu->kvm->stat.mmu_flooded;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5025) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5026) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5027)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5028) spte = get_written_sptes(sp, gpa, &npte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5029) if (!spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5030) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5031)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5032) local_flush = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5033) while (npte--) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5034) entry = *spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5035) mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5036) if (gentry && sp->role.level != PG_LEVEL_4K)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5037) ++vcpu->kvm->stat.mmu_pde_zapped;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5038) if (need_remote_flush(entry, *spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5039) remote_flush = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5040) ++spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5041) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5042) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5043) kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5044) kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5045) spin_unlock(&vcpu->kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5046) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5047)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5048) int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5049) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5050) gpa_t gpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5051) int r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5052)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5053) if (vcpu->arch.mmu->direct_map)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5054) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5055)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5056) gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5057)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5058) r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5059)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5060) return r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5061) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5062) EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5063)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5064) int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5065) void *insn, int insn_len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5066) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5067) int r, emulation_type = EMULTYPE_PF;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5068) bool direct = vcpu->arch.mmu->direct_map;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5069)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5070) if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5071) return RET_PF_RETRY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5072)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5073) r = RET_PF_INVALID;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5074) if (unlikely(error_code & PFERR_RSVD_MASK)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5075) r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5076) if (r == RET_PF_EMULATE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5077) goto emulate;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5078) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5079)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5080) if (r == RET_PF_INVALID) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5081) r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5082) lower_32_bits(error_code), false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5083) if (WARN_ON_ONCE(r == RET_PF_INVALID))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5084) return -EIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5085) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5086)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5087) if (r < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5088) return r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5089) if (r != RET_PF_EMULATE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5090) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5091)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5092) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5093) * Before emulating the instruction, check if the error code
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5094) * was due to a RO violation while translating the guest page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5095) * This can occur when using nested virtualization with nested
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5096) * paging in both guests. If true, we simply unprotect the page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5097) * and resume the guest.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5098) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5099) if (vcpu->arch.mmu->direct_map &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5100) (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5101) kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5102) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5103) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5104)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5105) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5106) * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5107) * optimistically try to just unprotect the page and let the processor
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5108) * re-execute the instruction that caused the page fault. Do not allow
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5109) * retrying MMIO emulation, as it's not only pointless but could also
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5110) * cause us to enter an infinite loop because the processor will keep
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5111) * faulting on the non-existent MMIO address. Retrying an instruction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5112) * from a nested guest is also pointless and dangerous as we are only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5113) * explicitly shadowing L1's page tables, i.e. unprotecting something
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5114) * for L1 isn't going to magically fix whatever issue cause L2 to fail.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5115) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5116) if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5117) emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5118) emulate:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5119) return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5120) insn_len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5121) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5122) EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5123)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5124) void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5125) gva_t gva, hpa_t root_hpa)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5126) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5127) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5128)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5129) /* It's actually a GPA for vcpu->arch.guest_mmu. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5130) if (mmu != &vcpu->arch.guest_mmu) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5131) /* INVLPG on a non-canonical address is a NOP according to the SDM. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5132) if (is_noncanonical_address(gva, vcpu))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5133) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5134)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5135) kvm_x86_ops.tlb_flush_gva(vcpu, gva);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5136) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5137)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5138) if (!mmu->invlpg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5139) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5140)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5141) if (root_hpa == INVALID_PAGE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5142) mmu->invlpg(vcpu, gva, mmu->root_hpa);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5143)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5144) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5145) * INVLPG is required to invalidate any global mappings for the VA,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5146) * irrespective of PCID. Since it would take us roughly similar amount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5147) * of work to determine whether any of the prev_root mappings of the VA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5148) * is marked global, or to just sync it blindly, so we might as well
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5149) * just always sync it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5150) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5151) * Mappings not reachable via the current cr3 or the prev_roots will be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5152) * synced when switching to that cr3, so nothing needs to be done here
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5153) * for them.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5154) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5155) for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5156) if (VALID_PAGE(mmu->prev_roots[i].hpa))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5157) mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5158) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5159) mmu->invlpg(vcpu, gva, root_hpa);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5160) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5161) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5162) EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_gva);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5163)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5164) void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5165) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5166) kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5167) ++vcpu->stat.invlpg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5168) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5169) EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5170)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5171)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5172) void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5173) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5174) struct kvm_mmu *mmu = vcpu->arch.mmu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5175) bool tlb_flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5176) uint i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5177)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5178) if (pcid == kvm_get_active_pcid(vcpu)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5179) mmu->invlpg(vcpu, gva, mmu->root_hpa);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5180) tlb_flush = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5181) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5182)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5183) for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5184) if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5185) pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5186) mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5187) tlb_flush = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5188) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5189) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5190)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5191) if (tlb_flush)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5192) kvm_x86_ops.tlb_flush_gva(vcpu, gva);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5193)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5194) ++vcpu->stat.invlpg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5195)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5196) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5197) * Mappings not reachable via the current cr3 or the prev_roots will be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5198) * synced when switching to that cr3, so nothing needs to be done here
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5199) * for them.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5200) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5201) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5202) EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5203)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5204) void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5205) int tdp_huge_page_level)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5206) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5207) tdp_enabled = enable_tdp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5208) max_tdp_level = tdp_max_root_level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5209)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5210) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5211) * max_huge_page_level reflects KVM's MMU capabilities irrespective
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5212) * of kernel support, e.g. KVM may be capable of using 1GB pages when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5213) * the kernel is not. But, KVM never creates a page size greater than
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5214) * what is used by the kernel for any given HVA, i.e. the kernel's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5215) * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5216) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5217) if (tdp_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5218) max_huge_page_level = tdp_huge_page_level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5219) else if (boot_cpu_has(X86_FEATURE_GBPAGES))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5220) max_huge_page_level = PG_LEVEL_1G;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5221) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5222) max_huge_page_level = PG_LEVEL_2M;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5223) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5224) EXPORT_SYMBOL_GPL(kvm_configure_mmu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5225)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5226) /* The return value indicates if tlb flush on all vcpus is needed. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5227) typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5228)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5229) /* The caller should hold mmu-lock before calling this function. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5230) static __always_inline bool
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5231) slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5232) slot_level_handler fn, int start_level, int end_level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5233) gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5234) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5235) struct slot_rmap_walk_iterator iterator;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5236) bool flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5237)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5238) for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5239) end_gfn, &iterator) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5240) if (iterator.rmap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5241) flush |= fn(kvm, iterator.rmap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5242)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5243) if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5244) if (flush && lock_flush_tlb) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5245) kvm_flush_remote_tlbs_with_address(kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5246) start_gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5247) iterator.gfn - start_gfn + 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5248) flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5249) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5250) cond_resched_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5251) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5252) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5253)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5254) if (flush && lock_flush_tlb) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5255) kvm_flush_remote_tlbs_with_address(kvm, start_gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5256) end_gfn - start_gfn + 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5257) flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5258) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5259)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5260) return flush;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5261) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5262)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5263) static __always_inline bool
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5264) slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5265) slot_level_handler fn, int start_level, int end_level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5266) bool lock_flush_tlb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5267) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5268) return slot_handle_level_range(kvm, memslot, fn, start_level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5269) end_level, memslot->base_gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5270) memslot->base_gfn + memslot->npages - 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5271) lock_flush_tlb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5272) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5273)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5274) static __always_inline bool
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5275) slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5276) slot_level_handler fn, bool lock_flush_tlb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5277) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5278) return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5279) KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5280) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5281)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5282) static __always_inline bool
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5283) slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5284) slot_level_handler fn, bool lock_flush_tlb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5285) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5286) return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K + 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5287) KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5288) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5289)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5290) static __always_inline bool
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5291) slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5292) slot_level_handler fn, bool lock_flush_tlb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5293) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5294) return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5295) PG_LEVEL_4K, lock_flush_tlb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5296) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5297)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5298) static void free_mmu_pages(struct kvm_mmu *mmu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5299) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5300) free_page((unsigned long)mmu->pae_root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5301) free_page((unsigned long)mmu->lm_root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5302) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5303)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5304) static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5305) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5306) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5307) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5308)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5309) mmu->root_hpa = INVALID_PAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5310) mmu->root_pgd = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5311) mmu->translate_gpa = translate_gpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5312) for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5313) mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5314)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5315) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5316) * When using PAE paging, the four PDPTEs are treated as 'root' pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5317) * while the PDP table is a per-vCPU construct that's allocated at MMU
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5318) * creation. When emulating 32-bit mode, cr3 is only 32 bits even on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5319) * x86_64. Therefore we need to allocate the PDP table in the first
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5320) * 4GB of memory, which happens to fit the DMA32 zone. TDP paging
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5321) * generally doesn't use PAE paging and can skip allocating the PDP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5322) * table. The main exception, handled here, is SVM's 32-bit NPT. The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5323) * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5324) * KVM; that horror is handled on-demand by mmu_alloc_shadow_roots().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5325) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5326) if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5327) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5328)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5329) page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5330) if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5331) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5332)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5333) mmu->pae_root = page_address(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5334) for (i = 0; i < 4; ++i)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5335) mmu->pae_root[i] = INVALID_PAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5336)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5337) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5338) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5339)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5340) int kvm_mmu_create(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5341) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5342) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5343)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5344) vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5345) vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5346)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5347) vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5348) vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5349)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5350) vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5351)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5352) vcpu->arch.mmu = &vcpu->arch.root_mmu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5353) vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5354)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5355) vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5356)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5357) ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5358) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5359) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5360)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5361) ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5362) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5363) goto fail_allocate_root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5364)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5365) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5366) fail_allocate_root:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5367) free_mmu_pages(&vcpu->arch.guest_mmu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5368) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5369) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5370)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5371) #define BATCH_ZAP_PAGES 10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5372) static void kvm_zap_obsolete_pages(struct kvm *kvm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5373) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5374) struct kvm_mmu_page *sp, *node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5375) int nr_zapped, batch = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5376)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5377) restart:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5378) list_for_each_entry_safe_reverse(sp, node,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5379) &kvm->arch.active_mmu_pages, link) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5380) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5381) * No obsolete valid page exists before a newly created page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5382) * since active_mmu_pages is a FIFO list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5383) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5384) if (!is_obsolete_sp(kvm, sp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5385) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5386)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5387) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5388) * Invalid pages should never land back on the list of active
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5389) * pages. Skip the bogus page, otherwise we'll get stuck in an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5390) * infinite loop if the page gets put back on the list (again).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5391) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5392) if (WARN_ON(sp->role.invalid))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5393) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5394)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5395) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5396) * No need to flush the TLB since we're only zapping shadow
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5397) * pages with an obsolete generation number and all vCPUS have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5398) * loaded a new root, i.e. the shadow pages being zapped cannot
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5399) * be in active use by the guest.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5400) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5401) if (batch >= BATCH_ZAP_PAGES &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5402) cond_resched_lock(&kvm->mmu_lock)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5403) batch = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5404) goto restart;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5405) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5406)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5407) if (__kvm_mmu_prepare_zap_page(kvm, sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5408) &kvm->arch.zapped_obsolete_pages, &nr_zapped)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5409) batch += nr_zapped;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5410) goto restart;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5411) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5412) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5413)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5414) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5415) * Trigger a remote TLB flush before freeing the page tables to ensure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5416) * KVM is not in the middle of a lockless shadow page table walk, which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5417) * may reference the pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5418) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5419) kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5420) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5421)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5422) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5423) * Fast invalidate all shadow pages and use lock-break technique
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5424) * to zap obsolete pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5425) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5426) * It's required when memslot is being deleted or VM is being
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5427) * destroyed, in these cases, we should ensure that KVM MMU does
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5428) * not use any resource of the being-deleted slot or all slots
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5429) * after calling the function.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5430) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5431) static void kvm_mmu_zap_all_fast(struct kvm *kvm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5432) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5433) lockdep_assert_held(&kvm->slots_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5434)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5435) spin_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5436) trace_kvm_mmu_zap_all_fast(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5437)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5438) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5439) * Toggle mmu_valid_gen between '0' and '1'. Because slots_lock is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5440) * held for the entire duration of zapping obsolete pages, it's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5441) * impossible for there to be multiple invalid generations associated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5442) * with *valid* shadow pages at any given time, i.e. there is exactly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5443) * one valid generation and (at most) one invalid generation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5444) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5445) kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5446)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5447) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5448) * Notify all vcpus to reload its shadow page table and flush TLB.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5449) * Then all vcpus will switch to new shadow page table with the new
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5450) * mmu_valid_gen.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5451) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5452) * Note: we need to do this under the protection of mmu_lock,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5453) * otherwise, vcpu would purge shadow page but miss tlb flush.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5454) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5455) kvm_reload_remote_mmus(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5456)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5457) kvm_zap_obsolete_pages(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5458)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5459) if (kvm->arch.tdp_mmu_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5460) kvm_tdp_mmu_zap_all(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5461)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5462) spin_unlock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5463) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5464)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5465) static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5466) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5467) return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5468) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5469)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5470) static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5471) struct kvm_memory_slot *slot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5472) struct kvm_page_track_notifier_node *node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5473) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5474) kvm_mmu_zap_all_fast(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5475) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5476)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5477) void kvm_mmu_init_vm(struct kvm *kvm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5478) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5479) struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5480)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5481) kvm_mmu_init_tdp_mmu(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5482)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5483) node->track_write = kvm_mmu_pte_write;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5484) node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5485) kvm_page_track_register_notifier(kvm, node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5486) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5487)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5488) void kvm_mmu_uninit_vm(struct kvm *kvm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5489) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5490) struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5491)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5492) kvm_page_track_unregister_notifier(kvm, node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5493)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5494) kvm_mmu_uninit_tdp_mmu(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5495) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5496)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5497) void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5498) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5499) struct kvm_memslots *slots;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5500) struct kvm_memory_slot *memslot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5501) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5502) bool flush;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5503)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5504) spin_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5505) for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5506) slots = __kvm_memslots(kvm, i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5507) kvm_for_each_memslot(memslot, slots) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5508) gfn_t start, end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5509)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5510) start = max(gfn_start, memslot->base_gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5511) end = min(gfn_end, memslot->base_gfn + memslot->npages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5512) if (start >= end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5513) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5514)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5515) slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5516) PG_LEVEL_4K,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5517) KVM_MAX_HUGEPAGE_LEVEL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5518) start, end - 1, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5519) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5520) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5521)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5522) if (kvm->arch.tdp_mmu_enabled) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5523) flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5524) if (flush)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5525) kvm_flush_remote_tlbs(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5526) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5527)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5528) spin_unlock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5529) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5530)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5531) static bool slot_rmap_write_protect(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5532) struct kvm_rmap_head *rmap_head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5533) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5534) return __rmap_write_protect(kvm, rmap_head, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5535) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5536)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5537) void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5538) struct kvm_memory_slot *memslot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5539) int start_level)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5540) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5541) bool flush;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5542)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5543) spin_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5544) flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5545) start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5546) if (kvm->arch.tdp_mmu_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5547) flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5548) spin_unlock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5549)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5550) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5551) * We can flush all the TLBs out of the mmu lock without TLB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5552) * corruption since we just change the spte from writable to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5553) * readonly so that we only need to care the case of changing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5554) * spte from present to present (changing the spte from present
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5555) * to nonpresent will flush all the TLBs immediately), in other
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5556) * words, the only case we care is mmu_spte_update() where we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5557) * have checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5558) * instead of PT_WRITABLE_MASK, that means it does not depend
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5559) * on PT_WRITABLE_MASK anymore.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5560) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5561) if (flush)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5562) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5563) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5564)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5565) static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5566) struct kvm_rmap_head *rmap_head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5567) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5568) u64 *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5569) struct rmap_iterator iter;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5570) int need_tlb_flush = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5571) kvm_pfn_t pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5572) struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5573)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5574) restart:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5575) for_each_rmap_spte(rmap_head, &iter, sptep) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5576) sp = sptep_to_sp(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5577) pfn = spte_to_pfn(*sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5578)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5579) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5580) * We cannot do huge page mapping for indirect shadow pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5581) * which are found on the last rmap (level = 1) when not using
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5582) * tdp; such shadow pages are synced with the page table in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5583) * the guest, and the guest page table is using 4K page size
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5584) * mapping if the indirect sp has level = 1.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5585) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5586) if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5587) (kvm_is_zone_device_pfn(pfn) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5588) PageCompound(pfn_to_page(pfn)))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5589) pte_list_remove(rmap_head, sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5590)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5591) if (kvm_available_flush_tlb_with_range())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5592) kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5593) KVM_PAGES_PER_HPAGE(sp->role.level));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5594) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5595) need_tlb_flush = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5596)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5597) goto restart;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5598) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5599) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5600)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5601) return need_tlb_flush;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5602) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5603)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5604) void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5605) const struct kvm_memory_slot *memslot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5606) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5607) /* FIXME: const-ify all uses of struct kvm_memory_slot. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5608) spin_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5609) slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5610) kvm_mmu_zap_collapsible_spte, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5611)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5612) if (kvm->arch.tdp_mmu_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5613) kvm_tdp_mmu_zap_collapsible_sptes(kvm, memslot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5614) spin_unlock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5615) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5616)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5617) void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5618) struct kvm_memory_slot *memslot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5619) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5620) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5621) * All current use cases for flushing the TLBs for a specific memslot
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5622) * are related to dirty logging, and do the TLB flush out of mmu_lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5623) * The interaction between the various operations on memslot must be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5624) * serialized by slots_locks to ensure the TLB flush from one operation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5625) * is observed by any other operation on the same memslot.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5626) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5627) lockdep_assert_held(&kvm->slots_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5628) kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5629) memslot->npages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5630) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5631)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5632) void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5633) struct kvm_memory_slot *memslot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5634) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5635) bool flush;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5636)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5637) spin_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5638) flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5639) if (kvm->arch.tdp_mmu_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5640) flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5641) spin_unlock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5642)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5643) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5644) * It's also safe to flush TLBs out of mmu lock here as currently this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5645) * function is only used for dirty logging, in which case flushing TLB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5646) * out of mmu lock also guarantees no dirty pages will be lost in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5647) * dirty_bitmap.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5648) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5649) if (flush)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5650) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5651) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5652) EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5653)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5654) void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5655) struct kvm_memory_slot *memslot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5656) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5657) bool flush;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5658)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5659) spin_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5660) flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5661) false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5662) if (kvm->arch.tdp_mmu_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5663) flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_2M);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5664) spin_unlock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5665)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5666) if (flush)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5667) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5668) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5669) EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5670)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5671) void kvm_mmu_slot_set_dirty(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5672) struct kvm_memory_slot *memslot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5673) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5674) bool flush;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5675)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5676) spin_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5677) flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5678) if (kvm->arch.tdp_mmu_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5679) flush |= kvm_tdp_mmu_slot_set_dirty(kvm, memslot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5680) spin_unlock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5681)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5682) if (flush)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5683) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5684) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5685) EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5686)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5687) void kvm_mmu_zap_all(struct kvm *kvm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5688) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5689) struct kvm_mmu_page *sp, *node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5690) LIST_HEAD(invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5691) int ign;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5692)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5693) spin_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5694) restart:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5695) list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5696) if (WARN_ON(sp->role.invalid))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5697) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5698) if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5699) goto restart;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5700) if (cond_resched_lock(&kvm->mmu_lock))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5701) goto restart;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5702) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5703)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5704) kvm_mmu_commit_zap_page(kvm, &invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5705)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5706) if (kvm->arch.tdp_mmu_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5707) kvm_tdp_mmu_zap_all(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5708)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5709) spin_unlock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5710) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5711)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5712) void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5713) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5714) WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5715)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5716) gen &= MMIO_SPTE_GEN_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5717)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5718) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5719) * Generation numbers are incremented in multiples of the number of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5720) * address spaces in order to provide unique generations across all
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5721) * address spaces. Strip what is effectively the address space
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5722) * modifier prior to checking for a wrap of the MMIO generation so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5723) * that a wrap in any address space is detected.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5724) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5725) gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5726)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5727) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5728) * The very rare case: if the MMIO generation number has wrapped,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5729) * zap all shadow pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5730) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5731) if (unlikely(gen == 0)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5732) kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5733) kvm_mmu_zap_all_fast(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5734) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5735) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5736)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5737) static unsigned long
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5738) mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5739) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5740) struct kvm *kvm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5741) int nr_to_scan = sc->nr_to_scan;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5742) unsigned long freed = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5743)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5744) mutex_lock(&kvm_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5745)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5746) list_for_each_entry(kvm, &vm_list, vm_list) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5747) int idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5748) LIST_HEAD(invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5749)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5750) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5751) * Never scan more than sc->nr_to_scan VM instances.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5752) * Will not hit this condition practically since we do not try
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5753) * to shrink more than one VM and it is very unlikely to see
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5754) * !n_used_mmu_pages so many times.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5755) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5756) if (!nr_to_scan--)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5757) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5758) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5759) * n_used_mmu_pages is accessed without holding kvm->mmu_lock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5760) * here. We may skip a VM instance errorneosly, but we do not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5761) * want to shrink a VM that only started to populate its MMU
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5762) * anyway.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5763) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5764) if (!kvm->arch.n_used_mmu_pages &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5765) !kvm_has_zapped_obsolete_pages(kvm))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5766) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5767)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5768) idx = srcu_read_lock(&kvm->srcu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5769) spin_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5770)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5771) if (kvm_has_zapped_obsolete_pages(kvm)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5772) kvm_mmu_commit_zap_page(kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5773) &kvm->arch.zapped_obsolete_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5774) goto unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5775) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5776)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5777) freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5778)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5779) unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5780) spin_unlock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5781) srcu_read_unlock(&kvm->srcu, idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5782)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5783) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5784) * unfair on small ones
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5785) * per-vm shrinkers cry out
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5786) * sadness comes quickly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5787) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5788) list_move_tail(&kvm->vm_list, &vm_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5789) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5790) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5791)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5792) mutex_unlock(&kvm_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5793) return freed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5794) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5795)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5796) static unsigned long
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5797) mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5798) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5799) return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5800) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5801)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5802) static struct shrinker mmu_shrinker = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5803) .count_objects = mmu_shrink_count,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5804) .scan_objects = mmu_shrink_scan,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5805) .seeks = DEFAULT_SEEKS * 10,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5806) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5807)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5808) static void mmu_destroy_caches(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5809) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5810) kmem_cache_destroy(pte_list_desc_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5811) kmem_cache_destroy(mmu_page_header_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5812) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5813)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5814) static void kvm_set_mmio_spte_mask(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5815) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5816) u64 mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5817)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5818) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5819) * Set a reserved PA bit in MMIO SPTEs to generate page faults with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5820) * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5821) * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5822) * 52-bit physical addresses then there are no reserved PA bits in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5823) * PTEs and so the reserved PA approach must be disabled.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5824) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5825) if (shadow_phys_bits < 52)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5826) mask = BIT_ULL(51) | PT_PRESENT_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5827) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5828) mask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5829)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5830) kvm_mmu_set_mmio_spte_mask(mask, ACC_WRITE_MASK | ACC_USER_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5831) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5832)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5833) static bool get_nx_auto_mode(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5834) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5835) /* Return true when CPU has the bug, and mitigations are ON */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5836) return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5837) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5838)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5839) static void __set_nx_huge_pages(bool val)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5840) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5841) nx_huge_pages = itlb_multihit_kvm_mitigation = val;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5842) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5843)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5844) static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5845) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5846) bool old_val = nx_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5847) bool new_val;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5848)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5849) /* In "auto" mode deploy workaround only if CPU has the bug. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5850) if (sysfs_streq(val, "off"))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5851) new_val = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5852) else if (sysfs_streq(val, "force"))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5853) new_val = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5854) else if (sysfs_streq(val, "auto"))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5855) new_val = get_nx_auto_mode();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5856) else if (strtobool(val, &new_val) < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5857) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5858)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5859) __set_nx_huge_pages(new_val);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5860)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5861) if (new_val != old_val) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5862) struct kvm *kvm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5863)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5864) mutex_lock(&kvm_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5865)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5866) list_for_each_entry(kvm, &vm_list, vm_list) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5867) mutex_lock(&kvm->slots_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5868) kvm_mmu_zap_all_fast(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5869) mutex_unlock(&kvm->slots_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5870)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5871) wake_up_process(kvm->arch.nx_lpage_recovery_thread);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5872) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5873) mutex_unlock(&kvm_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5874) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5875)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5876) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5877) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5878)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5879) int kvm_mmu_module_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5880) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5881) int ret = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5882)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5883) if (nx_huge_pages == -1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5884) __set_nx_huge_pages(get_nx_auto_mode());
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5885)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5886) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5887) * MMU roles use union aliasing which is, generally speaking, an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5888) * undefined behavior. However, we supposedly know how compilers behave
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5889) * and the current status quo is unlikely to change. Guardians below are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5890) * supposed to let us know if the assumption becomes false.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5891) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5892) BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5893) BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5894) BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5895)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5896) kvm_mmu_reset_all_pte_masks();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5897)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5898) kvm_set_mmio_spte_mask();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5899)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5900) pte_list_desc_cache = kmem_cache_create("pte_list_desc",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5901) sizeof(struct pte_list_desc),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5902) 0, SLAB_ACCOUNT, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5903) if (!pte_list_desc_cache)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5904) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5905)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5906) mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5907) sizeof(struct kvm_mmu_page),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5908) 0, SLAB_ACCOUNT, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5909) if (!mmu_page_header_cache)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5910) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5911)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5912) if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5913) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5914)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5915) ret = register_shrinker(&mmu_shrinker);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5916) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5917) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5918)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5919) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5920)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5921) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5922) mmu_destroy_caches();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5923) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5924) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5925)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5926) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5927) * Calculate mmu pages needed for kvm.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5928) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5929) unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5930) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5931) unsigned long nr_mmu_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5932) unsigned long nr_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5933) struct kvm_memslots *slots;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5934) struct kvm_memory_slot *memslot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5935) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5936)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5937) for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5938) slots = __kvm_memslots(kvm, i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5939)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5940) kvm_for_each_memslot(memslot, slots)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5941) nr_pages += memslot->npages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5942) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5943)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5944) nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5945) nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5946)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5947) return nr_mmu_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5948) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5949)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5950) void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5951) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5952) kvm_mmu_unload(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5953) free_mmu_pages(&vcpu->arch.root_mmu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5954) free_mmu_pages(&vcpu->arch.guest_mmu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5955) mmu_free_memory_caches(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5956) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5957)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5958) void kvm_mmu_module_exit(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5959) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5960) mmu_destroy_caches();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5961) percpu_counter_destroy(&kvm_total_used_mmu_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5962) unregister_shrinker(&mmu_shrinker);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5963) mmu_audit_disable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5964) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5965)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5966) static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5967) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5968) unsigned int old_val;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5969) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5970)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5971) old_val = nx_huge_pages_recovery_ratio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5972) err = param_set_uint(val, kp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5973) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5974) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5975)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5976) if (READ_ONCE(nx_huge_pages) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5977) !old_val && nx_huge_pages_recovery_ratio) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5978) struct kvm *kvm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5979)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5980) mutex_lock(&kvm_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5981)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5982) list_for_each_entry(kvm, &vm_list, vm_list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5983) wake_up_process(kvm->arch.nx_lpage_recovery_thread);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5984)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5985) mutex_unlock(&kvm_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5986) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5987)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5988) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5989) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5990)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5991) static void kvm_recover_nx_lpages(struct kvm *kvm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5992) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5993) int rcu_idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5994) struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5995) unsigned int ratio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5996) LIST_HEAD(invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5997) bool flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5998) ulong to_zap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5999)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6000) rcu_idx = srcu_read_lock(&kvm->srcu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6001) spin_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6002)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6003) ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6004) to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6005) for ( ; to_zap; --to_zap) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6006) if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6007) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6008)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6009) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6010) * We use a separate list instead of just using active_mmu_pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6011) * because the number of lpage_disallowed pages is expected to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6012) * be relatively small compared to the total.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6013) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6014) sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6015) struct kvm_mmu_page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6016) lpage_disallowed_link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6017) WARN_ON_ONCE(!sp->lpage_disallowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6018) if (sp->tdp_mmu_page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6019) flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6020) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6021) kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6022) WARN_ON_ONCE(sp->lpage_disallowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6023) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6024)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6025) if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6026) kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6027) cond_resched_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6028) flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6029) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6030) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6031) kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6032)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6033) spin_unlock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6034) srcu_read_unlock(&kvm->srcu, rcu_idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6035) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6036)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6037) static long get_nx_lpage_recovery_timeout(u64 start_time)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6038) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6039) return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6040) ? start_time + 60 * HZ - get_jiffies_64()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6041) : MAX_SCHEDULE_TIMEOUT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6042) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6043)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6044) static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6045) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6046) u64 start_time;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6047) long remaining_time;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6048)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6049) while (true) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6050) start_time = get_jiffies_64();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6051) remaining_time = get_nx_lpage_recovery_timeout(start_time);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6052)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6053) set_current_state(TASK_INTERRUPTIBLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6054) while (!kthread_should_stop() && remaining_time > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6055) schedule_timeout(remaining_time);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6056) remaining_time = get_nx_lpage_recovery_timeout(start_time);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6057) set_current_state(TASK_INTERRUPTIBLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6058) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6059)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6060) set_current_state(TASK_RUNNING);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6061)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6062) if (kthread_should_stop())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6063) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6064)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6065) kvm_recover_nx_lpages(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6066) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6067) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6068)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6069) int kvm_mmu_post_init_vm(struct kvm *kvm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6070) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6071) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6072)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6073) err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6074) "kvm-nx-lpage-recovery",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6075) &kvm->arch.nx_lpage_recovery_thread);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6076) if (!err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6077) kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6078)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6079) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6080) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6081)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6082) void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6083) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6084) if (kvm->arch.nx_lpage_recovery_thread)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6085) kthread_stop(kvm->arch.nx_lpage_recovery_thread);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6086) }