Orange Pi5 kernel

Deprecated Linux kernel 5.10.110 for OrangePi 5/5B/5+ boards

3 Commits   0 Branches   0 Tags
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    1) // SPDX-License-Identifier: GPL-2.0-only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    3)  * Kernel-based Virtual Machine driver for Linux
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    4)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    5)  * This module enables machines with Intel VT-x extensions to run virtual
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    6)  * machines without emulation or binary translation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    7)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    8)  * MMU support
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    9)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   10)  * Copyright (C) 2006 Qumranet, Inc.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   11)  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   12)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   13)  * Authors:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   14)  *   Yaniv Kamay  <yaniv@qumranet.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   15)  *   Avi Kivity   <avi@qumranet.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   16)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   17) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   18) #include "irq.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   19) #include "ioapic.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   20) #include "mmu.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   21) #include "mmu_internal.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   22) #include "tdp_mmu.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   23) #include "x86.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   24) #include "kvm_cache_regs.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   25) #include "kvm_emulate.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   26) #include "cpuid.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   27) #include "spte.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   28) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   29) #include <linux/kvm_host.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   30) #include <linux/types.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   31) #include <linux/string.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   32) #include <linux/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   33) #include <linux/highmem.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   34) #include <linux/moduleparam.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   35) #include <linux/export.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   36) #include <linux/swap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   37) #include <linux/hugetlb.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   38) #include <linux/compiler.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   39) #include <linux/srcu.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   40) #include <linux/slab.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   41) #include <linux/sched/signal.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   42) #include <linux/uaccess.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   43) #include <linux/hash.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   44) #include <linux/kern_levels.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   45) #include <linux/kthread.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   46) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   47) #include <asm/page.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   48) #include <asm/memtype.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   49) #include <asm/cmpxchg.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   50) #include <asm/io.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   51) #include <asm/vmx.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   52) #include <asm/kvm_page_track.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   53) #include "trace.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   54) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   55) #include "paging.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   56) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   57) extern bool itlb_multihit_kvm_mitigation;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   58) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   59) static int __read_mostly nx_huge_pages = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   60) #ifdef CONFIG_PREEMPT_RT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   61) /* Recovery can cause latency spikes, disable it for PREEMPT_RT.  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   62) static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   63) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   64) static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   65) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   66) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   67) static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   68) static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   69) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   70) static const struct kernel_param_ops nx_huge_pages_ops = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   71) 	.set = set_nx_huge_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   72) 	.get = param_get_bool,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   73) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   74) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   75) static const struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   76) 	.set = set_nx_huge_pages_recovery_ratio,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   77) 	.get = param_get_uint,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   78) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   79) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   80) module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   81) __MODULE_PARM_TYPE(nx_huge_pages, "bool");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   82) module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   83) 		&nx_huge_pages_recovery_ratio, 0644);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   84) __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   85) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   86) static bool __read_mostly force_flush_and_sync_on_reuse;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   87) module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   88) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   89) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   90)  * When setting this variable to true it enables Two-Dimensional-Paging
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   91)  * where the hardware walks 2 page tables:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   92)  * 1. the guest-virtual to guest-physical
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   93)  * 2. while doing 1. it walks guest-physical to host-physical
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   94)  * If the hardware supports that we don't need to do shadow paging.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   95)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   96) bool tdp_enabled = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   97) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   98) static int max_huge_page_level __read_mostly;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   99) static int max_tdp_level __read_mostly;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  100) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  101) enum {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  102) 	AUDIT_PRE_PAGE_FAULT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  103) 	AUDIT_POST_PAGE_FAULT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  104) 	AUDIT_PRE_PTE_WRITE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  105) 	AUDIT_POST_PTE_WRITE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  106) 	AUDIT_PRE_SYNC,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  107) 	AUDIT_POST_SYNC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  108) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  109) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  110) #ifdef MMU_DEBUG
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  111) bool dbg = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  112) module_param(dbg, bool, 0644);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  113) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  114) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  115) #define PTE_PREFETCH_NUM		8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  116) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  117) #define PT32_LEVEL_BITS 10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  118) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  119) #define PT32_LEVEL_SHIFT(level) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  120) 		(PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  121) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  122) #define PT32_LVL_OFFSET_MASK(level) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  123) 	(PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  124) 						* PT32_LEVEL_BITS))) - 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  125) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  126) #define PT32_INDEX(address, level)\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  127) 	(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  128) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  129) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  130) #define PT32_BASE_ADDR_MASK PAGE_MASK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  131) #define PT32_DIR_BASE_ADDR_MASK \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  132) 	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  133) #define PT32_LVL_ADDR_MASK(level) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  134) 	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  135) 					    * PT32_LEVEL_BITS))) - 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  136) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  137) #include <trace/events/kvm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  138) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  139) /* make pte_list_desc fit well in cache line */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  140) #define PTE_LIST_EXT 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  141) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  142) struct pte_list_desc {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  143) 	u64 *sptes[PTE_LIST_EXT];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  144) 	struct pte_list_desc *more;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  145) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  146) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  147) struct kvm_shadow_walk_iterator {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  148) 	u64 addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  149) 	hpa_t shadow_addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  150) 	u64 *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  151) 	int level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  152) 	unsigned index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  153) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  154) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  155) #define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker)     \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  156) 	for (shadow_walk_init_using_root(&(_walker), (_vcpu),              \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  157) 					 (_root), (_addr));                \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  158) 	     shadow_walk_okay(&(_walker));			           \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  159) 	     shadow_walk_next(&(_walker)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  160) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  161) #define for_each_shadow_entry(_vcpu, _addr, _walker)            \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  162) 	for (shadow_walk_init(&(_walker), _vcpu, _addr);	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  163) 	     shadow_walk_okay(&(_walker));			\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  164) 	     shadow_walk_next(&(_walker)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  165) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  166) #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte)	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  167) 	for (shadow_walk_init(&(_walker), _vcpu, _addr);		\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  168) 	     shadow_walk_okay(&(_walker)) &&				\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  169) 		({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  170) 	     __shadow_walk_next(&(_walker), spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  171) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  172) static struct kmem_cache *pte_list_desc_cache;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  173) struct kmem_cache *mmu_page_header_cache;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  174) static struct percpu_counter kvm_total_used_mmu_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  175) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  176) static void mmu_spte_set(u64 *sptep, u64 spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  177) static union kvm_mmu_page_role
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  178) kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  179) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  180) #define CREATE_TRACE_POINTS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  181) #include "mmutrace.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  182) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  183) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  184) static inline bool kvm_available_flush_tlb_with_range(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  185) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  186) 	return kvm_x86_ops.tlb_remote_flush_with_range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  187) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  188) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  189) static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  190) 		struct kvm_tlb_range *range)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  191) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  192) 	int ret = -ENOTSUPP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  193) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  194) 	if (range && kvm_x86_ops.tlb_remote_flush_with_range)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  195) 		ret = kvm_x86_ops.tlb_remote_flush_with_range(kvm, range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  196) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  197) 	if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  198) 		kvm_flush_remote_tlbs(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  199) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  200) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  201) void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  202) 		u64 start_gfn, u64 pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  203) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  204) 	struct kvm_tlb_range range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  205) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  206) 	range.start_gfn = start_gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  207) 	range.pages = pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  208) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  209) 	kvm_flush_remote_tlbs_with_range(kvm, &range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  210) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  211) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  212) bool is_nx_huge_page_enabled(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  213) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  214) 	return READ_ONCE(nx_huge_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  215) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  216) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  217) static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  218) 			   unsigned int access)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  219) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  220) 	u64 mask = make_mmio_spte(vcpu, gfn, access);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  221) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  222) 	trace_mark_mmio_spte(sptep, gfn, mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  223) 	mmu_spte_set(sptep, mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  224) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  225) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  226) static gfn_t get_mmio_spte_gfn(u64 spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  227) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  228) 	u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  229) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  230) 	gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  231) 	       & shadow_nonpresent_or_rsvd_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  232) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  233) 	return gpa >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  234) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  235) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  236) static unsigned get_mmio_spte_access(u64 spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  237) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  238) 	return spte & shadow_mmio_access_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  239) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  240) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  241) static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  242) 			  kvm_pfn_t pfn, unsigned int access)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  243) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  244) 	if (unlikely(is_noslot_pfn(pfn))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  245) 		mark_mmio_spte(vcpu, sptep, gfn, access);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  246) 		return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  247) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  248) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  249) 	return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  250) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  251) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  252) static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  253) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  254) 	u64 kvm_gen, spte_gen, gen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  255) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  256) 	gen = kvm_vcpu_memslots(vcpu)->generation;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  257) 	if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  258) 		return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  259) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  260) 	kvm_gen = gen & MMIO_SPTE_GEN_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  261) 	spte_gen = get_mmio_spte_generation(spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  262) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  263) 	trace_check_mmio_spte(spte, kvm_gen, spte_gen);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  264) 	return likely(kvm_gen == spte_gen);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  265) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  266) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  267) static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  268)                                   struct x86_exception *exception)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  269) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  270)         return gpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  271) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  272) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  273) static int is_cpuid_PSE36(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  274) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  275) 	return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  276) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  277) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  278) static int is_nx(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  279) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  280) 	return vcpu->arch.efer & EFER_NX;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  281) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  282) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  283) static gfn_t pse36_gfn_delta(u32 gpte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  284) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  285) 	int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  286) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  287) 	return (gpte & PT32_DIR_PSE36_MASK) << shift;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  288) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  289) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  290) #ifdef CONFIG_X86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  291) static void __set_spte(u64 *sptep, u64 spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  292) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  293) 	WRITE_ONCE(*sptep, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  294) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  295) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  296) static void __update_clear_spte_fast(u64 *sptep, u64 spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  297) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  298) 	WRITE_ONCE(*sptep, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  299) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  300) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  301) static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  302) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  303) 	return xchg(sptep, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  304) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  305) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  306) static u64 __get_spte_lockless(u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  307) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  308) 	return READ_ONCE(*sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  309) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  310) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  311) union split_spte {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  312) 	struct {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  313) 		u32 spte_low;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  314) 		u32 spte_high;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  315) 	};
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  316) 	u64 spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  317) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  318) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  319) static void count_spte_clear(u64 *sptep, u64 spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  320) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  321) 	struct kvm_mmu_page *sp =  sptep_to_sp(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  322) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  323) 	if (is_shadow_present_pte(spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  324) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  325) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  326) 	/* Ensure the spte is completely set before we increase the count */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  327) 	smp_wmb();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  328) 	sp->clear_spte_count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  329) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  330) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  331) static void __set_spte(u64 *sptep, u64 spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  332) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  333) 	union split_spte *ssptep, sspte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  334) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  335) 	ssptep = (union split_spte *)sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  336) 	sspte = (union split_spte)spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  337) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  338) 	ssptep->spte_high = sspte.spte_high;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  339) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  340) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  341) 	 * If we map the spte from nonpresent to present, We should store
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  342) 	 * the high bits firstly, then set present bit, so cpu can not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  343) 	 * fetch this spte while we are setting the spte.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  344) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  345) 	smp_wmb();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  346) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  347) 	WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  348) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  349) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  350) static void __update_clear_spte_fast(u64 *sptep, u64 spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  351) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  352) 	union split_spte *ssptep, sspte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  353) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  354) 	ssptep = (union split_spte *)sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  355) 	sspte = (union split_spte)spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  356) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  357) 	WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  358) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  359) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  360) 	 * If we map the spte from present to nonpresent, we should clear
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  361) 	 * present bit firstly to avoid vcpu fetch the old high bits.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  362) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  363) 	smp_wmb();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  364) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  365) 	ssptep->spte_high = sspte.spte_high;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  366) 	count_spte_clear(sptep, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  367) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  368) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  369) static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  370) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  371) 	union split_spte *ssptep, sspte, orig;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  372) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  373) 	ssptep = (union split_spte *)sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  374) 	sspte = (union split_spte)spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  375) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  376) 	/* xchg acts as a barrier before the setting of the high bits */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  377) 	orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  378) 	orig.spte_high = ssptep->spte_high;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  379) 	ssptep->spte_high = sspte.spte_high;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  380) 	count_spte_clear(sptep, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  381) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  382) 	return orig.spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  383) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  384) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  385) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  386)  * The idea using the light way get the spte on x86_32 guest is from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  387)  * gup_get_pte (mm/gup.c).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  388)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  389)  * An spte tlb flush may be pending, because kvm_set_pte_rmapp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  390)  * coalesces them and we are running out of the MMU lock.  Therefore
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  391)  * we need to protect against in-progress updates of the spte.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  392)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  393)  * Reading the spte while an update is in progress may get the old value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  394)  * for the high part of the spte.  The race is fine for a present->non-present
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  395)  * change (because the high part of the spte is ignored for non-present spte),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  396)  * but for a present->present change we must reread the spte.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  397)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  398)  * All such changes are done in two steps (present->non-present and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  399)  * non-present->present), hence it is enough to count the number of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  400)  * present->non-present updates: if it changed while reading the spte,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  401)  * we might have hit the race.  This is done using clear_spte_count.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  402)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  403) static u64 __get_spte_lockless(u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  404) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  405) 	struct kvm_mmu_page *sp =  sptep_to_sp(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  406) 	union split_spte spte, *orig = (union split_spte *)sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  407) 	int count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  408) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  409) retry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  410) 	count = sp->clear_spte_count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  411) 	smp_rmb();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  412) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  413) 	spte.spte_low = orig->spte_low;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  414) 	smp_rmb();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  415) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  416) 	spte.spte_high = orig->spte_high;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  417) 	smp_rmb();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  418) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  419) 	if (unlikely(spte.spte_low != orig->spte_low ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  420) 	      count != sp->clear_spte_count))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  421) 		goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  422) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  423) 	return spte.spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  424) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  425) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  426) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  427) static bool spte_has_volatile_bits(u64 spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  428) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  429) 	if (!is_shadow_present_pte(spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  430) 		return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  431) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  432) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  433) 	 * Always atomically update spte if it can be updated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  434) 	 * out of mmu-lock, it can ensure dirty bit is not lost,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  435) 	 * also, it can help us to get a stable is_writable_pte()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  436) 	 * to ensure tlb flush is not missed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  437) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  438) 	if (spte_can_locklessly_be_made_writable(spte) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  439) 	    is_access_track_spte(spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  440) 		return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  441) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  442) 	if (spte_ad_enabled(spte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  443) 		if ((spte & shadow_accessed_mask) == 0 ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  444) 	    	    (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  445) 			return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  446) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  447) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  448) 	return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  449) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  450) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  451) /* Rules for using mmu_spte_set:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  452)  * Set the sptep from nonpresent to present.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  453)  * Note: the sptep being assigned *must* be either not present
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  454)  * or in a state where the hardware will not attempt to update
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  455)  * the spte.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  456)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  457) static void mmu_spte_set(u64 *sptep, u64 new_spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  458) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  459) 	WARN_ON(is_shadow_present_pte(*sptep));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  460) 	__set_spte(sptep, new_spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  461) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  462) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  463) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  464)  * Update the SPTE (excluding the PFN), but do not track changes in its
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  465)  * accessed/dirty status.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  466)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  467) static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  468) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  469) 	u64 old_spte = *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  470) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  471) 	WARN_ON(!is_shadow_present_pte(new_spte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  472) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  473) 	if (!is_shadow_present_pte(old_spte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  474) 		mmu_spte_set(sptep, new_spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  475) 		return old_spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  476) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  477) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  478) 	if (!spte_has_volatile_bits(old_spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  479) 		__update_clear_spte_fast(sptep, new_spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  480) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  481) 		old_spte = __update_clear_spte_slow(sptep, new_spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  482) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  483) 	WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  484) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  485) 	return old_spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  486) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  487) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  488) /* Rules for using mmu_spte_update:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  489)  * Update the state bits, it means the mapped pfn is not changed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  490)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  491)  * Whenever we overwrite a writable spte with a read-only one we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  492)  * should flush remote TLBs. Otherwise rmap_write_protect
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  493)  * will find a read-only spte, even though the writable spte
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  494)  * might be cached on a CPU's TLB, the return value indicates this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  495)  * case.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  496)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  497)  * Returns true if the TLB needs to be flushed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  498)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  499) static bool mmu_spte_update(u64 *sptep, u64 new_spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  500) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  501) 	bool flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  502) 	u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  503) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  504) 	if (!is_shadow_present_pte(old_spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  505) 		return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  506) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  507) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  508) 	 * For the spte updated out of mmu-lock is safe, since
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  509) 	 * we always atomically update it, see the comments in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  510) 	 * spte_has_volatile_bits().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  511) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  512) 	if (spte_can_locklessly_be_made_writable(old_spte) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  513) 	      !is_writable_pte(new_spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  514) 		flush = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  515) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  516) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  517) 	 * Flush TLB when accessed/dirty states are changed in the page tables,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  518) 	 * to guarantee consistency between TLB and page tables.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  519) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  520) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  521) 	if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  522) 		flush = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  523) 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  524) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  525) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  526) 	if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  527) 		flush = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  528) 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  529) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  530) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  531) 	return flush;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  532) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  533) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  534) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  535)  * Rules for using mmu_spte_clear_track_bits:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  536)  * It sets the sptep from present to nonpresent, and track the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  537)  * state bits, it is used to clear the last level sptep.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  538)  * Returns non-zero if the PTE was previously valid.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  539)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  540) static int mmu_spte_clear_track_bits(u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  541) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  542) 	kvm_pfn_t pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  543) 	u64 old_spte = *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  544) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  545) 	if (!spte_has_volatile_bits(old_spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  546) 		__update_clear_spte_fast(sptep, 0ull);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  547) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  548) 		old_spte = __update_clear_spte_slow(sptep, 0ull);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  549) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  550) 	if (!is_shadow_present_pte(old_spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  551) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  552) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  553) 	pfn = spte_to_pfn(old_spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  554) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  555) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  556) 	 * KVM does not hold the refcount of the page used by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  557) 	 * kvm mmu, before reclaiming the page, we should
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  558) 	 * unmap it from mmu first.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  559) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  560) 	WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  561) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  562) 	if (is_accessed_spte(old_spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  563) 		kvm_set_pfn_accessed(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  564) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  565) 	if (is_dirty_spte(old_spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  566) 		kvm_set_pfn_dirty(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  567) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  568) 	return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  569) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  570) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  571) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  572)  * Rules for using mmu_spte_clear_no_track:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  573)  * Directly clear spte without caring the state bits of sptep,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  574)  * it is used to set the upper level spte.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  575)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  576) static void mmu_spte_clear_no_track(u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  577) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  578) 	__update_clear_spte_fast(sptep, 0ull);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  579) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  580) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  581) static u64 mmu_spte_get_lockless(u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  582) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  583) 	return __get_spte_lockless(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  584) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  585) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  586) /* Restore an acc-track PTE back to a regular PTE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  587) static u64 restore_acc_track_spte(u64 spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  588) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  589) 	u64 new_spte = spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  590) 	u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  591) 			 & SHADOW_ACC_TRACK_SAVED_BITS_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  592) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  593) 	WARN_ON_ONCE(spte_ad_enabled(spte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  594) 	WARN_ON_ONCE(!is_access_track_spte(spte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  595) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  596) 	new_spte &= ~shadow_acc_track_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  597) 	new_spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  598) 		      SHADOW_ACC_TRACK_SAVED_BITS_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  599) 	new_spte |= saved_bits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  600) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  601) 	return new_spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  602) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  603) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  604) /* Returns the Accessed status of the PTE and resets it at the same time. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  605) static bool mmu_spte_age(u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  606) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  607) 	u64 spte = mmu_spte_get_lockless(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  608) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  609) 	if (!is_accessed_spte(spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  610) 		return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  611) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  612) 	if (spte_ad_enabled(spte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  613) 		clear_bit((ffs(shadow_accessed_mask) - 1),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  614) 			  (unsigned long *)sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  615) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  616) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  617) 		 * Capture the dirty status of the page, so that it doesn't get
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  618) 		 * lost when the SPTE is marked for access tracking.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  619) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  620) 		if (is_writable_pte(spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  621) 			kvm_set_pfn_dirty(spte_to_pfn(spte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  622) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  623) 		spte = mark_spte_for_access_track(spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  624) 		mmu_spte_update_no_track(sptep, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  625) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  626) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  627) 	return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  628) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  629) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  630) static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  631) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  632) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  633) 	 * Prevent page table teardown by making any free-er wait during
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  634) 	 * kvm_flush_remote_tlbs() IPI to all active vcpus.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  635) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  636) 	local_irq_disable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  637) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  638) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  639) 	 * Make sure a following spte read is not reordered ahead of the write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  640) 	 * to vcpu->mode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  641) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  642) 	smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  643) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  644) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  645) static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  646) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  647) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  648) 	 * Make sure the write to vcpu->mode is not reordered in front of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  649) 	 * reads to sptes.  If it does, kvm_mmu_commit_zap_page() can see us
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  650) 	 * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  651) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  652) 	smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  653) 	local_irq_enable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  654) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  655) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  656) static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  657) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  658) 	int r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  659) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  660) 	/* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  661) 	r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  662) 				       1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  663) 	if (r)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  664) 		return r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  665) 	r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  666) 				       PT64_ROOT_MAX_LEVEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  667) 	if (r)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  668) 		return r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  669) 	if (maybe_indirect) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  670) 		r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  671) 					       PT64_ROOT_MAX_LEVEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  672) 		if (r)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  673) 			return r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  674) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  675) 	return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  676) 					  PT64_ROOT_MAX_LEVEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  677) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  678) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  679) static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  680) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  681) 	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  682) 	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  683) 	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  684) 	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  685) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  686) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  687) static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  688) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  689) 	return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  690) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  691) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  692) static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  693) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  694) 	kmem_cache_free(pte_list_desc_cache, pte_list_desc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  695) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  696) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  697) static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  698) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  699) 	if (!sp->role.direct)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  700) 		return sp->gfns[index];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  701) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  702) 	return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  703) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  704) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  705) static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  706) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  707) 	if (!sp->role.direct) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  708) 		sp->gfns[index] = gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  709) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  710) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  711) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  712) 	if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  713) 		pr_err_ratelimited("gfn mismatch under direct page %llx "
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  714) 				   "(expected %llx, got %llx)\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  715) 				   sp->gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  716) 				   kvm_mmu_page_get_gfn(sp, index), gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  717) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  718) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  719) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  720)  * Return the pointer to the large page information for a given gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  721)  * handling slots that are not large page aligned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  722)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  723) static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  724) 					      struct kvm_memory_slot *slot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  725) 					      int level)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  726) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  727) 	unsigned long idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  728) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  729) 	idx = gfn_to_index(gfn, slot->base_gfn, level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  730) 	return &slot->arch.lpage_info[level - 2][idx];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  731) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  732) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  733) static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  734) 					    gfn_t gfn, int count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  735) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  736) 	struct kvm_lpage_info *linfo;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  737) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  738) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  739) 	for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  740) 		linfo = lpage_info_slot(gfn, slot, i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  741) 		linfo->disallow_lpage += count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  742) 		WARN_ON(linfo->disallow_lpage < 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  743) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  744) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  745) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  746) void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  747) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  748) 	update_gfn_disallow_lpage_count(slot, gfn, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  749) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  750) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  751) void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  752) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  753) 	update_gfn_disallow_lpage_count(slot, gfn, -1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  754) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  755) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  756) static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  757) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  758) 	struct kvm_memslots *slots;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  759) 	struct kvm_memory_slot *slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  760) 	gfn_t gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  761) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  762) 	kvm->arch.indirect_shadow_pages++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  763) 	gfn = sp->gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  764) 	slots = kvm_memslots_for_spte_role(kvm, sp->role);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  765) 	slot = __gfn_to_memslot(slots, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  766) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  767) 	/* the non-leaf shadow pages are keeping readonly. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  768) 	if (sp->role.level > PG_LEVEL_4K)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  769) 		return kvm_slot_page_track_add_page(kvm, slot, gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  770) 						    KVM_PAGE_TRACK_WRITE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  771) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  772) 	kvm_mmu_gfn_disallow_lpage(slot, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  773) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  774) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  775) void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  776) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  777) 	if (sp->lpage_disallowed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  778) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  779) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  780) 	++kvm->stat.nx_lpage_splits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  781) 	list_add_tail(&sp->lpage_disallowed_link,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  782) 		      &kvm->arch.lpage_disallowed_mmu_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  783) 	sp->lpage_disallowed = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  784) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  785) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  786) static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  787) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  788) 	struct kvm_memslots *slots;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  789) 	struct kvm_memory_slot *slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  790) 	gfn_t gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  791) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  792) 	kvm->arch.indirect_shadow_pages--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  793) 	gfn = sp->gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  794) 	slots = kvm_memslots_for_spte_role(kvm, sp->role);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  795) 	slot = __gfn_to_memslot(slots, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  796) 	if (sp->role.level > PG_LEVEL_4K)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  797) 		return kvm_slot_page_track_remove_page(kvm, slot, gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  798) 						       KVM_PAGE_TRACK_WRITE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  799) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  800) 	kvm_mmu_gfn_allow_lpage(slot, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  801) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  802) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  803) void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  804) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  805) 	--kvm->stat.nx_lpage_splits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  806) 	sp->lpage_disallowed = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  807) 	list_del(&sp->lpage_disallowed_link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  808) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  809) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  810) static struct kvm_memory_slot *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  811) gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  812) 			    bool no_dirty_log)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  813) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  814) 	struct kvm_memory_slot *slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  815) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  816) 	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  817) 	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  818) 		return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  819) 	if (no_dirty_log && slot->dirty_bitmap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  820) 		return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  821) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  822) 	return slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  823) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  824) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  825) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  826)  * About rmap_head encoding:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  827)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  828)  * If the bit zero of rmap_head->val is clear, then it points to the only spte
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  829)  * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  830)  * pte_list_desc containing more mappings.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  831)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  832) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  833) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  834)  * Returns the number of pointers in the rmap chain, not counting the new one.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  835)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  836) static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  837) 			struct kvm_rmap_head *rmap_head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  838) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  839) 	struct pte_list_desc *desc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  840) 	int i, count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  841) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  842) 	if (!rmap_head->val) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  843) 		rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  844) 		rmap_head->val = (unsigned long)spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  845) 	} else if (!(rmap_head->val & 1)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  846) 		rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  847) 		desc = mmu_alloc_pte_list_desc(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  848) 		desc->sptes[0] = (u64 *)rmap_head->val;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  849) 		desc->sptes[1] = spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  850) 		rmap_head->val = (unsigned long)desc | 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  851) 		++count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  852) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  853) 		rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  854) 		desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  855) 		while (desc->sptes[PTE_LIST_EXT-1]) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  856) 			count += PTE_LIST_EXT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  857) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  858) 			if (!desc->more) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  859) 				desc->more = mmu_alloc_pte_list_desc(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  860) 				desc = desc->more;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  861) 				break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  862) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  863) 			desc = desc->more;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  864) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  865) 		for (i = 0; desc->sptes[i]; ++i)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  866) 			++count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  867) 		desc->sptes[i] = spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  868) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  869) 	return count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  870) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  871) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  872) static void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  873) pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  874) 			   struct pte_list_desc *desc, int i,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  875) 			   struct pte_list_desc *prev_desc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  876) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  877) 	int j;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  878) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  879) 	for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  880) 		;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  881) 	desc->sptes[i] = desc->sptes[j];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  882) 	desc->sptes[j] = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  883) 	if (j != 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  884) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  885) 	if (!prev_desc && !desc->more)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  886) 		rmap_head->val = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  887) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  888) 		if (prev_desc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  889) 			prev_desc->more = desc->more;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  890) 		else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  891) 			rmap_head->val = (unsigned long)desc->more | 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  892) 	mmu_free_pte_list_desc(desc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  893) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  894) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  895) static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  896) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  897) 	struct pte_list_desc *desc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  898) 	struct pte_list_desc *prev_desc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  899) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  900) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  901) 	if (!rmap_head->val) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  902) 		pr_err("%s: %p 0->BUG\n", __func__, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  903) 		BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  904) 	} else if (!(rmap_head->val & 1)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  905) 		rmap_printk("%s:  %p 1->0\n", __func__, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  906) 		if ((u64 *)rmap_head->val != spte) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  907) 			pr_err("%s:  %p 1->BUG\n", __func__, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  908) 			BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  909) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  910) 		rmap_head->val = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  911) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  912) 		rmap_printk("%s:  %p many->many\n", __func__, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  913) 		desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  914) 		prev_desc = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  915) 		while (desc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  916) 			for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  917) 				if (desc->sptes[i] == spte) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  918) 					pte_list_desc_remove_entry(rmap_head,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  919) 							desc, i, prev_desc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  920) 					return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  921) 				}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  922) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  923) 			prev_desc = desc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  924) 			desc = desc->more;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  925) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  926) 		pr_err("%s: %p many->many\n", __func__, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  927) 		BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  928) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  929) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  930) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  931) static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  932) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  933) 	mmu_spte_clear_track_bits(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  934) 	__pte_list_remove(sptep, rmap_head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  935) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  936) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  937) static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  938) 					   struct kvm_memory_slot *slot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  939) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  940) 	unsigned long idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  941) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  942) 	idx = gfn_to_index(gfn, slot->base_gfn, level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  943) 	return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  944) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  945) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  946) static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  947) 					 struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  948) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  949) 	struct kvm_memslots *slots;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  950) 	struct kvm_memory_slot *slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  951) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  952) 	slots = kvm_memslots_for_spte_role(kvm, sp->role);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  953) 	slot = __gfn_to_memslot(slots, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  954) 	return __gfn_to_rmap(gfn, sp->role.level, slot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  955) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  956) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  957) static bool rmap_can_add(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  958) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  959) 	struct kvm_mmu_memory_cache *mc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  960) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  961) 	mc = &vcpu->arch.mmu_pte_list_desc_cache;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  962) 	return kvm_mmu_memory_cache_nr_free_objects(mc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  963) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  964) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  965) static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  966) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  967) 	struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  968) 	struct kvm_rmap_head *rmap_head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  969) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  970) 	sp = sptep_to_sp(spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  971) 	kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  972) 	rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  973) 	return pte_list_add(vcpu, spte, rmap_head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  974) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  975) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  976) static void rmap_remove(struct kvm *kvm, u64 *spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  977) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  978) 	struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  979) 	gfn_t gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  980) 	struct kvm_rmap_head *rmap_head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  981) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  982) 	sp = sptep_to_sp(spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  983) 	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  984) 	rmap_head = gfn_to_rmap(kvm, gfn, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  985) 	__pte_list_remove(spte, rmap_head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  986) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  987) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  988) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  989)  * Used by the following functions to iterate through the sptes linked by a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  990)  * rmap.  All fields are private and not assumed to be used outside.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  991)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  992) struct rmap_iterator {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  993) 	/* private fields */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  994) 	struct pte_list_desc *desc;	/* holds the sptep if not NULL */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  995) 	int pos;			/* index of the sptep */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  996) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  997) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  998) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  999)  * Iteration must be started by this function.  This should also be used after
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000)  * removing/dropping sptes from the rmap link because in such cases the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001)  * information in the iterator may not be valid.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003)  * Returns sptep if found, NULL otherwise.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) 			   struct rmap_iterator *iter)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) 	u64 *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) 	if (!rmap_head->val)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) 		return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) 	if (!(rmap_head->val & 1)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) 		iter->desc = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) 		sptep = (u64 *)rmap_head->val;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) 		goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) 	iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) 	iter->pos = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) 	sptep = iter->desc->sptes[iter->pos];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) 	BUG_ON(!is_shadow_present_pte(*sptep));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) 	return sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028)  * Must be used with a valid iterator: e.g. after rmap_get_first().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030)  * Returns sptep if found, NULL otherwise.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) static u64 *rmap_get_next(struct rmap_iterator *iter)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) 	u64 *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) 	if (iter->desc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) 		if (iter->pos < PTE_LIST_EXT - 1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) 			++iter->pos;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) 			sptep = iter->desc->sptes[iter->pos];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) 			if (sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) 				goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) 		iter->desc = iter->desc->more;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) 		if (iter->desc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) 			iter->pos = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) 			/* desc->sptes[0] cannot be NULL */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) 			sptep = iter->desc->sptes[iter->pos];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) 			goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) 	return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) 	BUG_ON(!is_shadow_present_pte(*sptep));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) 	return sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) #define for_each_rmap_spte(_rmap_head_, _iter_, _spte_)			\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061) 	for (_spte_ = rmap_get_first(_rmap_head_, _iter_);		\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) 	     _spte_; _spte_ = rmap_get_next(_iter_))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) static void drop_spte(struct kvm *kvm, u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) 	if (mmu_spte_clear_track_bits(sptep))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) 		rmap_remove(kvm, sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073) 	if (is_large_pte(*sptep)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) 		WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) 		drop_spte(kvm, sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) 		--kvm->stat.lpages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) 		return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) 	return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) 	if (__drop_large_spte(vcpu->kvm, sptep)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) 		struct kvm_mmu_page *sp = sptep_to_sp(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) 		kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089) 			KVM_PAGES_PER_HPAGE(sp->role.level));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094)  * Write-protect on the specified @sptep, @pt_protect indicates whether
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095)  * spte write-protection is caused by protecting shadow page table.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097)  * Note: write protection is difference between dirty logging and spte
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098)  * protection:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099)  * - for dirty logging, the spte can be set to writable at anytime if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100)  *   its dirty bitmap is properly set.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101)  * - for spte protection, the spte can be writable only after unsync-ing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102)  *   shadow page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104)  * Return true if tlb need be flushed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) static bool spte_write_protect(u64 *sptep, bool pt_protect)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) 	u64 spte = *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110) 	if (!is_writable_pte(spte) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) 	      !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) 		return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114) 	rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) 	if (pt_protect)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) 		spte &= ~SPTE_MMU_WRITEABLE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) 	spte = spte & ~PT_WRITABLE_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) 	return mmu_spte_update(sptep, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) static bool __rmap_write_protect(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) 				 struct kvm_rmap_head *rmap_head,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) 				 bool pt_protect)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) 	u64 *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) 	struct rmap_iterator iter;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) 	bool flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) 	for_each_rmap_spte(rmap_head, &iter, sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) 		flush |= spte_write_protect(sptep, pt_protect);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) 	return flush;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) static bool spte_clear_dirty(u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) 	u64 spte = *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) 	rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) 	MMU_WARN_ON(!spte_ad_enabled(spte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) 	spte &= ~shadow_dirty_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) 	return mmu_spte_update(sptep, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) static bool spte_wrprot_for_clear_dirty(u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) 	bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) 					       (unsigned long *)sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) 	if (was_writable && !spte_ad_enabled(*sptep))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) 		kvm_set_pfn_dirty(spte_to_pfn(*sptep));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) 	return was_writable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159)  * Gets the GFN ready for another round of dirty logging by clearing the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160)  *	- D bit on ad-enabled SPTEs, and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161)  *	- W bit on ad-disabled SPTEs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162)  * Returns true iff any D or W bits were cleared.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166) 	u64 *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) 	struct rmap_iterator iter;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) 	bool flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) 	for_each_rmap_spte(rmap_head, &iter, sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) 		if (spte_ad_need_write_protect(*sptep))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) 			flush |= spte_wrprot_for_clear_dirty(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) 		else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) 			flush |= spte_clear_dirty(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) 	return flush;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) static bool spte_set_dirty(u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) 	u64 spte = *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) 	rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) 	 * Similar to the !kvm_x86_ops.slot_disable_log_dirty case,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) 	 * do not bother adding back write access to pages marked
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188) 	 * SPTE_AD_WRPROT_ONLY_MASK.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) 	spte |= shadow_dirty_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) 	return mmu_spte_update(sptep, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) 	u64 *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) 	struct rmap_iterator iter;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199) 	bool flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) 	for_each_rmap_spte(rmap_head, &iter, sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) 		if (spte_ad_enabled(*sptep))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) 			flush |= spte_set_dirty(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) 	return flush;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209)  * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210)  * @kvm: kvm instance
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211)  * @slot: slot to protect
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212)  * @gfn_offset: start of the BITS_PER_LONG pages we care about
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213)  * @mask: indicates which pages we should protect
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215)  * Used when we do not need to care about huge page mappings: e.g. during dirty
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216)  * logging we do not have any such mappings.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) 				     struct kvm_memory_slot *slot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) 				     gfn_t gfn_offset, unsigned long mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) 	struct kvm_rmap_head *rmap_head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) 	if (kvm->arch.tdp_mmu_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) 		kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) 				slot->base_gfn + gfn_offset, mask, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227) 	while (mask) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) 		rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) 					  PG_LEVEL_4K, slot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) 		__rmap_write_protect(kvm, rmap_head, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) 		/* clear the first set bit */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) 		mask &= mask - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238)  * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239)  * protect the page if the D-bit isn't supported.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240)  * @kvm: kvm instance
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241)  * @slot: slot to clear D-bit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242)  * @gfn_offset: start of the BITS_PER_LONG pages we care about
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243)  * @mask: indicates which pages we should clear D-bit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245)  * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) 				     struct kvm_memory_slot *slot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) 				     gfn_t gfn_offset, unsigned long mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) 	struct kvm_rmap_head *rmap_head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) 	if (kvm->arch.tdp_mmu_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254) 		kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255) 				slot->base_gfn + gfn_offset, mask, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256) 	while (mask) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) 		rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258) 					  PG_LEVEL_4K, slot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259) 		__rmap_clear_dirty(kvm, rmap_head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261) 		/* clear the first set bit */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262) 		mask &= mask - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265) EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268)  * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269)  * PT level pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271)  * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272)  * enable dirty logging for them.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274)  * Used when we do not need to care about huge page mappings: e.g. during dirty
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275)  * logging we do not have any such mappings.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) 				struct kvm_memory_slot *slot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279) 				gfn_t gfn_offset, unsigned long mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) 	if (kvm_x86_ops.enable_log_dirty_pt_masked)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282) 		kvm_x86_ops.enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) 				mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285) 		kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288) bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289) 				    struct kvm_memory_slot *slot, u64 gfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1290) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1291) 	struct kvm_rmap_head *rmap_head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1292) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1293) 	bool write_protected = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1294) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1295) 	for (i = PG_LEVEL_4K; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1296) 		rmap_head = __gfn_to_rmap(gfn, i, slot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1297) 		write_protected |= __rmap_write_protect(kvm, rmap_head, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1298) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1299) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1300) 	if (kvm->arch.tdp_mmu_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1301) 		write_protected |=
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1302) 			kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1303) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1304) 	return write_protected;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1305) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1306) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1307) static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1308) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1309) 	struct kvm_memory_slot *slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1310) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1311) 	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1312) 	return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1313) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1314) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1315) static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1316) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1317) 	u64 *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1318) 	struct rmap_iterator iter;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1319) 	bool flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1320) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1321) 	while ((sptep = rmap_get_first(rmap_head, &iter))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1322) 		rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1323) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1324) 		pte_list_remove(rmap_head, sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1325) 		flush = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1326) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1327) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1328) 	return flush;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1329) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1330) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1331) static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1332) 			   struct kvm_memory_slot *slot, gfn_t gfn, int level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1333) 			   unsigned long data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1334) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1335) 	return kvm_zap_rmapp(kvm, rmap_head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1336) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1337) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1338) static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1339) 			     struct kvm_memory_slot *slot, gfn_t gfn, int level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1340) 			     unsigned long data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1341) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1342) 	u64 *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1343) 	struct rmap_iterator iter;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1344) 	int need_flush = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1345) 	u64 new_spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1346) 	pte_t *ptep = (pte_t *)data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1347) 	kvm_pfn_t new_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1348) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1349) 	WARN_ON(pte_huge(*ptep));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1350) 	new_pfn = pte_pfn(*ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1351) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1352) restart:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1353) 	for_each_rmap_spte(rmap_head, &iter, sptep) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1354) 		rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1355) 			    sptep, *sptep, gfn, level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1356) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1357) 		need_flush = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1358) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1359) 		if (pte_write(*ptep)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1360) 			pte_list_remove(rmap_head, sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1361) 			goto restart;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1362) 		} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1363) 			new_spte = kvm_mmu_changed_pte_notifier_make_spte(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1364) 					*sptep, new_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1365) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1366) 			mmu_spte_clear_track_bits(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1367) 			mmu_spte_set(sptep, new_spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1368) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1369) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1370) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1371) 	if (need_flush && kvm_available_flush_tlb_with_range()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1372) 		kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1373) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1374) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1375) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1376) 	return need_flush;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1377) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1378) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1379) struct slot_rmap_walk_iterator {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1380) 	/* input fields. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1381) 	struct kvm_memory_slot *slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1382) 	gfn_t start_gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1383) 	gfn_t end_gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1384) 	int start_level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1385) 	int end_level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1386) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1387) 	/* output fields. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1388) 	gfn_t gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1389) 	struct kvm_rmap_head *rmap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1390) 	int level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1391) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1392) 	/* private field. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1393) 	struct kvm_rmap_head *end_rmap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1394) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1395) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1396) static void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1397) rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1398) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1399) 	iterator->level = level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1400) 	iterator->gfn = iterator->start_gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1401) 	iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1402) 	iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1403) 					   iterator->slot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1404) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1405) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1406) static void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1407) slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1408) 		    struct kvm_memory_slot *slot, int start_level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1409) 		    int end_level, gfn_t start_gfn, gfn_t end_gfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1410) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1411) 	iterator->slot = slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1412) 	iterator->start_level = start_level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1413) 	iterator->end_level = end_level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1414) 	iterator->start_gfn = start_gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1415) 	iterator->end_gfn = end_gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1416) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1417) 	rmap_walk_init_level(iterator, iterator->start_level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1418) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1419) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1420) static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1421) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1422) 	return !!iterator->rmap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1423) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1424) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1425) static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1426) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1427) 	if (++iterator->rmap <= iterator->end_rmap) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1428) 		iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1429) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1430) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1431) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1432) 	if (++iterator->level > iterator->end_level) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1433) 		iterator->rmap = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1434) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1435) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1436) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1437) 	rmap_walk_init_level(iterator, iterator->level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1438) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1439) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1440) #define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_,	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1441) 	   _start_gfn, _end_gfn, _iter_)				\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1442) 	for (slot_rmap_walk_init(_iter_, _slot_, _start_level_,		\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1443) 				 _end_level_, _start_gfn, _end_gfn);	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1444) 	     slot_rmap_walk_okay(_iter_);				\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1445) 	     slot_rmap_walk_next(_iter_))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1446) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1447) static int kvm_handle_hva_range(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1448) 				unsigned long start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1449) 				unsigned long end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1450) 				unsigned long data,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1451) 				int (*handler)(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1452) 					       struct kvm_rmap_head *rmap_head,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1453) 					       struct kvm_memory_slot *slot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1454) 					       gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1455) 					       int level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1456) 					       unsigned long data))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1457) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1458) 	struct kvm_memslots *slots;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1459) 	struct kvm_memory_slot *memslot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1460) 	struct slot_rmap_walk_iterator iterator;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1461) 	int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1462) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1463) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1464) 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1465) 		slots = __kvm_memslots(kvm, i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1466) 		kvm_for_each_memslot(memslot, slots) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1467) 			unsigned long hva_start, hva_end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1468) 			gfn_t gfn_start, gfn_end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1469) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1470) 			hva_start = max(start, memslot->userspace_addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1471) 			hva_end = min(end, memslot->userspace_addr +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1472) 				      (memslot->npages << PAGE_SHIFT));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1473) 			if (hva_start >= hva_end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1474) 				continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1475) 			/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1476) 			 * {gfn(page) | page intersects with [hva_start, hva_end)} =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1477) 			 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1478) 			 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1479) 			gfn_start = hva_to_gfn_memslot(hva_start, memslot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1480) 			gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1481) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1482) 			for_each_slot_rmap_range(memslot, PG_LEVEL_4K,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1483) 						 KVM_MAX_HUGEPAGE_LEVEL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1484) 						 gfn_start, gfn_end - 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1485) 						 &iterator)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1486) 				ret |= handler(kvm, iterator.rmap, memslot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1487) 					       iterator.gfn, iterator.level, data);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1488) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1489) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1490) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1491) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1492) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1493) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1494) static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1495) 			  unsigned long data,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1496) 			  int (*handler)(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1497) 					 struct kvm_rmap_head *rmap_head,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1498) 					 struct kvm_memory_slot *slot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1499) 					 gfn_t gfn, int level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1500) 					 unsigned long data))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1501) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1502) 	return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1503) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1504) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1505) int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1506) 			unsigned flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1507) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1508) 	int r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1509) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1510) 	r = kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1511) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1512) 	if (kvm->arch.tdp_mmu_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1513) 		r |= kvm_tdp_mmu_zap_hva_range(kvm, start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1514) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1515) 	return r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1516) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1517) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1518) int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1519) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1520) 	int r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1521) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1522) 	r = kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1523) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1524) 	if (kvm->arch.tdp_mmu_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1525) 		r |= kvm_tdp_mmu_set_spte_hva(kvm, hva, &pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1526) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1527) 	return r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1528) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1529) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1530) static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1531) 			 struct kvm_memory_slot *slot, gfn_t gfn, int level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1532) 			 unsigned long data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1533) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1534) 	u64 *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1535) 	struct rmap_iterator iter;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1536) 	int young = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1537) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1538) 	for_each_rmap_spte(rmap_head, &iter, sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1539) 		young |= mmu_spte_age(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1540) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1541) 	trace_kvm_age_page(gfn, level, slot, young);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1542) 	return young;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1543) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1544) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1545) static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1546) 			      struct kvm_memory_slot *slot, gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1547) 			      int level, unsigned long data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1548) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1549) 	u64 *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1550) 	struct rmap_iterator iter;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1551) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1552) 	for_each_rmap_spte(rmap_head, &iter, sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1553) 		if (is_accessed_spte(*sptep))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1554) 			return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1555) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1556) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1557) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1558) #define RMAP_RECYCLE_THRESHOLD 1000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1559) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1560) static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1561) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1562) 	struct kvm_rmap_head *rmap_head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1563) 	struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1564) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1565) 	sp = sptep_to_sp(spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1566) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1567) 	rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1568) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1569) 	kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1570) 	kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1571) 			KVM_PAGES_PER_HPAGE(sp->role.level));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1572) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1573) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1574) int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1575) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1576) 	int young = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1577) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1578) 	young = kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1579) 	if (kvm->arch.tdp_mmu_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1580) 		young |= kvm_tdp_mmu_age_hva_range(kvm, start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1581) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1582) 	return young;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1583) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1584) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1585) int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1586) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1587) 	int young = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1588) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1589) 	young = kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1590) 	if (kvm->arch.tdp_mmu_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1591) 		young |= kvm_tdp_mmu_test_age_hva(kvm, hva);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1592) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1593) 	return young;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1594) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1595) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1596) #ifdef MMU_DEBUG
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1597) static int is_empty_shadow_page(u64 *spt)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1598) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1599) 	u64 *pos;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1600) 	u64 *end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1601) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1602) 	for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1603) 		if (is_shadow_present_pte(*pos)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1604) 			printk(KERN_ERR "%s: %p %llx\n", __func__,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1605) 			       pos, *pos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1606) 			return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1607) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1608) 	return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1609) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1610) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1611) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1612) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1613)  * This value is the sum of all of the kvm instances's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1614)  * kvm->arch.n_used_mmu_pages values.  We need a global,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1615)  * aggregate version in order to make the slab shrinker
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1616)  * faster
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1617)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1618) static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1619) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1620) 	kvm->arch.n_used_mmu_pages += nr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1621) 	percpu_counter_add(&kvm_total_used_mmu_pages, nr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1622) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1623) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1624) static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1625) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1626) 	MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1627) 	hlist_del(&sp->hash_link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1628) 	list_del(&sp->link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1629) 	free_page((unsigned long)sp->spt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1630) 	if (!sp->role.direct)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1631) 		free_page((unsigned long)sp->gfns);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1632) 	kmem_cache_free(mmu_page_header_cache, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1633) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1634) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1635) static unsigned kvm_page_table_hashfn(gfn_t gfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1636) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1637) 	return hash_64(gfn, KVM_MMU_HASH_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1638) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1639) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1640) static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1641) 				    struct kvm_mmu_page *sp, u64 *parent_pte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1642) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1643) 	if (!parent_pte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1644) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1645) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1646) 	pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1647) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1648) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1649) static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1650) 				       u64 *parent_pte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1651) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1652) 	__pte_list_remove(parent_pte, &sp->parent_ptes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1653) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1654) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1655) static void drop_parent_pte(struct kvm_mmu_page *sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1656) 			    u64 *parent_pte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1657) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1658) 	mmu_page_remove_parent_pte(sp, parent_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1659) 	mmu_spte_clear_no_track(parent_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1660) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1661) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1662) static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1663) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1664) 	struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1665) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1666) 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1667) 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1668) 	if (!direct)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1669) 		sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1670) 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1671) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1672) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1673) 	 * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1674) 	 * depends on valid pages being added to the head of the list.  See
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1675) 	 * comments in kvm_zap_obsolete_pages().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1676) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1677) 	sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1678) 	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1679) 	kvm_mod_used_mmu_pages(vcpu->kvm, +1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1680) 	return sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1681) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1682) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1683) static void mark_unsync(u64 *spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1684) static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1685) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1686) 	u64 *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1687) 	struct rmap_iterator iter;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1688) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1689) 	for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1690) 		mark_unsync(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1691) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1692) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1693) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1694) static void mark_unsync(u64 *spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1695) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1696) 	struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1697) 	unsigned int index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1698) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1699) 	sp = sptep_to_sp(spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1700) 	index = spte - sp->spt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1701) 	if (__test_and_set_bit(index, sp->unsync_child_bitmap))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1702) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1703) 	if (sp->unsync_children++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1704) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1705) 	kvm_mmu_mark_parents_unsync(sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1706) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1707) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1708) static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1709) 			       struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1710) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1711) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1712) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1713) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1714) #define KVM_PAGE_ARRAY_NR 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1715) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1716) struct kvm_mmu_pages {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1717) 	struct mmu_page_and_offset {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1718) 		struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1719) 		unsigned int idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1720) 	} page[KVM_PAGE_ARRAY_NR];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1721) 	unsigned int nr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1722) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1723) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1724) static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1725) 			 int idx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1726) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1727) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1728) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1729) 	if (sp->unsync)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1730) 		for (i=0; i < pvec->nr; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1731) 			if (pvec->page[i].sp == sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1732) 				return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1733) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1734) 	pvec->page[pvec->nr].sp = sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1735) 	pvec->page[pvec->nr].idx = idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1736) 	pvec->nr++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1737) 	return (pvec->nr == KVM_PAGE_ARRAY_NR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1738) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1739) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1740) static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1741) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1742) 	--sp->unsync_children;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1743) 	WARN_ON((int)sp->unsync_children < 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1744) 	__clear_bit(idx, sp->unsync_child_bitmap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1745) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1746) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1747) static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1748) 			   struct kvm_mmu_pages *pvec)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1749) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1750) 	int i, ret, nr_unsync_leaf = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1751) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1752) 	for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1753) 		struct kvm_mmu_page *child;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1754) 		u64 ent = sp->spt[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1755) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1756) 		if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1757) 			clear_unsync_child_bit(sp, i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1758) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1759) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1760) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1761) 		child = to_shadow_page(ent & PT64_BASE_ADDR_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1762) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1763) 		if (child->unsync_children) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1764) 			if (mmu_pages_add(pvec, child, i))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1765) 				return -ENOSPC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1766) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1767) 			ret = __mmu_unsync_walk(child, pvec);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1768) 			if (!ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1769) 				clear_unsync_child_bit(sp, i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1770) 				continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1771) 			} else if (ret > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1772) 				nr_unsync_leaf += ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1773) 			} else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1774) 				return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1775) 		} else if (child->unsync) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1776) 			nr_unsync_leaf++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1777) 			if (mmu_pages_add(pvec, child, i))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1778) 				return -ENOSPC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1779) 		} else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1780) 			clear_unsync_child_bit(sp, i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1781) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1782) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1783) 	return nr_unsync_leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1784) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1785) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1786) #define INVALID_INDEX (-1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1787) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1788) static int mmu_unsync_walk(struct kvm_mmu_page *sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1789) 			   struct kvm_mmu_pages *pvec)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1790) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1791) 	pvec->nr = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1792) 	if (!sp->unsync_children)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1793) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1794) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1795) 	mmu_pages_add(pvec, sp, INVALID_INDEX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1796) 	return __mmu_unsync_walk(sp, pvec);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1797) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1798) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1799) static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1800) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1801) 	WARN_ON(!sp->unsync);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1802) 	trace_kvm_mmu_sync_page(sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1803) 	sp->unsync = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1804) 	--kvm->stat.mmu_unsync;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1805) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1806) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1807) static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1808) 				     struct list_head *invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1809) static void kvm_mmu_commit_zap_page(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1810) 				    struct list_head *invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1811) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1812) #define for_each_valid_sp(_kvm, _sp, _list)				\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1813) 	hlist_for_each_entry(_sp, _list, hash_link)			\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1814) 		if (is_obsolete_sp((_kvm), (_sp))) {			\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1815) 		} else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1816) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1817) #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)			\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1818) 	for_each_valid_sp(_kvm, _sp,					\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1819) 	  &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)])	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1820) 		if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1821) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1822) static inline bool is_ept_sp(struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1823) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1824) 	return sp->role.cr0_wp && sp->role.smap_andnot_wp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1825) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1826) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1827) /* @sp->gfn should be write-protected at the call site */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1828) static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1829) 			    struct list_head *invalid_list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1830) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1831) 	if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1832) 	    vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1833) 		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1834) 		return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1835) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1836) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1837) 	return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1838) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1839) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1840) static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1841) 					struct list_head *invalid_list,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1842) 					bool remote_flush)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1843) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1844) 	if (!remote_flush && list_empty(invalid_list))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1845) 		return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1846) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1847) 	if (!list_empty(invalid_list))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1848) 		kvm_mmu_commit_zap_page(kvm, invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1849) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1850) 		kvm_flush_remote_tlbs(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1851) 	return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1852) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1853) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1854) static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1855) 				 struct list_head *invalid_list,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1856) 				 bool remote_flush, bool local_flush)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1857) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1858) 	if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1859) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1860) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1861) 	if (local_flush)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1862) 		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1863) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1864) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1865) #ifdef CONFIG_KVM_MMU_AUDIT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1866) #include "mmu_audit.c"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1867) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1868) static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1869) static void mmu_audit_disable(void) { }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1870) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1871) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1872) static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1873) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1874) 	return sp->role.invalid ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1875) 	       unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1876) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1877) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1878) static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1879) 			 struct list_head *invalid_list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1880) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1881) 	kvm_unlink_unsync_page(vcpu->kvm, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1882) 	return __kvm_sync_page(vcpu, sp, invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1883) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1884) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1885) /* @gfn should be write-protected at the call site */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1886) static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1887) 			   struct list_head *invalid_list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1888) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1889) 	struct kvm_mmu_page *s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1890) 	bool ret = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1891) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1892) 	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1893) 		if (!s->unsync)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1894) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1895) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1896) 		WARN_ON(s->role.level != PG_LEVEL_4K);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1897) 		ret |= kvm_sync_page(vcpu, s, invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1898) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1899) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1900) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1901) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1902) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1903) struct mmu_page_path {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1904) 	struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1905) 	unsigned int idx[PT64_ROOT_MAX_LEVEL];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1906) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1907) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1908) #define for_each_sp(pvec, sp, parents, i)			\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1909) 		for (i = mmu_pages_first(&pvec, &parents);	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1910) 			i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1911) 			i = mmu_pages_next(&pvec, &parents, i))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1912) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1913) static int mmu_pages_next(struct kvm_mmu_pages *pvec,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1914) 			  struct mmu_page_path *parents,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1915) 			  int i)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1916) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1917) 	int n;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1918) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1919) 	for (n = i+1; n < pvec->nr; n++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1920) 		struct kvm_mmu_page *sp = pvec->page[n].sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1921) 		unsigned idx = pvec->page[n].idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1922) 		int level = sp->role.level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1923) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1924) 		parents->idx[level-1] = idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1925) 		if (level == PG_LEVEL_4K)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1926) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1927) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1928) 		parents->parent[level-2] = sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1929) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1930) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1931) 	return n;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1932) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1933) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1934) static int mmu_pages_first(struct kvm_mmu_pages *pvec,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1935) 			   struct mmu_page_path *parents)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1936) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1937) 	struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1938) 	int level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1939) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1940) 	if (pvec->nr == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1941) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1942) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1943) 	WARN_ON(pvec->page[0].idx != INVALID_INDEX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1944) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1945) 	sp = pvec->page[0].sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1946) 	level = sp->role.level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1947) 	WARN_ON(level == PG_LEVEL_4K);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1948) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1949) 	parents->parent[level-2] = sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1950) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1951) 	/* Also set up a sentinel.  Further entries in pvec are all
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1952) 	 * children of sp, so this element is never overwritten.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1953) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1954) 	parents->parent[level-1] = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1955) 	return mmu_pages_next(pvec, parents, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1956) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1957) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1958) static void mmu_pages_clear_parents(struct mmu_page_path *parents)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1959) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1960) 	struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1961) 	unsigned int level = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1962) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1963) 	do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1964) 		unsigned int idx = parents->idx[level];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1965) 		sp = parents->parent[level];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1966) 		if (!sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1967) 			return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1968) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1969) 		WARN_ON(idx == INVALID_INDEX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1970) 		clear_unsync_child_bit(sp, idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1971) 		level++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1972) 	} while (!sp->unsync_children);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1973) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1974) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1975) static void mmu_sync_children(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1976) 			      struct kvm_mmu_page *parent)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1977) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1978) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1979) 	struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1980) 	struct mmu_page_path parents;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1981) 	struct kvm_mmu_pages pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1982) 	LIST_HEAD(invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1983) 	bool flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1984) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1985) 	while (mmu_unsync_walk(parent, &pages)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1986) 		bool protected = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1987) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1988) 		for_each_sp(pages, sp, parents, i)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1989) 			protected |= rmap_write_protect(vcpu, sp->gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1990) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1991) 		if (protected) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1992) 			kvm_flush_remote_tlbs(vcpu->kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1993) 			flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1994) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1995) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1996) 		for_each_sp(pages, sp, parents, i) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1997) 			flush |= kvm_sync_page(vcpu, sp, &invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1998) 			mmu_pages_clear_parents(&parents);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1999) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2000) 		if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2001) 			kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2002) 			cond_resched_lock(&vcpu->kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2003) 			flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2004) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2005) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2006) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2007) 	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2008) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2009) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2010) static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2011) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2012) 	atomic_set(&sp->write_flooding_count,  0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2013) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2014) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2015) static void clear_sp_write_flooding_count(u64 *spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2016) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2017) 	__clear_sp_write_flooding_count(sptep_to_sp(spte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2018) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2019) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2020) static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2021) 					     gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2022) 					     gva_t gaddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2023) 					     unsigned level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2024) 					     int direct,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2025) 					     unsigned int access)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2026) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2027) 	bool direct_mmu = vcpu->arch.mmu->direct_map;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2028) 	union kvm_mmu_page_role role;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2029) 	struct hlist_head *sp_list;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2030) 	unsigned quadrant;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2031) 	struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2032) 	bool need_sync = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2033) 	bool flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2034) 	int collisions = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2035) 	LIST_HEAD(invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2036) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2037) 	role = vcpu->arch.mmu->mmu_role.base;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2038) 	role.level = level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2039) 	role.direct = direct;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2040) 	if (role.direct)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2041) 		role.gpte_is_8_bytes = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2042) 	role.access = access;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2043) 	if (!direct_mmu && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2044) 		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2045) 		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2046) 		role.quadrant = quadrant;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2047) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2048) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2049) 	sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2050) 	for_each_valid_sp(vcpu->kvm, sp, sp_list) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2051) 		if (sp->gfn != gfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2052) 			collisions++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2053) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2054) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2055) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2056) 		if (!need_sync && sp->unsync)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2057) 			need_sync = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2058) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2059) 		if (sp->role.word != role.word)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2060) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2061) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2062) 		if (direct_mmu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2063) 			goto trace_get_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2064) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2065) 		if (sp->unsync) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2066) 			/* The page is good, but __kvm_sync_page might still end
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2067) 			 * up zapping it.  If so, break in order to rebuild it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2068) 			 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2069) 			if (!__kvm_sync_page(vcpu, sp, &invalid_list))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2070) 				break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2071) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2072) 			WARN_ON(!list_empty(&invalid_list));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2073) 			kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2074) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2075) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2076) 		if (sp->unsync_children)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2077) 			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2078) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2079) 		__clear_sp_write_flooding_count(sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2080) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2081) trace_get_page:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2082) 		trace_kvm_mmu_get_page(sp, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2083) 		goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2084) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2085) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2086) 	++vcpu->kvm->stat.mmu_cache_miss;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2087) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2088) 	sp = kvm_mmu_alloc_page(vcpu, direct);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2089) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2090) 	sp->gfn = gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2091) 	sp->role = role;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2092) 	hlist_add_head(&sp->hash_link, sp_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2093) 	if (!direct) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2094) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2095) 		 * we should do write protection before syncing pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2096) 		 * otherwise the content of the synced shadow page may
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2097) 		 * be inconsistent with guest page table.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2098) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2099) 		account_shadowed(vcpu->kvm, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2100) 		if (level == PG_LEVEL_4K && rmap_write_protect(vcpu, gfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2101) 			kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2102) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2103) 		if (level > PG_LEVEL_4K && need_sync)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2104) 			flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2105) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2106) 	trace_kvm_mmu_get_page(sp, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2107) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2108) 	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2109) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2110) 	if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2111) 		vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2112) 	return sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2113) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2114) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2115) static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2116) 					struct kvm_vcpu *vcpu, hpa_t root,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2117) 					u64 addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2118) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2119) 	iterator->addr = addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2120) 	iterator->shadow_addr = root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2121) 	iterator->level = vcpu->arch.mmu->shadow_root_level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2122) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2123) 	if (iterator->level == PT64_ROOT_4LEVEL &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2124) 	    vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2125) 	    !vcpu->arch.mmu->direct_map)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2126) 		--iterator->level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2127) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2128) 	if (iterator->level == PT32E_ROOT_LEVEL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2129) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2130) 		 * prev_root is currently only used for 64-bit hosts. So only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2131) 		 * the active root_hpa is valid here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2132) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2133) 		BUG_ON(root != vcpu->arch.mmu->root_hpa);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2134) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2135) 		iterator->shadow_addr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2136) 			= vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2137) 		iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2138) 		--iterator->level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2139) 		if (!iterator->shadow_addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2140) 			iterator->level = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2141) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2142) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2143) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2144) static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2145) 			     struct kvm_vcpu *vcpu, u64 addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2146) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2147) 	shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2148) 				    addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2149) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2150) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2151) static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2152) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2153) 	if (iterator->level < PG_LEVEL_4K)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2154) 		return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2155) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2156) 	iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2157) 	iterator->sptep	= ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2158) 	return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2159) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2160) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2161) static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2162) 			       u64 spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2163) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2164) 	if (is_last_spte(spte, iterator->level)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2165) 		iterator->level = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2166) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2167) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2168) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2169) 	iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2170) 	--iterator->level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2171) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2172) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2173) static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2174) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2175) 	__shadow_walk_next(iterator, *iterator->sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2176) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2177) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2178) static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2179) 			     struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2180) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2181) 	u64 spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2182) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2183) 	BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2184) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2185) 	spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2186) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2187) 	mmu_spte_set(sptep, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2188) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2189) 	mmu_page_add_parent_pte(vcpu, sp, sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2190) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2191) 	if (sp->unsync_children || sp->unsync)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2192) 		mark_unsync(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2193) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2194) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2195) static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2196) 				   unsigned direct_access)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2197) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2198) 	if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2199) 		struct kvm_mmu_page *child;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2200) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2201) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2202) 		 * For the direct sp, if the guest pte's dirty bit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2203) 		 * changed form clean to dirty, it will corrupt the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2204) 		 * sp's access: allow writable in the read-only sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2205) 		 * so we should update the spte at this point to get
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2206) 		 * a new sp with the correct access.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2207) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2208) 		child = to_shadow_page(*sptep & PT64_BASE_ADDR_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2209) 		if (child->role.access == direct_access)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2210) 			return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2211) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2212) 		drop_parent_pte(child, sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2213) 		kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2214) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2215) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2216) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2217) /* Returns the number of zapped non-leaf child shadow pages. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2218) static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2219) 			    u64 *spte, struct list_head *invalid_list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2220) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2221) 	u64 pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2222) 	struct kvm_mmu_page *child;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2223) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2224) 	pte = *spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2225) 	if (is_shadow_present_pte(pte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2226) 		if (is_last_spte(pte, sp->role.level)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2227) 			drop_spte(kvm, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2228) 			if (is_large_pte(pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2229) 				--kvm->stat.lpages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2230) 		} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2231) 			child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2232) 			drop_parent_pte(child, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2233) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2234) 			/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2235) 			 * Recursively zap nested TDP SPs, parentless SPs are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2236) 			 * unlikely to be used again in the near future.  This
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2237) 			 * avoids retaining a large number of stale nested SPs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2238) 			 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2239) 			if (tdp_enabled && invalid_list &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2240) 			    child->role.guest_mode && !child->parent_ptes.val)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2241) 				return kvm_mmu_prepare_zap_page(kvm, child,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2242) 								invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2243) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2244) 	} else if (is_mmio_spte(pte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2245) 		mmu_spte_clear_no_track(spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2246) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2247) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2248) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2249) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2250) static int kvm_mmu_page_unlink_children(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2251) 					struct kvm_mmu_page *sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2252) 					struct list_head *invalid_list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2253) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2254) 	int zapped = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2255) 	unsigned i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2256) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2257) 	for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2258) 		zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2259) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2260) 	return zapped;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2261) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2262) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2263) static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2264) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2265) 	u64 *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2266) 	struct rmap_iterator iter;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2267) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2268) 	while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2269) 		drop_parent_pte(sp, sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2270) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2271) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2272) static int mmu_zap_unsync_children(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2273) 				   struct kvm_mmu_page *parent,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2274) 				   struct list_head *invalid_list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2275) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2276) 	int i, zapped = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2277) 	struct mmu_page_path parents;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2278) 	struct kvm_mmu_pages pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2279) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2280) 	if (parent->role.level == PG_LEVEL_4K)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2281) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2282) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2283) 	while (mmu_unsync_walk(parent, &pages)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2284) 		struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2285) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2286) 		for_each_sp(pages, sp, parents, i) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2287) 			kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2288) 			mmu_pages_clear_parents(&parents);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2289) 			zapped++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2290) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2291) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2292) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2293) 	return zapped;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2294) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2295) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2296) static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2297) 				       struct kvm_mmu_page *sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2298) 				       struct list_head *invalid_list,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2299) 				       int *nr_zapped)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2300) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2301) 	bool list_unstable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2302) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2303) 	trace_kvm_mmu_prepare_zap_page(sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2304) 	++kvm->stat.mmu_shadow_zapped;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2305) 	*nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2306) 	*nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2307) 	kvm_mmu_unlink_parents(kvm, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2308) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2309) 	/* Zapping children means active_mmu_pages has become unstable. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2310) 	list_unstable = *nr_zapped;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2311) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2312) 	if (!sp->role.invalid && !sp->role.direct)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2313) 		unaccount_shadowed(kvm, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2314) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2315) 	if (sp->unsync)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2316) 		kvm_unlink_unsync_page(kvm, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2317) 	if (!sp->root_count) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2318) 		/* Count self */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2319) 		(*nr_zapped)++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2320) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2321) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2322) 		 * Already invalid pages (previously active roots) are not on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2323) 		 * the active page list.  See list_del() in the "else" case of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2324) 		 * !sp->root_count.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2325) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2326) 		if (sp->role.invalid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2327) 			list_add(&sp->link, invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2328) 		else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2329) 			list_move(&sp->link, invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2330) 		kvm_mod_used_mmu_pages(kvm, -1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2331) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2332) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2333) 		 * Remove the active root from the active page list, the root
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2334) 		 * will be explicitly freed when the root_count hits zero.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2335) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2336) 		list_del(&sp->link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2337) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2338) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2339) 		 * Obsolete pages cannot be used on any vCPUs, see the comment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2340) 		 * in kvm_mmu_zap_all_fast().  Note, is_obsolete_sp() also
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2341) 		 * treats invalid shadow pages as being obsolete.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2342) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2343) 		if (!is_obsolete_sp(kvm, sp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2344) 			kvm_reload_remote_mmus(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2345) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2346) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2347) 	if (sp->lpage_disallowed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2348) 		unaccount_huge_nx_page(kvm, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2349) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2350) 	sp->role.invalid = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2351) 	return list_unstable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2352) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2353) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2354) static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2355) 				     struct list_head *invalid_list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2356) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2357) 	int nr_zapped;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2358) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2359) 	__kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2360) 	return nr_zapped;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2361) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2362) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2363) static void kvm_mmu_commit_zap_page(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2364) 				    struct list_head *invalid_list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2365) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2366) 	struct kvm_mmu_page *sp, *nsp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2367) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2368) 	if (list_empty(invalid_list))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2369) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2370) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2371) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2372) 	 * We need to make sure everyone sees our modifications to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2373) 	 * the page tables and see changes to vcpu->mode here. The barrier
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2374) 	 * in the kvm_flush_remote_tlbs() achieves this. This pairs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2375) 	 * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2376) 	 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2377) 	 * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2378) 	 * guest mode and/or lockless shadow page table walks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2379) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2380) 	kvm_flush_remote_tlbs(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2381) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2382) 	list_for_each_entry_safe(sp, nsp, invalid_list, link) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2383) 		WARN_ON(!sp->role.invalid || sp->root_count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2384) 		kvm_mmu_free_page(sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2385) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2386) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2387) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2388) static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2389) 						  unsigned long nr_to_zap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2390) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2391) 	unsigned long total_zapped = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2392) 	struct kvm_mmu_page *sp, *tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2393) 	LIST_HEAD(invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2394) 	bool unstable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2395) 	int nr_zapped;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2396) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2397) 	if (list_empty(&kvm->arch.active_mmu_pages))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2398) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2399) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2400) restart:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2401) 	list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2402) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2403) 		 * Don't zap active root pages, the page itself can't be freed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2404) 		 * and zapping it will just force vCPUs to realloc and reload.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2405) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2406) 		if (sp->root_count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2407) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2408) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2409) 		unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2410) 						      &nr_zapped);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2411) 		total_zapped += nr_zapped;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2412) 		if (total_zapped >= nr_to_zap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2413) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2414) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2415) 		if (unstable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2416) 			goto restart;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2417) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2418) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2419) 	kvm_mmu_commit_zap_page(kvm, &invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2420) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2421) 	kvm->stat.mmu_recycled += total_zapped;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2422) 	return total_zapped;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2423) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2424) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2425) static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2426) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2427) 	if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2428) 		return kvm->arch.n_max_mmu_pages -
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2429) 			kvm->arch.n_used_mmu_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2430) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2431) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2432) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2433) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2434) static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2435) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2436) 	unsigned long avail = kvm_mmu_available_pages(vcpu->kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2437) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2438) 	if (likely(avail >= KVM_MIN_FREE_MMU_PAGES))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2439) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2440) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2441) 	kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2442) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2443) 	if (!kvm_mmu_available_pages(vcpu->kvm))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2444) 		return -ENOSPC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2445) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2446) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2447) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2448) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2449)  * Changing the number of mmu pages allocated to the vm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2450)  * Note: if goal_nr_mmu_pages is too small, you will get dead lock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2451)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2452) void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2453) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2454) 	spin_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2455) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2456) 	if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2457) 		kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2458) 						  goal_nr_mmu_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2459) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2460) 		goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2461) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2462) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2463) 	kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2464) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2465) 	spin_unlock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2466) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2467) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2468) int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2469) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2470) 	struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2471) 	LIST_HEAD(invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2472) 	int r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2473) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2474) 	pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2475) 	r = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2476) 	spin_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2477) 	for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2478) 		pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2479) 			 sp->role.word);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2480) 		r = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2481) 		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2482) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2483) 	kvm_mmu_commit_zap_page(kvm, &invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2484) 	spin_unlock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2485) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2486) 	return r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2487) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2488) EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2489) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2490) static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2491) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2492) 	trace_kvm_mmu_unsync_page(sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2493) 	++vcpu->kvm->stat.mmu_unsync;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2494) 	sp->unsync = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2495) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2496) 	kvm_mmu_mark_parents_unsync(sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2497) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2498) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2499) bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2500) 			    bool can_unsync)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2501) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2502) 	struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2503) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2504) 	if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2505) 		return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2506) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2507) 	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2508) 		if (!can_unsync)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2509) 			return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2510) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2511) 		if (sp->unsync)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2512) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2513) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2514) 		WARN_ON(sp->role.level != PG_LEVEL_4K);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2515) 		kvm_unsync_page(vcpu, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2516) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2517) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2518) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2519) 	 * We need to ensure that the marking of unsync pages is visible
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2520) 	 * before the SPTE is updated to allow writes because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2521) 	 * kvm_mmu_sync_roots() checks the unsync flags without holding
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2522) 	 * the MMU lock and so can race with this. If the SPTE was updated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2523) 	 * before the page had been marked as unsync-ed, something like the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2524) 	 * following could happen:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2525) 	 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2526) 	 * CPU 1                    CPU 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2527) 	 * ---------------------------------------------------------------------
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2528) 	 * 1.2 Host updates SPTE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2529) 	 *     to be writable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2530) 	 *                      2.1 Guest writes a GPTE for GVA X.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2531) 	 *                          (GPTE being in the guest page table shadowed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2532) 	 *                           by the SP from CPU 1.)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2533) 	 *                          This reads SPTE during the page table walk.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2534) 	 *                          Since SPTE.W is read as 1, there is no
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2535) 	 *                          fault.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2536) 	 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2537) 	 *                      2.2 Guest issues TLB flush.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2538) 	 *                          That causes a VM Exit.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2539) 	 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2540) 	 *                      2.3 kvm_mmu_sync_pages() reads sp->unsync.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2541) 	 *                          Since it is false, so it just returns.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2542) 	 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2543) 	 *                      2.4 Guest accesses GVA X.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2544) 	 *                          Since the mapping in the SP was not updated,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2545) 	 *                          so the old mapping for GVA X incorrectly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2546) 	 *                          gets used.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2547) 	 * 1.1 Host marks SP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2548) 	 *     as unsync
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2549) 	 *     (sp->unsync = true)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2550) 	 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2551) 	 * The write barrier below ensures that 1.1 happens before 1.2 and thus
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2552) 	 * the situation in 2.4 does not arise. The implicit barrier in 2.2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2553) 	 * pairs with this write barrier.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2554) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2555) 	smp_wmb();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2556) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2557) 	return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2558) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2559) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2560) static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2561) 		    unsigned int pte_access, int level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2562) 		    gfn_t gfn, kvm_pfn_t pfn, bool speculative,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2563) 		    bool can_unsync, bool host_writable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2564) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2565) 	u64 spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2566) 	struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2567) 	int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2568) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2569) 	if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2570) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2571) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2572) 	sp = sptep_to_sp(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2573) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2574) 	ret = make_spte(vcpu, pte_access, level, gfn, pfn, *sptep, speculative,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2575) 			can_unsync, host_writable, sp_ad_disabled(sp), &spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2576) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2577) 	if (spte & PT_WRITABLE_MASK)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2578) 		kvm_vcpu_mark_page_dirty(vcpu, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2579) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2580) 	if (*sptep == spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2581) 		ret |= SET_SPTE_SPURIOUS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2582) 	else if (mmu_spte_update(sptep, spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2583) 		ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2584) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2585) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2586) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2587) static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2588) 			unsigned int pte_access, bool write_fault, int level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2589) 			gfn_t gfn, kvm_pfn_t pfn, bool speculative,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2590) 			bool host_writable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2591) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2592) 	int was_rmapped = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2593) 	int rmap_count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2594) 	int set_spte_ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2595) 	int ret = RET_PF_FIXED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2596) 	bool flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2597) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2598) 	pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2599) 		 *sptep, write_fault, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2600) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2601) 	if (is_shadow_present_pte(*sptep)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2602) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2603) 		 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2604) 		 * the parent of the now unreachable PTE.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2605) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2606) 		if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2607) 			struct kvm_mmu_page *child;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2608) 			u64 pte = *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2609) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2610) 			child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2611) 			drop_parent_pte(child, sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2612) 			flush = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2613) 		} else if (pfn != spte_to_pfn(*sptep)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2614) 			pgprintk("hfn old %llx new %llx\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2615) 				 spte_to_pfn(*sptep), pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2616) 			drop_spte(vcpu->kvm, sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2617) 			flush = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2618) 		} else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2619) 			was_rmapped = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2620) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2621) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2622) 	set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2623) 				speculative, true, host_writable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2624) 	if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2625) 		if (write_fault)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2626) 			ret = RET_PF_EMULATE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2627) 		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2628) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2629) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2630) 	if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2631) 		kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2632) 				KVM_PAGES_PER_HPAGE(level));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2633) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2634) 	if (unlikely(is_mmio_spte(*sptep)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2635) 		ret = RET_PF_EMULATE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2636) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2637) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2638) 	 * The fault is fully spurious if and only if the new SPTE and old SPTE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2639) 	 * are identical, and emulation is not required.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2640) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2641) 	if ((set_spte_ret & SET_SPTE_SPURIOUS) && ret == RET_PF_FIXED) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2642) 		WARN_ON_ONCE(!was_rmapped);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2643) 		return RET_PF_SPURIOUS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2644) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2645) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2646) 	pgprintk("%s: setting spte %llx\n", __func__, *sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2647) 	trace_kvm_mmu_set_spte(level, gfn, sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2648) 	if (!was_rmapped && is_large_pte(*sptep))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2649) 		++vcpu->kvm->stat.lpages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2650) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2651) 	if (is_shadow_present_pte(*sptep)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2652) 		if (!was_rmapped) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2653) 			rmap_count = rmap_add(vcpu, sptep, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2654) 			if (rmap_count > RMAP_RECYCLE_THRESHOLD)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2655) 				rmap_recycle(vcpu, sptep, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2656) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2657) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2658) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2659) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2660) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2661) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2662) static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2663) 				     bool no_dirty_log)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2664) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2665) 	struct kvm_memory_slot *slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2666) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2667) 	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2668) 	if (!slot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2669) 		return KVM_PFN_ERR_FAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2670) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2671) 	return gfn_to_pfn_memslot_atomic(slot, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2672) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2673) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2674) static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2675) 				    struct kvm_mmu_page *sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2676) 				    u64 *start, u64 *end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2677) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2678) 	struct page *pages[PTE_PREFETCH_NUM];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2679) 	struct kvm_memory_slot *slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2680) 	unsigned int access = sp->role.access;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2681) 	int i, ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2682) 	gfn_t gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2683) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2684) 	gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2685) 	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2686) 	if (!slot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2687) 		return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2688) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2689) 	ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2690) 	if (ret <= 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2691) 		return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2692) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2693) 	for (i = 0; i < ret; i++, gfn++, start++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2694) 		mmu_set_spte(vcpu, start, access, false, sp->role.level, gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2695) 			     page_to_pfn(pages[i]), true, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2696) 		put_page(pages[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2697) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2698) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2699) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2700) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2701) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2702) static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2703) 				  struct kvm_mmu_page *sp, u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2704) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2705) 	u64 *spte, *start = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2706) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2707) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2708) 	WARN_ON(!sp->role.direct);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2709) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2710) 	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2711) 	spte = sp->spt + i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2712) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2713) 	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2714) 		if (is_shadow_present_pte(*spte) || spte == sptep) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2715) 			if (!start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2716) 				continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2717) 			if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2718) 				break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2719) 			start = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2720) 		} else if (!start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2721) 			start = spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2722) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2723) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2724) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2725) static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2726) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2727) 	struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2728) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2729) 	sp = sptep_to_sp(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2730) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2731) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2732) 	 * Without accessed bits, there's no way to distinguish between
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2733) 	 * actually accessed translations and prefetched, so disable pte
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2734) 	 * prefetch if accessed bits aren't available.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2735) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2736) 	if (sp_ad_disabled(sp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2737) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2738) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2739) 	if (sp->role.level > PG_LEVEL_4K)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2740) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2741) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2742) 	__direct_pte_prefetch(vcpu, sp, sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2743) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2744) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2745) static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2746) 				  kvm_pfn_t pfn, struct kvm_memory_slot *slot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2747) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2748) 	unsigned long hva;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2749) 	pte_t *pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2750) 	int level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2751) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2752) 	if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2753) 		return PG_LEVEL_4K;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2754) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2755) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2756) 	 * Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2757) 	 * is not solely for performance, it's also necessary to avoid the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2758) 	 * "writable" check in __gfn_to_hva_many(), which will always fail on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2759) 	 * read-only memslots due to gfn_to_hva() assuming writes.  Earlier
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2760) 	 * page fault steps have already verified the guest isn't writing a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2761) 	 * read-only memslot.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2762) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2763) 	hva = __gfn_to_hva_memslot(slot, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2764) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2765) 	pte = lookup_address_in_mm(vcpu->kvm->mm, hva, &level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2766) 	if (unlikely(!pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2767) 		return PG_LEVEL_4K;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2768) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2769) 	return level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2770) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2771) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2772) int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2773) 			    int max_level, kvm_pfn_t *pfnp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2774) 			    bool huge_page_disallowed, int *req_level)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2775) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2776) 	struct kvm_memory_slot *slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2777) 	struct kvm_lpage_info *linfo;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2778) 	kvm_pfn_t pfn = *pfnp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2779) 	kvm_pfn_t mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2780) 	int level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2781) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2782) 	*req_level = PG_LEVEL_4K;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2783) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2784) 	if (unlikely(max_level == PG_LEVEL_4K))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2785) 		return PG_LEVEL_4K;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2786) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2787) 	if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2788) 		return PG_LEVEL_4K;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2789) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2790) 	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2791) 	if (!slot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2792) 		return PG_LEVEL_4K;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2793) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2794) 	max_level = min(max_level, max_huge_page_level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2795) 	for ( ; max_level > PG_LEVEL_4K; max_level--) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2796) 		linfo = lpage_info_slot(gfn, slot, max_level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2797) 		if (!linfo->disallow_lpage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2798) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2799) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2800) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2801) 	if (max_level == PG_LEVEL_4K)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2802) 		return PG_LEVEL_4K;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2803) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2804) 	level = host_pfn_mapping_level(vcpu, gfn, pfn, slot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2805) 	if (level == PG_LEVEL_4K)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2806) 		return level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2807) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2808) 	*req_level = level = min(level, max_level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2809) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2810) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2811) 	 * Enforce the iTLB multihit workaround after capturing the requested
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2812) 	 * level, which will be used to do precise, accurate accounting.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2813) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2814) 	if (huge_page_disallowed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2815) 		return PG_LEVEL_4K;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2816) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2817) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2818) 	 * mmu_notifier_retry() was successful and mmu_lock is held, so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2819) 	 * the pmd can't be split from under us.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2820) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2821) 	mask = KVM_PAGES_PER_HPAGE(level) - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2822) 	VM_BUG_ON((gfn & mask) != (pfn & mask));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2823) 	*pfnp = pfn & ~mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2824) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2825) 	return level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2826) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2827) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2828) void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2829) 				kvm_pfn_t *pfnp, int *goal_levelp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2830) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2831) 	int level = *goal_levelp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2832) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2833) 	if (cur_level == level && level > PG_LEVEL_4K &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2834) 	    is_shadow_present_pte(spte) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2835) 	    !is_large_pte(spte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2836) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2837) 		 * A small SPTE exists for this pfn, but FNAME(fetch)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2838) 		 * and __direct_map would like to create a large PTE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2839) 		 * instead: just force them to go down another level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2840) 		 * patching back for them into pfn the next 9 bits of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2841) 		 * the address.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2842) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2843) 		u64 page_mask = KVM_PAGES_PER_HPAGE(level) -
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2844) 				KVM_PAGES_PER_HPAGE(level - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2845) 		*pfnp |= gfn & page_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2846) 		(*goal_levelp)--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2847) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2848) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2849) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2850) static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2851) 			int map_writable, int max_level, kvm_pfn_t pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2852) 			bool prefault, bool is_tdp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2853) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2854) 	bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2855) 	bool write = error_code & PFERR_WRITE_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2856) 	bool exec = error_code & PFERR_FETCH_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2857) 	bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2858) 	struct kvm_shadow_walk_iterator it;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2859) 	struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2860) 	int level, req_level, ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2861) 	gfn_t gfn = gpa >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2862) 	gfn_t base_gfn = gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2863) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2864) 	if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2865) 		return RET_PF_RETRY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2866) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2867) 	level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2868) 					huge_page_disallowed, &req_level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2869) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2870) 	trace_kvm_mmu_spte_requested(gpa, level, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2871) 	for_each_shadow_entry(vcpu, gpa, it) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2872) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2873) 		 * We cannot overwrite existing page tables with an NX
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2874) 		 * large page, as the leaf could be executable.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2875) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2876) 		if (nx_huge_page_workaround_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2877) 			disallowed_hugepage_adjust(*it.sptep, gfn, it.level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2878) 						   &pfn, &level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2879) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2880) 		base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2881) 		if (it.level == level)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2882) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2883) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2884) 		drop_large_spte(vcpu, it.sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2885) 		if (!is_shadow_present_pte(*it.sptep)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2886) 			sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2887) 					      it.level - 1, true, ACC_ALL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2888) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2889) 			link_shadow_page(vcpu, it.sptep, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2890) 			if (is_tdp && huge_page_disallowed &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2891) 			    req_level >= it.level)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2892) 				account_huge_nx_page(vcpu->kvm, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2893) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2894) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2895) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2896) 	ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2897) 			   write, level, base_gfn, pfn, prefault,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2898) 			   map_writable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2899) 	if (ret == RET_PF_SPURIOUS)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2900) 		return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2901) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2902) 	direct_pte_prefetch(vcpu, it.sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2903) 	++vcpu->stat.pf_fixed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2904) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2905) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2906) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2907) static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2908) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2909) 	send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2910) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2911) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2912) static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2913) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2914) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2915) 	 * Do not cache the mmio info caused by writing the readonly gfn
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2916) 	 * into the spte otherwise read access on readonly gfn also can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2917) 	 * caused mmio page fault and treat it as mmio access.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2918) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2919) 	if (pfn == KVM_PFN_ERR_RO_FAULT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2920) 		return RET_PF_EMULATE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2921) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2922) 	if (pfn == KVM_PFN_ERR_HWPOISON) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2923) 		kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2924) 		return RET_PF_RETRY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2925) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2926) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2927) 	return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2928) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2929) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2930) static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2931) 				kvm_pfn_t pfn, unsigned int access,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2932) 				int *ret_val)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2933) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2934) 	/* The pfn is invalid, report the error! */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2935) 	if (unlikely(is_error_pfn(pfn))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2936) 		*ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2937) 		return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2938) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2939) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2940) 	if (unlikely(is_noslot_pfn(pfn)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2941) 		vcpu_cache_mmio_info(vcpu, gva, gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2942) 				     access & shadow_mmio_access_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2943) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2944) 	return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2945) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2946) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2947) static bool page_fault_can_be_fast(u32 error_code)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2948) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2949) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2950) 	 * Do not fix the mmio spte with invalid generation number which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2951) 	 * need to be updated by slow page fault path.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2952) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2953) 	if (unlikely(error_code & PFERR_RSVD_MASK))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2954) 		return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2955) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2956) 	/* See if the page fault is due to an NX violation */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2957) 	if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2958) 		      == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2959) 		return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2960) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2961) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2962) 	 * #PF can be fast if:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2963) 	 * 1. The shadow page table entry is not present, which could mean that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2964) 	 *    the fault is potentially caused by access tracking (if enabled).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2965) 	 * 2. The shadow page table entry is present and the fault
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2966) 	 *    is caused by write-protect, that means we just need change the W
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2967) 	 *    bit of the spte which can be done out of mmu-lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2968) 	 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2969) 	 * However, if access tracking is disabled we know that a non-present
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2970) 	 * page must be a genuine page fault where we have to create a new SPTE.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2971) 	 * So, if access tracking is disabled, we return true only for write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2972) 	 * accesses to a present page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2973) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2974) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2975) 	return shadow_acc_track_mask != 0 ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2976) 	       ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2977) 		== (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2978) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2979) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2980) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2981)  * Returns true if the SPTE was fixed successfully. Otherwise,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2982)  * someone else modified the SPTE from its original value.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2983)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2984) static bool
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2985) fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2986) 			u64 *sptep, u64 old_spte, u64 new_spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2987) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2988) 	gfn_t gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2989) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2990) 	WARN_ON(!sp->role.direct);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2991) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2992) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2993) 	 * Theoretically we could also set dirty bit (and flush TLB) here in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2994) 	 * order to eliminate unnecessary PML logging. See comments in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2995) 	 * set_spte. But fast_page_fault is very unlikely to happen with PML
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2996) 	 * enabled, so we do not do this. This might result in the same GPA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2997) 	 * to be logged in PML buffer again when the write really happens, and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2998) 	 * eventually to be called by mark_page_dirty twice. But it's also no
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2999) 	 * harm. This also avoids the TLB flush needed after setting dirty bit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3000) 	 * so non-PML cases won't be impacted.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3001) 	 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3002) 	 * Compare with set_spte where instead shadow_dirty_mask is set.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3003) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3004) 	if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3005) 		return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3006) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3007) 	if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3008) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3009) 		 * The gfn of direct spte is stable since it is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3010) 		 * calculated by sp->gfn.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3011) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3012) 		gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3013) 		kvm_vcpu_mark_page_dirty(vcpu, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3014) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3015) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3016) 	return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3017) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3018) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3019) static bool is_access_allowed(u32 fault_err_code, u64 spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3020) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3021) 	if (fault_err_code & PFERR_FETCH_MASK)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3022) 		return is_executable_pte(spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3023) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3024) 	if (fault_err_code & PFERR_WRITE_MASK)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3025) 		return is_writable_pte(spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3026) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3027) 	/* Fault was on Read access */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3028) 	return spte & PT_PRESENT_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3029) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3030) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3031) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3032)  * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3033)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3034) static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3035) 			   u32 error_code)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3036) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3037) 	struct kvm_shadow_walk_iterator iterator;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3038) 	struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3039) 	int ret = RET_PF_INVALID;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3040) 	u64 spte = 0ull;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3041) 	uint retry_count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3042) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3043) 	if (!page_fault_can_be_fast(error_code))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3044) 		return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3045) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3046) 	walk_shadow_page_lockless_begin(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3047) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3048) 	do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3049) 		u64 new_spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3050) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3051) 		for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3052) 			if (!is_shadow_present_pte(spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3053) 				break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3054) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3055) 		sp = sptep_to_sp(iterator.sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3056) 		if (!is_last_spte(spte, sp->role.level))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3057) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3058) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3059) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3060) 		 * Check whether the memory access that caused the fault would
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3061) 		 * still cause it if it were to be performed right now. If not,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3062) 		 * then this is a spurious fault caused by TLB lazily flushed,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3063) 		 * or some other CPU has already fixed the PTE after the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3064) 		 * current CPU took the fault.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3065) 		 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3066) 		 * Need not check the access of upper level table entries since
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3067) 		 * they are always ACC_ALL.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3068) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3069) 		if (is_access_allowed(error_code, spte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3070) 			ret = RET_PF_SPURIOUS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3071) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3072) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3073) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3074) 		new_spte = spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3075) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3076) 		if (is_access_track_spte(spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3077) 			new_spte = restore_acc_track_spte(new_spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3078) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3079) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3080) 		 * Currently, to simplify the code, write-protection can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3081) 		 * be removed in the fast path only if the SPTE was
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3082) 		 * write-protected for dirty-logging or access tracking.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3083) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3084) 		if ((error_code & PFERR_WRITE_MASK) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3085) 		    spte_can_locklessly_be_made_writable(spte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3086) 			new_spte |= PT_WRITABLE_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3087) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3088) 			/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3089) 			 * Do not fix write-permission on the large spte.  Since
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3090) 			 * we only dirty the first page into the dirty-bitmap in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3091) 			 * fast_pf_fix_direct_spte(), other pages are missed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3092) 			 * if its slot has dirty logging enabled.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3093) 			 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3094) 			 * Instead, we let the slow page fault path create a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3095) 			 * normal spte to fix the access.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3096) 			 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3097) 			 * See the comments in kvm_arch_commit_memory_region().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3098) 			 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3099) 			if (sp->role.level > PG_LEVEL_4K)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3100) 				break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3101) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3102) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3103) 		/* Verify that the fault can be handled in the fast path */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3104) 		if (new_spte == spte ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3105) 		    !is_access_allowed(error_code, new_spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3106) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3107) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3108) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3109) 		 * Currently, fast page fault only works for direct mapping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3110) 		 * since the gfn is not stable for indirect shadow page. See
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3111) 		 * Documentation/virt/kvm/locking.rst to get more detail.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3112) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3113) 		if (fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3114) 					    new_spte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3115) 			ret = RET_PF_FIXED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3116) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3117) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3118) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3119) 		if (++retry_count > 4) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3120) 			printk_once(KERN_WARNING
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3121) 				"kvm: Fast #PF retrying more than 4 times.\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3122) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3123) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3124) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3125) 	} while (true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3126) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3127) 	trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3128) 			      spte, ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3129) 	walk_shadow_page_lockless_end(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3130) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3131) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3132) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3133) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3134) static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3135) 			       struct list_head *invalid_list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3136) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3137) 	struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3138) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3139) 	if (!VALID_PAGE(*root_hpa))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3140) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3141) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3142) 	sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3143) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3144) 	if (kvm_mmu_put_root(kvm, sp)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3145) 		if (sp->tdp_mmu_page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3146) 			kvm_tdp_mmu_free_root(kvm, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3147) 		else if (sp->role.invalid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3148) 			kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3149) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3150) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3151) 	*root_hpa = INVALID_PAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3152) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3153) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3154) /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3155) void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3156) 			ulong roots_to_free)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3157) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3158) 	struct kvm *kvm = vcpu->kvm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3159) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3160) 	LIST_HEAD(invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3161) 	bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3162) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3163) 	BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3164) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3165) 	/* Before acquiring the MMU lock, see if we need to do any real work. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3166) 	if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3167) 		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3168) 			if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3169) 			    VALID_PAGE(mmu->prev_roots[i].hpa))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3170) 				break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3171) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3172) 		if (i == KVM_MMU_NUM_PREV_ROOTS)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3173) 			return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3174) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3175) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3176) 	spin_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3177) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3178) 	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3179) 		if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3180) 			mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3181) 					   &invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3182) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3183) 	if (free_active_root) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3184) 		if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3185) 		    (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3186) 			mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3187) 		} else if (mmu->pae_root) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3188) 			for (i = 0; i < 4; ++i)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3189) 				if (mmu->pae_root[i] != 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3190) 					mmu_free_root_page(kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3191) 							   &mmu->pae_root[i],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3192) 							   &invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3193) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3194) 		mmu->root_hpa = INVALID_PAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3195) 		mmu->root_pgd = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3196) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3197) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3198) 	kvm_mmu_commit_zap_page(kvm, &invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3199) 	spin_unlock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3200) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3201) EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3202) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3203) static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3204) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3205) 	int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3206) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3207) 	if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3208) 		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3209) 		ret = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3210) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3211) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3212) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3213) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3214) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3215) static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3216) 			    u8 level, bool direct)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3217) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3218) 	struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3219) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3220) 	spin_lock(&vcpu->kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3221) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3222) 	if (make_mmu_pages_available(vcpu)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3223) 		spin_unlock(&vcpu->kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3224) 		return INVALID_PAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3225) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3226) 	sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3227) 	++sp->root_count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3228) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3229) 	spin_unlock(&vcpu->kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3230) 	return __pa(sp->spt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3231) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3232) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3233) static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3234) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3235) 	u8 shadow_root_level = vcpu->arch.mmu->shadow_root_level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3236) 	hpa_t root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3237) 	unsigned i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3238) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3239) 	if (vcpu->kvm->arch.tdp_mmu_enabled) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3240) 		root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3241) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3242) 		if (!VALID_PAGE(root))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3243) 			return -ENOSPC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3244) 		vcpu->arch.mmu->root_hpa = root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3245) 	} else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3246) 		root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3247) 				      true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3248) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3249) 		if (!VALID_PAGE(root))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3250) 			return -ENOSPC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3251) 		vcpu->arch.mmu->root_hpa = root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3252) 	} else if (shadow_root_level == PT32E_ROOT_LEVEL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3253) 		for (i = 0; i < 4; ++i) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3254) 			MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3255) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3256) 			root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3257) 					      i << 30, PT32_ROOT_LEVEL, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3258) 			if (!VALID_PAGE(root))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3259) 				return -ENOSPC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3260) 			vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3261) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3262) 		vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3263) 	} else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3264) 		BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3265) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3266) 	/* root_pgd is ignored for direct MMUs. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3267) 	vcpu->arch.mmu->root_pgd = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3268) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3269) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3270) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3271) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3272) static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3273) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3274) 	u64 pdptr, pm_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3275) 	gfn_t root_gfn, root_pgd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3276) 	hpa_t root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3277) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3278) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3279) 	root_pgd = vcpu->arch.mmu->get_guest_pgd(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3280) 	root_gfn = root_pgd >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3281) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3282) 	if (mmu_check_root(vcpu, root_gfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3283) 		return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3284) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3285) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3286) 	 * Do we shadow a long mode page table? If so we need to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3287) 	 * write-protect the guests page table root.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3288) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3289) 	if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3290) 		MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->root_hpa));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3291) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3292) 		root = mmu_alloc_root(vcpu, root_gfn, 0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3293) 				      vcpu->arch.mmu->shadow_root_level, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3294) 		if (!VALID_PAGE(root))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3295) 			return -ENOSPC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3296) 		vcpu->arch.mmu->root_hpa = root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3297) 		goto set_root_pgd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3298) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3299) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3300) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3301) 	 * We shadow a 32 bit page table. This may be a legacy 2-level
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3302) 	 * or a PAE 3-level page table. In either case we need to be aware that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3303) 	 * the shadow page table may be a PAE or a long mode page table.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3304) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3305) 	pm_mask = PT_PRESENT_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3306) 	if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3307) 		pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3308) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3309) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3310) 		 * Allocate the page for the PDPTEs when shadowing 32-bit NPT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3311) 		 * with 64-bit only when needed.  Unlike 32-bit NPT, it doesn't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3312) 		 * need to be in low mem.  See also lm_root below.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3313) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3314) 		if (!vcpu->arch.mmu->pae_root) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3315) 			WARN_ON_ONCE(!tdp_enabled);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3316) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3317) 			vcpu->arch.mmu->pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3318) 			if (!vcpu->arch.mmu->pae_root)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3319) 				return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3320) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3321) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3322) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3323) 	for (i = 0; i < 4; ++i) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3324) 		MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3325) 		if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3326) 			pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3327) 			if (!(pdptr & PT_PRESENT_MASK)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3328) 				vcpu->arch.mmu->pae_root[i] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3329) 				continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3330) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3331) 			root_gfn = pdptr >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3332) 			if (mmu_check_root(vcpu, root_gfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3333) 				return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3334) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3335) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3336) 		root = mmu_alloc_root(vcpu, root_gfn, i << 30,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3337) 				      PT32_ROOT_LEVEL, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3338) 		if (!VALID_PAGE(root))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3339) 			return -ENOSPC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3340) 		vcpu->arch.mmu->pae_root[i] = root | pm_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3341) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3342) 	vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3343) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3344) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3345) 	 * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3346) 	 * tables are allocated and initialized at MMU creation as there is no
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3347) 	 * equivalent level in the guest's NPT to shadow.  Allocate the tables
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3348) 	 * on demand, as running a 32-bit L1 VMM is very rare.  The PDP is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3349) 	 * handled above (to share logic with PAE), deal with the PML4 here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3350) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3351) 	if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3352) 		if (vcpu->arch.mmu->lm_root == NULL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3353) 			u64 *lm_root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3354) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3355) 			lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3356) 			if (!lm_root)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3357) 				return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3358) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3359) 			lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3360) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3361) 			vcpu->arch.mmu->lm_root = lm_root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3362) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3363) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3364) 		vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3365) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3366) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3367) set_root_pgd:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3368) 	vcpu->arch.mmu->root_pgd = root_pgd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3369) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3370) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3371) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3372) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3373) static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3374) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3375) 	if (vcpu->arch.mmu->direct_map)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3376) 		return mmu_alloc_direct_roots(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3377) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3378) 		return mmu_alloc_shadow_roots(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3379) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3380) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3381) void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3382) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3383) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3384) 	struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3385) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3386) 	if (vcpu->arch.mmu->direct_map)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3387) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3388) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3389) 	if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3390) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3391) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3392) 	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3393) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3394) 	if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3395) 		hpa_t root = vcpu->arch.mmu->root_hpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3396) 		sp = to_shadow_page(root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3397) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3398) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3399) 		 * Even if another CPU was marking the SP as unsync-ed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3400) 		 * simultaneously, any guest page table changes are not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3401) 		 * guaranteed to be visible anyway until this VCPU issues a TLB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3402) 		 * flush strictly after those changes are made. We only need to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3403) 		 * ensure that the other CPU sets these flags before any actual
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3404) 		 * changes to the page tables are made. The comments in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3405) 		 * mmu_need_write_protect() describe what could go wrong if this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3406) 		 * requirement isn't satisfied.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3407) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3408) 		if (!smp_load_acquire(&sp->unsync) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3409) 		    !smp_load_acquire(&sp->unsync_children))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3410) 			return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3411) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3412) 		spin_lock(&vcpu->kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3413) 		kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3414) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3415) 		mmu_sync_children(vcpu, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3416) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3417) 		kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3418) 		spin_unlock(&vcpu->kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3419) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3420) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3421) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3422) 	spin_lock(&vcpu->kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3423) 	kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3424) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3425) 	for (i = 0; i < 4; ++i) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3426) 		hpa_t root = vcpu->arch.mmu->pae_root[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3427) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3428) 		if (root && VALID_PAGE(root)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3429) 			root &= PT64_BASE_ADDR_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3430) 			sp = to_shadow_page(root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3431) 			mmu_sync_children(vcpu, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3432) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3433) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3434) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3435) 	kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3436) 	spin_unlock(&vcpu->kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3437) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3438) EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3439) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3440) static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3441) 				  u32 access, struct x86_exception *exception)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3442) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3443) 	if (exception)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3444) 		exception->error_code = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3445) 	return vaddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3446) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3447) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3448) static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3449) 					 u32 access,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3450) 					 struct x86_exception *exception)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3451) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3452) 	if (exception)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3453) 		exception->error_code = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3454) 	return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3455) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3456) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3457) static bool
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3458) __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3459) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3460) 	int bit7 = (pte >> 7) & 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3461) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3462) 	return pte & rsvd_check->rsvd_bits_mask[bit7][level-1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3463) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3464) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3465) static bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, u64 pte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3466) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3467) 	return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3468) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3469) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3470) static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3471) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3472) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3473) 	 * A nested guest cannot use the MMIO cache if it is using nested
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3474) 	 * page tables, because cr2 is a nGPA while the cache stores GPAs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3475) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3476) 	if (mmu_is_nested(vcpu))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3477) 		return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3478) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3479) 	if (direct)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3480) 		return vcpu_match_mmio_gpa(vcpu, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3481) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3482) 	return vcpu_match_mmio_gva(vcpu, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3483) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3484) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3485) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3486)  * Return the level of the lowest level SPTE added to sptes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3487)  * That SPTE may be non-present.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3488)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3489) static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3490) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3491) 	struct kvm_shadow_walk_iterator iterator;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3492) 	int leaf = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3493) 	u64 spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3494) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3495) 	walk_shadow_page_lockless_begin(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3496) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3497) 	for (shadow_walk_init(&iterator, vcpu, addr),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3498) 	     *root_level = iterator.level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3499) 	     shadow_walk_okay(&iterator);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3500) 	     __shadow_walk_next(&iterator, spte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3501) 		leaf = iterator.level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3502) 		spte = mmu_spte_get_lockless(iterator.sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3503) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3504) 		sptes[leaf - 1] = spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3505) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3506) 		if (!is_shadow_present_pte(spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3507) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3508) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3509) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3510) 	walk_shadow_page_lockless_end(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3511) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3512) 	return leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3513) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3514) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3515) /* return true if reserved bit is detected on spte. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3516) static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3517) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3518) 	u64 sptes[PT64_ROOT_MAX_LEVEL];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3519) 	struct rsvd_bits_validate *rsvd_check;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3520) 	int root, leaf, level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3521) 	bool reserved = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3522) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3523) 	if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3524) 		*sptep = 0ull;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3525) 		return reserved;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3526) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3527) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3528) 	if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3529) 		leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3530) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3531) 		leaf = get_walk(vcpu, addr, sptes, &root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3532) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3533) 	if (unlikely(leaf < 0)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3534) 		*sptep = 0ull;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3535) 		return reserved;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3536) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3537) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3538) 	rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3539) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3540) 	for (level = root; level >= leaf; level--) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3541) 		if (!is_shadow_present_pte(sptes[level - 1]))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3542) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3543) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3544) 		 * Use a bitwise-OR instead of a logical-OR to aggregate the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3545) 		 * reserved bit and EPT's invalid memtype/XWR checks to avoid
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3546) 		 * adding a Jcc in the loop.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3547) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3548) 		reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level - 1]) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3549) 			    __is_rsvd_bits_set(rsvd_check, sptes[level - 1],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3550) 					       level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3551) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3552) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3553) 	if (reserved) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3554) 		pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3555) 		       __func__, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3556) 		for (level = root; level >= leaf; level--)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3557) 			pr_err("------ spte 0x%llx level %d.\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3558) 			       sptes[level - 1], level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3559) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3560) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3561) 	*sptep = sptes[leaf - 1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3562) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3563) 	return reserved;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3564) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3565) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3566) static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3567) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3568) 	u64 spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3569) 	bool reserved;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3570) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3571) 	if (mmio_info_in_cache(vcpu, addr, direct))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3572) 		return RET_PF_EMULATE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3573) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3574) 	reserved = get_mmio_spte(vcpu, addr, &spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3575) 	if (WARN_ON(reserved))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3576) 		return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3577) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3578) 	if (is_mmio_spte(spte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3579) 		gfn_t gfn = get_mmio_spte_gfn(spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3580) 		unsigned int access = get_mmio_spte_access(spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3581) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3582) 		if (!check_mmio_spte(vcpu, spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3583) 			return RET_PF_INVALID;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3584) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3585) 		if (direct)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3586) 			addr = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3587) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3588) 		trace_handle_mmio_page_fault(addr, gfn, access);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3589) 		vcpu_cache_mmio_info(vcpu, addr, gfn, access);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3590) 		return RET_PF_EMULATE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3591) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3592) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3593) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3594) 	 * If the page table is zapped by other cpus, let CPU fault again on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3595) 	 * the address.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3596) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3597) 	return RET_PF_RETRY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3598) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3599) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3600) static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3601) 					 u32 error_code, gfn_t gfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3602) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3603) 	if (unlikely(error_code & PFERR_RSVD_MASK))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3604) 		return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3605) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3606) 	if (!(error_code & PFERR_PRESENT_MASK) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3607) 	      !(error_code & PFERR_WRITE_MASK))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3608) 		return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3609) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3610) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3611) 	 * guest is writing the page which is write tracked which can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3612) 	 * not be fixed by page fault handler.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3613) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3614) 	if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3615) 		return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3616) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3617) 	return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3618) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3619) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3620) static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3621) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3622) 	struct kvm_shadow_walk_iterator iterator;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3623) 	u64 spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3624) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3625) 	walk_shadow_page_lockless_begin(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3626) 	for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3627) 		clear_sp_write_flooding_count(iterator.sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3628) 		if (!is_shadow_present_pte(spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3629) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3630) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3631) 	walk_shadow_page_lockless_end(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3632) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3633) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3634) static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3635) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3636) 	/* make sure the token value is not 0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3637) 	u32 id = vcpu->arch.apf.id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3638) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3639) 	if (id << 12 == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3640) 		vcpu->arch.apf.id = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3641) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3642) 	return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3643) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3644) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3645) static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3646) 				    gfn_t gfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3647) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3648) 	struct kvm_arch_async_pf arch;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3649) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3650) 	arch.token = alloc_apf_token(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3651) 	arch.gfn = gfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3652) 	arch.direct_map = vcpu->arch.mmu->direct_map;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3653) 	arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3654) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3655) 	return kvm_setup_async_pf(vcpu, cr2_or_gpa,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3656) 				  kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3657) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3658) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3659) static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3660) 			 gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3661) 			 bool *writable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3662) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3663) 	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3664) 	bool async;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3665) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3666) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3667) 	 * Retry the page fault if the gfn hit a memslot that is being deleted
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3668) 	 * or moved.  This ensures any existing SPTEs for the old memslot will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3669) 	 * be zapped before KVM inserts a new MMIO SPTE for the gfn.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3670) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3671) 	if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3672) 		return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3673) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3674) 	/* Don't expose private memslots to L2. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3675) 	if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3676) 		*pfn = KVM_PFN_NOSLOT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3677) 		*writable = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3678) 		return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3679) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3680) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3681) 	async = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3682) 	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3683) 	if (!async)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3684) 		return false; /* *pfn has correct page already */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3685) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3686) 	if (!prefault && kvm_can_do_async_pf(vcpu)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3687) 		trace_kvm_try_async_get_page(cr2_or_gpa, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3688) 		if (kvm_find_async_pf_gfn(vcpu, gfn)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3689) 			trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3690) 			kvm_make_request(KVM_REQ_APF_HALT, vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3691) 			return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3692) 		} else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3693) 			return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3694) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3695) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3696) 	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3697) 	return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3698) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3699) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3700) static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3701) 			     bool prefault, int max_level, bool is_tdp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3702) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3703) 	bool write = error_code & PFERR_WRITE_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3704) 	bool map_writable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3705) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3706) 	gfn_t gfn = gpa >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3707) 	unsigned long mmu_seq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3708) 	kvm_pfn_t pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3709) 	int r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3710) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3711) 	if (page_fault_handle_page_track(vcpu, error_code, gfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3712) 		return RET_PF_EMULATE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3713) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3714) 	if (!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3715) 		r = fast_page_fault(vcpu, gpa, error_code);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3716) 		if (r != RET_PF_INVALID)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3717) 			return r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3718) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3719) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3720) 	r = mmu_topup_memory_caches(vcpu, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3721) 	if (r)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3722) 		return r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3723) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3724) 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3725) 	smp_rmb();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3726) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3727) 	if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3728) 		return RET_PF_RETRY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3729) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3730) 	if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3731) 		return r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3732) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3733) 	r = RET_PF_RETRY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3734) 	spin_lock(&vcpu->kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3735) 	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3736) 		goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3737) 	r = make_mmu_pages_available(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3738) 	if (r)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3739) 		goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3740) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3741) 	if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3742) 		r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3743) 				    pfn, prefault);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3744) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3745) 		r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3746) 				 prefault, is_tdp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3747) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3748) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3749) 	spin_unlock(&vcpu->kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3750) 	kvm_release_pfn_clean(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3751) 	return r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3752) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3753) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3754) static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3755) 				u32 error_code, bool prefault)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3756) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3757) 	pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3758) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3759) 	/* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3760) 	return direct_page_fault(vcpu, gpa & PAGE_MASK, error_code, prefault,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3761) 				 PG_LEVEL_2M, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3762) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3763) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3764) int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3765) 				u64 fault_address, char *insn, int insn_len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3766) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3767) 	int r = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3768) 	u32 flags = vcpu->arch.apf.host_apf_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3769) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3770) #ifndef CONFIG_X86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3771) 	/* A 64-bit CR2 should be impossible on 32-bit KVM. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3772) 	if (WARN_ON_ONCE(fault_address >> 32))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3773) 		return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3774) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3775) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3776) 	vcpu->arch.l1tf_flush_l1d = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3777) 	if (!flags) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3778) 		trace_kvm_page_fault(fault_address, error_code);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3779) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3780) 		if (kvm_event_needs_reinjection(vcpu))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3781) 			kvm_mmu_unprotect_page_virt(vcpu, fault_address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3782) 		r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3783) 				insn_len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3784) 	} else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3785) 		vcpu->arch.apf.host_apf_flags = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3786) 		local_irq_disable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3787) 		kvm_async_pf_task_wait_schedule(fault_address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3788) 		local_irq_enable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3789) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3790) 		WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3791) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3792) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3793) 	return r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3794) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3795) EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3796) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3797) int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3798) 		       bool prefault)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3799) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3800) 	int max_level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3801) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3802) 	for (max_level = KVM_MAX_HUGEPAGE_LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3803) 	     max_level > PG_LEVEL_4K;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3804) 	     max_level--) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3805) 		int page_num = KVM_PAGES_PER_HPAGE(max_level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3806) 		gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3807) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3808) 		if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3809) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3810) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3811) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3812) 	return direct_page_fault(vcpu, gpa, error_code, prefault,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3813) 				 max_level, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3814) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3815) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3816) static void nonpaging_init_context(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3817) 				   struct kvm_mmu *context)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3818) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3819) 	context->page_fault = nonpaging_page_fault;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3820) 	context->gva_to_gpa = nonpaging_gva_to_gpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3821) 	context->sync_page = nonpaging_sync_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3822) 	context->invlpg = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3823) 	context->root_level = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3824) 	context->shadow_root_level = PT32E_ROOT_LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3825) 	context->direct_map = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3826) 	context->nx = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3827) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3828) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3829) static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3830) 				  union kvm_mmu_page_role role)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3831) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3832) 	return (role.direct || pgd == root->pgd) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3833) 	       VALID_PAGE(root->hpa) && to_shadow_page(root->hpa) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3834) 	       role.word == to_shadow_page(root->hpa)->role.word;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3835) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3836) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3837) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3838)  * Find out if a previously cached root matching the new pgd/role is available.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3839)  * The current root is also inserted into the cache.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3840)  * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3841)  * returned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3842)  * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3843)  * false is returned. This root should now be freed by the caller.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3844)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3845) static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_pgd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3846) 				  union kvm_mmu_page_role new_role)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3847) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3848) 	uint i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3849) 	struct kvm_mmu_root_info root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3850) 	struct kvm_mmu *mmu = vcpu->arch.mmu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3851) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3852) 	root.pgd = mmu->root_pgd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3853) 	root.hpa = mmu->root_hpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3854) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3855) 	if (is_root_usable(&root, new_pgd, new_role))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3856) 		return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3857) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3858) 	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3859) 		swap(root, mmu->prev_roots[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3860) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3861) 		if (is_root_usable(&root, new_pgd, new_role))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3862) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3863) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3864) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3865) 	mmu->root_hpa = root.hpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3866) 	mmu->root_pgd = root.pgd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3867) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3868) 	return i < KVM_MMU_NUM_PREV_ROOTS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3869) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3870) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3871) static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3872) 			    union kvm_mmu_page_role new_role)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3873) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3874) 	struct kvm_mmu *mmu = vcpu->arch.mmu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3875) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3876) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3877) 	 * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3878) 	 * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3879) 	 * later if necessary.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3880) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3881) 	if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3882) 	    mmu->root_level >= PT64_ROOT_4LEVEL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3883) 		return cached_root_available(vcpu, new_pgd, new_role);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3884) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3885) 	return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3886) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3887) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3888) static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3889) 			      union kvm_mmu_page_role new_role,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3890) 			      bool skip_tlb_flush, bool skip_mmu_sync)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3891) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3892) 	if (!fast_pgd_switch(vcpu, new_pgd, new_role)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3893) 		kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, KVM_MMU_ROOT_CURRENT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3894) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3895) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3896) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3897) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3898) 	 * It's possible that the cached previous root page is obsolete because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3899) 	 * of a change in the MMU generation number. However, changing the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3900) 	 * generation number is accompanied by KVM_REQ_MMU_RELOAD, which will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3901) 	 * free the root set here and allocate a new one.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3902) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3903) 	kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3904) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3905) 	if (!skip_mmu_sync || force_flush_and_sync_on_reuse)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3906) 		kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3907) 	if (!skip_tlb_flush || force_flush_and_sync_on_reuse)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3908) 		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3909) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3910) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3911) 	 * The last MMIO access's GVA and GPA are cached in the VCPU. When
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3912) 	 * switching to a new CR3, that GVA->GPA mapping may no longer be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3913) 	 * valid. So clear any cached MMIO info even when we don't need to sync
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3914) 	 * the shadow page tables.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3915) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3916) 	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3917) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3918) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3919) 	 * If this is a direct root page, it doesn't have a write flooding
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3920) 	 * count. Otherwise, clear the write flooding count.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3921) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3922) 	if (!new_role.direct)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3923) 		__clear_sp_write_flooding_count(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3924) 				to_shadow_page(vcpu->arch.mmu->root_hpa));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3925) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3926) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3927) void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3928) 		     bool skip_mmu_sync)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3929) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3930) 	__kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3931) 			  skip_tlb_flush, skip_mmu_sync);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3932) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3933) EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3934) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3935) static unsigned long get_cr3(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3936) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3937) 	return kvm_read_cr3(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3938) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3939) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3940) static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3941) 			   unsigned int access, int *nr_present)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3942) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3943) 	if (unlikely(is_mmio_spte(*sptep))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3944) 		if (gfn != get_mmio_spte_gfn(*sptep)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3945) 			mmu_spte_clear_no_track(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3946) 			return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3947) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3948) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3949) 		(*nr_present)++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3950) 		mark_mmio_spte(vcpu, sptep, gfn, access);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3951) 		return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3952) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3953) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3954) 	return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3955) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3956) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3957) static inline bool is_last_gpte(struct kvm_mmu *mmu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3958) 				unsigned level, unsigned gpte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3959) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3960) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3961) 	 * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3962) 	 * If it is clear, there are no large pages at this level, so clear
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3963) 	 * PT_PAGE_SIZE_MASK in gpte if that is the case.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3964) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3965) 	gpte &= level - mmu->last_nonleaf_level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3966) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3967) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3968) 	 * PG_LEVEL_4K always terminates.  The RHS has bit 7 set
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3969) 	 * iff level <= PG_LEVEL_4K, which for our purpose means
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3970) 	 * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3971) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3972) 	gpte |= level - PG_LEVEL_4K - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3973) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3974) 	return gpte & PT_PAGE_SIZE_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3975) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3976) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3977) #define PTTYPE_EPT 18 /* arbitrary */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3978) #define PTTYPE PTTYPE_EPT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3979) #include "paging_tmpl.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3980) #undef PTTYPE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3981) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3982) #define PTTYPE 64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3983) #include "paging_tmpl.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3984) #undef PTTYPE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3985) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3986) #define PTTYPE 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3987) #include "paging_tmpl.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3988) #undef PTTYPE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3989) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3990) static void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3991) __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3992) 			struct rsvd_bits_validate *rsvd_check,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3993) 			int maxphyaddr, int level, bool nx, bool gbpages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3994) 			bool pse, bool amd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3995) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3996) 	u64 exb_bit_rsvd = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3997) 	u64 gbpages_bit_rsvd = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3998) 	u64 nonleaf_bit8_rsvd = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3999) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4000) 	rsvd_check->bad_mt_xwr = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4001) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4002) 	if (!nx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4003) 		exb_bit_rsvd = rsvd_bits(63, 63);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4004) 	if (!gbpages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4005) 		gbpages_bit_rsvd = rsvd_bits(7, 7);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4006) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4007) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4008) 	 * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4009) 	 * leaf entries) on AMD CPUs only.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4010) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4011) 	if (amd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4012) 		nonleaf_bit8_rsvd = rsvd_bits(8, 8);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4013) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4014) 	switch (level) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4015) 	case PT32_ROOT_LEVEL:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4016) 		/* no rsvd bits for 2 level 4K page table entries */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4017) 		rsvd_check->rsvd_bits_mask[0][1] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4018) 		rsvd_check->rsvd_bits_mask[0][0] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4019) 		rsvd_check->rsvd_bits_mask[1][0] =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4020) 			rsvd_check->rsvd_bits_mask[0][0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4021) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4022) 		if (!pse) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4023) 			rsvd_check->rsvd_bits_mask[1][1] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4024) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4025) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4026) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4027) 		if (is_cpuid_PSE36())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4028) 			/* 36bits PSE 4MB page */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4029) 			rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4030) 		else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4031) 			/* 32 bits PSE 4MB page */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4032) 			rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4033) 		break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4034) 	case PT32E_ROOT_LEVEL:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4035) 		rsvd_check->rsvd_bits_mask[0][2] =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4036) 			rsvd_bits(maxphyaddr, 63) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4037) 			rsvd_bits(5, 8) | rsvd_bits(1, 2);	/* PDPTE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4038) 		rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4039) 			rsvd_bits(maxphyaddr, 62);	/* PDE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4040) 		rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4041) 			rsvd_bits(maxphyaddr, 62); 	/* PTE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4042) 		rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4043) 			rsvd_bits(maxphyaddr, 62) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4044) 			rsvd_bits(13, 20);		/* large page */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4045) 		rsvd_check->rsvd_bits_mask[1][0] =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4046) 			rsvd_check->rsvd_bits_mask[0][0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4047) 		break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4048) 	case PT64_ROOT_5LEVEL:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4049) 		rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4050) 			nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4051) 			rsvd_bits(maxphyaddr, 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4052) 		rsvd_check->rsvd_bits_mask[1][4] =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4053) 			rsvd_check->rsvd_bits_mask[0][4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4054) 		fallthrough;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4055) 	case PT64_ROOT_4LEVEL:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4056) 		rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4057) 			nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4058) 			rsvd_bits(maxphyaddr, 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4059) 		rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4060) 			gbpages_bit_rsvd |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4061) 			rsvd_bits(maxphyaddr, 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4062) 		rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4063) 			rsvd_bits(maxphyaddr, 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4064) 		rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4065) 			rsvd_bits(maxphyaddr, 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4066) 		rsvd_check->rsvd_bits_mask[1][3] =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4067) 			rsvd_check->rsvd_bits_mask[0][3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4068) 		rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4069) 			gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4070) 			rsvd_bits(13, 29);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4071) 		rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4072) 			rsvd_bits(maxphyaddr, 51) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4073) 			rsvd_bits(13, 20);		/* large page */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4074) 		rsvd_check->rsvd_bits_mask[1][0] =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4075) 			rsvd_check->rsvd_bits_mask[0][0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4076) 		break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4077) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4078) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4079) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4080) static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4081) 				  struct kvm_mmu *context)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4082) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4083) 	__reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4084) 				cpuid_maxphyaddr(vcpu), context->root_level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4085) 				context->nx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4086) 				guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4087) 				is_pse(vcpu),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4088) 				guest_cpuid_is_amd_or_hygon(vcpu));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4089) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4090) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4091) static void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4092) __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4093) 			    int maxphyaddr, bool execonly)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4094) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4095) 	u64 bad_mt_xwr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4096) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4097) 	rsvd_check->rsvd_bits_mask[0][4] =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4098) 		rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4099) 	rsvd_check->rsvd_bits_mask[0][3] =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4100) 		rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4101) 	rsvd_check->rsvd_bits_mask[0][2] =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4102) 		rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4103) 	rsvd_check->rsvd_bits_mask[0][1] =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4104) 		rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4105) 	rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4106) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4107) 	/* large page */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4108) 	rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4109) 	rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4110) 	rsvd_check->rsvd_bits_mask[1][2] =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4111) 		rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4112) 	rsvd_check->rsvd_bits_mask[1][1] =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4113) 		rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4114) 	rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4115) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4116) 	bad_mt_xwr = 0xFFull << (2 * 8);	/* bits 3..5 must not be 2 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4117) 	bad_mt_xwr |= 0xFFull << (3 * 8);	/* bits 3..5 must not be 3 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4118) 	bad_mt_xwr |= 0xFFull << (7 * 8);	/* bits 3..5 must not be 7 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4119) 	bad_mt_xwr |= REPEAT_BYTE(1ull << 2);	/* bits 0..2 must not be 010 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4120) 	bad_mt_xwr |= REPEAT_BYTE(1ull << 6);	/* bits 0..2 must not be 110 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4121) 	if (!execonly) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4122) 		/* bits 0..2 must not be 100 unless VMX capabilities allow it */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4123) 		bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4124) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4125) 	rsvd_check->bad_mt_xwr = bad_mt_xwr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4126) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4127) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4128) static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4129) 		struct kvm_mmu *context, bool execonly)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4130) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4131) 	__reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4132) 				    cpuid_maxphyaddr(vcpu), execonly);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4133) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4134) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4135) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4136)  * the page table on host is the shadow page table for the page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4137)  * table in guest or amd nested guest, its mmu features completely
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4138)  * follow the features in guest.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4139)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4140) void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4141) reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4142) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4143) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4144) 	 * KVM uses NX when TDP is disabled to handle a variety of scenarios,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4145) 	 * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4146) 	 * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4147) 	 * The iTLB multi-hit workaround can be toggled at any time, so assume
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4148) 	 * NX can be used by any non-nested shadow MMU to avoid having to reset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4149) 	 * MMU contexts.  Note, KVM forces EFER.NX=1 when TDP is disabled.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4150) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4151) 	bool uses_nx = context->nx || !tdp_enabled ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4152) 		context->mmu_role.base.smep_andnot_wp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4153) 	struct rsvd_bits_validate *shadow_zero_check;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4154) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4155) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4156) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4157) 	 * Passing "true" to the last argument is okay; it adds a check
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4158) 	 * on bit 8 of the SPTEs which KVM doesn't use anyway.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4159) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4160) 	shadow_zero_check = &context->shadow_zero_check;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4161) 	__reset_rsvds_bits_mask(vcpu, shadow_zero_check,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4162) 				shadow_phys_bits,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4163) 				context->shadow_root_level, uses_nx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4164) 				guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4165) 				is_pse(vcpu), true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4166) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4167) 	if (!shadow_me_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4168) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4169) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4170) 	for (i = context->shadow_root_level; --i >= 0;) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4171) 		shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4172) 		shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4173) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4174) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4175) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4176) EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4177) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4178) static inline bool boot_cpu_is_amd(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4179) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4180) 	WARN_ON_ONCE(!tdp_enabled);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4181) 	return shadow_x_mask == 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4182) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4183) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4184) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4185)  * the direct page table on host, use as much mmu features as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4186)  * possible, however, kvm currently does not do execution-protection.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4187)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4188) static void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4189) reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4190) 				struct kvm_mmu *context)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4191) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4192) 	struct rsvd_bits_validate *shadow_zero_check;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4193) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4194) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4195) 	shadow_zero_check = &context->shadow_zero_check;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4196) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4197) 	if (boot_cpu_is_amd())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4198) 		__reset_rsvds_bits_mask(vcpu, shadow_zero_check,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4199) 					shadow_phys_bits,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4200) 					context->shadow_root_level, false,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4201) 					boot_cpu_has(X86_FEATURE_GBPAGES),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4202) 					true, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4203) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4204) 		__reset_rsvds_bits_mask_ept(shadow_zero_check,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4205) 					    shadow_phys_bits,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4206) 					    false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4207) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4208) 	if (!shadow_me_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4209) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4210) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4211) 	for (i = context->shadow_root_level; --i >= 0;) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4212) 		shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4213) 		shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4214) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4215) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4216) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4217) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4218)  * as the comments in reset_shadow_zero_bits_mask() except it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4219)  * is the shadow page table for intel nested guest.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4220)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4221) static void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4222) reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4223) 				struct kvm_mmu *context, bool execonly)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4224) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4225) 	__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4226) 				    shadow_phys_bits, execonly);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4227) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4228) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4229) #define BYTE_MASK(access) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4230) 	((1 & (access) ? 2 : 0) | \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4231) 	 (2 & (access) ? 4 : 0) | \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4232) 	 (3 & (access) ? 8 : 0) | \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4233) 	 (4 & (access) ? 16 : 0) | \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4234) 	 (5 & (access) ? 32 : 0) | \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4235) 	 (6 & (access) ? 64 : 0) | \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4236) 	 (7 & (access) ? 128 : 0))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4237) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4238) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4239) static void update_permission_bitmask(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4240) 				      struct kvm_mmu *mmu, bool ept)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4241) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4242) 	unsigned byte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4243) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4244) 	const u8 x = BYTE_MASK(ACC_EXEC_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4245) 	const u8 w = BYTE_MASK(ACC_WRITE_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4246) 	const u8 u = BYTE_MASK(ACC_USER_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4247) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4248) 	bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4249) 	bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4250) 	bool cr0_wp = is_write_protection(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4251) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4252) 	for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4253) 		unsigned pfec = byte << 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4254) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4255) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4256) 		 * Each "*f" variable has a 1 bit for each UWX value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4257) 		 * that causes a fault with the given PFEC.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4258) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4259) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4260) 		/* Faults from writes to non-writable pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4261) 		u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4262) 		/* Faults from user mode accesses to supervisor pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4263) 		u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4264) 		/* Faults from fetches of non-executable pages*/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4265) 		u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4266) 		/* Faults from kernel mode fetches of user pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4267) 		u8 smepf = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4268) 		/* Faults from kernel mode accesses of user pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4269) 		u8 smapf = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4270) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4271) 		if (!ept) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4272) 			/* Faults from kernel mode accesses to user pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4273) 			u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4274) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4275) 			/* Not really needed: !nx will cause pte.nx to fault */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4276) 			if (!mmu->nx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4277) 				ff = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4278) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4279) 			/* Allow supervisor writes if !cr0.wp */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4280) 			if (!cr0_wp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4281) 				wf = (pfec & PFERR_USER_MASK) ? wf : 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4282) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4283) 			/* Disallow supervisor fetches of user code if cr4.smep */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4284) 			if (cr4_smep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4285) 				smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4286) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4287) 			/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4288) 			 * SMAP:kernel-mode data accesses from user-mode
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4289) 			 * mappings should fault. A fault is considered
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4290) 			 * as a SMAP violation if all of the following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4291) 			 * conditions are true:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4292) 			 *   - X86_CR4_SMAP is set in CR4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4293) 			 *   - A user page is accessed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4294) 			 *   - The access is not a fetch
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4295) 			 *   - Page fault in kernel mode
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4296) 			 *   - if CPL = 3 or X86_EFLAGS_AC is clear
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4297) 			 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4298) 			 * Here, we cover the first three conditions.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4299) 			 * The fourth is computed dynamically in permission_fault();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4300) 			 * PFERR_RSVD_MASK bit will be set in PFEC if the access is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4301) 			 * *not* subject to SMAP restrictions.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4302) 			 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4303) 			if (cr4_smap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4304) 				smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4305) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4306) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4307) 		mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4308) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4309) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4310) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4311) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4312) * PKU is an additional mechanism by which the paging controls access to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4313) * user-mode addresses based on the value in the PKRU register.  Protection
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4314) * key violations are reported through a bit in the page fault error code.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4315) * Unlike other bits of the error code, the PK bit is not known at the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4316) * call site of e.g. gva_to_gpa; it must be computed directly in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4317) * permission_fault based on two bits of PKRU, on some machine state (CR4,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4318) * CR0, EFER, CPL), and on other bits of the error code and the page tables.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4319) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4320) * In particular the following conditions come from the error code, the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4321) * page tables and the machine state:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4322) * - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4323) * - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4324) * - PK is always zero if U=0 in the page tables
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4325) * - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4326) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4327) * The PKRU bitmask caches the result of these four conditions.  The error
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4328) * code (minus the P bit) and the page table's U bit form an index into the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4329) * PKRU bitmask.  Two bits of the PKRU bitmask are then extracted and ANDed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4330) * with the two bits of the PKRU register corresponding to the protection key.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4331) * For the first three conditions above the bits will be 00, thus masking
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4332) * away both AD and WD.  For all reads or if the last condition holds, WD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4333) * only will be masked away.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4334) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4335) static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4336) 				bool ept)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4337) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4338) 	unsigned bit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4339) 	bool wp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4340) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4341) 	if (ept) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4342) 		mmu->pkru_mask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4343) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4344) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4345) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4346) 	/* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4347) 	if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4348) 		mmu->pkru_mask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4349) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4350) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4351) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4352) 	wp = is_write_protection(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4353) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4354) 	for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4355) 		unsigned pfec, pkey_bits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4356) 		bool check_pkey, check_write, ff, uf, wf, pte_user;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4357) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4358) 		pfec = bit << 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4359) 		ff = pfec & PFERR_FETCH_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4360) 		uf = pfec & PFERR_USER_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4361) 		wf = pfec & PFERR_WRITE_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4362) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4363) 		/* PFEC.RSVD is replaced by ACC_USER_MASK. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4364) 		pte_user = pfec & PFERR_RSVD_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4365) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4366) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4367) 		 * Only need to check the access which is not an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4368) 		 * instruction fetch and is to a user page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4369) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4370) 		check_pkey = (!ff && pte_user);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4371) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4372) 		 * write access is controlled by PKRU if it is a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4373) 		 * user access or CR0.WP = 1.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4374) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4375) 		check_write = check_pkey && wf && (uf || wp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4376) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4377) 		/* PKRU.AD stops both read and write access. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4378) 		pkey_bits = !!check_pkey;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4379) 		/* PKRU.WD stops write access. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4380) 		pkey_bits |= (!!check_write) << 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4381) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4382) 		mmu->pkru_mask |= (pkey_bits & 3) << pfec;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4383) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4384) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4385) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4386) static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4387) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4388) 	unsigned root_level = mmu->root_level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4389) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4390) 	mmu->last_nonleaf_level = root_level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4391) 	if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4392) 		mmu->last_nonleaf_level++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4393) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4394) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4395) static void paging64_init_context_common(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4396) 					 struct kvm_mmu *context,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4397) 					 int level)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4398) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4399) 	context->nx = is_nx(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4400) 	context->root_level = level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4401) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4402) 	reset_rsvds_bits_mask(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4403) 	update_permission_bitmask(vcpu, context, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4404) 	update_pkru_bitmask(vcpu, context, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4405) 	update_last_nonleaf_level(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4406) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4407) 	MMU_WARN_ON(!is_pae(vcpu));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4408) 	context->page_fault = paging64_page_fault;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4409) 	context->gva_to_gpa = paging64_gva_to_gpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4410) 	context->sync_page = paging64_sync_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4411) 	context->invlpg = paging64_invlpg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4412) 	context->shadow_root_level = level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4413) 	context->direct_map = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4414) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4415) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4416) static void paging64_init_context(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4417) 				  struct kvm_mmu *context)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4418) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4419) 	int root_level = is_la57_mode(vcpu) ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4420) 			 PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4421) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4422) 	paging64_init_context_common(vcpu, context, root_level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4423) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4424) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4425) static void paging32_init_context(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4426) 				  struct kvm_mmu *context)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4427) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4428) 	context->nx = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4429) 	context->root_level = PT32_ROOT_LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4430) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4431) 	reset_rsvds_bits_mask(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4432) 	update_permission_bitmask(vcpu, context, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4433) 	update_pkru_bitmask(vcpu, context, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4434) 	update_last_nonleaf_level(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4435) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4436) 	context->page_fault = paging32_page_fault;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4437) 	context->gva_to_gpa = paging32_gva_to_gpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4438) 	context->sync_page = paging32_sync_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4439) 	context->invlpg = paging32_invlpg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4440) 	context->shadow_root_level = PT32E_ROOT_LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4441) 	context->direct_map = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4442) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4443) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4444) static void paging32E_init_context(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4445) 				   struct kvm_mmu *context)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4446) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4447) 	paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4448) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4449) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4450) static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4451) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4452) 	union kvm_mmu_extended_role ext = {0};
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4453) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4454) 	ext.cr0_pg = !!is_paging(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4455) 	ext.cr4_pae = !!is_pae(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4456) 	ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4457) 	ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4458) 	ext.cr4_pse = !!is_pse(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4459) 	ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4460) 	ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4461) 	ext.maxphyaddr = cpuid_maxphyaddr(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4462) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4463) 	ext.valid = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4464) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4465) 	return ext;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4466) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4467) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4468) static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4469) 						   bool base_only)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4470) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4471) 	union kvm_mmu_role role = {0};
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4472) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4473) 	role.base.access = ACC_ALL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4474) 	role.base.nxe = !!is_nx(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4475) 	role.base.cr0_wp = is_write_protection(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4476) 	role.base.smm = is_smm(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4477) 	role.base.guest_mode = is_guest_mode(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4478) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4479) 	if (base_only)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4480) 		return role;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4481) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4482) 	role.ext = kvm_calc_mmu_role_ext(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4483) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4484) 	return role;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4485) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4486) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4487) static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4488) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4489) 	/* Use 5-level TDP if and only if it's useful/necessary. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4490) 	if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4491) 		return 4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4492) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4493) 	return max_tdp_level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4494) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4495) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4496) static union kvm_mmu_role
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4497) kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4498) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4499) 	union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4500) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4501) 	role.base.ad_disabled = (shadow_accessed_mask == 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4502) 	role.base.level = kvm_mmu_get_tdp_level(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4503) 	role.base.direct = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4504) 	role.base.gpte_is_8_bytes = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4505) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4506) 	return role;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4507) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4508) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4509) static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4510) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4511) 	struct kvm_mmu *context = &vcpu->arch.root_mmu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4512) 	union kvm_mmu_role new_role =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4513) 		kvm_calc_tdp_mmu_root_page_role(vcpu, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4514) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4515) 	if (new_role.as_u64 == context->mmu_role.as_u64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4516) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4517) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4518) 	context->mmu_role.as_u64 = new_role.as_u64;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4519) 	context->page_fault = kvm_tdp_page_fault;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4520) 	context->sync_page = nonpaging_sync_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4521) 	context->invlpg = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4522) 	context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4523) 	context->direct_map = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4524) 	context->get_guest_pgd = get_cr3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4525) 	context->get_pdptr = kvm_pdptr_read;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4526) 	context->inject_page_fault = kvm_inject_page_fault;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4527) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4528) 	if (!is_paging(vcpu)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4529) 		context->nx = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4530) 		context->gva_to_gpa = nonpaging_gva_to_gpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4531) 		context->root_level = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4532) 	} else if (is_long_mode(vcpu)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4533) 		context->nx = is_nx(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4534) 		context->root_level = is_la57_mode(vcpu) ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4535) 				PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4536) 		reset_rsvds_bits_mask(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4537) 		context->gva_to_gpa = paging64_gva_to_gpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4538) 	} else if (is_pae(vcpu)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4539) 		context->nx = is_nx(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4540) 		context->root_level = PT32E_ROOT_LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4541) 		reset_rsvds_bits_mask(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4542) 		context->gva_to_gpa = paging64_gva_to_gpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4543) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4544) 		context->nx = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4545) 		context->root_level = PT32_ROOT_LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4546) 		reset_rsvds_bits_mask(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4547) 		context->gva_to_gpa = paging32_gva_to_gpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4548) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4549) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4550) 	update_permission_bitmask(vcpu, context, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4551) 	update_pkru_bitmask(vcpu, context, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4552) 	update_last_nonleaf_level(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4553) 	reset_tdp_shadow_zero_bits_mask(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4554) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4555) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4556) static union kvm_mmu_role
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4557) kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu, bool base_only)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4558) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4559) 	union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4560) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4561) 	role.base.smep_andnot_wp = role.ext.cr4_smep &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4562) 		!is_write_protection(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4563) 	role.base.smap_andnot_wp = role.ext.cr4_smap &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4564) 		!is_write_protection(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4565) 	role.base.gpte_is_8_bytes = !!is_pae(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4566) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4567) 	return role;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4568) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4569) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4570) static union kvm_mmu_role
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4571) kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4572) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4573) 	union kvm_mmu_role role =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4574) 		kvm_calc_shadow_root_page_role_common(vcpu, base_only);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4575) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4576) 	role.base.direct = !is_paging(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4577) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4578) 	if (!is_long_mode(vcpu))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4579) 		role.base.level = PT32E_ROOT_LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4580) 	else if (is_la57_mode(vcpu))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4581) 		role.base.level = PT64_ROOT_5LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4582) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4583) 		role.base.level = PT64_ROOT_4LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4584) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4585) 	return role;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4586) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4587) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4588) static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4589) 				    u32 cr0, u32 cr4, u32 efer,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4590) 				    union kvm_mmu_role new_role)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4591) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4592) 	if (!(cr0 & X86_CR0_PG))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4593) 		nonpaging_init_context(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4594) 	else if (efer & EFER_LMA)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4595) 		paging64_init_context(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4596) 	else if (cr4 & X86_CR4_PAE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4597) 		paging32E_init_context(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4598) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4599) 		paging32_init_context(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4600) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4601) 	context->mmu_role.as_u64 = new_role.as_u64;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4602) 	reset_shadow_zero_bits_mask(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4603) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4604) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4605) static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4606) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4607) 	struct kvm_mmu *context = &vcpu->arch.root_mmu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4608) 	union kvm_mmu_role new_role =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4609) 		kvm_calc_shadow_mmu_root_page_role(vcpu, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4610) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4611) 	if (new_role.as_u64 != context->mmu_role.as_u64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4612) 		shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4613) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4614) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4615) static union kvm_mmu_role
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4616) kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4617) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4618) 	union kvm_mmu_role role =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4619) 		kvm_calc_shadow_root_page_role_common(vcpu, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4620) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4621) 	role.base.direct = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4622) 	role.base.level = kvm_mmu_get_tdp_level(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4623) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4624) 	return role;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4625) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4626) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4627) void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4628) 			     gpa_t nested_cr3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4629) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4630) 	struct kvm_mmu *context = &vcpu->arch.guest_mmu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4631) 	union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4632) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4633) 	__kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, false, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4634) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4635) 	if (new_role.as_u64 != context->mmu_role.as_u64) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4636) 		shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4637) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4638) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4639) 		 * Override the level set by the common init helper, nested TDP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4640) 		 * always uses the host's TDP configuration.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4641) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4642) 		context->shadow_root_level = new_role.base.level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4643) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4644) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4645) EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4646) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4647) static union kvm_mmu_role
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4648) kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4649) 				   bool execonly, u8 level)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4650) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4651) 	union kvm_mmu_role role = {0};
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4652) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4653) 	/* SMM flag is inherited from root_mmu */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4654) 	role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4655) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4656) 	role.base.level = level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4657) 	role.base.gpte_is_8_bytes = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4658) 	role.base.direct = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4659) 	role.base.ad_disabled = !accessed_dirty;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4660) 	role.base.guest_mode = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4661) 	role.base.access = ACC_ALL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4662) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4663) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4664) 	 * WP=1 and NOT_WP=1 is an impossible combination, use WP and the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4665) 	 * SMAP variation to denote shadow EPT entries.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4666) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4667) 	role.base.cr0_wp = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4668) 	role.base.smap_andnot_wp = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4669) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4670) 	role.ext = kvm_calc_mmu_role_ext(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4671) 	role.ext.execonly = execonly;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4672) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4673) 	return role;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4674) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4675) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4676) void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4677) 			     bool accessed_dirty, gpa_t new_eptp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4678) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4679) 	struct kvm_mmu *context = &vcpu->arch.guest_mmu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4680) 	u8 level = vmx_eptp_page_walk_level(new_eptp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4681) 	union kvm_mmu_role new_role =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4682) 		kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4683) 						   execonly, level);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4684) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4685) 	__kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base, true, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4686) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4687) 	if (new_role.as_u64 == context->mmu_role.as_u64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4688) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4689) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4690) 	context->shadow_root_level = level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4691) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4692) 	context->nx = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4693) 	context->ept_ad = accessed_dirty;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4694) 	context->page_fault = ept_page_fault;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4695) 	context->gva_to_gpa = ept_gva_to_gpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4696) 	context->sync_page = ept_sync_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4697) 	context->invlpg = ept_invlpg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4698) 	context->root_level = level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4699) 	context->direct_map = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4700) 	context->mmu_role.as_u64 = new_role.as_u64;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4701) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4702) 	update_permission_bitmask(vcpu, context, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4703) 	update_pkru_bitmask(vcpu, context, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4704) 	update_last_nonleaf_level(vcpu, context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4705) 	reset_rsvds_bits_mask_ept(vcpu, context, execonly);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4706) 	reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4707) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4708) EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4709) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4710) static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4711) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4712) 	struct kvm_mmu *context = &vcpu->arch.root_mmu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4713) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4714) 	kvm_init_shadow_mmu(vcpu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4715) 			    kvm_read_cr0_bits(vcpu, X86_CR0_PG),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4716) 			    kvm_read_cr4_bits(vcpu, X86_CR4_PAE),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4717) 			    vcpu->arch.efer);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4718) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4719) 	context->get_guest_pgd     = get_cr3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4720) 	context->get_pdptr         = kvm_pdptr_read;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4721) 	context->inject_page_fault = kvm_inject_page_fault;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4722) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4723) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4724) static union kvm_mmu_role kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4725) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4726) 	union kvm_mmu_role role = kvm_calc_shadow_root_page_role_common(vcpu, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4727) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4728) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4729) 	 * Nested MMUs are used only for walking L2's gva->gpa, they never have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4730) 	 * shadow pages of their own and so "direct" has no meaning.   Set it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4731) 	 * to "true" to try to detect bogus usage of the nested MMU.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4732) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4733) 	role.base.direct = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4734) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4735) 	if (!is_paging(vcpu))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4736) 		role.base.level = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4737) 	else if (is_long_mode(vcpu))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4738) 		role.base.level = is_la57_mode(vcpu) ? PT64_ROOT_5LEVEL :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4739) 						       PT64_ROOT_4LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4740) 	else if (is_pae(vcpu))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4741) 		role.base.level = PT32E_ROOT_LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4742) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4743) 		role.base.level = PT32_ROOT_LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4744) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4745) 	return role;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4746) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4747) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4748) static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4749) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4750) 	union kvm_mmu_role new_role = kvm_calc_nested_mmu_role(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4751) 	struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4752) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4753) 	if (new_role.as_u64 == g_context->mmu_role.as_u64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4754) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4755) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4756) 	g_context->mmu_role.as_u64 = new_role.as_u64;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4757) 	g_context->get_guest_pgd     = get_cr3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4758) 	g_context->get_pdptr         = kvm_pdptr_read;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4759) 	g_context->inject_page_fault = kvm_inject_page_fault;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4760) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4761) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4762) 	 * L2 page tables are never shadowed, so there is no need to sync
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4763) 	 * SPTEs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4764) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4765) 	g_context->invlpg            = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4766) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4767) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4768) 	 * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4769) 	 * L1's nested page tables (e.g. EPT12). The nested translation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4770) 	 * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4771) 	 * L2's page tables as the first level of translation and L1's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4772) 	 * nested page tables as the second level of translation. Basically
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4773) 	 * the gva_to_gpa functions between mmu and nested_mmu are swapped.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4774) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4775) 	if (!is_paging(vcpu)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4776) 		g_context->nx = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4777) 		g_context->root_level = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4778) 		g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4779) 	} else if (is_long_mode(vcpu)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4780) 		g_context->nx = is_nx(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4781) 		g_context->root_level = is_la57_mode(vcpu) ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4782) 					PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4783) 		reset_rsvds_bits_mask(vcpu, g_context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4784) 		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4785) 	} else if (is_pae(vcpu)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4786) 		g_context->nx = is_nx(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4787) 		g_context->root_level = PT32E_ROOT_LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4788) 		reset_rsvds_bits_mask(vcpu, g_context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4789) 		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4790) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4791) 		g_context->nx = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4792) 		g_context->root_level = PT32_ROOT_LEVEL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4793) 		reset_rsvds_bits_mask(vcpu, g_context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4794) 		g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4795) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4796) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4797) 	update_permission_bitmask(vcpu, g_context, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4798) 	update_pkru_bitmask(vcpu, g_context, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4799) 	update_last_nonleaf_level(vcpu, g_context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4800) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4801) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4802) void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4803) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4804) 	if (reset_roots) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4805) 		uint i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4806) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4807) 		vcpu->arch.mmu->root_hpa = INVALID_PAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4808) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4809) 		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4810) 			vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4811) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4812) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4813) 	if (mmu_is_nested(vcpu))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4814) 		init_kvm_nested_mmu(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4815) 	else if (tdp_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4816) 		init_kvm_tdp_mmu(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4817) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4818) 		init_kvm_softmmu(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4819) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4820) EXPORT_SYMBOL_GPL(kvm_init_mmu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4821) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4822) static union kvm_mmu_page_role
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4823) kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4824) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4825) 	union kvm_mmu_role role;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4826) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4827) 	if (tdp_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4828) 		role = kvm_calc_tdp_mmu_root_page_role(vcpu, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4829) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4830) 		role = kvm_calc_shadow_mmu_root_page_role(vcpu, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4831) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4832) 	return role.base;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4833) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4834) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4835) void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4836) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4837) 	kvm_mmu_unload(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4838) 	kvm_init_mmu(vcpu, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4839) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4840) EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4841) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4842) int kvm_mmu_load(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4843) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4844) 	int r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4845) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4846) 	r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4847) 	if (r)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4848) 		goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4849) 	r = mmu_alloc_roots(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4850) 	kvm_mmu_sync_roots(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4851) 	if (r)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4852) 		goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4853) 	kvm_mmu_load_pgd(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4854) 	kvm_x86_ops.tlb_flush_current(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4855) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4856) 	return r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4857) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4858) EXPORT_SYMBOL_GPL(kvm_mmu_load);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4859) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4860) void kvm_mmu_unload(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4861) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4862) 	kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4863) 	WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4864) 	kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4865) 	WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4866) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4867) EXPORT_SYMBOL_GPL(kvm_mmu_unload);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4868) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4869) static bool need_remote_flush(u64 old, u64 new)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4870) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4871) 	if (!is_shadow_present_pte(old))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4872) 		return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4873) 	if (!is_shadow_present_pte(new))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4874) 		return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4875) 	if ((old ^ new) & PT64_BASE_ADDR_MASK)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4876) 		return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4877) 	old ^= shadow_nx_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4878) 	new ^= shadow_nx_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4879) 	return (old & ~new & PT64_PERM_MASK) != 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4880) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4881) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4882) static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4883) 				    int *bytes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4884) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4885) 	u64 gentry = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4886) 	int r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4887) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4888) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4889) 	 * Assume that the pte write on a page table of the same type
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4890) 	 * as the current vcpu paging mode since we update the sptes only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4891) 	 * when they have the same mode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4892) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4893) 	if (is_pae(vcpu) && *bytes == 4) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4894) 		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4895) 		*gpa &= ~(gpa_t)7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4896) 		*bytes = 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4897) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4898) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4899) 	if (*bytes == 4 || *bytes == 8) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4900) 		r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4901) 		if (r)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4902) 			gentry = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4903) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4904) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4905) 	return gentry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4906) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4907) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4908) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4909)  * If we're seeing too many writes to a page, it may no longer be a page table,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4910)  * or we may be forking, in which case it is better to unmap the page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4911)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4912) static bool detect_write_flooding(struct kvm_mmu_page *sp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4913) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4914) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4915) 	 * Skip write-flooding detected for the sp whose level is 1, because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4916) 	 * it can become unsync, then the guest page is not write-protected.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4917) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4918) 	if (sp->role.level == PG_LEVEL_4K)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4919) 		return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4920) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4921) 	atomic_inc(&sp->write_flooding_count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4922) 	return atomic_read(&sp->write_flooding_count) >= 3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4923) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4924) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4925) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4926)  * Misaligned accesses are too much trouble to fix up; also, they usually
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4927)  * indicate a page is not used as a page table.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4928)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4929) static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4930) 				    int bytes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4931) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4932) 	unsigned offset, pte_size, misaligned;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4933) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4934) 	pgprintk("misaligned: gpa %llx bytes %d role %x\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4935) 		 gpa, bytes, sp->role.word);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4936) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4937) 	offset = offset_in_page(gpa);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4938) 	pte_size = sp->role.gpte_is_8_bytes ? 8 : 4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4939) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4940) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4941) 	 * Sometimes, the OS only writes the last one bytes to update status
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4942) 	 * bits, for example, in linux, andb instruction is used in clear_bit().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4943) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4944) 	if (!(offset & (pte_size - 1)) && bytes == 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4945) 		return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4946) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4947) 	misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4948) 	misaligned |= bytes < 4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4949) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4950) 	return misaligned;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4951) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4952) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4953) static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4954) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4955) 	unsigned page_offset, quadrant;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4956) 	u64 *spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4957) 	int level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4958) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4959) 	page_offset = offset_in_page(gpa);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4960) 	level = sp->role.level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4961) 	*nspte = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4962) 	if (!sp->role.gpte_is_8_bytes) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4963) 		page_offset <<= 1;	/* 32->64 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4964) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4965) 		 * A 32-bit pde maps 4MB while the shadow pdes map
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4966) 		 * only 2MB.  So we need to double the offset again
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4967) 		 * and zap two pdes instead of one.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4968) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4969) 		if (level == PT32_ROOT_LEVEL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4970) 			page_offset &= ~7; /* kill rounding error */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4971) 			page_offset <<= 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4972) 			*nspte = 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4973) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4974) 		quadrant = page_offset >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4975) 		page_offset &= ~PAGE_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4976) 		if (quadrant != sp->role.quadrant)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4977) 			return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4978) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4979) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4980) 	spte = &sp->spt[page_offset / sizeof(*spte)];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4981) 	return spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4982) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4983) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4984) static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4985) 			      const u8 *new, int bytes,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4986) 			      struct kvm_page_track_notifier_node *node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4987) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4988) 	gfn_t gfn = gpa >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4989) 	struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4990) 	LIST_HEAD(invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4991) 	u64 entry, gentry, *spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4992) 	int npte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4993) 	bool remote_flush, local_flush;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4994) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4995) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4996) 	 * If we don't have indirect shadow pages, it means no page is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4997) 	 * write-protected, so we can exit simply.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4998) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4999) 	if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5000) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5001) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5002) 	remote_flush = local_flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5003) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5004) 	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5005) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5006) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5007) 	 * No need to care whether allocation memory is successful
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5008) 	 * or not since pte prefetch is skiped if it does not have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5009) 	 * enough objects in the cache.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5010) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5011) 	mmu_topup_memory_caches(vcpu, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5012) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5013) 	spin_lock(&vcpu->kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5014) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5015) 	gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5016) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5017) 	++vcpu->kvm->stat.mmu_pte_write;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5018) 	kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5019) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5020) 	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5021) 		if (detect_write_misaligned(sp, gpa, bytes) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5022) 		      detect_write_flooding(sp)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5023) 			kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5024) 			++vcpu->kvm->stat.mmu_flooded;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5025) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5026) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5027) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5028) 		spte = get_written_sptes(sp, gpa, &npte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5029) 		if (!spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5030) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5031) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5032) 		local_flush = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5033) 		while (npte--) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5034) 			entry = *spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5035) 			mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5036) 			if (gentry && sp->role.level != PG_LEVEL_4K)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5037) 				++vcpu->kvm->stat.mmu_pde_zapped;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5038) 			if (need_remote_flush(entry, *spte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5039) 				remote_flush = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5040) 			++spte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5041) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5042) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5043) 	kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5044) 	kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5045) 	spin_unlock(&vcpu->kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5046) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5047) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5048) int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5049) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5050) 	gpa_t gpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5051) 	int r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5052) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5053) 	if (vcpu->arch.mmu->direct_map)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5054) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5055) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5056) 	gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5057) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5058) 	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5059) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5060) 	return r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5061) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5062) EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5063) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5064) int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5065) 		       void *insn, int insn_len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5066) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5067) 	int r, emulation_type = EMULTYPE_PF;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5068) 	bool direct = vcpu->arch.mmu->direct_map;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5069) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5070) 	if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5071) 		return RET_PF_RETRY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5072) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5073) 	r = RET_PF_INVALID;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5074) 	if (unlikely(error_code & PFERR_RSVD_MASK)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5075) 		r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5076) 		if (r == RET_PF_EMULATE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5077) 			goto emulate;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5078) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5079) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5080) 	if (r == RET_PF_INVALID) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5081) 		r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5082) 					  lower_32_bits(error_code), false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5083) 		if (WARN_ON_ONCE(r == RET_PF_INVALID))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5084) 			return -EIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5085) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5086) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5087) 	if (r < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5088) 		return r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5089) 	if (r != RET_PF_EMULATE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5090) 		return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5091) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5092) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5093) 	 * Before emulating the instruction, check if the error code
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5094) 	 * was due to a RO violation while translating the guest page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5095) 	 * This can occur when using nested virtualization with nested
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5096) 	 * paging in both guests. If true, we simply unprotect the page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5097) 	 * and resume the guest.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5098) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5099) 	if (vcpu->arch.mmu->direct_map &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5100) 	    (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5101) 		kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5102) 		return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5103) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5104) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5105) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5106) 	 * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5107) 	 * optimistically try to just unprotect the page and let the processor
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5108) 	 * re-execute the instruction that caused the page fault.  Do not allow
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5109) 	 * retrying MMIO emulation, as it's not only pointless but could also
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5110) 	 * cause us to enter an infinite loop because the processor will keep
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5111) 	 * faulting on the non-existent MMIO address.  Retrying an instruction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5112) 	 * from a nested guest is also pointless and dangerous as we are only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5113) 	 * explicitly shadowing L1's page tables, i.e. unprotecting something
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5114) 	 * for L1 isn't going to magically fix whatever issue cause L2 to fail.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5115) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5116) 	if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5117) 		emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5118) emulate:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5119) 	return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5120) 				       insn_len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5121) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5122) EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5123) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5124) void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5125) 			    gva_t gva, hpa_t root_hpa)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5126) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5127) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5128) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5129) 	/* It's actually a GPA for vcpu->arch.guest_mmu.  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5130) 	if (mmu != &vcpu->arch.guest_mmu) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5131) 		/* INVLPG on a non-canonical address is a NOP according to the SDM.  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5132) 		if (is_noncanonical_address(gva, vcpu))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5133) 			return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5134) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5135) 		kvm_x86_ops.tlb_flush_gva(vcpu, gva);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5136) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5137) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5138) 	if (!mmu->invlpg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5139) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5140) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5141) 	if (root_hpa == INVALID_PAGE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5142) 		mmu->invlpg(vcpu, gva, mmu->root_hpa);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5143) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5144) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5145) 		 * INVLPG is required to invalidate any global mappings for the VA,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5146) 		 * irrespective of PCID. Since it would take us roughly similar amount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5147) 		 * of work to determine whether any of the prev_root mappings of the VA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5148) 		 * is marked global, or to just sync it blindly, so we might as well
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5149) 		 * just always sync it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5150) 		 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5151) 		 * Mappings not reachable via the current cr3 or the prev_roots will be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5152) 		 * synced when switching to that cr3, so nothing needs to be done here
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5153) 		 * for them.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5154) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5155) 		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5156) 			if (VALID_PAGE(mmu->prev_roots[i].hpa))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5157) 				mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5158) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5159) 		mmu->invlpg(vcpu, gva, root_hpa);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5160) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5161) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5162) EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_gva);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5163) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5164) void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5165) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5166) 	kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5167) 	++vcpu->stat.invlpg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5168) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5169) EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5170) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5171) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5172) void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5173) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5174) 	struct kvm_mmu *mmu = vcpu->arch.mmu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5175) 	bool tlb_flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5176) 	uint i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5177) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5178) 	if (pcid == kvm_get_active_pcid(vcpu)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5179) 		mmu->invlpg(vcpu, gva, mmu->root_hpa);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5180) 		tlb_flush = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5181) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5182) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5183) 	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5184) 		if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5185) 		    pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5186) 			mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5187) 			tlb_flush = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5188) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5189) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5190) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5191) 	if (tlb_flush)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5192) 		kvm_x86_ops.tlb_flush_gva(vcpu, gva);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5193) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5194) 	++vcpu->stat.invlpg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5195) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5196) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5197) 	 * Mappings not reachable via the current cr3 or the prev_roots will be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5198) 	 * synced when switching to that cr3, so nothing needs to be done here
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5199) 	 * for them.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5200) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5201) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5202) EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5203) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5204) void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5205) 		       int tdp_huge_page_level)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5206) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5207) 	tdp_enabled = enable_tdp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5208) 	max_tdp_level = tdp_max_root_level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5209) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5210) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5211) 	 * max_huge_page_level reflects KVM's MMU capabilities irrespective
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5212) 	 * of kernel support, e.g. KVM may be capable of using 1GB pages when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5213) 	 * the kernel is not.  But, KVM never creates a page size greater than
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5214) 	 * what is used by the kernel for any given HVA, i.e. the kernel's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5215) 	 * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5216) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5217) 	if (tdp_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5218) 		max_huge_page_level = tdp_huge_page_level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5219) 	else if (boot_cpu_has(X86_FEATURE_GBPAGES))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5220) 		max_huge_page_level = PG_LEVEL_1G;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5221) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5222) 		max_huge_page_level = PG_LEVEL_2M;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5223) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5224) EXPORT_SYMBOL_GPL(kvm_configure_mmu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5225) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5226) /* The return value indicates if tlb flush on all vcpus is needed. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5227) typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5228) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5229) /* The caller should hold mmu-lock before calling this function. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5230) static __always_inline bool
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5231) slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5232) 			slot_level_handler fn, int start_level, int end_level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5233) 			gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5234) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5235) 	struct slot_rmap_walk_iterator iterator;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5236) 	bool flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5237) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5238) 	for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5239) 			end_gfn, &iterator) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5240) 		if (iterator.rmap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5241) 			flush |= fn(kvm, iterator.rmap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5242) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5243) 		if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5244) 			if (flush && lock_flush_tlb) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5245) 				kvm_flush_remote_tlbs_with_address(kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5246) 						start_gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5247) 						iterator.gfn - start_gfn + 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5248) 				flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5249) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5250) 			cond_resched_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5251) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5252) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5253) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5254) 	if (flush && lock_flush_tlb) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5255) 		kvm_flush_remote_tlbs_with_address(kvm, start_gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5256) 						   end_gfn - start_gfn + 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5257) 		flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5258) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5259) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5260) 	return flush;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5261) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5262) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5263) static __always_inline bool
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5264) slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5265) 		  slot_level_handler fn, int start_level, int end_level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5266) 		  bool lock_flush_tlb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5267) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5268) 	return slot_handle_level_range(kvm, memslot, fn, start_level,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5269) 			end_level, memslot->base_gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5270) 			memslot->base_gfn + memslot->npages - 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5271) 			lock_flush_tlb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5272) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5273) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5274) static __always_inline bool
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5275) slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5276) 		      slot_level_handler fn, bool lock_flush_tlb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5277) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5278) 	return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5279) 				 KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5280) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5281) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5282) static __always_inline bool
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5283) slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5284) 			slot_level_handler fn, bool lock_flush_tlb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5285) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5286) 	return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K + 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5287) 				 KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5288) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5289) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5290) static __always_inline bool
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5291) slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5292) 		 slot_level_handler fn, bool lock_flush_tlb)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5293) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5294) 	return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5295) 				 PG_LEVEL_4K, lock_flush_tlb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5296) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5297) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5298) static void free_mmu_pages(struct kvm_mmu *mmu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5299) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5300) 	free_page((unsigned long)mmu->pae_root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5301) 	free_page((unsigned long)mmu->lm_root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5302) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5303) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5304) static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5305) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5306) 	struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5307) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5308) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5309) 	mmu->root_hpa = INVALID_PAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5310) 	mmu->root_pgd = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5311) 	mmu->translate_gpa = translate_gpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5312) 	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5313) 		mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5314) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5315) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5316) 	 * When using PAE paging, the four PDPTEs are treated as 'root' pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5317) 	 * while the PDP table is a per-vCPU construct that's allocated at MMU
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5318) 	 * creation.  When emulating 32-bit mode, cr3 is only 32 bits even on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5319) 	 * x86_64.  Therefore we need to allocate the PDP table in the first
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5320) 	 * 4GB of memory, which happens to fit the DMA32 zone.  TDP paging
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5321) 	 * generally doesn't use PAE paging and can skip allocating the PDP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5322) 	 * table.  The main exception, handled here, is SVM's 32-bit NPT.  The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5323) 	 * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5324) 	 * KVM; that horror is handled on-demand by mmu_alloc_shadow_roots().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5325) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5326) 	if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5327) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5328) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5329) 	page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5330) 	if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5331) 		return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5332) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5333) 	mmu->pae_root = page_address(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5334) 	for (i = 0; i < 4; ++i)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5335) 		mmu->pae_root[i] = INVALID_PAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5336) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5337) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5338) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5339) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5340) int kvm_mmu_create(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5341) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5342) 	int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5343) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5344) 	vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5345) 	vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5346) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5347) 	vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5348) 	vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5349) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5350) 	vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5351) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5352) 	vcpu->arch.mmu = &vcpu->arch.root_mmu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5353) 	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5354) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5355) 	vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5356) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5357) 	ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5358) 	if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5359) 		return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5360) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5361) 	ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5362) 	if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5363) 		goto fail_allocate_root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5364) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5365) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5366)  fail_allocate_root:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5367) 	free_mmu_pages(&vcpu->arch.guest_mmu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5368) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5369) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5370) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5371) #define BATCH_ZAP_PAGES	10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5372) static void kvm_zap_obsolete_pages(struct kvm *kvm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5373) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5374) 	struct kvm_mmu_page *sp, *node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5375) 	int nr_zapped, batch = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5376) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5377) restart:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5378) 	list_for_each_entry_safe_reverse(sp, node,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5379) 	      &kvm->arch.active_mmu_pages, link) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5380) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5381) 		 * No obsolete valid page exists before a newly created page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5382) 		 * since active_mmu_pages is a FIFO list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5383) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5384) 		if (!is_obsolete_sp(kvm, sp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5385) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5386) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5387) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5388) 		 * Invalid pages should never land back on the list of active
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5389) 		 * pages.  Skip the bogus page, otherwise we'll get stuck in an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5390) 		 * infinite loop if the page gets put back on the list (again).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5391) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5392) 		if (WARN_ON(sp->role.invalid))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5393) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5394) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5395) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5396) 		 * No need to flush the TLB since we're only zapping shadow
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5397) 		 * pages with an obsolete generation number and all vCPUS have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5398) 		 * loaded a new root, i.e. the shadow pages being zapped cannot
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5399) 		 * be in active use by the guest.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5400) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5401) 		if (batch >= BATCH_ZAP_PAGES &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5402) 		    cond_resched_lock(&kvm->mmu_lock)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5403) 			batch = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5404) 			goto restart;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5405) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5406) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5407) 		if (__kvm_mmu_prepare_zap_page(kvm, sp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5408) 				&kvm->arch.zapped_obsolete_pages, &nr_zapped)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5409) 			batch += nr_zapped;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5410) 			goto restart;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5411) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5412) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5413) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5414) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5415) 	 * Trigger a remote TLB flush before freeing the page tables to ensure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5416) 	 * KVM is not in the middle of a lockless shadow page table walk, which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5417) 	 * may reference the pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5418) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5419) 	kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5420) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5421) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5422) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5423)  * Fast invalidate all shadow pages and use lock-break technique
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5424)  * to zap obsolete pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5425)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5426)  * It's required when memslot is being deleted or VM is being
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5427)  * destroyed, in these cases, we should ensure that KVM MMU does
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5428)  * not use any resource of the being-deleted slot or all slots
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5429)  * after calling the function.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5430)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5431) static void kvm_mmu_zap_all_fast(struct kvm *kvm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5432) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5433) 	lockdep_assert_held(&kvm->slots_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5434) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5435) 	spin_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5436) 	trace_kvm_mmu_zap_all_fast(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5437) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5438) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5439) 	 * Toggle mmu_valid_gen between '0' and '1'.  Because slots_lock is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5440) 	 * held for the entire duration of zapping obsolete pages, it's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5441) 	 * impossible for there to be multiple invalid generations associated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5442) 	 * with *valid* shadow pages at any given time, i.e. there is exactly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5443) 	 * one valid generation and (at most) one invalid generation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5444) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5445) 	kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5446) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5447) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5448) 	 * Notify all vcpus to reload its shadow page table and flush TLB.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5449) 	 * Then all vcpus will switch to new shadow page table with the new
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5450) 	 * mmu_valid_gen.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5451) 	 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5452) 	 * Note: we need to do this under the protection of mmu_lock,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5453) 	 * otherwise, vcpu would purge shadow page but miss tlb flush.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5454) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5455) 	kvm_reload_remote_mmus(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5456) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5457) 	kvm_zap_obsolete_pages(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5458) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5459) 	if (kvm->arch.tdp_mmu_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5460) 		kvm_tdp_mmu_zap_all(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5461) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5462) 	spin_unlock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5463) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5464) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5465) static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5466) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5467) 	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5468) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5469) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5470) static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5471) 			struct kvm_memory_slot *slot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5472) 			struct kvm_page_track_notifier_node *node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5473) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5474) 	kvm_mmu_zap_all_fast(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5475) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5476) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5477) void kvm_mmu_init_vm(struct kvm *kvm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5478) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5479) 	struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5480) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5481) 	kvm_mmu_init_tdp_mmu(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5482) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5483) 	node->track_write = kvm_mmu_pte_write;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5484) 	node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5485) 	kvm_page_track_register_notifier(kvm, node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5486) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5487) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5488) void kvm_mmu_uninit_vm(struct kvm *kvm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5489) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5490) 	struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5491) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5492) 	kvm_page_track_unregister_notifier(kvm, node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5493) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5494) 	kvm_mmu_uninit_tdp_mmu(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5495) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5496) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5497) void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5498) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5499) 	struct kvm_memslots *slots;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5500) 	struct kvm_memory_slot *memslot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5501) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5502) 	bool flush;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5503) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5504) 	spin_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5505) 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5506) 		slots = __kvm_memslots(kvm, i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5507) 		kvm_for_each_memslot(memslot, slots) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5508) 			gfn_t start, end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5509) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5510) 			start = max(gfn_start, memslot->base_gfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5511) 			end = min(gfn_end, memslot->base_gfn + memslot->npages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5512) 			if (start >= end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5513) 				continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5514) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5515) 			slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5516) 						PG_LEVEL_4K,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5517) 						KVM_MAX_HUGEPAGE_LEVEL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5518) 						start, end - 1, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5519) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5520) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5521) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5522) 	if (kvm->arch.tdp_mmu_enabled) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5523) 		flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5524) 		if (flush)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5525) 			kvm_flush_remote_tlbs(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5526) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5527) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5528) 	spin_unlock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5529) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5530) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5531) static bool slot_rmap_write_protect(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5532) 				    struct kvm_rmap_head *rmap_head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5533) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5534) 	return __rmap_write_protect(kvm, rmap_head, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5535) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5536) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5537) void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5538) 				      struct kvm_memory_slot *memslot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5539) 				      int start_level)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5540) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5541) 	bool flush;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5542) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5543) 	spin_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5544) 	flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5545) 				start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5546) 	if (kvm->arch.tdp_mmu_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5547) 		flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5548) 	spin_unlock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5549) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5550) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5551) 	 * We can flush all the TLBs out of the mmu lock without TLB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5552) 	 * corruption since we just change the spte from writable to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5553) 	 * readonly so that we only need to care the case of changing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5554) 	 * spte from present to present (changing the spte from present
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5555) 	 * to nonpresent will flush all the TLBs immediately), in other
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5556) 	 * words, the only case we care is mmu_spte_update() where we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5557) 	 * have checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5558) 	 * instead of PT_WRITABLE_MASK, that means it does not depend
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5559) 	 * on PT_WRITABLE_MASK anymore.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5560) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5561) 	if (flush)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5562) 		kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5563) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5564) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5565) static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5566) 					 struct kvm_rmap_head *rmap_head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5567) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5568) 	u64 *sptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5569) 	struct rmap_iterator iter;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5570) 	int need_tlb_flush = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5571) 	kvm_pfn_t pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5572) 	struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5573) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5574) restart:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5575) 	for_each_rmap_spte(rmap_head, &iter, sptep) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5576) 		sp = sptep_to_sp(sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5577) 		pfn = spte_to_pfn(*sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5578) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5579) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5580) 		 * We cannot do huge page mapping for indirect shadow pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5581) 		 * which are found on the last rmap (level = 1) when not using
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5582) 		 * tdp; such shadow pages are synced with the page table in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5583) 		 * the guest, and the guest page table is using 4K page size
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5584) 		 * mapping if the indirect sp has level = 1.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5585) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5586) 		if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5587) 		    (kvm_is_zone_device_pfn(pfn) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5588) 		     PageCompound(pfn_to_page(pfn)))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5589) 			pte_list_remove(rmap_head, sptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5590) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5591) 			if (kvm_available_flush_tlb_with_range())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5592) 				kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5593) 					KVM_PAGES_PER_HPAGE(sp->role.level));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5594) 			else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5595) 				need_tlb_flush = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5596) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5597) 			goto restart;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5598) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5599) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5600) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5601) 	return need_tlb_flush;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5602) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5603) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5604) void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5605) 				   const struct kvm_memory_slot *memslot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5606) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5607) 	/* FIXME: const-ify all uses of struct kvm_memory_slot.  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5608) 	spin_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5609) 	slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5610) 			 kvm_mmu_zap_collapsible_spte, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5611) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5612) 	if (kvm->arch.tdp_mmu_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5613) 		kvm_tdp_mmu_zap_collapsible_sptes(kvm, memslot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5614) 	spin_unlock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5615) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5616) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5617) void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5618) 					struct kvm_memory_slot *memslot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5619) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5620) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5621) 	 * All current use cases for flushing the TLBs for a specific memslot
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5622) 	 * are related to dirty logging, and do the TLB flush out of mmu_lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5623) 	 * The interaction between the various operations on memslot must be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5624) 	 * serialized by slots_locks to ensure the TLB flush from one operation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5625) 	 * is observed by any other operation on the same memslot.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5626) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5627) 	lockdep_assert_held(&kvm->slots_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5628) 	kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5629) 					   memslot->npages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5630) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5631) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5632) void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5633) 				   struct kvm_memory_slot *memslot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5634) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5635) 	bool flush;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5636) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5637) 	spin_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5638) 	flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5639) 	if (kvm->arch.tdp_mmu_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5640) 		flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5641) 	spin_unlock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5642) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5643) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5644) 	 * It's also safe to flush TLBs out of mmu lock here as currently this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5645) 	 * function is only used for dirty logging, in which case flushing TLB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5646) 	 * out of mmu lock also guarantees no dirty pages will be lost in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5647) 	 * dirty_bitmap.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5648) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5649) 	if (flush)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5650) 		kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5651) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5652) EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5653) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5654) void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5655) 					struct kvm_memory_slot *memslot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5656) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5657) 	bool flush;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5658) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5659) 	spin_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5660) 	flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5661) 					false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5662) 	if (kvm->arch.tdp_mmu_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5663) 		flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_2M);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5664) 	spin_unlock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5665) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5666) 	if (flush)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5667) 		kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5668) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5669) EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5670) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5671) void kvm_mmu_slot_set_dirty(struct kvm *kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5672) 			    struct kvm_memory_slot *memslot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5673) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5674) 	bool flush;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5675) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5676) 	spin_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5677) 	flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5678) 	if (kvm->arch.tdp_mmu_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5679) 		flush |= kvm_tdp_mmu_slot_set_dirty(kvm, memslot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5680) 	spin_unlock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5681) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5682) 	if (flush)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5683) 		kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5684) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5685) EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5686) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5687) void kvm_mmu_zap_all(struct kvm *kvm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5688) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5689) 	struct kvm_mmu_page *sp, *node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5690) 	LIST_HEAD(invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5691) 	int ign;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5692) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5693) 	spin_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5694) restart:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5695) 	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5696) 		if (WARN_ON(sp->role.invalid))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5697) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5698) 		if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5699) 			goto restart;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5700) 		if (cond_resched_lock(&kvm->mmu_lock))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5701) 			goto restart;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5702) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5703) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5704) 	kvm_mmu_commit_zap_page(kvm, &invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5705) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5706) 	if (kvm->arch.tdp_mmu_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5707) 		kvm_tdp_mmu_zap_all(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5708) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5709) 	spin_unlock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5710) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5711) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5712) void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5713) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5714) 	WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5715) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5716) 	gen &= MMIO_SPTE_GEN_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5717) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5718) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5719) 	 * Generation numbers are incremented in multiples of the number of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5720) 	 * address spaces in order to provide unique generations across all
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5721) 	 * address spaces.  Strip what is effectively the address space
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5722) 	 * modifier prior to checking for a wrap of the MMIO generation so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5723) 	 * that a wrap in any address space is detected.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5724) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5725) 	gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5726) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5727) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5728) 	 * The very rare case: if the MMIO generation number has wrapped,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5729) 	 * zap all shadow pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5730) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5731) 	if (unlikely(gen == 0)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5732) 		kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5733) 		kvm_mmu_zap_all_fast(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5734) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5735) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5736) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5737) static unsigned long
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5738) mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5739) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5740) 	struct kvm *kvm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5741) 	int nr_to_scan = sc->nr_to_scan;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5742) 	unsigned long freed = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5743) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5744) 	mutex_lock(&kvm_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5745) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5746) 	list_for_each_entry(kvm, &vm_list, vm_list) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5747) 		int idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5748) 		LIST_HEAD(invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5749) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5750) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5751) 		 * Never scan more than sc->nr_to_scan VM instances.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5752) 		 * Will not hit this condition practically since we do not try
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5753) 		 * to shrink more than one VM and it is very unlikely to see
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5754) 		 * !n_used_mmu_pages so many times.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5755) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5756) 		if (!nr_to_scan--)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5757) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5758) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5759) 		 * n_used_mmu_pages is accessed without holding kvm->mmu_lock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5760) 		 * here. We may skip a VM instance errorneosly, but we do not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5761) 		 * want to shrink a VM that only started to populate its MMU
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5762) 		 * anyway.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5763) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5764) 		if (!kvm->arch.n_used_mmu_pages &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5765) 		    !kvm_has_zapped_obsolete_pages(kvm))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5766) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5767) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5768) 		idx = srcu_read_lock(&kvm->srcu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5769) 		spin_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5770) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5771) 		if (kvm_has_zapped_obsolete_pages(kvm)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5772) 			kvm_mmu_commit_zap_page(kvm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5773) 			      &kvm->arch.zapped_obsolete_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5774) 			goto unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5775) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5776) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5777) 		freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5778) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5779) unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5780) 		spin_unlock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5781) 		srcu_read_unlock(&kvm->srcu, idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5782) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5783) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5784) 		 * unfair on small ones
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5785) 		 * per-vm shrinkers cry out
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5786) 		 * sadness comes quickly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5787) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5788) 		list_move_tail(&kvm->vm_list, &vm_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5789) 		break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5790) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5791) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5792) 	mutex_unlock(&kvm_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5793) 	return freed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5794) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5795) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5796) static unsigned long
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5797) mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5798) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5799) 	return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5800) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5801) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5802) static struct shrinker mmu_shrinker = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5803) 	.count_objects = mmu_shrink_count,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5804) 	.scan_objects = mmu_shrink_scan,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5805) 	.seeks = DEFAULT_SEEKS * 10,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5806) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5807) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5808) static void mmu_destroy_caches(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5809) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5810) 	kmem_cache_destroy(pte_list_desc_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5811) 	kmem_cache_destroy(mmu_page_header_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5812) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5813) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5814) static void kvm_set_mmio_spte_mask(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5815) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5816) 	u64 mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5817) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5818) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5819) 	 * Set a reserved PA bit in MMIO SPTEs to generate page faults with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5820) 	 * PFEC.RSVD=1 on MMIO accesses.  64-bit PTEs (PAE, x86-64, and EPT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5821) 	 * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5822) 	 * 52-bit physical addresses then there are no reserved PA bits in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5823) 	 * PTEs and so the reserved PA approach must be disabled.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5824) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5825) 	if (shadow_phys_bits < 52)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5826) 		mask = BIT_ULL(51) | PT_PRESENT_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5827) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5828) 		mask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5829) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5830) 	kvm_mmu_set_mmio_spte_mask(mask, ACC_WRITE_MASK | ACC_USER_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5831) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5832) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5833) static bool get_nx_auto_mode(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5834) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5835) 	/* Return true when CPU has the bug, and mitigations are ON */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5836) 	return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5837) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5838) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5839) static void __set_nx_huge_pages(bool val)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5840) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5841) 	nx_huge_pages = itlb_multihit_kvm_mitigation = val;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5842) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5843) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5844) static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5845) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5846) 	bool old_val = nx_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5847) 	bool new_val;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5848) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5849) 	/* In "auto" mode deploy workaround only if CPU has the bug. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5850) 	if (sysfs_streq(val, "off"))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5851) 		new_val = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5852) 	else if (sysfs_streq(val, "force"))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5853) 		new_val = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5854) 	else if (sysfs_streq(val, "auto"))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5855) 		new_val = get_nx_auto_mode();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5856) 	else if (strtobool(val, &new_val) < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5857) 		return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5858) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5859) 	__set_nx_huge_pages(new_val);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5860) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5861) 	if (new_val != old_val) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5862) 		struct kvm *kvm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5863) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5864) 		mutex_lock(&kvm_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5865) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5866) 		list_for_each_entry(kvm, &vm_list, vm_list) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5867) 			mutex_lock(&kvm->slots_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5868) 			kvm_mmu_zap_all_fast(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5869) 			mutex_unlock(&kvm->slots_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5870) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5871) 			wake_up_process(kvm->arch.nx_lpage_recovery_thread);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5872) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5873) 		mutex_unlock(&kvm_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5874) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5875) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5876) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5877) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5878) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5879) int kvm_mmu_module_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5880) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5881) 	int ret = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5882) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5883) 	if (nx_huge_pages == -1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5884) 		__set_nx_huge_pages(get_nx_auto_mode());
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5885) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5886) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5887) 	 * MMU roles use union aliasing which is, generally speaking, an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5888) 	 * undefined behavior. However, we supposedly know how compilers behave
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5889) 	 * and the current status quo is unlikely to change. Guardians below are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5890) 	 * supposed to let us know if the assumption becomes false.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5891) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5892) 	BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5893) 	BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5894) 	BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5895) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5896) 	kvm_mmu_reset_all_pte_masks();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5897) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5898) 	kvm_set_mmio_spte_mask();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5899) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5900) 	pte_list_desc_cache = kmem_cache_create("pte_list_desc",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5901) 					    sizeof(struct pte_list_desc),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5902) 					    0, SLAB_ACCOUNT, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5903) 	if (!pte_list_desc_cache)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5904) 		goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5905) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5906) 	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5907) 						  sizeof(struct kvm_mmu_page),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5908) 						  0, SLAB_ACCOUNT, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5909) 	if (!mmu_page_header_cache)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5910) 		goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5911) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5912) 	if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5913) 		goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5914) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5915) 	ret = register_shrinker(&mmu_shrinker);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5916) 	if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5917) 		goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5918) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5919) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5920) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5921) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5922) 	mmu_destroy_caches();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5923) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5924) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5925) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5926) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5927)  * Calculate mmu pages needed for kvm.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5928)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5929) unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5930) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5931) 	unsigned long nr_mmu_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5932) 	unsigned long nr_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5933) 	struct kvm_memslots *slots;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5934) 	struct kvm_memory_slot *memslot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5935) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5936) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5937) 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5938) 		slots = __kvm_memslots(kvm, i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5939) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5940) 		kvm_for_each_memslot(memslot, slots)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5941) 			nr_pages += memslot->npages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5942) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5943) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5944) 	nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5945) 	nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5946) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5947) 	return nr_mmu_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5948) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5949) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5950) void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5951) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5952) 	kvm_mmu_unload(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5953) 	free_mmu_pages(&vcpu->arch.root_mmu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5954) 	free_mmu_pages(&vcpu->arch.guest_mmu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5955) 	mmu_free_memory_caches(vcpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5956) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5957) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5958) void kvm_mmu_module_exit(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5959) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5960) 	mmu_destroy_caches();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5961) 	percpu_counter_destroy(&kvm_total_used_mmu_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5962) 	unregister_shrinker(&mmu_shrinker);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5963) 	mmu_audit_disable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5964) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5965) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5966) static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5967) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5968) 	unsigned int old_val;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5969) 	int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5970) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5971) 	old_val = nx_huge_pages_recovery_ratio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5972) 	err = param_set_uint(val, kp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5973) 	if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5974) 		return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5975) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5976) 	if (READ_ONCE(nx_huge_pages) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5977) 	    !old_val && nx_huge_pages_recovery_ratio) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5978) 		struct kvm *kvm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5979) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5980) 		mutex_lock(&kvm_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5981) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5982) 		list_for_each_entry(kvm, &vm_list, vm_list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5983) 			wake_up_process(kvm->arch.nx_lpage_recovery_thread);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5984) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5985) 		mutex_unlock(&kvm_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5986) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5987) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5988) 	return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5989) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5990) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5991) static void kvm_recover_nx_lpages(struct kvm *kvm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5992) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5993) 	int rcu_idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5994) 	struct kvm_mmu_page *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5995) 	unsigned int ratio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5996) 	LIST_HEAD(invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5997) 	bool flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5998) 	ulong to_zap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5999) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6000) 	rcu_idx = srcu_read_lock(&kvm->srcu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6001) 	spin_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6002) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6003) 	ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6004) 	to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6005) 	for ( ; to_zap; --to_zap) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6006) 		if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6007) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6008) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6009) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6010) 		 * We use a separate list instead of just using active_mmu_pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6011) 		 * because the number of lpage_disallowed pages is expected to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6012) 		 * be relatively small compared to the total.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6013) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6014) 		sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6015) 				      struct kvm_mmu_page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6016) 				      lpage_disallowed_link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6017) 		WARN_ON_ONCE(!sp->lpage_disallowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6018) 		if (sp->tdp_mmu_page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6019) 			flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6020) 		} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6021) 			kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6022) 			WARN_ON_ONCE(sp->lpage_disallowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6023) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6024) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6025) 		if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6026) 			kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6027) 			cond_resched_lock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6028) 			flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6029) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6030) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6031) 	kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6032) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6033) 	spin_unlock(&kvm->mmu_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6034) 	srcu_read_unlock(&kvm->srcu, rcu_idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6035) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6036) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6037) static long get_nx_lpage_recovery_timeout(u64 start_time)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6038) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6039) 	return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6040) 		? start_time + 60 * HZ - get_jiffies_64()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6041) 		: MAX_SCHEDULE_TIMEOUT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6042) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6043) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6044) static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6045) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6046) 	u64 start_time;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6047) 	long remaining_time;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6048) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6049) 	while (true) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6050) 		start_time = get_jiffies_64();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6051) 		remaining_time = get_nx_lpage_recovery_timeout(start_time);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6052) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6053) 		set_current_state(TASK_INTERRUPTIBLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6054) 		while (!kthread_should_stop() && remaining_time > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6055) 			schedule_timeout(remaining_time);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6056) 			remaining_time = get_nx_lpage_recovery_timeout(start_time);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6057) 			set_current_state(TASK_INTERRUPTIBLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6058) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6059) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6060) 		set_current_state(TASK_RUNNING);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6061) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6062) 		if (kthread_should_stop())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6063) 			return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6064) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6065) 		kvm_recover_nx_lpages(kvm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6066) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6067) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6068) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6069) int kvm_mmu_post_init_vm(struct kvm *kvm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6070) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6071) 	int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6072) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6073) 	err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6074) 					  "kvm-nx-lpage-recovery",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6075) 					  &kvm->arch.nx_lpage_recovery_thread);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6076) 	if (!err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6077) 		kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6078) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6079) 	return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6080) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6081) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6082) void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6083) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6084) 	if (kvm->arch.nx_lpage_recovery_thread)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6085) 		kthread_stop(kvm->arch.nx_lpage_recovery_thread);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6086) }