^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0-or-later
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * Copyright 2013 Red Hat Inc.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Authors: Jérôme Glisse <jglisse@redhat.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) * Refer to include/linux/hmm.h for information about heterogeneous memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) * management or HMM for short.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) #include <linux/pagewalk.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #include <linux/hmm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) #include <linux/init.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #include <linux/rmap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #include <linux/swap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #include <linux/slab.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #include <linux/sched.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #include <linux/mmzone.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #include <linux/pagemap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #include <linux/swapops.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #include <linux/hugetlb.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) #include <linux/memremap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) #include <linux/sched/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) #include <linux/jump_label.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #include <linux/dma-mapping.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) #include <linux/mmu_notifier.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) #include <linux/memory_hotplug.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) struct hmm_vma_walk {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) struct hmm_range *range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) unsigned long last;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) enum {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) HMM_NEED_FAULT = 1 << 0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) HMM_NEED_WRITE_FAULT = 1 << 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) static int hmm_pfns_fill(unsigned long addr, unsigned long end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) struct hmm_range *range, unsigned long cpu_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) unsigned long i = (addr - range->start) >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) for (; addr < end; addr += PAGE_SIZE, i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) range->hmm_pfns[i] = cpu_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) * hmm_vma_fault() - fault in a range lacking valid pmd or pte(s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) * @addr: range virtual start address (inclusive)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) * @end: range virtual end address (exclusive)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) * @required_fault: HMM_NEED_* flags
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) * @walk: mm_walk structure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) * Return: -EBUSY after page fault, or page fault error
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) * This function will be called whenever pmd_none() or pte_none() returns true,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) * or whenever there is no page directory covering the virtual address range.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) static int hmm_vma_fault(unsigned long addr, unsigned long end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) unsigned int required_fault, struct mm_walk *walk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) struct hmm_vma_walk *hmm_vma_walk = walk->private;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) struct vm_area_struct *vma = walk->vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) unsigned int fault_flags = FAULT_FLAG_REMOTE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) WARN_ON_ONCE(!required_fault);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) hmm_vma_walk->last = addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) if (required_fault & HMM_NEED_WRITE_FAULT) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) if (!(vma->vm_flags & VM_WRITE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) return -EPERM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) fault_flags |= FAULT_FLAG_WRITE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) for (; addr < end; addr += PAGE_SIZE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) if (handle_mm_fault(vma, addr, fault_flags, NULL) &
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) VM_FAULT_ERROR)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) return -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) unsigned long pfn_req_flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) unsigned long cpu_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) struct hmm_range *range = hmm_vma_walk->range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) * So we not only consider the individual per page request we also
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) * consider the default flags requested for the range. The API can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) * be used 2 ways. The first one where the HMM user coalesces
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) * multiple page faults into one request and sets flags per pfn for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) * those faults. The second one where the HMM user wants to pre-
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) * fault a range with specific flags. For the latter one it is a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) * waste to have the user pre-fill the pfn arrays with a default
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) * flags value.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) pfn_req_flags &= range->pfn_flags_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) pfn_req_flags |= range->default_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) /* We aren't ask to do anything ... */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) if (!(pfn_req_flags & HMM_PFN_REQ_FAULT))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) /* Need to write fault ? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) if ((pfn_req_flags & HMM_PFN_REQ_WRITE) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) !(cpu_flags & HMM_PFN_WRITE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) return HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) /* If CPU page table is not valid then we need to fault */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) if (!(cpu_flags & HMM_PFN_VALID))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) return HMM_NEED_FAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) static unsigned int
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) const unsigned long hmm_pfns[], unsigned long npages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) unsigned long cpu_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) struct hmm_range *range = hmm_vma_walk->range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) unsigned int required_fault = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) unsigned long i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) * If the default flags do not request to fault pages, and the mask does
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) * not allow for individual pages to be faulted, then
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) * hmm_pte_need_fault() will always return 0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) if (!((range->default_flags | range->pfn_flags_mask) &
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) HMM_PFN_REQ_FAULT))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) for (i = 0; i < npages; ++i) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) required_fault |= hmm_pte_need_fault(hmm_vma_walk, hmm_pfns[i],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) cpu_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) if (required_fault == HMM_NEED_ALL_BITS)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) return required_fault;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) return required_fault;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) __always_unused int depth, struct mm_walk *walk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) struct hmm_vma_walk *hmm_vma_walk = walk->private;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) struct hmm_range *range = hmm_vma_walk->range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) unsigned int required_fault;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) unsigned long i, npages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) unsigned long *hmm_pfns;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) i = (addr - range->start) >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) npages = (end - addr) >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) hmm_pfns = &range->hmm_pfns[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) required_fault =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) if (!walk->vma) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) if (required_fault)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) return hmm_pfns_fill(addr, end, range, HMM_PFN_ERROR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) if (required_fault)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) return hmm_vma_fault(addr, end, required_fault, walk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) return hmm_pfns_fill(addr, end, range, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) static inline unsigned long hmm_pfn_flags_order(unsigned long order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) return order << HMM_PFN_ORDER_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) pmd_t pmd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) if (pmd_protnone(pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) return (pmd_write(pmd) ? (HMM_PFN_VALID | HMM_PFN_WRITE) :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) HMM_PFN_VALID) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) #ifdef CONFIG_TRANSPARENT_HUGEPAGE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) unsigned long end, unsigned long hmm_pfns[],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) pmd_t pmd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) struct hmm_vma_walk *hmm_vma_walk = walk->private;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) struct hmm_range *range = hmm_vma_walk->range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) unsigned long pfn, npages, i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) unsigned int required_fault;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) unsigned long cpu_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) npages = (end - addr) >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) cpu_flags = pmd_to_hmm_pfn_flags(range, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) required_fault =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, cpu_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) if (required_fault)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) return hmm_vma_fault(addr, end, required_fault, walk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) hmm_pfns[i] = pfn | cpu_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) #else /* CONFIG_TRANSPARENT_HUGEPAGE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) /* stub to allow the code below to compile */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) unsigned long end, unsigned long hmm_pfns[], pmd_t pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) static inline bool hmm_is_device_private_entry(struct hmm_range *range,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) swp_entry_t entry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) return is_device_private_entry(entry) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) device_private_entry_to_page(entry)->pgmap->owner ==
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) range->dev_private_owner;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) static inline unsigned long pte_to_hmm_pfn_flags(struct hmm_range *range,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) pte_t pte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) return pte_write(pte) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) unsigned long end, pmd_t *pmdp, pte_t *ptep,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) unsigned long *hmm_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) struct hmm_vma_walk *hmm_vma_walk = walk->private;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) struct hmm_range *range = hmm_vma_walk->range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) unsigned int required_fault;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) unsigned long cpu_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) pte_t pte = *ptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) uint64_t pfn_req_flags = *hmm_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) if (pte_none(pte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) required_fault =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) if (required_fault)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) goto fault;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) *hmm_pfn = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) if (!pte_present(pte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) swp_entry_t entry = pte_to_swp_entry(pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) * Never fault in device private pages, but just report
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) * the PFN even if not present.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) if (hmm_is_device_private_entry(range, entry)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) cpu_flags = HMM_PFN_VALID;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) if (is_write_device_private_entry(entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) cpu_flags |= HMM_PFN_WRITE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) *hmm_pfn = device_private_entry_to_pfn(entry) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) cpu_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) required_fault =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) if (!required_fault) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) *hmm_pfn = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) if (!non_swap_entry(entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) goto fault;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) if (is_migration_entry(entry)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) pte_unmap(ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) hmm_vma_walk->last = addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) migration_entry_wait(walk->mm, pmdp, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) return -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) /* Report error for everything else */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) pte_unmap(ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) cpu_flags = pte_to_hmm_pfn_flags(range, pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) required_fault =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) if (required_fault)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) goto fault;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) * Bypass devmap pte such as DAX page when all pfn requested
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) * flags(pfn_req_flags) are fulfilled.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) * Since each architecture defines a struct page for the zero page, just
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) * fall through and treat it like a normal page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) if (!vm_normal_page(walk->vma, addr, pte) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) !pte_devmap(pte) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) !is_zero_pfn(pte_pfn(pte))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) pte_unmap(ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) *hmm_pfn = HMM_PFN_ERROR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) *hmm_pfn = pte_pfn(pte) | cpu_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) fault:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) pte_unmap(ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) /* Fault any virtual address we were asked to fault */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) return hmm_vma_fault(addr, end, required_fault, walk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) static int hmm_vma_walk_pmd(pmd_t *pmdp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) unsigned long start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) unsigned long end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) struct mm_walk *walk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) struct hmm_vma_walk *hmm_vma_walk = walk->private;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) struct hmm_range *range = hmm_vma_walk->range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) unsigned long *hmm_pfns =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) &range->hmm_pfns[(start - range->start) >> PAGE_SHIFT];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) unsigned long npages = (end - start) >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) unsigned long addr = start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) pte_t *ptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) pmd_t pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) again:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) pmd = READ_ONCE(*pmdp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) if (pmd_none(pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) return hmm_vma_walk_hole(start, end, -1, walk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) hmm_vma_walk->last = addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) pmd_migration_entry_wait(walk->mm, pmdp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) return -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) return hmm_pfns_fill(start, end, range, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) if (!pmd_present(pmd)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) * No need to take pmd_lock here, even if some other thread
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) * is splitting the huge pmd we will get that event through
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) * mmu_notifier callback.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) * So just read pmd value and check again it's a transparent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) * huge or device mapping one and compute corresponding pfn
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) * values.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) pmd = pmd_read_atomic(pmdp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) barrier();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) goto again;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) * We have handled all the valid cases above ie either none, migration,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) * huge or transparent huge. At this point either it is a valid pmd
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) * entry pointing to pte directory or it is a bad pmd that will not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) * recover.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) if (pmd_bad(pmd)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) ptep = pte_offset_map(pmdp, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) for (; addr < end; addr += PAGE_SIZE, ptep++, hmm_pfns++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) int r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, hmm_pfns);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) if (r) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) /* hmm_vma_handle_pte() did pte_unmap() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) return r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) pte_unmap(ptep - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) static inline unsigned long pud_to_hmm_pfn_flags(struct hmm_range *range,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) pud_t pud)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) if (!pud_present(pud))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) return (pud_write(pud) ? (HMM_PFN_VALID | HMM_PFN_WRITE) :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) HMM_PFN_VALID) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) hmm_pfn_flags_order(PUD_SHIFT - PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) struct mm_walk *walk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) struct hmm_vma_walk *hmm_vma_walk = walk->private;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) struct hmm_range *range = hmm_vma_walk->range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) unsigned long addr = start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) pud_t pud;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) if (!ptl)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) /* Normally we don't want to split the huge page */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) walk->action = ACTION_CONTINUE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) pud = READ_ONCE(*pudp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) if (pud_none(pud)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) return hmm_vma_walk_hole(start, end, -1, walk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) if (pud_huge(pud) && pud_devmap(pud)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) unsigned long i, npages, pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) unsigned int required_fault;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) unsigned long *hmm_pfns;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) unsigned long cpu_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) if (!pud_present(pud)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) return hmm_vma_walk_hole(start, end, -1, walk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) i = (addr - range->start) >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) npages = (end - addr) >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) hmm_pfns = &range->hmm_pfns[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) cpu_flags = pud_to_hmm_pfn_flags(range, pud);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) npages, cpu_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) if (required_fault) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) return hmm_vma_fault(addr, end, required_fault, walk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) for (i = 0; i < npages; ++i, ++pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) hmm_pfns[i] = pfn | cpu_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) /* Ask for the PUD to be split */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) walk->action = ACTION_SUBTREE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) #define hmm_vma_walk_pud NULL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) #ifdef CONFIG_HUGETLB_PAGE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) unsigned long start, unsigned long end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) struct mm_walk *walk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) unsigned long addr = start, i, pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) struct hmm_vma_walk *hmm_vma_walk = walk->private;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) struct hmm_range *range = hmm_vma_walk->range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) struct vm_area_struct *vma = walk->vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) unsigned int required_fault;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) unsigned long pfn_req_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) unsigned long cpu_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) pte_t entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) entry = huge_ptep_get(pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) i = (start - range->start) >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) pfn_req_flags = range->hmm_pfns[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) cpu_flags = pte_to_hmm_pfn_flags(range, entry) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) hmm_pfn_flags_order(huge_page_order(hstate_vma(vma)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) required_fault =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) if (required_fault) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) return hmm_vma_fault(addr, end, required_fault, walk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) range->hmm_pfns[i] = pfn | cpu_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) #define hmm_vma_walk_hugetlb_entry NULL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) #endif /* CONFIG_HUGETLB_PAGE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) static int hmm_vma_walk_test(unsigned long start, unsigned long end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) struct mm_walk *walk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) struct hmm_vma_walk *hmm_vma_walk = walk->private;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) struct hmm_range *range = hmm_vma_walk->range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) struct vm_area_struct *vma = walk->vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) vma->vm_flags & VM_READ)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) * vma ranges that don't have struct page backing them or map I/O
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) * devices directly cannot be handled by hmm_range_fault().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) * If the vma does not allow read access, then assume that it does not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) * allow write access either. HMM does not support architectures that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) * allow write without read.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) * If a fault is requested for an unsupported range then it is a hard
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) * failure.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) if (hmm_range_need_fault(hmm_vma_walk,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) range->hmm_pfns +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) ((start - range->start) >> PAGE_SHIFT),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) (end - start) >> PAGE_SHIFT, 0))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) /* Skip this vma and continue processing the next vma. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) static const struct mm_walk_ops hmm_walk_ops = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) .pud_entry = hmm_vma_walk_pud,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) .pmd_entry = hmm_vma_walk_pmd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) .pte_hole = hmm_vma_walk_hole,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) .hugetlb_entry = hmm_vma_walk_hugetlb_entry,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) .test_walk = hmm_vma_walk_test,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) * hmm_range_fault - try to fault some address in a virtual address range
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) * @range: argument structure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) * Returns 0 on success or one of the following error codes:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) * (e.g., device file vma).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) * -ENOMEM: Out of memory.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) * -EPERM: Invalid permission (e.g., asking for write and range is read
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) * only).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) * -EBUSY: The range has been invalidated and the caller needs to wait for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) * the invalidation to finish.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) * -EFAULT: A page was requested to be valid and could not be made valid
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) * ie it has no backing VMA or it is illegal to access
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) * This is similar to get_user_pages(), except that it can read the page tables
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) * without mutating them (ie causing faults).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) int hmm_range_fault(struct hmm_range *range)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) struct hmm_vma_walk hmm_vma_walk = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) .range = range,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) .last = range->start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) struct mm_struct *mm = range->notifier->mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) mmap_assert_locked(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) /* If range is no longer valid force retry. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) if (mmu_interval_check_retry(range->notifier,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) range->notifier_seq))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) return -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) ret = walk_page_range(mm, hmm_vma_walk.last, range->end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) &hmm_walk_ops, &hmm_vma_walk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) * When -EBUSY is returned the loop restarts with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) * hmm_vma_walk.last set to an address that has not been stored
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) * in pfns. All entries < last in the pfn array are set to their
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) * output, and all >= are still at their input values.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) } while (ret == -EBUSY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) EXPORT_SYMBOL(hmm_range_fault);