^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0-only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * mm/userfaultfd.c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Copyright (C) 2015 Red Hat, Inc.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) #include <linux/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) #include <linux/sched/signal.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) #include <linux/pagemap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) #include <linux/rmap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #include <linux/swap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) #include <linux/swapops.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #include <linux/userfaultfd_k.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #include <linux/mmu_notifier.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #include <linux/hugetlb.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #include <linux/shmem_fs.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #include <asm/tlbflush.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #include "internal.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) static __always_inline
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) unsigned long dst_start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) unsigned long len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) * Make sure that the dst range is both valid and fully within a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) * single existing vma.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) struct vm_area_struct *dst_vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) dst_vma = find_vma(dst_mm, dst_start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) if (!dst_vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) if (dst_start < dst_vma->vm_start ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) dst_start + len > dst_vma->vm_end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) * Check the vma is registered in uffd, this is required to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) * enforce the VM_MAYWRITE check done at uffd registration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) * time.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) if (!dst_vma->vm_userfaultfd_ctx.ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) return dst_vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) * Install PTEs, to map dst_addr (within dst_vma) to page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) * and anon, and for both shared and private VMAs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) struct vm_area_struct *dst_vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) unsigned long dst_addr, struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) bool newly_allocated, bool wp_copy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) pte_t _dst_pte, *dst_pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) bool writable = dst_vma->vm_flags & VM_WRITE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) bool vm_shared = dst_vma->vm_flags & VM_SHARED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) bool page_in_cache = page->mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) struct inode *inode;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) pgoff_t offset, max_off;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) if (page_in_cache && !vm_shared)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) writable = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) if (writable || !page_in_cache)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) _dst_pte = pte_mkdirty(_dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) if (writable) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) if (wp_copy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) _dst_pte = pte_mkuffd_wp(_dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) _dst_pte = pte_mkwrite(_dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) if (vma_is_shmem(dst_vma)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) /* serialize against truncate with the page table lock */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) inode = dst_vma->vm_file->f_inode;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) offset = linear_page_index(dst_vma, dst_addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) ret = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) if (unlikely(offset >= max_off))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) ret = -EEXIST;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) if (!pte_none(*dst_pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) if (page_in_cache)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) page_add_file_rmap(page, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) * Must happen after rmap, as mm_counter() checks mapping (via
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) * PageAnon()), which is set by __page_set_anon_rmap().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) inc_mm_counter(dst_mm, mm_counter(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) if (newly_allocated)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) lru_cache_add_inactive_or_unevictable(page, dst_vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) /* No need to invalidate - it was non-present before */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) update_mmu_cache(dst_vma, dst_addr, dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) pte_unmap_unlock(dst_pte, ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) static int mcopy_atomic_pte(struct mm_struct *dst_mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) pmd_t *dst_pmd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) struct vm_area_struct *dst_vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) unsigned long dst_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) unsigned long src_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) struct page **pagep,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) bool wp_copy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) void *page_kaddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) if (!*pagep) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) ret = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) page_kaddr = kmap_atomic(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) ret = copy_from_user(page_kaddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) (const void __user *) src_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) kunmap_atomic(page_kaddr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) /* fallback to copy_from_user outside mmap_lock */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) if (unlikely(ret)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) ret = -ENOENT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) *pagep = page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) /* don't free the page */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) page = *pagep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) *pagep = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) * The memory barrier inside __SetPageUptodate makes sure that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) * preceding stores to the page contents become visible before
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) * the set_pte_at() write.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) __SetPageUptodate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) ret = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) goto out_release;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) page, true, wp_copy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) goto out_release;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) out_release:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) static int mfill_zeropage_pte(struct mm_struct *dst_mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) pmd_t *dst_pmd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) struct vm_area_struct *dst_vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) unsigned long dst_addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) pte_t _dst_pte, *dst_pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) pgoff_t offset, max_off;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) struct inode *inode;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) dst_vma->vm_page_prot));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) if (dst_vma->vm_file) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) /* the shmem MAP_PRIVATE case requires checking the i_size */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) inode = dst_vma->vm_file->f_inode;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) offset = linear_page_index(dst_vma, dst_addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) ret = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) if (unlikely(offset >= max_off))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) ret = -EEXIST;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) if (!pte_none(*dst_pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) /* No need to invalidate - it was non-present before */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) update_mmu_cache(dst_vma, dst_addr, dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) pte_unmap_unlock(dst_pte, ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) static int mcontinue_atomic_pte(struct mm_struct *dst_mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) pmd_t *dst_pmd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) struct vm_area_struct *dst_vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) unsigned long dst_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) bool wp_copy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) struct inode *inode = file_inode(dst_vma->vm_file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) ret = shmem_getpage(inode, pgoff, &page, SGP_READ);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) if (!page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) ret = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) page, false, wp_copy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) goto out_release;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) out_release:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) pgd_t *pgd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) p4d_t *p4d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) pud_t *pud;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) pgd = pgd_offset(mm, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) p4d = p4d_alloc(mm, pgd, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) if (!p4d)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) pud = pud_alloc(mm, p4d, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) if (!pud)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) * Note that we didn't run this because the pmd was
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) * missing, the *pmd may be already established and in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) * turn it may also be a trans_huge_pmd.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) return pmd_alloc(mm, pud, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) #ifdef CONFIG_HUGETLB_PAGE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) * __mcopy_atomic processing for HUGETLB vmas. Note that this routine is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) * called with mmap_lock held, it will release mmap_lock before returning.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) struct vm_area_struct *dst_vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) unsigned long dst_start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) unsigned long src_start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) unsigned long len,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) enum mcopy_atomic_mode mode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) int vm_shared = dst_vma->vm_flags & VM_SHARED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) ssize_t err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) pte_t *dst_pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) unsigned long src_addr, dst_addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) long copied;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) unsigned long vma_hpagesize;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) pgoff_t idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) u32 hash;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) struct address_space *mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) * There is no default zero huge page for all huge page sizes as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) * supported by hugetlb. A PMD_SIZE huge pages may exist as used
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) * by THP. Since we can not reliably insert a zero page, this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) * feature is not supported.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) if (mode == MCOPY_ATOMIC_ZEROPAGE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) mmap_read_unlock(dst_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) src_addr = src_start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) dst_addr = dst_start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) copied = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) vma_hpagesize = vma_kernel_pagesize(dst_vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) * Validate alignment based on huge page size
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) err = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) retry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) * On routine entry dst_vma is set. If we had to drop mmap_lock and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) * retry, dst_vma will be set to NULL and we must lookup again.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) if (!dst_vma) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) err = -ENOENT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) dst_vma = find_dst_vma(dst_mm, dst_start, len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) err = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) vm_shared = dst_vma->vm_flags & VM_SHARED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) * If not shared, ensure the dst_vma has a anon_vma.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) err = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) if (!vm_shared) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) if (unlikely(anon_vma_prepare(dst_vma)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) while (src_addr < src_start + len) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) BUG_ON(dst_addr >= dst_start + len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) * Serialize via i_mmap_rwsem and hugetlb_fault_mutex.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) * i_mmap_rwsem ensures the dst_pte remains valid even
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) * in the case of shared pmds. fault mutex prevents
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) * races with other faulting threads.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) mapping = dst_vma->vm_file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) i_mmap_lock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) idx = linear_page_index(dst_vma, dst_addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) hash = hugetlb_fault_mutex_hash(mapping, idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) mutex_lock(&hugetlb_fault_mutex_table[hash]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) err = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) if (!dst_pte) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) mutex_unlock(&hugetlb_fault_mutex_table[hash]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) i_mmap_unlock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) if (mode != MCOPY_ATOMIC_CONTINUE &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) !huge_pte_none(huge_ptep_get(dst_pte))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) err = -EEXIST;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) mutex_unlock(&hugetlb_fault_mutex_table[hash]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) i_mmap_unlock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) dst_addr, src_addr, mode, &page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) mutex_unlock(&hugetlb_fault_mutex_table[hash]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) i_mmap_unlock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) vm_alloc_shared = vm_shared;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) if (unlikely(err == -ENOENT)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) mmap_read_unlock(dst_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) BUG_ON(!page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) err = copy_huge_page_from_user(page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) (const void __user *)src_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) vma_hpagesize / PAGE_SIZE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) if (unlikely(err)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) err = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) mmap_read_lock(dst_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) dst_vma = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) BUG_ON(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) if (!err) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) dst_addr += vma_hpagesize;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) src_addr += vma_hpagesize;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) copied += vma_hpagesize;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) if (fatal_signal_pending(current))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) err = -EINTR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) mmap_read_unlock(dst_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) if (page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) * We encountered an error and are about to free a newly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) * allocated huge page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) * Reservation handling is very subtle, and is different for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) * private and shared mappings. See the routine
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) * restore_reserve_on_error for details. Unfortunately, we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) * can not call restore_reserve_on_error now as it would
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) * require holding mmap_lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) * If a reservation for the page existed in the reservation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) * map of a private mapping, the map was modified to indicate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) * the reservation was consumed when the page was allocated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) * We clear the PagePrivate flag now so that the global
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) * reserve count will not be incremented in free_huge_page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) * The reservation map will still indicate the reservation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) * was consumed and possibly prevent later page allocation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) * This is better than leaking a global reservation. If no
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) * reservation existed, it is still safe to clear PagePrivate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) * as no adjustments to reservation counts were made during
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) * allocation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) * The reservation map for shared mappings indicates which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) * pages have reservations. When a huge page is allocated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) * for an address with a reservation, no change is made to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) * the reserve map. In this case PagePrivate will be set
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) * to indicate that the global reservation count should be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) * incremented when the page is freed. This is the desired
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) * behavior. However, when a huge page is allocated for an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) * address without a reservation a reservation entry is added
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) * to the reservation map, and PagePrivate will not be set.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) * When the page is freed, the global reserve count will NOT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) * be incremented and it will appear as though we have leaked
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) * reserved page. In this case, set PagePrivate so that the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) * global reserve count will be incremented to match the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) * reservation map entry which was created.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) * Note that vm_alloc_shared is based on the flags of the vma
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) * for which the page was originally allocated. dst_vma could
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) * be different or NULL on error.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) if (vm_alloc_shared)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) SetPagePrivate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) ClearPagePrivate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) BUG_ON(copied < 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) BUG_ON(err > 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) BUG_ON(!copied && !err);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) return copied ? copied : err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) #else /* !CONFIG_HUGETLB_PAGE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) /* fail at build time if gcc attempts to use this */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) struct vm_area_struct *dst_vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) unsigned long dst_start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) unsigned long src_start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) unsigned long len,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) enum mcopy_atomic_mode mode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) #endif /* CONFIG_HUGETLB_PAGE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) pmd_t *dst_pmd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) struct vm_area_struct *dst_vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) unsigned long dst_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) unsigned long src_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) struct page **page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) enum mcopy_atomic_mode mode,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) bool wp_copy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) ssize_t err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) if (mode == MCOPY_ATOMIC_CONTINUE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) return mcontinue_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) wp_copy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) * The normal page fault path for a shmem will invoke the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) * fault, fill the hole in the file and COW it right away. The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) * result generates plain anonymous memory. So when we are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) * generate anonymous memory directly without actually filling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) * the hole. For the MAP_PRIVATE case the robustness check
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) * only happens in the pagetable (to verify it's still none)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) * and not in the radix tree.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) if (!(dst_vma->vm_flags & VM_SHARED)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) if (mode == MCOPY_ATOMIC_NORMAL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) dst_addr, src_addr, page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) wp_copy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) err = mfill_zeropage_pte(dst_mm, dst_pmd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) dst_vma, dst_addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) VM_WARN_ON_ONCE(wp_copy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) err = shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) dst_addr, src_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) mode != MCOPY_ATOMIC_NORMAL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) unsigned long dst_start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) unsigned long src_start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) unsigned long len,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) enum mcopy_atomic_mode mcopy_mode,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) bool *mmap_changing,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) __u64 mode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) struct vm_area_struct *dst_vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) ssize_t err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) pmd_t *dst_pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) unsigned long src_addr, dst_addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) long copied;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) bool wp_copy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) * Sanitize the command parameters:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) BUG_ON(dst_start & ~PAGE_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) BUG_ON(len & ~PAGE_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) /* Does the address range wrap, or is the span zero-sized? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) BUG_ON(src_start + len <= src_start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) BUG_ON(dst_start + len <= dst_start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) src_addr = src_start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) dst_addr = dst_start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) copied = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) retry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) mmap_read_lock(dst_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) * If memory mappings are changing because of non-cooperative
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) * operation (e.g. mremap) running in parallel, bail out and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) * request the user to retry later
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) err = -EAGAIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) if (mmap_changing && READ_ONCE(*mmap_changing))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) * Make sure the vma is not shared, that the dst range is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) * both valid and fully within a single existing vma.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) err = -ENOENT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) dst_vma = find_dst_vma(dst_mm, dst_start, len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) if (!dst_vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) err = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) * it will overwrite vm_ops, so vma_is_anonymous must return false.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) dst_vma->vm_flags & VM_SHARED))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) * validate 'mode' now that we know the dst_vma: don't allow
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) * a wrprotect copy if the userfaultfd didn't register as WP.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) wp_copy = mode & UFFDIO_COPY_MODE_WP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) if (wp_copy && !(dst_vma->vm_flags & VM_UFFD_WP))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) * If this is a HUGETLB vma, pass off to appropriate routine
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) if (is_vm_hugetlb_page(dst_vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) src_start, len, mcopy_mode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) if (!vma_is_shmem(dst_vma) && mcopy_mode == MCOPY_ATOMIC_CONTINUE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) * Ensure the dst_vma has a anon_vma or this page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) * would get a NULL anon_vma when moved in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) * dst_vma.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) err = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) if (!(dst_vma->vm_flags & VM_SHARED) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) unlikely(anon_vma_prepare(dst_vma)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) while (src_addr < src_start + len) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) pmd_t dst_pmdval;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) BUG_ON(dst_addr >= dst_start + len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) if (unlikely(!dst_pmd)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) err = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) dst_pmdval = pmd_read_atomic(dst_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) * If the dst_pmd is mapped as THP don't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) * override it and just be strict.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) if (unlikely(pmd_trans_huge(dst_pmdval))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) err = -EEXIST;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) if (unlikely(pmd_none(dst_pmdval)) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) unlikely(__pte_alloc(dst_mm, dst_pmd))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) err = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) /* If an huge pmd materialized from under us fail */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) if (unlikely(pmd_trans_huge(*dst_pmd))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) err = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) BUG_ON(pmd_none(*dst_pmd));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) BUG_ON(pmd_trans_huge(*dst_pmd));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) src_addr, &page, mcopy_mode, wp_copy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) if (unlikely(err == -ENOENT)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) void *page_kaddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) mmap_read_unlock(dst_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) BUG_ON(!page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) page_kaddr = kmap(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) err = copy_from_user(page_kaddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) (const void __user *) src_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) kunmap(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) if (unlikely(err)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) err = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) BUG_ON(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) if (!err) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) dst_addr += PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) src_addr += PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) copied += PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) if (fatal_signal_pending(current))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) err = -EINTR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) mmap_read_unlock(dst_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) BUG_ON(copied < 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) BUG_ON(err > 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) BUG_ON(!copied && !err);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) return copied ? copied : err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) unsigned long src_start, unsigned long len,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) bool *mmap_changing, __u64 mode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) return __mcopy_atomic(dst_mm, dst_start, src_start, len,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) MCOPY_ATOMIC_NORMAL, mmap_changing, mode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) unsigned long len, bool *mmap_changing)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) mmap_changing, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) unsigned long len, bool *mmap_changing)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) mmap_changing, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) unsigned long len, bool enable_wp, bool *mmap_changing)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) struct vm_area_struct *dst_vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) pgprot_t newprot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) * Sanitize the command parameters:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) BUG_ON(start & ~PAGE_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) BUG_ON(len & ~PAGE_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) /* Does the address range wrap, or is the span zero-sized? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) BUG_ON(start + len <= start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) mmap_read_lock(dst_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) * If memory mappings are changing because of non-cooperative
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) * operation (e.g. mremap) running in parallel, bail out and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) * request the user to retry later
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) err = -EAGAIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) if (mmap_changing && READ_ONCE(*mmap_changing))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) err = -ENOENT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) dst_vma = find_dst_vma(dst_mm, start, len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) * Make sure the vma is not shared, that the dst range is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) * both valid and fully within a single existing vma.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) if (!userfaultfd_wp(dst_vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) if (!vma_is_anonymous(dst_vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) if (enable_wp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) newprot = vm_get_page_prot(dst_vma->vm_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) change_protection(dst_vma, start, start + len, newprot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) err = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) mmap_read_unlock(dst_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) }