Orange Pi5 kernel

Deprecated Linux kernel 5.10.110 for OrangePi 5/5B/5+ boards

3 Commits   0 Branches   0 Tags
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   1) // SPDX-License-Identifier: GPL-2.0-only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   3)  *  mm/userfaultfd.c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   4)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   5)  *  Copyright (C) 2015  Red Hat, Inc.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   6)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   7) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   8) #include <linux/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   9) #include <linux/sched/signal.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  10) #include <linux/pagemap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  11) #include <linux/rmap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  12) #include <linux/swap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  13) #include <linux/swapops.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  14) #include <linux/userfaultfd_k.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  15) #include <linux/mmu_notifier.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  16) #include <linux/hugetlb.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  17) #include <linux/shmem_fs.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  18) #include <asm/tlbflush.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  19) #include "internal.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  20) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  21) static __always_inline
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  22) struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  23) 				    unsigned long dst_start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  24) 				    unsigned long len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  25) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  26) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  27) 	 * Make sure that the dst range is both valid and fully within a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  28) 	 * single existing vma.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  29) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  30) 	struct vm_area_struct *dst_vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  31) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  32) 	dst_vma = find_vma(dst_mm, dst_start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  33) 	if (!dst_vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  34) 		return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  35) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  36) 	if (dst_start < dst_vma->vm_start ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  37) 	    dst_start + len > dst_vma->vm_end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  38) 		return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  39) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  40) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  41) 	 * Check the vma is registered in uffd, this is required to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  42) 	 * enforce the VM_MAYWRITE check done at uffd registration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  43) 	 * time.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  44) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  45) 	if (!dst_vma->vm_userfaultfd_ctx.ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  46) 		return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  47) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  48) 	return dst_vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  49) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  50) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  51) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  52)  * Install PTEs, to map dst_addr (within dst_vma) to page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  53)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  54)  * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  55)  * and anon, and for both shared and private VMAs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  56)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  57) int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  58) 			     struct vm_area_struct *dst_vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  59) 			     unsigned long dst_addr, struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  60) 			     bool newly_allocated, bool wp_copy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  61) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  62) 	int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  63) 	pte_t _dst_pte, *dst_pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  64) 	bool writable = dst_vma->vm_flags & VM_WRITE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  65) 	bool vm_shared = dst_vma->vm_flags & VM_SHARED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  66) 	bool page_in_cache = page->mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  67) 	spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  68) 	struct inode *inode;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  69) 	pgoff_t offset, max_off;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  70) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  71) 	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  72) 	if (page_in_cache && !vm_shared)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  73) 		writable = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  74) 	if (writable || !page_in_cache)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  75) 		_dst_pte = pte_mkdirty(_dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  76) 	if (writable) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  77) 		if (wp_copy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  78) 			_dst_pte = pte_mkuffd_wp(_dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  79) 		else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  80) 			_dst_pte = pte_mkwrite(_dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  81) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  82) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  83) 	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  84) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  85) 	if (vma_is_shmem(dst_vma)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  86) 		/* serialize against truncate with the page table lock */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  87) 		inode = dst_vma->vm_file->f_inode;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  88) 		offset = linear_page_index(dst_vma, dst_addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  89) 		max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  90) 		ret = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  91) 		if (unlikely(offset >= max_off))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  92) 			goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  93) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  94) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  95) 	ret = -EEXIST;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  96) 	if (!pte_none(*dst_pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  97) 		goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  98) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  99) 	if (page_in_cache)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) 		page_add_file_rmap(page, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) 		page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) 	 * Must happen after rmap, as mm_counter() checks mapping (via
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) 	 * PageAnon()), which is set by __page_set_anon_rmap().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) 	inc_mm_counter(dst_mm, mm_counter(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) 	if (newly_allocated)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) 		lru_cache_add_inactive_or_unevictable(page, dst_vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) 	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) 	/* No need to invalidate - it was non-present before */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) 	update_mmu_cache(dst_vma, dst_addr, dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) 	ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) 	pte_unmap_unlock(dst_pte, ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) static int mcopy_atomic_pte(struct mm_struct *dst_mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) 			    pmd_t *dst_pmd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) 			    struct vm_area_struct *dst_vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) 			    unsigned long dst_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) 			    unsigned long src_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) 			    struct page **pagep,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) 			    bool wp_copy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) 	void *page_kaddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) 	int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) 	struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) 	if (!*pagep) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) 		ret = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) 		page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) 		if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) 			goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) 		page_kaddr = kmap_atomic(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) 		ret = copy_from_user(page_kaddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) 				     (const void __user *) src_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) 				     PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) 		kunmap_atomic(page_kaddr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) 		/* fallback to copy_from_user outside mmap_lock */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) 		if (unlikely(ret)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) 			ret = -ENOENT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) 			*pagep = page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) 			/* don't free the page */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) 			goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) 		page = *pagep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) 		*pagep = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) 	 * The memory barrier inside __SetPageUptodate makes sure that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) 	 * preceding stores to the page contents become visible before
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) 	 * the set_pte_at() write.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) 	__SetPageUptodate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) 	ret = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) 	if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) 		goto out_release;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) 	ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) 				       page, true, wp_copy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) 	if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) 		goto out_release;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) out_release:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) 	put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) 	goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) static int mfill_zeropage_pte(struct mm_struct *dst_mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) 			      pmd_t *dst_pmd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) 			      struct vm_area_struct *dst_vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) 			      unsigned long dst_addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) 	pte_t _dst_pte, *dst_pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) 	spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) 	int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) 	pgoff_t offset, max_off;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) 	struct inode *inode;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) 	_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) 					 dst_vma->vm_page_prot));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) 	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) 	if (dst_vma->vm_file) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) 		/* the shmem MAP_PRIVATE case requires checking the i_size */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) 		inode = dst_vma->vm_file->f_inode;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) 		offset = linear_page_index(dst_vma, dst_addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) 		max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) 		ret = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) 		if (unlikely(offset >= max_off))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) 			goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) 	ret = -EEXIST;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) 	if (!pte_none(*dst_pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) 		goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) 	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) 	/* No need to invalidate - it was non-present before */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) 	update_mmu_cache(dst_vma, dst_addr, dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) 	ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) 	pte_unmap_unlock(dst_pte, ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) static int mcontinue_atomic_pte(struct mm_struct *dst_mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) 				pmd_t *dst_pmd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) 				struct vm_area_struct *dst_vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) 				unsigned long dst_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) 				bool wp_copy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) 	struct inode *inode = file_inode(dst_vma->vm_file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) 	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) 	struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) 	int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) 	ret = shmem_getpage(inode, pgoff, &page, SGP_READ);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) 	if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) 		goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) 	if (!page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) 		ret = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) 		goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) 	ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) 				       page, false, wp_copy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) 	if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) 		goto out_release;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) 	unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) 	ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) out_release:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) 	unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) 	put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) 	goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) 	pgd_t *pgd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) 	p4d_t *p4d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) 	pud_t *pud;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) 	pgd = pgd_offset(mm, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) 	p4d = p4d_alloc(mm, pgd, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) 	if (!p4d)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) 		return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) 	pud = pud_alloc(mm, p4d, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) 	if (!pud)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) 		return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) 	 * Note that we didn't run this because the pmd was
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) 	 * missing, the *pmd may be already established and in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) 	 * turn it may also be a trans_huge_pmd.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) 	return pmd_alloc(mm, pud, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) #ifdef CONFIG_HUGETLB_PAGE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274)  * __mcopy_atomic processing for HUGETLB vmas.  Note that this routine is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275)  * called with mmap_lock held, it will release mmap_lock before returning.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) 					      struct vm_area_struct *dst_vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) 					      unsigned long dst_start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) 					      unsigned long src_start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) 					      unsigned long len,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) 					      enum mcopy_atomic_mode mode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) 	int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) 	int vm_shared = dst_vma->vm_flags & VM_SHARED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) 	ssize_t err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) 	pte_t *dst_pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) 	unsigned long src_addr, dst_addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) 	long copied;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) 	struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) 	unsigned long vma_hpagesize;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) 	pgoff_t idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) 	u32 hash;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) 	struct address_space *mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) 	 * There is no default zero huge page for all huge page sizes as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) 	 * supported by hugetlb.  A PMD_SIZE huge pages may exist as used
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) 	 * by THP.  Since we can not reliably insert a zero page, this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) 	 * feature is not supported.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) 	if (mode == MCOPY_ATOMIC_ZEROPAGE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) 		mmap_read_unlock(dst_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) 		return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) 	src_addr = src_start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) 	dst_addr = dst_start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) 	copied = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) 	page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) 	vma_hpagesize = vma_kernel_pagesize(dst_vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) 	 * Validate alignment based on huge page size
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) 	err = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) 	if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) 		goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) retry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) 	 * On routine entry dst_vma is set.  If we had to drop mmap_lock and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) 	 * retry, dst_vma will be set to NULL and we must lookup again.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) 	if (!dst_vma) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) 		err = -ENOENT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) 		dst_vma = find_dst_vma(dst_mm, dst_start, len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) 		if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) 			goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) 		err = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) 		if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) 			goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) 		vm_shared = dst_vma->vm_flags & VM_SHARED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) 	 * If not shared, ensure the dst_vma has a anon_vma.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) 	err = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) 	if (!vm_shared) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) 		if (unlikely(anon_vma_prepare(dst_vma)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) 			goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) 	while (src_addr < src_start + len) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) 		BUG_ON(dst_addr >= dst_start + len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) 		 * Serialize via i_mmap_rwsem and hugetlb_fault_mutex.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) 		 * i_mmap_rwsem ensures the dst_pte remains valid even
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) 		 * in the case of shared pmds.  fault mutex prevents
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) 		 * races with other faulting threads.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) 		mapping = dst_vma->vm_file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) 		i_mmap_lock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) 		idx = linear_page_index(dst_vma, dst_addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) 		hash = hugetlb_fault_mutex_hash(mapping, idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) 		mutex_lock(&hugetlb_fault_mutex_table[hash]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) 		err = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) 		dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) 		if (!dst_pte) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) 			i_mmap_unlock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) 			goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) 		if (mode != MCOPY_ATOMIC_CONTINUE &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) 		    !huge_pte_none(huge_ptep_get(dst_pte))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) 			err = -EEXIST;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) 			i_mmap_unlock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) 			goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) 		err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) 					       dst_addr, src_addr, mode, &page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) 		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) 		i_mmap_unlock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) 		vm_alloc_shared = vm_shared;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) 		cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) 		if (unlikely(err == -ENOENT)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) 			mmap_read_unlock(dst_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) 			BUG_ON(!page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) 			err = copy_huge_page_from_user(page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) 						(const void __user *)src_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) 						vma_hpagesize / PAGE_SIZE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) 						true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) 			if (unlikely(err)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) 				err = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) 				goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) 			mmap_read_lock(dst_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) 			dst_vma = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) 			goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) 		} else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) 			BUG_ON(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) 		if (!err) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) 			dst_addr += vma_hpagesize;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) 			src_addr += vma_hpagesize;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) 			copied += vma_hpagesize;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) 			if (fatal_signal_pending(current))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) 				err = -EINTR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) 		if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) 	mmap_read_unlock(dst_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) 	if (page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) 		 * We encountered an error and are about to free a newly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) 		 * allocated huge page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) 		 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) 		 * Reservation handling is very subtle, and is different for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) 		 * private and shared mappings.  See the routine
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) 		 * restore_reserve_on_error for details.  Unfortunately, we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) 		 * can not call restore_reserve_on_error now as it would
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) 		 * require holding mmap_lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) 		 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) 		 * If a reservation for the page existed in the reservation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) 		 * map of a private mapping, the map was modified to indicate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) 		 * the reservation was consumed when the page was allocated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) 		 * We clear the PagePrivate flag now so that the global
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) 		 * reserve count will not be incremented in free_huge_page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) 		 * The reservation map will still indicate the reservation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) 		 * was consumed and possibly prevent later page allocation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) 		 * This is better than leaking a global reservation.  If no
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) 		 * reservation existed, it is still safe to clear PagePrivate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) 		 * as no adjustments to reservation counts were made during
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) 		 * allocation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) 		 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) 		 * The reservation map for shared mappings indicates which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) 		 * pages have reservations.  When a huge page is allocated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) 		 * for an address with a reservation, no change is made to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) 		 * the reserve map.  In this case PagePrivate will be set
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) 		 * to indicate that the global reservation count should be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) 		 * incremented when the page is freed.  This is the desired
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) 		 * behavior.  However, when a huge page is allocated for an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) 		 * address without a reservation a reservation entry is added
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) 		 * to the reservation map, and PagePrivate will not be set.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) 		 * When the page is freed, the global reserve count will NOT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) 		 * be incremented and it will appear as though we have leaked
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) 		 * reserved page.  In this case, set PagePrivate so that the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) 		 * global reserve count will be incremented to match the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) 		 * reservation map entry which was created.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) 		 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) 		 * Note that vm_alloc_shared is based on the flags of the vma
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) 		 * for which the page was originally allocated.  dst_vma could
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) 		 * be different or NULL on error.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) 		if (vm_alloc_shared)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) 			SetPagePrivate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) 		else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) 			ClearPagePrivate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) 		put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) 	BUG_ON(copied < 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) 	BUG_ON(err > 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) 	BUG_ON(!copied && !err);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) 	return copied ? copied : err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) #else /* !CONFIG_HUGETLB_PAGE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) /* fail at build time if gcc attempts to use this */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) 				      struct vm_area_struct *dst_vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) 				      unsigned long dst_start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) 				      unsigned long src_start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) 				      unsigned long len,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) 				      enum mcopy_atomic_mode mode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) #endif /* CONFIG_HUGETLB_PAGE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) 						pmd_t *dst_pmd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) 						struct vm_area_struct *dst_vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) 						unsigned long dst_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) 						unsigned long src_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) 						struct page **page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) 						enum mcopy_atomic_mode mode,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) 						bool wp_copy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) 	ssize_t err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) 	if (mode == MCOPY_ATOMIC_CONTINUE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) 		return mcontinue_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) 					    wp_copy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) 	 * The normal page fault path for a shmem will invoke the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) 	 * fault, fill the hole in the file and COW it right away. The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) 	 * result generates plain anonymous memory. So when we are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) 	 * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) 	 * generate anonymous memory directly without actually filling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) 	 * the hole. For the MAP_PRIVATE case the robustness check
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) 	 * only happens in the pagetable (to verify it's still none)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) 	 * and not in the radix tree.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) 	if (!(dst_vma->vm_flags & VM_SHARED)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) 		if (mode == MCOPY_ATOMIC_NORMAL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) 			err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) 					       dst_addr, src_addr, page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) 					       wp_copy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) 		else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) 			err = mfill_zeropage_pte(dst_mm, dst_pmd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) 						 dst_vma, dst_addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) 		VM_WARN_ON_ONCE(wp_copy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) 		err = shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) 					     dst_addr, src_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) 					     mode != MCOPY_ATOMIC_NORMAL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) 					     page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) 	return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) 					      unsigned long dst_start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) 					      unsigned long src_start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) 					      unsigned long len,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) 					      enum mcopy_atomic_mode mcopy_mode,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) 					      bool *mmap_changing,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) 					      __u64 mode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) 	struct vm_area_struct *dst_vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) 	ssize_t err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) 	pmd_t *dst_pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) 	unsigned long src_addr, dst_addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) 	long copied;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) 	struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) 	bool wp_copy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) 	 * Sanitize the command parameters:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) 	BUG_ON(dst_start & ~PAGE_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) 	BUG_ON(len & ~PAGE_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) 	/* Does the address range wrap, or is the span zero-sized? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) 	BUG_ON(src_start + len <= src_start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) 	BUG_ON(dst_start + len <= dst_start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) 	src_addr = src_start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) 	dst_addr = dst_start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) 	copied = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) 	page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) retry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) 	mmap_read_lock(dst_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) 	 * If memory mappings are changing because of non-cooperative
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) 	 * operation (e.g. mremap) running in parallel, bail out and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) 	 * request the user to retry later
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) 	err = -EAGAIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) 	if (mmap_changing && READ_ONCE(*mmap_changing))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) 		goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) 	 * Make sure the vma is not shared, that the dst range is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) 	 * both valid and fully within a single existing vma.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) 	err = -ENOENT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) 	dst_vma = find_dst_vma(dst_mm, dst_start, len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) 	if (!dst_vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) 		goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) 	err = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) 	 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) 	 * it will overwrite vm_ops, so vma_is_anonymous must return false.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) 	if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) 	    dst_vma->vm_flags & VM_SHARED))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) 		goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) 	 * validate 'mode' now that we know the dst_vma: don't allow
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) 	 * a wrprotect copy if the userfaultfd didn't register as WP.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) 	wp_copy = mode & UFFDIO_COPY_MODE_WP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) 	if (wp_copy && !(dst_vma->vm_flags & VM_UFFD_WP))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) 		goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) 	 * If this is a HUGETLB vma, pass off to appropriate routine
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) 	if (is_vm_hugetlb_page(dst_vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) 		return  __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) 						src_start, len, mcopy_mode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) 	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) 		goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) 	if (!vma_is_shmem(dst_vma) && mcopy_mode == MCOPY_ATOMIC_CONTINUE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) 		goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) 	 * Ensure the dst_vma has a anon_vma or this page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) 	 * would get a NULL anon_vma when moved in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) 	 * dst_vma.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) 	err = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) 	if (!(dst_vma->vm_flags & VM_SHARED) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) 	    unlikely(anon_vma_prepare(dst_vma)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) 		goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) 	while (src_addr < src_start + len) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) 		pmd_t dst_pmdval;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) 		BUG_ON(dst_addr >= dst_start + len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) 		dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) 		if (unlikely(!dst_pmd)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) 			err = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) 		dst_pmdval = pmd_read_atomic(dst_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) 		 * If the dst_pmd is mapped as THP don't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) 		 * override it and just be strict.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) 		if (unlikely(pmd_trans_huge(dst_pmdval))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) 			err = -EEXIST;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) 		if (unlikely(pmd_none(dst_pmdval)) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) 		    unlikely(__pte_alloc(dst_mm, dst_pmd))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) 			err = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) 		/* If an huge pmd materialized from under us fail */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) 		if (unlikely(pmd_trans_huge(*dst_pmd))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) 			err = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) 		BUG_ON(pmd_none(*dst_pmd));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) 		BUG_ON(pmd_trans_huge(*dst_pmd));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) 		err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) 				       src_addr, &page, mcopy_mode, wp_copy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) 		cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) 		if (unlikely(err == -ENOENT)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) 			void *page_kaddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) 			mmap_read_unlock(dst_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) 			BUG_ON(!page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) 			page_kaddr = kmap(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) 			err = copy_from_user(page_kaddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) 					     (const void __user *) src_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) 					     PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) 			kunmap(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) 			if (unlikely(err)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) 				err = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) 				goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) 			goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) 		} else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) 			BUG_ON(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) 		if (!err) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) 			dst_addr += PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) 			src_addr += PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) 			copied += PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) 			if (fatal_signal_pending(current))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) 				err = -EINTR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) 		if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) 	mmap_read_unlock(dst_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) 	if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) 		put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) 	BUG_ON(copied < 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) 	BUG_ON(err > 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) 	BUG_ON(!copied && !err);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) 	return copied ? copied : err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) 		     unsigned long src_start, unsigned long len,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) 		     bool *mmap_changing, __u64 mode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) 	return __mcopy_atomic(dst_mm, dst_start, src_start, len,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) 			      MCOPY_ATOMIC_NORMAL, mmap_changing, mode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) 		       unsigned long len, bool *mmap_changing)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) 	return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) 			      mmap_changing, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) 		       unsigned long len, bool *mmap_changing)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) 	return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) 			      mmap_changing, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) 			unsigned long len, bool enable_wp, bool *mmap_changing)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) 	struct vm_area_struct *dst_vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) 	pgprot_t newprot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) 	int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) 	 * Sanitize the command parameters:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) 	BUG_ON(start & ~PAGE_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) 	BUG_ON(len & ~PAGE_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) 	/* Does the address range wrap, or is the span zero-sized? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) 	BUG_ON(start + len <= start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) 	mmap_read_lock(dst_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) 	 * If memory mappings are changing because of non-cooperative
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) 	 * operation (e.g. mremap) running in parallel, bail out and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) 	 * request the user to retry later
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) 	err = -EAGAIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) 	if (mmap_changing && READ_ONCE(*mmap_changing))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) 		goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) 	err = -ENOENT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) 	dst_vma = find_dst_vma(dst_mm, start, len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) 	 * Make sure the vma is not shared, that the dst range is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) 	 * both valid and fully within a single existing vma.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) 	if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) 		goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) 	if (!userfaultfd_wp(dst_vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) 		goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) 	if (!vma_is_anonymous(dst_vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) 		goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) 	if (enable_wp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) 		newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) 		newprot = vm_get_page_prot(dst_vma->vm_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) 	change_protection(dst_vma, start, start + len, newprot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) 			  enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) 	err = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) 	mmap_read_unlock(dst_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) 	return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) }