^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0-only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * Copyright (C) 2009 Red Hat, Inc.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) #include <linux/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) #include <linux/sched.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) #include <linux/sched/coredump.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) #include <linux/sched/numa_balancing.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #include <linux/highmem.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) #include <linux/hugetlb.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #include <linux/mmu_notifier.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #include <linux/rmap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #include <linux/swap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #include <linux/shrinker.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #include <linux/mm_inline.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #include <linux/swapops.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #include <linux/dax.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #include <linux/khugepaged.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) #include <linux/freezer.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) #include <linux/pfn_t.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) #include <linux/mman.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #include <linux/memremap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) #include <linux/pagemap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) #include <linux/debugfs.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) #include <linux/migrate.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) #include <linux/hashtable.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) #include <linux/userfaultfd_k.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) #include <linux/page_idle.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) #include <linux/shmem_fs.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) #include <linux/oom.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) #include <linux/numa.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) #include <linux/page_owner.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) #include <asm/tlb.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) #include <asm/pgalloc.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) #include "internal.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) * By default, transparent hugepage support is disabled in order to avoid
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) * risking an increased memory footprint for applications that are not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) * guaranteed to benefit from it. When transparent hugepage support is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) * enabled, it is for all mappings, and khugepaged scans all mappings.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) * Defrag is invoked by khugepaged hugepage allocations and by page faults
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) * for all hugepage allocations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) unsigned long transparent_hugepage_flags __read_mostly =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) (1<<TRANSPARENT_HUGEPAGE_FLAG)|
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) static struct shrinker deferred_split_shrinker;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) static atomic_t huge_zero_refcount;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) struct page *huge_zero_page __read_mostly;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) unsigned long huge_zero_pfn __read_mostly = ~0UL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) static inline bool file_thp_enabled(struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) return transhuge_vma_enabled(vma, vma->vm_flags) && vma->vm_file &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) !inode_is_open_for_write(vma->vm_file->f_inode) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) (vma->vm_flags & VM_EXEC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) bool transparent_hugepage_active(struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) /* The addr is used to check if the vma size fits */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) if (!transhuge_vma_suitable(vma, addr))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) if (vma_is_anonymous(vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) return __transparent_hugepage_enabled(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) if (vma_is_shmem(vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) return shmem_huge_enabled(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) return file_thp_enabled(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) static struct page *get_huge_zero_page(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) struct page *zero_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) retry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) return READ_ONCE(huge_zero_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) HPAGE_PMD_ORDER);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) if (!zero_page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) count_vm_event(THP_ZERO_PAGE_ALLOC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) preempt_disable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) preempt_enable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) __free_pages(zero_page, compound_order(zero_page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) /* We take additional reference here. It will be put back by shrinker */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) atomic_set(&huge_zero_refcount, 2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) preempt_enable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) return READ_ONCE(huge_zero_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) static void put_huge_zero_page(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) * Counter should never go to zero here. Only shrinker can put
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) * last reference.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) struct page *mm_get_huge_zero_page(struct mm_struct *mm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) return READ_ONCE(huge_zero_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) if (!get_huge_zero_page())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) put_huge_zero_page();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) return READ_ONCE(huge_zero_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) void mm_put_huge_zero_page(struct mm_struct *mm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) put_huge_zero_page();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) struct shrink_control *sc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) /* we can free zero page only if last reference remains */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) struct shrink_control *sc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) struct page *zero_page = xchg(&huge_zero_page, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) BUG_ON(zero_page == NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) WRITE_ONCE(huge_zero_pfn, ~0UL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) __free_pages(zero_page, compound_order(zero_page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) return HPAGE_PMD_NR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) static struct shrinker huge_zero_page_shrinker = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) .count_objects = shrink_huge_zero_page_count,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) .scan_objects = shrink_huge_zero_page_scan,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) .seeks = DEFAULT_SEEKS,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) #ifdef CONFIG_SYSFS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) static ssize_t enabled_show(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) struct kobj_attribute *attr, char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) return sprintf(buf, "[always] madvise never\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) return sprintf(buf, "always [madvise] never\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) return sprintf(buf, "always madvise [never]\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) static ssize_t enabled_store(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) struct kobj_attribute *attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) const char *buf, size_t count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) ssize_t ret = count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) if (sysfs_streq(buf, "always")) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) } else if (sysfs_streq(buf, "madvise")) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) } else if (sysfs_streq(buf, "never")) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) ret = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) if (ret > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) int err = start_stop_khugepaged();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) ret = err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) static struct kobj_attribute enabled_attr =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) __ATTR(enabled, 0644, enabled_show, enabled_store);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) ssize_t single_hugepage_flag_show(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) struct kobj_attribute *attr, char *buf,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) enum transparent_hugepage_flag flag)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) return sprintf(buf, "%d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) !!test_bit(flag, &transparent_hugepage_flags));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) ssize_t single_hugepage_flag_store(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) struct kobj_attribute *attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) const char *buf, size_t count,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) enum transparent_hugepage_flag flag)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) unsigned long value;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) ret = kstrtoul(buf, 10, &value);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) if (ret < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) if (value > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) if (value)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) set_bit(flag, &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) clear_bit(flag, &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) return count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) static ssize_t defrag_show(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) struct kobj_attribute *attr, char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) return sprintf(buf, "[always] defer defer+madvise madvise never\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) return sprintf(buf, "always [defer] defer+madvise madvise never\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) return sprintf(buf, "always defer [defer+madvise] madvise never\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) return sprintf(buf, "always defer defer+madvise [madvise] never\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) return sprintf(buf, "always defer defer+madvise madvise [never]\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) static ssize_t defrag_store(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) struct kobj_attribute *attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) const char *buf, size_t count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) if (sysfs_streq(buf, "always")) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) } else if (sysfs_streq(buf, "defer+madvise")) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) } else if (sysfs_streq(buf, "defer")) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) } else if (sysfs_streq(buf, "madvise")) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) } else if (sysfs_streq(buf, "never")) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) return count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) static struct kobj_attribute defrag_attr =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) __ATTR(defrag, 0644, defrag_show, defrag_store);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) static ssize_t use_zero_page_show(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) struct kobj_attribute *attr, char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) return single_hugepage_flag_show(kobj, attr, buf,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) static ssize_t use_zero_page_store(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) struct kobj_attribute *attr, const char *buf, size_t count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) return single_hugepage_flag_store(kobj, attr, buf, count,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) static struct kobj_attribute use_zero_page_attr =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) static ssize_t hpage_pmd_size_show(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) struct kobj_attribute *attr, char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) return sprintf(buf, "%lu\n", HPAGE_PMD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) static struct kobj_attribute hpage_pmd_size_attr =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) __ATTR_RO(hpage_pmd_size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) static struct attribute *hugepage_attr[] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) &enabled_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) &defrag_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) &use_zero_page_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) &hpage_pmd_size_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) #ifdef CONFIG_SHMEM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) &shmem_enabled_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) NULL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) static const struct attribute_group hugepage_attr_group = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) .attrs = hugepage_attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) if (unlikely(!*hugepage_kobj)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) pr_err("failed to create transparent hugepage kobject\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) if (err) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) pr_err("failed to register transparent hugepage group\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) goto delete_obj;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) if (err) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) pr_err("failed to register transparent hugepage group\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) goto remove_hp_group;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) remove_hp_group:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) delete_obj:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) kobject_put(*hugepage_kobj);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) kobject_put(hugepage_kobj);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) #endif /* CONFIG_SYSFS */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) static int __init hugepage_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) struct kobject *hugepage_kobj;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) if (!has_transparent_hugepage()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) * Hardware doesn't support hugepages, hence disable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) * DAX PMD support.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_NEVER_DAX;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) * hugepages can't be allocated by the buddy allocator
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) * we use page->mapping and page->index in second tail page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) * as list_head: assuming THP order >= 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) err = hugepage_init_sysfs(&hugepage_kobj);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) goto err_sysfs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) err = khugepaged_init();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) goto err_slab;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) err = register_shrinker(&huge_zero_page_shrinker);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) goto err_hzp_shrinker;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) err = register_shrinker(&deferred_split_shrinker);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) goto err_split_shrinker;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) * By default disable transparent hugepages on smaller systems,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) * where the extra memory used could hurt more than TLB overhead
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) * is likely to save. The admin can still enable it through /sys.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) transparent_hugepage_flags = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) err = start_stop_khugepaged();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) goto err_khugepaged;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) err_khugepaged:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) unregister_shrinker(&deferred_split_shrinker);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) err_split_shrinker:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) unregister_shrinker(&huge_zero_page_shrinker);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) err_hzp_shrinker:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) khugepaged_destroy();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) err_slab:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) hugepage_exit_sysfs(hugepage_kobj);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) err_sysfs:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) subsys_initcall(hugepage_init);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) static int __init setup_transparent_hugepage(char *str)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) if (!str)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) if (!strcmp(str, "always")) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) set_bit(TRANSPARENT_HUGEPAGE_FLAG,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) ret = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) } else if (!strcmp(str, "madvise")) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) ret = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) } else if (!strcmp(str, "never")) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) &transparent_hugepage_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) ret = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) if (!ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) pr_warn("transparent_hugepage= cannot parse, ignored\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) __setup("transparent_hugepage=", setup_transparent_hugepage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) if (likely(vma->vm_flags & VM_WRITE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) pmd = pmd_mkwrite(pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) return pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) #ifdef CONFIG_MEMCG
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) static inline struct deferred_split *get_deferred_split_queue(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) if (memcg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) return &memcg->deferred_split_queue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) return &pgdat->deferred_split_queue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) static inline struct deferred_split *get_deferred_split_queue(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) return &pgdat->deferred_split_queue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) void prep_transhuge_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) * we use page->mapping and page->indexlru in second tail page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) * as list_head: assuming THP order >= 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) INIT_LIST_HEAD(page_deferred_list(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) bool is_transparent_hugepage(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) if (!PageCompound(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) page = compound_head(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) return is_huge_zero_page(page) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) page[1].compound_dtor == TRANSHUGE_PAGE_DTOR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) EXPORT_SYMBOL_GPL(is_transparent_hugepage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) static unsigned long __thp_get_unmapped_area(struct file *filp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) unsigned long addr, unsigned long len,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) loff_t off, unsigned long flags, unsigned long size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) loff_t off_end = off + len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) loff_t off_align = round_up(off, size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) unsigned long len_pad, ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) if (off_end <= off_align || (off_end - off_align) < size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) len_pad = len + size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) if (len_pad < len || (off + len_pad) < off)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) ret = current->mm->get_unmapped_area(filp, addr, len_pad,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) off >> PAGE_SHIFT, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) * The failure might be due to length padding. The caller will retry
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) * without the padding.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) if (IS_ERR_VALUE(ret))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) * Do not try to align to THP boundary if allocation at the address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) * hint succeeds.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) if (ret == addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) return addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) ret += (off - ret) & (size - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) unsigned long len, unsigned long pgoff, unsigned long flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) unsigned long ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) loff_t off = (loff_t)pgoff << PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) struct page *page, gfp_t gfp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) struct vm_area_struct *vma = vmf->vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) pgtable_t pgtable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) vm_fault_t ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) VM_BUG_ON_PAGE(!PageCompound(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) if (mem_cgroup_charge(page, vma->vm_mm, gfp)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) count_vm_event(THP_FAULT_FALLBACK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) count_vm_event(THP_FAULT_FALLBACK_CHARGE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) return VM_FAULT_FALLBACK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) cgroup_throttle_swaprate(page, gfp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) pgtable = pte_alloc_one(vma->vm_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) if (unlikely(!pgtable)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) ret = VM_FAULT_OOM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) goto release;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) * The memory barrier inside __SetPageUptodate makes sure that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) * clear_huge_page writes become visible before the set_pmd_at()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) * write.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) __SetPageUptodate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) if (unlikely(!pmd_none(*vmf->pmd))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) goto unlock_release;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) pmd_t entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) ret = check_stable_address_space(vma->vm_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) goto unlock_release;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) /* Deliver the page fault to userland */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) if (userfaultfd_missing(vma)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) vm_fault_t ret2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) spin_unlock(vmf->ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) pte_free(vma->vm_mm, pgtable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) ret2 = handle_userfault(vmf, VM_UFFD_MISSING);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) VM_BUG_ON(ret2 & VM_FAULT_FALLBACK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) return ret2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) entry = mk_huge_pmd(page, vma->vm_page_prot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) page_add_new_anon_rmap(page, vma, haddr, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) lru_cache_add_inactive_or_unevictable(page, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) mm_inc_nr_ptes(vma->vm_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) spin_unlock(vmf->ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) count_vm_event(THP_FAULT_ALLOC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) unlock_release:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) spin_unlock(vmf->ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) release:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) if (pgtable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) pte_free(vma->vm_mm, pgtable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) * always: directly stall for all thp allocations
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) * defer: wake kswapd and fail if not immediately available
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) * fail if not immediately available
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) * available
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) * never: never stall for any thp allocation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) /* Always do synchronous compaction */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) /* Kick kcompactd and fail quickly */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) /* Synchronous compaction if madvised, otherwise kick kcompactd */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) return GFP_TRANSHUGE_LIGHT |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) (vma_madvised ? __GFP_DIRECT_RECLAIM :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) __GFP_KSWAPD_RECLAIM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) /* Only do synchronous compaction if madvised */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) return GFP_TRANSHUGE_LIGHT |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) return GFP_TRANSHUGE_LIGHT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) /* Caller must hold page table lock. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) struct page *zero_page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) pmd_t entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) if (!pmd_none(*pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) entry = mk_pmd(zero_page, vma->vm_page_prot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) entry = pmd_mkhuge(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) if (pgtable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) pgtable_trans_huge_deposit(mm, pmd, pgtable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) set_pmd_at(mm, haddr, pmd, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) mm_inc_nr_ptes(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) struct vm_area_struct *vma = vmf->vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) gfp_t gfp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) if (!transhuge_vma_suitable(vma, haddr))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) return VM_FAULT_FALLBACK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) if (unlikely(anon_vma_prepare(vma)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) return VM_FAULT_OOM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) return VM_FAULT_OOM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) if (!(vmf->flags & FAULT_FLAG_WRITE) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) !mm_forbids_zeropage(vma->vm_mm) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) transparent_hugepage_use_zero_page()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) pgtable_t pgtable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) struct page *zero_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) vm_fault_t ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) pgtable = pte_alloc_one(vma->vm_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) if (unlikely(!pgtable))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) return VM_FAULT_OOM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) zero_page = mm_get_huge_zero_page(vma->vm_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) if (unlikely(!zero_page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) pte_free(vma->vm_mm, pgtable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) count_vm_event(THP_FAULT_FALLBACK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) return VM_FAULT_FALLBACK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) if (pmd_none(*vmf->pmd)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) ret = check_stable_address_space(vma->vm_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) if (ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) spin_unlock(vmf->ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) pte_free(vma->vm_mm, pgtable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) } else if (userfaultfd_missing(vma)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) spin_unlock(vmf->ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) pte_free(vma->vm_mm, pgtable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) ret = handle_userfault(vmf, VM_UFFD_MISSING);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) VM_BUG_ON(ret & VM_FAULT_FALLBACK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) set_huge_zero_page(pgtable, vma->vm_mm, vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) haddr, vmf->pmd, zero_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) spin_unlock(vmf->ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) spin_unlock(vmf->ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) pte_free(vma->vm_mm, pgtable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) gfp = alloc_hugepage_direct_gfpmask(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) if (unlikely(!page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) count_vm_event(THP_FAULT_FALLBACK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) return VM_FAULT_FALLBACK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) prep_transhuge_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) return __do_huge_pmd_anonymous_page(vmf, page, gfp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) pgtable_t pgtable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) struct mm_struct *mm = vma->vm_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) pmd_t entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) ptl = pmd_lock(mm, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) if (!pmd_none(*pmd)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) if (write) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) entry = pmd_mkyoung(*pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) update_mmu_cache_pmd(vma, addr, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) if (pfn_t_devmap(pfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) entry = pmd_mkdevmap(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) if (write) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) entry = pmd_mkyoung(pmd_mkdirty(entry));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) entry = maybe_pmd_mkwrite(entry, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) if (pgtable) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) pgtable_trans_huge_deposit(mm, pmd, pgtable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) mm_inc_nr_ptes(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807) pgtable = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) set_pmd_at(mm, addr, pmd, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) update_mmu_cache_pmd(vma, addr, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) if (pgtable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) pte_free(mm, pgtable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) * vmf_insert_pfn_pmd_prot - insert a pmd size pfn
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821) * @vmf: Structure describing the fault
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) * @pfn: pfn to insert
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823) * @pgprot: page protection to use
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) * @write: whether it's a write fault
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) * Insert a pmd size pfn. See vmf_insert_pfn() for additional info and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) * also consult the vmf_insert_mixed_prot() documentation when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) * @pgprot != @vmf->vma->vm_page_prot.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) * Return: vm_fault_t value.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832) vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) pgprot_t pgprot, bool write)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) unsigned long addr = vmf->address & PMD_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) struct vm_area_struct *vma = vmf->vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837) pgtable_t pgtable = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) * If we had pmd_special, we could avoid all these restrictions,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841) * but we need to be consistent with PTEs and architectures that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) * can't support a 'special' bit.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845) !pfn_t_devmap(pfn));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847) (VM_PFNMAP|VM_MIXEDMAP));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848) BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850) if (addr < vma->vm_start || addr >= vma->vm_end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) return VM_FAULT_SIGBUS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853) if (arch_needs_pgtable_deposit()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854) pgtable = pte_alloc_one(vma->vm_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) if (!pgtable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) return VM_FAULT_OOM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) track_pfn_insert(vma, &pgprot, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) return VM_FAULT_NOPAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd_prot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867) static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869) if (likely(vma->vm_flags & VM_WRITE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) pud = pud_mkwrite(pud);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) return pud;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875) pud_t *pud, pfn_t pfn, pgprot_t prot, bool write)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877) struct mm_struct *mm = vma->vm_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878) pud_t entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881) ptl = pud_lock(mm, pud);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) if (!pud_none(*pud)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883) if (write) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884) if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) WARN_ON_ONCE(!is_huge_zero_pud(*pud));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) entry = pud_mkyoung(*pud);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890) if (pudp_set_access_flags(vma, addr, pud, entry, 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) update_mmu_cache_pud(vma, addr, pud);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) entry = pud_mkhuge(pfn_t_pud(pfn, prot));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897) if (pfn_t_devmap(pfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898) entry = pud_mkdevmap(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899) if (write) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) entry = pud_mkyoung(pud_mkdirty(entry));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) entry = maybe_pud_mkwrite(entry, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903) set_pud_at(mm, addr, pud, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) update_mmu_cache_pud(vma, addr, pud);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911) * vmf_insert_pfn_pud_prot - insert a pud size pfn
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912) * @vmf: Structure describing the fault
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913) * @pfn: pfn to insert
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914) * @pgprot: page protection to use
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915) * @write: whether it's a write fault
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) * Insert a pud size pfn. See vmf_insert_pfn() for additional info and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) * also consult the vmf_insert_mixed_prot() documentation when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919) * @pgprot != @vmf->vma->vm_page_prot.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921) * Return: vm_fault_t value.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923) vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924) pgprot_t pgprot, bool write)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926) unsigned long addr = vmf->address & PUD_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927) struct vm_area_struct *vma = vmf->vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930) * If we had pud_special, we could avoid all these restrictions,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931) * but we need to be consistent with PTEs and architectures that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932) * can't support a 'special' bit.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934) BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935) !pfn_t_devmap(pfn));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936) BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) (VM_PFNMAP|VM_MIXEDMAP));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938) BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940) if (addr < vma->vm_start || addr >= vma->vm_end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941) return VM_FAULT_SIGBUS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943) track_pfn_insert(vma, &pgprot, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945) insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946) return VM_FAULT_NOPAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948) EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud_prot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949) #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951) static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952) pmd_t *pmd, int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) pmd_t _pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956) _pmd = pmd_mkyoung(*pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957) if (flags & FOLL_WRITE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958) _pmd = pmd_mkdirty(_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) pmd, _pmd, flags & FOLL_WRITE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961) update_mmu_cache_pmd(vma, addr, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964) struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965) pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967) unsigned long pfn = pmd_pfn(*pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968) struct mm_struct *mm = vma->vm_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971) assert_spin_locked(pmd_lockptr(mm, pmd));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974) * When we COW a devmap PMD entry, we split it into PTEs, so we should
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975) * not be in this function with `flags & FOLL_COW` set.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977) WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979) /* FOLL_GET and FOLL_PIN are mutually exclusive. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980) if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981) (FOLL_PIN | FOLL_GET)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984) if (flags & FOLL_WRITE && !pmd_write(*pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987) if (pmd_present(*pmd) && pmd_devmap(*pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988) /* pass */;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992) if (flags & FOLL_TOUCH)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993) touch_pmd(vma, addr, pmd, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996) * device mapped pages can only be returned if the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997) * caller will manage the page reference count.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999) if (!(flags & (FOLL_GET | FOLL_PIN)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) return ERR_PTR(-EEXIST);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) *pgmap = get_dev_pagemap(pfn, *pgmap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) if (!*pgmap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) return ERR_PTR(-EFAULT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) page = pfn_to_page(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) if (!try_grab_page(page, flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) page = ERR_PTR(-ENOMEM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) spinlock_t *dst_ptl, *src_ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) struct page *src_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) pmd_t pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) pgtable_t pgtable = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) int ret = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) /* Skip if can be re-fill on fault */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) if (!vma_is_anonymous(dst_vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) pgtable = pte_alloc_one(dst_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) if (unlikely(!pgtable))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) dst_ptl = pmd_lock(dst_mm, dst_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) src_ptl = pmd_lockptr(src_mm, src_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) ret = -EAGAIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) pmd = *src_pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) if (unlikely(is_swap_pmd(pmd))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) swp_entry_t entry = pmd_to_swp_entry(pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) VM_BUG_ON(!is_pmd_migration_entry(pmd));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) if (is_write_migration_entry(entry)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) make_migration_entry_read(&entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) pmd = swp_entry_to_pmd(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) if (pmd_swp_soft_dirty(*src_pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) pmd = pmd_swp_mksoft_dirty(pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) if (pmd_swp_uffd_wp(*src_pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) pmd = pmd_swp_mkuffd_wp(pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) set_pmd_at(src_mm, addr, src_pmd, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) mm_inc_nr_ptes(dst_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) if (!userfaultfd_wp(dst_vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) pmd = pmd_swp_clear_uffd_wp(pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) set_pmd_at(dst_mm, addr, dst_pmd, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) if (unlikely(!pmd_trans_huge(pmd))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) pte_free(dst_mm, pgtable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) * When page table lock is held, the huge zero pmd should not be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) * under splitting since we don't split the page itself, only pmd to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) * a page table.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) if (is_huge_zero_pmd(pmd)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) * get_huge_zero_page() will never allocate a new page here,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) * since we already have a zero page to copy. It just takes a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) * reference.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) mm_get_huge_zero_page(dst_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) goto out_zero_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) src_page = pmd_page(pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) * If this page is a potentially pinned page, split and retry the fault
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) * with smaller page size. Normally this should not happen because the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) * userspace should use MADV_DONTFORK upon pinned regions. This is a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089) * best effort that the pinned pages won't be replaced by another
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) * random page during the coming copy-on-write.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) if (unlikely(is_cow_mapping(src_vma->vm_flags) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) atomic_read(&src_mm->has_pinned) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) page_maybe_dma_pinned(src_page))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) pte_free(dst_mm, pgtable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) spin_unlock(src_ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) spin_unlock(dst_ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) __split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) return -EAGAIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) get_page(src_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) page_dup_rmap(src_page, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105) out_zero_page:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) mm_inc_nr_ptes(dst_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) pmdp_set_wrprotect(src_mm, addr, src_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109) if (!userfaultfd_wp(dst_vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110) pmd = pmd_clear_uffd_wp(pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) pmd = pmd_mkold(pmd_wrprotect(pmd));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) set_pmd_at(dst_mm, addr, dst_pmd, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) spin_unlock(src_ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) spin_unlock(dst_ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) pud_t *pud, int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) pud_t _pud;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) _pud = pud_mkyoung(*pud);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) if (flags & FOLL_WRITE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) _pud = pud_mkdirty(_pud);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) pud, _pud, flags & FOLL_WRITE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) update_mmu_cache_pud(vma, addr, pud);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) pud_t *pud, int flags, struct dev_pagemap **pgmap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) unsigned long pfn = pud_pfn(*pud);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140) struct mm_struct *mm = vma->vm_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) assert_spin_locked(pud_lockptr(mm, pud));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) if (flags & FOLL_WRITE && !pud_write(*pud))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) /* FOLL_GET and FOLL_PIN are mutually exclusive. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149) if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) (FOLL_PIN | FOLL_GET)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) if (pud_present(*pud) && pud_devmap(*pud))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154) /* pass */;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158) if (flags & FOLL_TOUCH)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) touch_pud(vma, addr, pud, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) * device mapped pages can only be returned if the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163) * caller will manage the page reference count.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) * At least one of FOLL_GET | FOLL_PIN must be set, so assert that here:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) if (!(flags & (FOLL_GET | FOLL_PIN)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) return ERR_PTR(-EEXIST);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) *pgmap = get_dev_pagemap(pfn, *pgmap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) if (!*pgmap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) return ERR_PTR(-EFAULT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) page = pfn_to_page(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) if (!try_grab_page(page, flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) page = ERR_PTR(-ENOMEM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) spinlock_t *dst_ptl, *src_ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) pud_t pud;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) dst_ptl = pud_lock(dst_mm, dst_pud);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) src_ptl = pud_lockptr(src_mm, src_pud);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) ret = -EAGAIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194) pud = *src_pud;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199) * When page table lock is held, the huge zero pud should not be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200) * under splitting since we don't split the page itself, only pud to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) * a page table.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) if (is_huge_zero_pud(pud)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204) /* No huge zero pud yet */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207) /* Please refer to comments in copy_huge_pmd() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) if (unlikely(is_cow_mapping(vma->vm_flags) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) atomic_read(&src_mm->has_pinned) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) page_maybe_dma_pinned(pud_page(pud)))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211) spin_unlock(src_ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) spin_unlock(dst_ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) __split_huge_pud(vma, src_pud, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214) return -EAGAIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) pudp_set_wrprotect(src_mm, addr, src_pud);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) pud = pud_mkold(pud_wrprotect(pud));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) set_pud_at(dst_mm, addr, dst_pud, pud);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223) spin_unlock(src_ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) spin_unlock(dst_ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) pud_t entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) unsigned long haddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) bool write = vmf->flags & FAULT_FLAG_WRITE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) if (unlikely(!pud_same(*vmf->pud, orig_pud)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) goto unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238) entry = pud_mkyoung(orig_pud);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) if (write)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240) entry = pud_mkdirty(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241) haddr = vmf->address & HPAGE_PUD_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242) if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243) update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245) unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) spin_unlock(vmf->ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250) void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252) pmd_t entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) unsigned long haddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254) bool write = vmf->flags & FAULT_FLAG_WRITE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256) vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258) goto unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260) entry = pmd_mkyoung(orig_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261) if (write)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262) entry = pmd_mkdirty(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263) haddr = vmf->address & HPAGE_PMD_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264) if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265) update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267) unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268) spin_unlock(vmf->ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271) vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273) struct vm_area_struct *vma = vmf->vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275) unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) VM_BUG_ON_VMA(!vma->anon_vma, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280) if (is_huge_zero_pmd(orig_pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) goto fallback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) spin_lock(vmf->ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285) if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286) spin_unlock(vmf->ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1290) page = pmd_page(orig_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1291) VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1292)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1293) /* Lock page for reuse_swap_page() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1294) if (!trylock_page(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1295) get_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1296) spin_unlock(vmf->ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1297) lock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1298) spin_lock(vmf->ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1299) if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1300) spin_unlock(vmf->ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1301) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1302) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1303) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1304) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1305) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1306) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1307)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1308) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1309) * We can only reuse the page if nobody else maps the huge page or it's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1310) * part.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1311) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1312) if (reuse_swap_page(page, NULL)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1313) pmd_t entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1314) entry = pmd_mkyoung(orig_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1315) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1316) if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1317) update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1318) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1319) spin_unlock(vmf->ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1320) return VM_FAULT_WRITE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1321) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1322)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1323) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1324) spin_unlock(vmf->ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1325) fallback:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1326) __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1327) return VM_FAULT_FALLBACK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1328) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1329)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1330) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1331) * FOLL_FORCE can write to even unwritable pmd's, but only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1332) * after we've gone through a COW cycle and they are dirty.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1333) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1334) static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1335) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1336) return pmd_write(pmd) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1337) ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1338) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1339)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1340) struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1341) unsigned long addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1342) pmd_t *pmd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1343) unsigned int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1344) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1345) struct mm_struct *mm = vma->vm_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1346) struct page *page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1347)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1348) assert_spin_locked(pmd_lockptr(mm, pmd));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1349)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1350) if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1351) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1352)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1353) /* Avoid dumping huge zero page */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1354) if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1355) return ERR_PTR(-EFAULT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1356)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1357) /* Full NUMA hinting faults to serialise migration in fault paths */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1358) if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1359) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1360)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1361) page = pmd_page(*pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1362) VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1363)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1364) if (!try_grab_page(page, flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1365) return ERR_PTR(-ENOMEM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1366)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1367) if (flags & FOLL_TOUCH)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1368) touch_pmd(vma, addr, pmd, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1369)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1370) if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1371) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1372) * We don't mlock() pte-mapped THPs. This way we can avoid
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1373) * leaking mlocked pages into non-VM_LOCKED VMAs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1374) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1375) * For anon THP:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1376) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1377) * In most cases the pmd is the only mapping of the page as we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1378) * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1379) * writable private mappings in populate_vma_page_range().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1380) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1381) * The only scenario when we have the page shared here is if we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1382) * mlocking read-only mapping shared over fork(). We skip
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1383) * mlocking such pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1384) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1385) * For file THP:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1386) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1387) * We can expect PageDoubleMap() to be stable under page lock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1388) * for file pages we set it in page_add_file_rmap(), which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1389) * requires page to be locked.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1390) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1391)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1392) if (PageAnon(page) && compound_mapcount(page) != 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1393) goto skip_mlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1394) if (PageDoubleMap(page) || !page->mapping)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1395) goto skip_mlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1396) if (!trylock_page(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1397) goto skip_mlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1398) if (page->mapping && !PageDoubleMap(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1399) mlock_vma_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1400) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1401) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1402) skip_mlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1403) page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1404) VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1405)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1406) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1407) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1408) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1409)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1410) /* NUMA hinting page fault entry point for trans huge pmds */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1411) vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1412) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1413) struct vm_area_struct *vma = vmf->vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1414) struct anon_vma *anon_vma = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1415) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1416) unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1417) int page_nid = NUMA_NO_NODE, this_nid = numa_node_id();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1418) int target_nid, last_cpupid = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1419) bool page_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1420) bool migrated = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1421) bool was_writable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1422) int flags = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1423)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1424) vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1425) if (unlikely(!pmd_same(pmd, *vmf->pmd)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1426) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1427)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1428) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1429) * If there are potential migrations, wait for completion and retry
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1430) * without disrupting NUMA hinting information. Do not relock and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1431) * check_same as the page may no longer be mapped.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1432) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1433) if (unlikely(pmd_trans_migrating(*vmf->pmd))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1434) page = pmd_page(*vmf->pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1435) if (!get_page_unless_zero(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1436) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1437) spin_unlock(vmf->ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1438) put_and_wait_on_page_locked(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1439) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1440) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1441)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1442) page = pmd_page(pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1443) BUG_ON(is_huge_zero_page(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1444) page_nid = page_to_nid(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1445) last_cpupid = page_cpupid_last(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1446) count_vm_numa_event(NUMA_HINT_FAULTS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1447) if (page_nid == this_nid) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1448) count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1449) flags |= TNF_FAULT_LOCAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1450) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1451)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1452) /* See similar comment in do_numa_page for explanation */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1453) if (!pmd_savedwrite(pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1454) flags |= TNF_NO_GROUP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1455)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1456) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1457) * Acquire the page lock to serialise THP migrations but avoid dropping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1458) * page_table_lock if at all possible
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1459) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1460) page_locked = trylock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1461) target_nid = mpol_misplaced(page, vma, haddr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1462) if (target_nid == NUMA_NO_NODE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1463) /* If the page was locked, there are no parallel migrations */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1464) if (page_locked)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1465) goto clear_pmdnuma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1466) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1467)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1468) /* Migration could have started since the pmd_trans_migrating check */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1469) if (!page_locked) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1470) page_nid = NUMA_NO_NODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1471) if (!get_page_unless_zero(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1472) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1473) spin_unlock(vmf->ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1474) put_and_wait_on_page_locked(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1475) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1476) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1477)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1478) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1479) * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1480) * to serialises splits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1481) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1482) get_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1483) spin_unlock(vmf->ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1484) anon_vma = page_lock_anon_vma_read(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1485)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1486) /* Confirm the PMD did not change while page_table_lock was released */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1487) spin_lock(vmf->ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1488) if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1489) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1490) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1491) page_nid = NUMA_NO_NODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1492) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1493) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1494)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1495) /* Bail if we fail to protect against THP splits for any reason */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1496) if (unlikely(!anon_vma)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1497) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1498) page_nid = NUMA_NO_NODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1499) goto clear_pmdnuma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1500) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1501)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1502) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1503) * Since we took the NUMA fault, we must have observed the !accessible
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1504) * bit. Make sure all other CPUs agree with that, to avoid them
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1505) * modifying the page we're about to migrate.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1506) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1507) * Must be done under PTL such that we'll observe the relevant
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1508) * inc_tlb_flush_pending().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1509) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1510) * We are not sure a pending tlb flush here is for a huge page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1511) * mapping or not. Hence use the tlb range variant
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1512) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1513) if (mm_tlb_flush_pending(vma->vm_mm)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1514) flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1515) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1516) * change_huge_pmd() released the pmd lock before
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1517) * invalidating the secondary MMUs sharing the primary
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1518) * MMU pagetables (with ->invalidate_range()). The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1519) * mmu_notifier_invalidate_range_end() (which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1520) * internally calls ->invalidate_range()) in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1521) * change_pmd_range() will run after us, so we can't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1522) * rely on it here and we need an explicit invalidate.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1523) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1524) mmu_notifier_invalidate_range(vma->vm_mm, haddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1525) haddr + HPAGE_PMD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1526) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1527)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1528) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1529) * Migrate the THP to the requested node, returns with page unlocked
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1530) * and access rights restored.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1531) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1532) spin_unlock(vmf->ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1533)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1534) migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1535) vmf->pmd, pmd, vmf->address, page, target_nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1536) if (migrated) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1537) flags |= TNF_MIGRATED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1538) page_nid = target_nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1539) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1540) flags |= TNF_MIGRATE_FAIL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1541)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1542) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1543) clear_pmdnuma:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1544) BUG_ON(!PageLocked(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1545) was_writable = pmd_savedwrite(pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1546) pmd = pmd_modify(pmd, vma->vm_page_prot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1547) pmd = pmd_mkyoung(pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1548) if (was_writable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1549) pmd = pmd_mkwrite(pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1550) set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1551) update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1552) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1553) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1554) spin_unlock(vmf->ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1555)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1556) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1557) if (anon_vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1558) page_unlock_anon_vma_read(anon_vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1559)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1560) if (page_nid != NUMA_NO_NODE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1561) task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1562) flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1563)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1564) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1565) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1566)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1567) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1568) * Return true if we do MADV_FREE successfully on entire pmd page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1569) * Otherwise, return false.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1570) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1571) bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1572) pmd_t *pmd, unsigned long addr, unsigned long next)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1573) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1574) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1575) pmd_t orig_pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1576) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1577) struct mm_struct *mm = tlb->mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1578) bool ret = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1579)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1580) tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1581)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1582) ptl = pmd_trans_huge_lock(pmd, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1583) if (!ptl)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1584) goto out_unlocked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1585)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1586) orig_pmd = *pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1587) if (is_huge_zero_pmd(orig_pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1588) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1589)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1590) if (unlikely(!pmd_present(orig_pmd))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1591) VM_BUG_ON(thp_migration_supported() &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1592) !is_pmd_migration_entry(orig_pmd));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1593) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1594) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1595)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1596) page = pmd_page(orig_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1597) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1598) * If other processes are mapping this page, we couldn't discard
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1599) * the page unless they all do MADV_FREE so let's skip the page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1600) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1601) if (total_mapcount(page) != 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1602) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1603)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1604) if (!trylock_page(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1605) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1606)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1607) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1608) * If user want to discard part-pages of THP, split it so MADV_FREE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1609) * will deactivate only them.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1610) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1611) if (next - addr != HPAGE_PMD_SIZE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1612) get_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1613) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1614) split_huge_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1615) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1616) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1617) goto out_unlocked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1618) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1619)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1620) if (PageDirty(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1621) ClearPageDirty(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1622) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1623)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1624) if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1625) pmdp_invalidate(vma, addr, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1626) orig_pmd = pmd_mkold(orig_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1627) orig_pmd = pmd_mkclean(orig_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1628)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1629) set_pmd_at(mm, addr, pmd, orig_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1630) tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1631) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1632)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1633) mark_page_lazyfree(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1634) ret = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1635) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1636) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1637) out_unlocked:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1638) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1639) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1640)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1641) static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1642) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1643) pgtable_t pgtable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1644)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1645) pgtable = pgtable_trans_huge_withdraw(mm, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1646) pte_free(mm, pgtable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1647) mm_dec_nr_ptes(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1648) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1649)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1650) int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1651) pmd_t *pmd, unsigned long addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1652) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1653) pmd_t orig_pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1654) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1655)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1656) tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1657)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1658) ptl = __pmd_trans_huge_lock(pmd, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1659) if (!ptl)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1660) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1661) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1662) * For architectures like ppc64 we look at deposited pgtable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1663) * when calling pmdp_huge_get_and_clear. So do the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1664) * pgtable_trans_huge_withdraw after finishing pmdp related
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1665) * operations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1666) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1667) orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1668) tlb->fullmm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1669) tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1670) if (vma_is_special_huge(vma)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1671) if (arch_needs_pgtable_deposit())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1672) zap_deposited_table(tlb->mm, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1673) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1674) if (is_huge_zero_pmd(orig_pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1675) tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1676) } else if (is_huge_zero_pmd(orig_pmd)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1677) zap_deposited_table(tlb->mm, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1678) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1679) tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1680) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1681) struct page *page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1682) int flush_needed = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1683)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1684) if (pmd_present(orig_pmd)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1685) page = pmd_page(orig_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1686) page_remove_rmap(page, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1687) VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1688) VM_BUG_ON_PAGE(!PageHead(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1689) } else if (thp_migration_supported()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1690) swp_entry_t entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1691)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1692) VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1693) entry = pmd_to_swp_entry(orig_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1694) page = pfn_to_page(swp_offset(entry));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1695) flush_needed = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1696) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1697) WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1698)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1699) if (PageAnon(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1700) zap_deposited_table(tlb->mm, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1701) add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1702) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1703) if (arch_needs_pgtable_deposit())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1704) zap_deposited_table(tlb->mm, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1705) add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1706) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1707)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1708) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1709) if (flush_needed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1710) tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1711) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1712) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1713) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1714)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1715) #ifndef pmd_move_must_withdraw
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1716) static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1717) spinlock_t *old_pmd_ptl,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1718) struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1719) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1720) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1721) * With split pmd lock we also need to move preallocated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1722) * PTE page table if new_pmd is on different PMD page table.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1723) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1724) * We also don't deposit and withdraw tables for file pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1725) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1726) return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1727) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1728) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1729)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1730) static pmd_t move_soft_dirty_pmd(pmd_t pmd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1731) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1732) #ifdef CONFIG_MEM_SOFT_DIRTY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1733) if (unlikely(is_pmd_migration_entry(pmd)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1734) pmd = pmd_swp_mksoft_dirty(pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1735) else if (pmd_present(pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1736) pmd = pmd_mksoft_dirty(pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1737) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1738) return pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1739) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1740)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1741) bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1742) unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1743) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1744) spinlock_t *old_ptl, *new_ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1745) pmd_t pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1746) struct mm_struct *mm = vma->vm_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1747) bool force_flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1748)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1749) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1750) * The destination pmd shouldn't be established, free_pgtables()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1751) * should have release it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1752) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1753) if (WARN_ON(!pmd_none(*new_pmd))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1754) VM_BUG_ON(pmd_trans_huge(*new_pmd));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1755) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1756) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1757)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1758) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1759) * We don't have to worry about the ordering of src and dst
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1760) * ptlocks because exclusive mmap_lock prevents deadlock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1761) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1762) old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1763) if (old_ptl) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1764) new_ptl = pmd_lockptr(mm, new_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1765) if (new_ptl != old_ptl)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1766) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1767) pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1768) if (pmd_present(pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1769) force_flush = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1770) VM_BUG_ON(!pmd_none(*new_pmd));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1771)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1772) if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1773) pgtable_t pgtable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1774) pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1775) pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1776) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1777) pmd = move_soft_dirty_pmd(pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1778) set_pmd_at(mm, new_addr, new_pmd, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1779) if (force_flush)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1780) flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1781) if (new_ptl != old_ptl)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1782) spin_unlock(new_ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1783) spin_unlock(old_ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1784) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1785) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1786) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1787) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1788)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1789) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1790) * Returns
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1791) * - 0 if PMD could not be locked
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1792) * - 1 if PMD was locked but protections unchange and TLB flush unnecessary
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1793) * - HPAGE_PMD_NR is protections changed and TLB flush necessary
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1794) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1795) int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1796) unsigned long addr, pgprot_t newprot, unsigned long cp_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1797) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1798) struct mm_struct *mm = vma->vm_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1799) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1800) pmd_t entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1801) bool preserve_write;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1802) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1803) bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1804) bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1805) bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1806)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1807) ptl = __pmd_trans_huge_lock(pmd, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1808) if (!ptl)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1809) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1810)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1811) preserve_write = prot_numa && pmd_write(*pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1812) ret = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1813)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1814) #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1815) if (is_swap_pmd(*pmd)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1816) swp_entry_t entry = pmd_to_swp_entry(*pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1817)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1818) VM_BUG_ON(!is_pmd_migration_entry(*pmd));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1819) if (is_write_migration_entry(entry)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1820) pmd_t newpmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1821) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1822) * A protection check is difficult so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1823) * just be safe and disable write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1824) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1825) make_migration_entry_read(&entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1826) newpmd = swp_entry_to_pmd(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1827) if (pmd_swp_soft_dirty(*pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1828) newpmd = pmd_swp_mksoft_dirty(newpmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1829) if (pmd_swp_uffd_wp(*pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1830) newpmd = pmd_swp_mkuffd_wp(newpmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1831) set_pmd_at(mm, addr, pmd, newpmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1832) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1833) goto unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1834) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1835) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1836)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1837) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1838) * Avoid trapping faults against the zero page. The read-only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1839) * data is likely to be read-cached on the local CPU and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1840) * local/remote hits to the zero page are not interesting.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1841) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1842) if (prot_numa && is_huge_zero_pmd(*pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1843) goto unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1844)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1845) if (prot_numa && pmd_protnone(*pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1846) goto unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1847)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1848) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1849) * In case prot_numa, we are under mmap_read_lock(mm). It's critical
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1850) * to not clear pmd intermittently to avoid race with MADV_DONTNEED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1851) * which is also under mmap_read_lock(mm):
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1852) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1853) * CPU0: CPU1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1854) * change_huge_pmd(prot_numa=1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1855) * pmdp_huge_get_and_clear_notify()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1856) * madvise_dontneed()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1857) * zap_pmd_range()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1858) * pmd_trans_huge(*pmd) == 0 (without ptl)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1859) * // skip the pmd
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1860) * set_pmd_at();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1861) * // pmd is re-established
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1862) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1863) * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1864) * which may break userspace.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1865) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1866) * pmdp_invalidate() is required to make sure we don't miss
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1867) * dirty/young flags set by hardware.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1868) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1869) entry = pmdp_invalidate(vma, addr, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1870)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1871) entry = pmd_modify(entry, newprot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1872) if (preserve_write)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1873) entry = pmd_mk_savedwrite(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1874) if (uffd_wp) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1875) entry = pmd_wrprotect(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1876) entry = pmd_mkuffd_wp(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1877) } else if (uffd_wp_resolve) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1878) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1879) * Leave the write bit to be handled by PF interrupt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1880) * handler, then things like COW could be properly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1881) * handled.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1882) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1883) entry = pmd_clear_uffd_wp(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1884) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1885) ret = HPAGE_PMD_NR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1886) set_pmd_at(mm, addr, pmd, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1887) BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1888) unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1889) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1890) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1891) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1892)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1893) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1894) * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1895) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1896) * Note that if it returns page table lock pointer, this routine returns without
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1897) * unlocking page table lock. So callers must unlock it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1898) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1899) spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1900) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1901) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1902) ptl = pmd_lock(vma->vm_mm, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1903) if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1904) pmd_devmap(*pmd)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1905) return ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1906) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1907) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1908) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1909)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1910) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1911) * Returns true if a given pud maps a thp, false otherwise.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1912) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1913) * Note that if it returns true, this routine returns without unlocking page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1914) * table lock. So callers must unlock it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1915) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1916) spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1917) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1918) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1919)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1920) ptl = pud_lock(vma->vm_mm, pud);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1921) if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1922) return ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1923) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1924) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1925) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1926)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1927) #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1928) int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1929) pud_t *pud, unsigned long addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1930) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1931) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1932)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1933) ptl = __pud_trans_huge_lock(pud, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1934) if (!ptl)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1935) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1936) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1937) * For architectures like ppc64 we look at deposited pgtable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1938) * when calling pudp_huge_get_and_clear. So do the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1939) * pgtable_trans_huge_withdraw after finishing pudp related
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1940) * operations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1941) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1942) pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1943) tlb_remove_pud_tlb_entry(tlb, pud, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1944) if (vma_is_special_huge(vma)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1945) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1946) /* No zero page support yet */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1947) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1948) /* No support for anonymous PUD pages yet */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1949) BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1950) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1951) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1952) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1953)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1954) static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1955) unsigned long haddr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1956) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1957) VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1958) VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1959) VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1960) VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1961)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1962) count_vm_event(THP_SPLIT_PUD);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1963)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1964) pudp_huge_clear_flush_notify(vma, haddr, pud);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1965) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1966)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1967) void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1968) unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1969) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1970) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1971) struct mmu_notifier_range range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1972)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1973) mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1974) address & HPAGE_PUD_MASK,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1975) (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1976) mmu_notifier_invalidate_range_start(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1977) ptl = pud_lock(vma->vm_mm, pud);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1978) if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1979) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1980) __split_huge_pud_locked(vma, pud, range.start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1981)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1982) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1983) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1984) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1985) * No need to double call mmu_notifier->invalidate_range() callback as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1986) * the above pudp_huge_clear_flush_notify() did already call it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1987) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1988) mmu_notifier_invalidate_range_only_end(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1989) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1990) #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1991)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1992) static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1993) unsigned long haddr, pmd_t *pmd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1994) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1995) struct mm_struct *mm = vma->vm_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1996) pgtable_t pgtable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1997) pmd_t _pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1998) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1999)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2000) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2001) * Leave pmd empty until pte is filled note that it is fine to delay
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2002) * notification until mmu_notifier_invalidate_range_end() as we are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2003) * replacing a zero pmd write protected page with a zero pte write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2004) * protected page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2005) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2006) * See Documentation/vm/mmu_notifier.rst
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2007) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2008) pmdp_huge_clear_flush(vma, haddr, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2009)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2010) pgtable = pgtable_trans_huge_withdraw(mm, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2011) pmd_populate(mm, &_pmd, pgtable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2012)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2013) for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2014) pte_t *pte, entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2015) entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2016) entry = pte_mkspecial(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2017) pte = pte_offset_map(&_pmd, haddr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2018) VM_BUG_ON(!pte_none(*pte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2019) set_pte_at(mm, haddr, pte, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2020) pte_unmap(pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2021) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2022) smp_wmb(); /* make pte visible before pmd */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2023) pmd_populate(mm, pmd, pgtable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2024) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2025)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2026) static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2027) unsigned long haddr, bool freeze)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2028) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2029) struct mm_struct *mm = vma->vm_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2030) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2031) pgtable_t pgtable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2032) pmd_t old_pmd, _pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2033) bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2034) unsigned long addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2035) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2036)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2037) VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2038) VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2039) VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2040) VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2041) && !pmd_devmap(*pmd));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2042)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2043) count_vm_event(THP_SPLIT_PMD);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2044)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2045) if (!vma_is_anonymous(vma)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2046) old_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2047) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2048) * We are going to unmap this huge page. So
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2049) * just go ahead and zap it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2050) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2051) if (arch_needs_pgtable_deposit())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2052) zap_deposited_table(mm, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2053) if (vma_is_special_huge(vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2054) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2055) if (unlikely(is_pmd_migration_entry(old_pmd))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2056) swp_entry_t entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2057)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2058) entry = pmd_to_swp_entry(old_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2059) page = migration_entry_to_page(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2060) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2061) page = pmd_page(old_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2062) if (!PageDirty(page) && pmd_dirty(old_pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2063) set_page_dirty(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2064) if (!PageReferenced(page) && pmd_young(old_pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2065) SetPageReferenced(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2066) page_remove_rmap(page, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2067) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2068) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2069) add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2070) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2071) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2072)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2073) if (is_huge_zero_pmd(*pmd)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2074) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2075) * FIXME: Do we want to invalidate secondary mmu by calling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2076) * mmu_notifier_invalidate_range() see comments below inside
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2077) * __split_huge_pmd() ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2078) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2079) * We are going from a zero huge page write protected to zero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2080) * small page also write protected so it does not seems useful
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2081) * to invalidate secondary mmu at this time.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2082) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2083) return __split_huge_zero_page_pmd(vma, haddr, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2084) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2085)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2086) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2087) * Up to this point the pmd is present and huge and userland has the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2088) * whole access to the hugepage during the split (which happens in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2089) * place). If we overwrite the pmd with the not-huge version pointing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2090) * to the pte here (which of course we could if all CPUs were bug
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2091) * free), userland could trigger a small page size TLB miss on the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2092) * small sized TLB while the hugepage TLB entry is still established in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2093) * the huge TLB. Some CPU doesn't like that.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2094) * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2095) * 383 on page 105. Intel should be safe but is also warns that it's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2096) * only safe if the permission and cache attributes of the two entries
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2097) * loaded in the two TLB is identical (which should be the case here).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2098) * But it is generally safer to never allow small and huge TLB entries
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2099) * for the same virtual address to be loaded simultaneously. So instead
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2100) * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2101) * current pmd notpresent (atomically because here the pmd_trans_huge
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2102) * must remain set at all times on the pmd until the split is complete
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2103) * for this pmd), then we flush the SMP TLB and finally we write the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2104) * non-huge version of the pmd entry with pmd_populate.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2105) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2106) old_pmd = pmdp_invalidate(vma, haddr, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2107)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2108) pmd_migration = is_pmd_migration_entry(old_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2109) if (unlikely(pmd_migration)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2110) swp_entry_t entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2111)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2112) entry = pmd_to_swp_entry(old_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2113) page = pfn_to_page(swp_offset(entry));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2114) write = is_write_migration_entry(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2115) young = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2116) soft_dirty = pmd_swp_soft_dirty(old_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2117) uffd_wp = pmd_swp_uffd_wp(old_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2118) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2119) page = pmd_page(old_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2120) if (pmd_dirty(old_pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2121) SetPageDirty(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2122) write = pmd_write(old_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2123) young = pmd_young(old_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2124) soft_dirty = pmd_soft_dirty(old_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2125) uffd_wp = pmd_uffd_wp(old_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2126) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2127) VM_BUG_ON_PAGE(!page_count(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2128) page_ref_add(page, HPAGE_PMD_NR - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2129)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2130) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2131) * Withdraw the table only after we mark the pmd entry invalid.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2132) * This's critical for some architectures (Power).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2133) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2134) pgtable = pgtable_trans_huge_withdraw(mm, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2135) pmd_populate(mm, &_pmd, pgtable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2136)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2137) for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2138) pte_t entry, *pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2139) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2140) * Note that NUMA hinting access restrictions are not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2141) * transferred to avoid any possibility of altering
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2142) * permissions across VMAs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2143) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2144) if (freeze || pmd_migration) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2145) swp_entry_t swp_entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2146) swp_entry = make_migration_entry(page + i, write);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2147) entry = swp_entry_to_pte(swp_entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2148) if (soft_dirty)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2149) entry = pte_swp_mksoft_dirty(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2150) if (uffd_wp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2151) entry = pte_swp_mkuffd_wp(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2152) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2153) entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2154) entry = maybe_mkwrite(entry, vma->vm_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2155) if (!write)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2156) entry = pte_wrprotect(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2157) if (!young)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2158) entry = pte_mkold(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2159) if (soft_dirty)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2160) entry = pte_mksoft_dirty(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2161) if (uffd_wp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2162) entry = pte_mkuffd_wp(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2163) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2164) pte = pte_offset_map(&_pmd, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2165) BUG_ON(!pte_none(*pte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2166) set_pte_at(mm, addr, pte, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2167) if (!pmd_migration)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2168) atomic_inc(&page[i]._mapcount);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2169) pte_unmap(pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2170) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2171)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2172) if (!pmd_migration) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2173) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2174) * Set PG_double_map before dropping compound_mapcount to avoid
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2175) * false-negative page_mapped().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2176) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2177) if (compound_mapcount(page) > 1 &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2178) !TestSetPageDoubleMap(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2179) for (i = 0; i < HPAGE_PMD_NR; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2180) atomic_inc(&page[i]._mapcount);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2181) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2182)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2183) lock_page_memcg(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2184) if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2185) /* Last compound_mapcount is gone. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2186) __dec_lruvec_page_state(page, NR_ANON_THPS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2187) if (TestClearPageDoubleMap(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2188) /* No need in mapcount reference anymore */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2189) for (i = 0; i < HPAGE_PMD_NR; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2190) atomic_dec(&page[i]._mapcount);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2191) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2192) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2193) unlock_page_memcg(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2194) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2195)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2196) smp_wmb(); /* make pte visible before pmd */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2197) pmd_populate(mm, pmd, pgtable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2198)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2199) if (freeze) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2200) for (i = 0; i < HPAGE_PMD_NR; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2201) page_remove_rmap(page + i, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2202) put_page(page + i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2203) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2204) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2205) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2206)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2207) void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2208) unsigned long address, bool freeze, struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2209) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2210) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2211) struct mmu_notifier_range range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2212) bool do_unlock_page = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2213) pmd_t _pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2214)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2215) mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2216) address & HPAGE_PMD_MASK,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2217) (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2218) mmu_notifier_invalidate_range_start(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2219) ptl = pmd_lock(vma->vm_mm, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2220)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2221) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2222) * If caller asks to setup a migration entries, we need a page to check
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2223) * pmd against. Otherwise we can end up replacing wrong page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2224) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2225) VM_BUG_ON(freeze && !page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2226) if (page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2227) VM_WARN_ON_ONCE(!PageLocked(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2228) if (page != pmd_page(*pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2229) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2230) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2231)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2232) repeat:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2233) if (pmd_trans_huge(*pmd)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2234) if (!page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2235) page = pmd_page(*pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2236) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2237) * An anonymous page must be locked, to ensure that a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2238) * concurrent reuse_swap_page() sees stable mapcount;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2239) * but reuse_swap_page() is not used on shmem or file,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2240) * and page lock must not be taken when zap_pmd_range()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2241) * calls __split_huge_pmd() while i_mmap_lock is held.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2242) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2243) if (PageAnon(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2244) if (unlikely(!trylock_page(page))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2245) get_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2246) _pmd = *pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2247) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2248) lock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2249) spin_lock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2250) if (unlikely(!pmd_same(*pmd, _pmd))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2251) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2252) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2253) page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2254) goto repeat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2255) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2256) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2257) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2258) do_unlock_page = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2259) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2260) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2261) if (PageMlocked(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2262) clear_page_mlock(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2263) } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2264) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2265) __split_huge_pmd_locked(vma, pmd, range.start, freeze);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2266) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2267) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2268) if (do_unlock_page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2269) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2270) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2271) * No need to double call mmu_notifier->invalidate_range() callback.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2272) * They are 3 cases to consider inside __split_huge_pmd_locked():
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2273) * 1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2274) * 2) __split_huge_zero_page_pmd() read only zero page and any write
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2275) * fault will trigger a flush_notify before pointing to a new page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2276) * (it is fine if the secondary mmu keeps pointing to the old zero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2277) * page in the meantime)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2278) * 3) Split a huge pmd into pte pointing to the same page. No need
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2279) * to invalidate secondary tlb entry they are all still valid.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2280) * any further changes to individual pte will notify. So no need
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2281) * to call mmu_notifier->invalidate_range()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2282) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2283) mmu_notifier_invalidate_range_only_end(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2284) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2285)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2286) void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2287) bool freeze, struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2288) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2289) pgd_t *pgd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2290) p4d_t *p4d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2291) pud_t *pud;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2292) pmd_t *pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2293)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2294) pgd = pgd_offset(vma->vm_mm, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2295) if (!pgd_present(*pgd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2296) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2297)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2298) p4d = p4d_offset(pgd, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2299) if (!p4d_present(*p4d))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2300) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2301)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2302) pud = pud_offset(p4d, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2303) if (!pud_present(*pud))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2304) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2305)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2306) pmd = pmd_offset(pud, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2307)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2308) __split_huge_pmd(vma, pmd, address, freeze, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2309) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2310)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2311) void vma_adjust_trans_huge(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2312) unsigned long start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2313) unsigned long end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2314) long adjust_next)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2315) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2316) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2317) * If the new start address isn't hpage aligned and it could
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2318) * previously contain an hugepage: check if we need to split
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2319) * an huge pmd.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2320) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2321) if (start & ~HPAGE_PMD_MASK &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2322) (start & HPAGE_PMD_MASK) >= vma->vm_start &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2323) (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2324) split_huge_pmd_address(vma, start, false, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2325)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2326) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2327) * If the new end address isn't hpage aligned and it could
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2328) * previously contain an hugepage: check if we need to split
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2329) * an huge pmd.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2330) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2331) if (end & ~HPAGE_PMD_MASK &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2332) (end & HPAGE_PMD_MASK) >= vma->vm_start &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2333) (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2334) split_huge_pmd_address(vma, end, false, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2335)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2336) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2337) * If we're also updating the vma->vm_next->vm_start, if the new
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2338) * vm_next->vm_start isn't hpage aligned and it could previously
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2339) * contain an hugepage: check if we need to split an huge pmd.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2340) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2341) if (adjust_next > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2342) struct vm_area_struct *next = vma->vm_next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2343) unsigned long nstart = next->vm_start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2344) nstart += adjust_next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2345) if (nstart & ~HPAGE_PMD_MASK &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2346) (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2347) (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2348) split_huge_pmd_address(next, nstart, false, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2349) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2350) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2351)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2352) static void unmap_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2353) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2354) enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_SYNC |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2355) TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2356)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2357) VM_BUG_ON_PAGE(!PageHead(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2358)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2359) if (PageAnon(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2360) ttu_flags |= TTU_SPLIT_FREEZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2361)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2362) try_to_unmap(page, ttu_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2363)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2364) VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2365) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2366)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2367) static void remap_page(struct page *page, unsigned int nr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2368) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2369) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2370) if (PageTransHuge(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2371) remove_migration_ptes(page, page, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2372) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2373) for (i = 0; i < nr; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2374) remove_migration_ptes(page + i, page + i, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2375) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2376) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2377)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2378) static void __split_huge_page_tail(struct page *head, int tail,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2379) struct lruvec *lruvec, struct list_head *list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2380) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2381) struct page *page_tail = head + tail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2382)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2383) VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2384)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2385) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2386) * Clone page flags before unfreezing refcount.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2387) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2388) * After successful get_page_unless_zero() might follow flags change,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2389) * for exmaple lock_page() which set PG_waiters.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2390) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2391) page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2392) page_tail->flags |= (head->flags &
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2393) ((1L << PG_referenced) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2394) (1L << PG_swapbacked) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2395) (1L << PG_swapcache) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2396) (1L << PG_mlocked) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2397) (1L << PG_uptodate) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2398) (1L << PG_active) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2399) (1L << PG_workingset) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2400) (1L << PG_locked) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2401) (1L << PG_unevictable) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2402) #ifdef CONFIG_64BIT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2403) (1L << PG_arch_2) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2404) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2405) (1L << PG_dirty)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2406)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2407) /* ->mapping in first tail page is compound_mapcount */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2408) VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2409) page_tail);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2410) page_tail->mapping = head->mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2411) page_tail->index = head->index + tail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2412)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2413) /* Page flags must be visible before we make the page non-compound. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2414) smp_wmb();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2415)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2416) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2417) * Clear PageTail before unfreezing page refcount.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2418) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2419) * After successful get_page_unless_zero() might follow put_page()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2420) * which needs correct compound_head().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2421) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2422) clear_compound_head(page_tail);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2423)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2424) /* Finally unfreeze refcount. Additional reference from page cache. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2425) page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2426) PageSwapCache(head)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2427)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2428) if (page_is_young(head))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2429) set_page_young(page_tail);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2430) if (page_is_idle(head))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2431) set_page_idle(page_tail);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2432)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2433) page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2434)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2435) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2436) * always add to the tail because some iterators expect new
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2437) * pages to show after the currently processed elements - e.g.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2438) * migrate_pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2439) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2440) lru_add_page_tail(head, page_tail, lruvec, list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2441) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2442)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2443) static void __split_huge_page(struct page *page, struct list_head *list,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2444) pgoff_t end, unsigned long flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2445) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2446) struct page *head = compound_head(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2447) pg_data_t *pgdat = page_pgdat(head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2448) struct lruvec *lruvec;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2449) struct address_space *swap_cache = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2450) unsigned long offset = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2451) unsigned int nr = thp_nr_pages(head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2452) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2453)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2454) lruvec = mem_cgroup_page_lruvec(head, pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2455)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2456) /* complete memcg works before add pages to LRU */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2457) split_page_memcg(head, nr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2458)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2459) if (PageAnon(head) && PageSwapCache(head)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2460) swp_entry_t entry = { .val = page_private(head) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2461)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2462) offset = swp_offset(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2463) swap_cache = swap_address_space(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2464) xa_lock(&swap_cache->i_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2465) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2466)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2467) for (i = nr - 1; i >= 1; i--) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2468) __split_huge_page_tail(head, i, lruvec, list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2469) /* Some pages can be beyond i_size: drop them from page cache */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2470) if (head[i].index >= end) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2471) ClearPageDirty(head + i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2472) __delete_from_page_cache(head + i, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2473) if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2474) shmem_uncharge(head->mapping->host, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2475) put_page(head + i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2476) } else if (!PageAnon(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2477) __xa_store(&head->mapping->i_pages, head[i].index,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2478) head + i, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2479) } else if (swap_cache) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2480) __xa_store(&swap_cache->i_pages, offset + i,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2481) head + i, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2482) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2483) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2484)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2485) ClearPageCompound(head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2486)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2487) split_page_owner(head, nr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2488)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2489) /* See comment in __split_huge_page_tail() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2490) if (PageAnon(head)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2491) /* Additional pin to swap cache */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2492) if (PageSwapCache(head)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2493) page_ref_add(head, 2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2494) xa_unlock(&swap_cache->i_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2495) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2496) page_ref_inc(head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2497) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2498) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2499) /* Additional pin to page cache */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2500) page_ref_add(head, 2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2501) xa_unlock(&head->mapping->i_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2502) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2503)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2504) spin_unlock_irqrestore(&pgdat->lru_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2505)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2506) remap_page(head, nr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2507)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2508) if (PageSwapCache(head)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2509) swp_entry_t entry = { .val = page_private(head) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2510)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2511) split_swap_cluster(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2512) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2513)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2514) for (i = 0; i < nr; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2515) struct page *subpage = head + i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2516) if (subpage == page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2517) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2518) unlock_page(subpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2519)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2520) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2521) * Subpages may be freed if there wasn't any mapping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2522) * like if add_to_swap() is running on a lru page that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2523) * had its mapping zapped. And freeing these pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2524) * requires taking the lru_lock so we do the put_page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2525) * of the tail pages after the split is complete.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2526) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2527) put_page(subpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2528) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2529) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2530)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2531) int total_mapcount(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2532) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2533) int i, compound, nr, ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2534)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2535) VM_BUG_ON_PAGE(PageTail(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2536)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2537) if (likely(!PageCompound(page)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2538) return atomic_read(&page->_mapcount) + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2539)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2540) compound = compound_mapcount(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2541) nr = compound_nr(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2542) if (PageHuge(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2543) return compound;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2544) ret = compound;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2545) for (i = 0; i < nr; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2546) ret += atomic_read(&page[i]._mapcount) + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2547) /* File pages has compound_mapcount included in _mapcount */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2548) if (!PageAnon(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2549) return ret - compound * nr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2550) if (PageDoubleMap(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2551) ret -= nr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2552) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2553) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2554)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2555) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2556) * This calculates accurately how many mappings a transparent hugepage
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2557) * has (unlike page_mapcount() which isn't fully accurate). This full
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2558) * accuracy is primarily needed to know if copy-on-write faults can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2559) * reuse the page and change the mapping to read-write instead of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2560) * copying them. At the same time this returns the total_mapcount too.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2561) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2562) * The function returns the highest mapcount any one of the subpages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2563) * has. If the return value is one, even if different processes are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2564) * mapping different subpages of the transparent hugepage, they can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2565) * all reuse it, because each process is reusing a different subpage.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2566) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2567) * The total_mapcount is instead counting all virtual mappings of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2568) * subpages. If the total_mapcount is equal to "one", it tells the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2569) * caller all mappings belong to the same "mm" and in turn the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2570) * anon_vma of the transparent hugepage can become the vma->anon_vma
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2571) * local one as no other process may be mapping any of the subpages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2572) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2573) * It would be more accurate to replace page_mapcount() with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2574) * page_trans_huge_mapcount(), however we only use
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2575) * page_trans_huge_mapcount() in the copy-on-write faults where we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2576) * need full accuracy to avoid breaking page pinning, because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2577) * page_trans_huge_mapcount() is slower than page_mapcount().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2578) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2579) int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2580) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2581) int i, ret, _total_mapcount, mapcount;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2582)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2583) /* hugetlbfs shouldn't call it */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2584) VM_BUG_ON_PAGE(PageHuge(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2585)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2586) if (likely(!PageTransCompound(page))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2587) mapcount = atomic_read(&page->_mapcount) + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2588) if (total_mapcount)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2589) *total_mapcount = mapcount;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2590) return mapcount;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2591) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2592)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2593) page = compound_head(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2594)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2595) _total_mapcount = ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2596) for (i = 0; i < thp_nr_pages(page); i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2597) mapcount = atomic_read(&page[i]._mapcount) + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2598) ret = max(ret, mapcount);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2599) _total_mapcount += mapcount;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2600) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2601) if (PageDoubleMap(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2602) ret -= 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2603) _total_mapcount -= thp_nr_pages(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2604) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2605) mapcount = compound_mapcount(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2606) ret += mapcount;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2607) _total_mapcount += mapcount;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2608) if (total_mapcount)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2609) *total_mapcount = _total_mapcount;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2610) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2611) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2612)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2613) /* Racy check whether the huge page can be split */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2614) bool can_split_huge_page(struct page *page, int *pextra_pins)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2615) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2616) int extra_pins;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2617)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2618) /* Additional pins from page cache */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2619) if (PageAnon(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2620) extra_pins = PageSwapCache(page) ? thp_nr_pages(page) : 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2621) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2622) extra_pins = thp_nr_pages(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2623) if (pextra_pins)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2624) *pextra_pins = extra_pins;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2625) return total_mapcount(page) == page_count(page) - extra_pins - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2626) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2627)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2628) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2629) * This function splits huge page into normal pages. @page can point to any
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2630) * subpage of huge page to split. Split doesn't change the position of @page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2631) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2632) * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2633) * The huge page must be locked.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2634) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2635) * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2636) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2637) * Both head page and tail pages will inherit mapping, flags, and so on from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2638) * the hugepage.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2639) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2640) * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2641) * they are not mapped.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2642) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2643) * Returns 0 if the hugepage is split successfully.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2644) * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2645) * us.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2646) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2647) int split_huge_page_to_list(struct page *page, struct list_head *list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2648) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2649) struct page *head = compound_head(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2650) struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2651) struct deferred_split *ds_queue = get_deferred_split_queue(head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2652) struct anon_vma *anon_vma = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2653) struct address_space *mapping = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2654) int extra_pins, ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2655) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2656) pgoff_t end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2657)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2658) VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2659) VM_BUG_ON_PAGE(!PageLocked(head), head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2660) VM_BUG_ON_PAGE(!PageCompound(head), head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2661)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2662) if (PageWriteback(head))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2663) return -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2664)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2665) if (PageAnon(head)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2666) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2667) * The caller does not necessarily hold an mmap_lock that would
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2668) * prevent the anon_vma disappearing so we first we take a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2669) * reference to it and then lock the anon_vma for write. This
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2670) * is similar to page_lock_anon_vma_read except the write lock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2671) * is taken to serialise against parallel split or collapse
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2672) * operations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2673) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2674) anon_vma = page_get_anon_vma(head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2675) if (!anon_vma) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2676) ret = -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2677) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2678) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2679) end = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2680) mapping = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2681) anon_vma_lock_write(anon_vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2682) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2683) mapping = head->mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2684)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2685) /* Truncated ? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2686) if (!mapping) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2687) ret = -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2688) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2689) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2690)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2691) anon_vma = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2692) i_mmap_lock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2693)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2694) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2695) *__split_huge_page() may need to trim off pages beyond EOF:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2696) * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2697) * which cannot be nested inside the page tree lock. So note
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2698) * end now: i_size itself may be changed at any moment, but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2699) * head page lock is good enough to serialize the trimming.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2700) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2701) end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2702) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2703)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2704) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2705) * Racy check if we can split the page, before unmap_page() will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2706) * split PMDs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2707) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2708) if (!can_split_huge_page(head, &extra_pins)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2709) ret = -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2710) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2711) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2712)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2713) unmap_page(head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2714)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2715) /* prevent PageLRU to go away from under us, and freeze lru stats */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2716) spin_lock_irqsave(&pgdata->lru_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2717)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2718) if (mapping) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2719) XA_STATE(xas, &mapping->i_pages, page_index(head));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2720)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2721) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2722) * Check if the head page is present in page cache.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2723) * We assume all tail are present too, if head is there.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2724) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2725) xa_lock(&mapping->i_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2726) if (xas_load(&xas) != head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2727) goto fail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2728) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2729)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2730) /* Prevent deferred_split_scan() touching ->_refcount */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2731) spin_lock(&ds_queue->split_queue_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2732) if (page_ref_freeze(head, 1 + extra_pins)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2733) if (!list_empty(page_deferred_list(head))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2734) ds_queue->split_queue_len--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2735) list_del(page_deferred_list(head));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2736) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2737) spin_unlock(&ds_queue->split_queue_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2738) if (mapping) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2739) if (PageSwapBacked(head))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2740) __dec_node_page_state(head, NR_SHMEM_THPS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2741) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2742) __dec_node_page_state(head, NR_FILE_THPS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2743) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2744)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2745) __split_huge_page(page, list, end, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2746) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2747) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2748) spin_unlock(&ds_queue->split_queue_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2749) fail:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2750) if (mapping)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2751) xa_unlock(&mapping->i_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2752) spin_unlock_irqrestore(&pgdata->lru_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2753) remap_page(head, thp_nr_pages(head));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2754) ret = -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2755) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2756)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2757) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2758) if (anon_vma) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2759) anon_vma_unlock_write(anon_vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2760) put_anon_vma(anon_vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2761) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2762) if (mapping)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2763) i_mmap_unlock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2764) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2765) count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2766) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2767) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2768)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2769) void free_transhuge_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2770) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2771) struct deferred_split *ds_queue = get_deferred_split_queue(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2772) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2773)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2774) spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2775) if (!list_empty(page_deferred_list(page))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2776) ds_queue->split_queue_len--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2777) list_del(page_deferred_list(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2778) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2779) spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2780) free_compound_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2781) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2782)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2783) void deferred_split_huge_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2784) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2785) struct deferred_split *ds_queue = get_deferred_split_queue(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2786) #ifdef CONFIG_MEMCG
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2787) struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2788) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2789) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2790)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2791) VM_BUG_ON_PAGE(!PageTransHuge(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2792)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2793) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2794) * The try_to_unmap() in page reclaim path might reach here too,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2795) * this may cause a race condition to corrupt deferred split queue.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2796) * And, if page reclaim is already handling the same page, it is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2797) * unnecessary to handle it again in shrinker.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2798) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2799) * Check PageSwapCache to determine if the page is being
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2800) * handled by page reclaim since THP swap would add the page into
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2801) * swap cache before calling try_to_unmap().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2802) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2803) if (PageSwapCache(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2804) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2805)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2806) spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2807) if (list_empty(page_deferred_list(page))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2808) count_vm_event(THP_DEFERRED_SPLIT_PAGE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2809) list_add_tail(page_deferred_list(page), &ds_queue->split_queue);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2810) ds_queue->split_queue_len++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2811) #ifdef CONFIG_MEMCG
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2812) if (memcg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2813) memcg_set_shrinker_bit(memcg, page_to_nid(page),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2814) deferred_split_shrinker.id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2815) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2816) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2817) spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2818) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2819)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2820) static unsigned long deferred_split_count(struct shrinker *shrink,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2821) struct shrink_control *sc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2822) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2823) struct pglist_data *pgdata = NODE_DATA(sc->nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2824) struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2825)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2826) #ifdef CONFIG_MEMCG
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2827) if (sc->memcg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2828) ds_queue = &sc->memcg->deferred_split_queue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2829) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2830) return READ_ONCE(ds_queue->split_queue_len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2831) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2832)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2833) static unsigned long deferred_split_scan(struct shrinker *shrink,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2834) struct shrink_control *sc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2835) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2836) struct pglist_data *pgdata = NODE_DATA(sc->nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2837) struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2838) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2839) LIST_HEAD(list), *pos, *next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2840) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2841) int split = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2842)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2843) #ifdef CONFIG_MEMCG
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2844) if (sc->memcg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2845) ds_queue = &sc->memcg->deferred_split_queue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2846) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2847)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2848) spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2849) /* Take pin on all head pages to avoid freeing them under us */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2850) list_for_each_safe(pos, next, &ds_queue->split_queue) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2851) page = list_entry((void *)pos, struct page, mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2852) page = compound_head(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2853) if (get_page_unless_zero(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2854) list_move(page_deferred_list(page), &list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2855) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2856) /* We lost race with put_compound_page() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2857) list_del_init(page_deferred_list(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2858) ds_queue->split_queue_len--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2859) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2860) if (!--sc->nr_to_scan)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2861) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2862) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2863) spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2864)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2865) list_for_each_safe(pos, next, &list) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2866) page = list_entry((void *)pos, struct page, mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2867) if (!trylock_page(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2868) goto next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2869) /* split_huge_page() removes page from list on success */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2870) if (!split_huge_page(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2871) split++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2872) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2873) next:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2874) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2875) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2876)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2877) spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2878) list_splice_tail(&list, &ds_queue->split_queue);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2879) spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2880)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2881) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2882) * Stop shrinker if we didn't split any page, but the queue is empty.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2883) * This can happen if pages were freed under us.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2884) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2885) if (!split && list_empty(&ds_queue->split_queue))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2886) return SHRINK_STOP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2887) return split;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2888) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2889)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2890) static struct shrinker deferred_split_shrinker = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2891) .count_objects = deferred_split_count,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2892) .scan_objects = deferred_split_scan,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2893) .seeks = DEFAULT_SEEKS,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2894) .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2895) SHRINKER_NONSLAB,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2896) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2897)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2898) #ifdef CONFIG_DEBUG_FS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2899) static int split_huge_pages_set(void *data, u64 val)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2900) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2901) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2902) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2903) unsigned long pfn, max_zone_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2904) unsigned long total = 0, split = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2905)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2906) if (val != 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2907) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2908)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2909) for_each_populated_zone(zone) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2910) max_zone_pfn = zone_end_pfn(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2911) for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2912) if (!pfn_valid(pfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2913) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2914)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2915) page = pfn_to_page(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2916) if (!get_page_unless_zero(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2917) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2918)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2919) if (zone != page_zone(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2920) goto next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2921)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2922) if (!PageHead(page) || PageHuge(page) || !PageLRU(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2923) goto next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2924)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2925) total++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2926) lock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2927) if (!split_huge_page(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2928) split++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2929) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2930) next:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2931) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2932) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2933) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2934)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2935) pr_info("%lu of %lu THP split\n", split, total);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2936)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2937) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2938) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2939) DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2940) "%llu\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2941)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2942) static int __init split_huge_pages_debugfs(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2943) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2944) debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2945) &split_huge_pages_fops);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2946) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2947) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2948) late_initcall(split_huge_pages_debugfs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2949) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2950)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2951) #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2952) void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2953) struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2954) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2955) struct vm_area_struct *vma = pvmw->vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2956) struct mm_struct *mm = vma->vm_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2957) unsigned long address = pvmw->address;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2958) pmd_t pmdval;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2959) swp_entry_t entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2960) pmd_t pmdswp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2961)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2962) if (!(pvmw->pmd && !pvmw->pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2963) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2964)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2965) flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2966) pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2967) if (pmd_dirty(pmdval))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2968) set_page_dirty(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2969) entry = make_migration_entry(page, pmd_write(pmdval));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2970) pmdswp = swp_entry_to_pmd(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2971) if (pmd_soft_dirty(pmdval))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2972) pmdswp = pmd_swp_mksoft_dirty(pmdswp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2973) set_pmd_at(mm, address, pvmw->pmd, pmdswp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2974) page_remove_rmap(page, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2975) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2976) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2977)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2978) void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2979) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2980) struct vm_area_struct *vma = pvmw->vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2981) struct mm_struct *mm = vma->vm_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2982) unsigned long address = pvmw->address;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2983) unsigned long mmun_start = address & HPAGE_PMD_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2984) pmd_t pmde;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2985) swp_entry_t entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2986)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2987) if (!(pvmw->pmd && !pvmw->pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2988) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2989)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2990) entry = pmd_to_swp_entry(*pvmw->pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2991) get_page(new);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2992) pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2993) if (pmd_swp_soft_dirty(*pvmw->pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2994) pmde = pmd_mksoft_dirty(pmde);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2995) if (is_write_migration_entry(entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2996) pmde = maybe_pmd_mkwrite(pmde, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2997) if (pmd_swp_uffd_wp(*pvmw->pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2998) pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2999)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3000) flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3001) if (PageAnon(new))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3002) page_add_anon_rmap(new, vma, mmun_start, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3003) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3004) page_add_file_rmap(new, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3005) set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3006) if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3007) mlock_vma_page(new);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3008) update_mmu_cache_pmd(vma, address, pvmw->pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3009) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3010) #endif