^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) #include <linux/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) #include <linux/sched.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) #include <linux/sched/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) #include <linux/sched/coredump.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) #include <linux/mmu_notifier.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) #include <linux/rmap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) #include <linux/swap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) #include <linux/mm_inline.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #include <linux/kthread.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) #include <linux/khugepaged.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #include <linux/freezer.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #include <linux/mman.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #include <linux/hashtable.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #include <linux/userfaultfd_k.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #include <linux/page_idle.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #include <linux/swapops.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #include <linux/shmem_fs.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) #include <asm/tlb.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) #include <asm/pgalloc.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) #include "internal.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) enum scan_result {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) SCAN_FAIL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) SCAN_SUCCEED,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) SCAN_PMD_NULL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) SCAN_EXCEED_NONE_PTE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) SCAN_EXCEED_SWAP_PTE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) SCAN_EXCEED_SHARED_PTE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) SCAN_PTE_NON_PRESENT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) SCAN_PTE_UFFD_WP,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) SCAN_PAGE_RO,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) SCAN_LACK_REFERENCED_PAGE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) SCAN_PAGE_NULL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) SCAN_SCAN_ABORT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) SCAN_PAGE_COUNT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) SCAN_PAGE_LRU,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) SCAN_PAGE_LOCK,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) SCAN_PAGE_ANON,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) SCAN_PAGE_COMPOUND,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) SCAN_ANY_PROCESS,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) SCAN_VMA_NULL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) SCAN_VMA_CHECK,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) SCAN_ADDRESS_RANGE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) SCAN_SWAP_CACHE_PAGE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) SCAN_DEL_PAGE_LRU,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) SCAN_ALLOC_HUGE_PAGE_FAIL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) SCAN_CGROUP_CHARGE_FAIL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) SCAN_TRUNCATED,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) SCAN_PAGE_HAS_PRIVATE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) #define CREATE_TRACE_POINTS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) #include <trace/events/huge_memory.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) static struct task_struct *khugepaged_thread __read_mostly;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) static DEFINE_MUTEX(khugepaged_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) /* default scan 8*512 pte (or vmas) every 30 second */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) static unsigned int khugepaged_pages_to_scan __read_mostly;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) static unsigned int khugepaged_pages_collapsed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) static unsigned int khugepaged_full_scans;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) /* during fragmentation poll the hugepage allocator once every minute */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) static unsigned long khugepaged_sleep_expire;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) static DEFINE_SPINLOCK(khugepaged_mm_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) * default collapse hugepages if there is at least one pte mapped like
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) * it would have happened if the vma was large enough during page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) * fault.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) static unsigned int khugepaged_max_ptes_none __read_mostly;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) static unsigned int khugepaged_max_ptes_swap __read_mostly;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) static unsigned int khugepaged_max_ptes_shared __read_mostly;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) #define MM_SLOTS_HASH_BITS 10
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) static struct kmem_cache *mm_slot_cache __read_mostly;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) #define MAX_PTE_MAPPED_THP 8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) * struct mm_slot - hash lookup from mm to mm_slot
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) * @hash: hash collision list
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) * @mm: the mm that this information is valid for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) struct mm_slot {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) struct hlist_node hash;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) struct list_head mm_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) struct mm_struct *mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) /* pte-mapped THP in this mm */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) int nr_pte_mapped_thp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) * struct khugepaged_scan - cursor for scanning
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) * @mm_head: the head of the mm list to scan
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) * @mm_slot: the current mm_slot we are scanning
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) * @address: the next address inside that to be scanned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) * There is only the one khugepaged_scan instance of this cursor structure.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) struct khugepaged_scan {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) struct list_head mm_head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) struct mm_slot *mm_slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) unsigned long address;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) static struct khugepaged_scan khugepaged_scan = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) #ifdef CONFIG_SYSFS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) struct kobj_attribute *attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) struct kobj_attribute *attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) const char *buf, size_t count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) unsigned long msecs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) err = kstrtoul(buf, 10, &msecs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) if (err || msecs > UINT_MAX)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) khugepaged_scan_sleep_millisecs = msecs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) khugepaged_sleep_expire = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) wake_up_interruptible(&khugepaged_wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) return count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) static struct kobj_attribute scan_sleep_millisecs_attr =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) scan_sleep_millisecs_store);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) struct kobj_attribute *attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) struct kobj_attribute *attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) const char *buf, size_t count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) unsigned long msecs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) err = kstrtoul(buf, 10, &msecs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) if (err || msecs > UINT_MAX)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) khugepaged_alloc_sleep_millisecs = msecs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) khugepaged_sleep_expire = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) wake_up_interruptible(&khugepaged_wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) return count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) static struct kobj_attribute alloc_sleep_millisecs_attr =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) alloc_sleep_millisecs_store);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) static ssize_t pages_to_scan_show(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) struct kobj_attribute *attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) static ssize_t pages_to_scan_store(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) struct kobj_attribute *attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) const char *buf, size_t count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) unsigned long pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) err = kstrtoul(buf, 10, &pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) if (err || !pages || pages > UINT_MAX)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) khugepaged_pages_to_scan = pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) return count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) static struct kobj_attribute pages_to_scan_attr =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) __ATTR(pages_to_scan, 0644, pages_to_scan_show,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) pages_to_scan_store);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) static ssize_t pages_collapsed_show(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) struct kobj_attribute *attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) static struct kobj_attribute pages_collapsed_attr =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) __ATTR_RO(pages_collapsed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) static ssize_t full_scans_show(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) struct kobj_attribute *attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) return sprintf(buf, "%u\n", khugepaged_full_scans);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) static struct kobj_attribute full_scans_attr =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) __ATTR_RO(full_scans);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) static ssize_t khugepaged_defrag_show(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) struct kobj_attribute *attr, char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) return single_hugepage_flag_show(kobj, attr, buf,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) static ssize_t khugepaged_defrag_store(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) struct kobj_attribute *attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) const char *buf, size_t count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) return single_hugepage_flag_store(kobj, attr, buf, count,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) static struct kobj_attribute khugepaged_defrag_attr =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) __ATTR(defrag, 0644, khugepaged_defrag_show,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) khugepaged_defrag_store);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) * max_ptes_none controls if khugepaged should collapse hugepages over
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) * any unmapped ptes in turn potentially increasing the memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) * reduce the available free memory in the system as it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) * runs. Increasing max_ptes_none will instead potentially reduce the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) * free memory in the system during the khugepaged scan.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) struct kobj_attribute *attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) struct kobj_attribute *attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) const char *buf, size_t count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) unsigned long max_ptes_none;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) err = kstrtoul(buf, 10, &max_ptes_none);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) if (err || max_ptes_none > HPAGE_PMD_NR-1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) khugepaged_max_ptes_none = max_ptes_none;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) return count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) static struct kobj_attribute khugepaged_max_ptes_none_attr =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) khugepaged_max_ptes_none_store);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) struct kobj_attribute *attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) return sprintf(buf, "%u\n", khugepaged_max_ptes_swap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) struct kobj_attribute *attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) const char *buf, size_t count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) unsigned long max_ptes_swap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) err = kstrtoul(buf, 10, &max_ptes_swap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) if (err || max_ptes_swap > HPAGE_PMD_NR-1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) khugepaged_max_ptes_swap = max_ptes_swap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) return count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) static struct kobj_attribute khugepaged_max_ptes_swap_attr =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) khugepaged_max_ptes_swap_store);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) static ssize_t khugepaged_max_ptes_shared_show(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) struct kobj_attribute *attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) return sprintf(buf, "%u\n", khugepaged_max_ptes_shared);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) struct kobj_attribute *attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) const char *buf, size_t count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) unsigned long max_ptes_shared;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) err = kstrtoul(buf, 10, &max_ptes_shared);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) if (err || max_ptes_shared > HPAGE_PMD_NR-1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) khugepaged_max_ptes_shared = max_ptes_shared;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) return count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) static struct kobj_attribute khugepaged_max_ptes_shared_attr =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) __ATTR(max_ptes_shared, 0644, khugepaged_max_ptes_shared_show,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) khugepaged_max_ptes_shared_store);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) static struct attribute *khugepaged_attr[] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) &khugepaged_defrag_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) &khugepaged_max_ptes_none_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) &khugepaged_max_ptes_swap_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) &khugepaged_max_ptes_shared_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) &pages_to_scan_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) &pages_collapsed_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) &full_scans_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) &scan_sleep_millisecs_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) &alloc_sleep_millisecs_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) NULL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) struct attribute_group khugepaged_attr_group = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) .attrs = khugepaged_attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) .name = "khugepaged",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) #endif /* CONFIG_SYSFS */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) int hugepage_madvise(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) unsigned long *vm_flags, int advice)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) switch (advice) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) case MADV_HUGEPAGE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) #ifdef CONFIG_S390
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) * can't handle this properly after s390_enable_sie, so we simply
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) * ignore the madvise to prevent qemu from causing a SIGSEGV.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) if (mm_has_pgste(vma->vm_mm))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) *vm_flags &= ~VM_NOHUGEPAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) *vm_flags |= VM_HUGEPAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) * If the vma become good for khugepaged to scan,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) * register it here without waiting a page fault that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) * may not happen any time soon.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) if (!(*vm_flags & VM_NO_KHUGEPAGED) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) khugepaged_enter_vma_merge(vma, *vm_flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) case MADV_NOHUGEPAGE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) *vm_flags &= ~VM_HUGEPAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) *vm_flags |= VM_NOHUGEPAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) * this vma even if we leave the mm registered in khugepaged if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) * it got registered before VM_NOHUGEPAGE was set.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) int __init khugepaged_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) sizeof(struct mm_slot),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) __alignof__(struct mm_slot), 0, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) if (!mm_slot_cache)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) void __init khugepaged_destroy(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) kmem_cache_destroy(mm_slot_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) static inline struct mm_slot *alloc_mm_slot(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) if (!mm_slot_cache) /* initialization failed */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) static inline void free_mm_slot(struct mm_slot *mm_slot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) kmem_cache_free(mm_slot_cache, mm_slot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) static struct mm_slot *get_mm_slot(struct mm_struct *mm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) struct mm_slot *mm_slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) if (mm == mm_slot->mm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) return mm_slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) static void insert_to_mm_slots_hash(struct mm_struct *mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) struct mm_slot *mm_slot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) mm_slot->mm = mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) static inline int khugepaged_test_exit(struct mm_struct *mm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) return atomic_read(&mm->mm_users) == 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) static bool hugepage_vma_check(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) unsigned long vm_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) if (!transhuge_vma_enabled(vma, vm_flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) if (vma->vm_file && !IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) -
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) vma->vm_pgoff, HPAGE_PMD_NR))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) /* Enabled via shmem mount options or sysfs settings. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) if (shmem_file(vma->vm_file))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) return shmem_huge_enabled(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) /* THP settings require madvise. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) /* Only regular file is valid */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) !inode_is_open_for_write(vma->vm_file->f_inode) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) (vm_flags & VM_EXEC)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) struct inode *inode = vma->vm_file->f_inode;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) return S_ISREG(inode->i_mode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) if (!vma->anon_vma || vma->vm_ops)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) if (vma_is_temporary_stack(vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) return !(vm_flags & VM_NO_KHUGEPAGED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) int __khugepaged_enter(struct mm_struct *mm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) struct mm_slot *mm_slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) int wakeup;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) mm_slot = alloc_mm_slot();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) if (!mm_slot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) /* __khugepaged_exit() must not run from under us */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) VM_BUG_ON_MM(atomic_read(&mm->mm_users) == 0, mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) free_mm_slot(mm_slot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) spin_lock(&khugepaged_mm_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) insert_to_mm_slots_hash(mm, mm_slot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) * Insert just behind the scanning cursor, to let the area settle
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) * down a little.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) wakeup = list_empty(&khugepaged_scan.mm_head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) spin_unlock(&khugepaged_mm_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) mmgrab(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) if (wakeup)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) wake_up_interruptible(&khugepaged_wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) unsigned long vm_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) unsigned long hstart, hend;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) * khugepaged only supports read-only files for non-shmem files.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) * khugepaged does not yet work on special mappings. And
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) * file-private shmem THP is not supported.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) if (!hugepage_vma_check(vma, vm_flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) hend = vma->vm_end & HPAGE_PMD_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) if (hstart < hend)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) return khugepaged_enter(vma, vm_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) void __khugepaged_exit(struct mm_struct *mm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) struct mm_slot *mm_slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) int free = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) spin_lock(&khugepaged_mm_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) mm_slot = get_mm_slot(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) hash_del(&mm_slot->hash);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) list_del(&mm_slot->mm_node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) free = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) spin_unlock(&khugepaged_mm_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) if (free) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) free_mm_slot(mm_slot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) mmdrop(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) } else if (mm_slot) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) * This is required to serialize against
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) * khugepaged_test_exit() (which is guaranteed to run
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) * under mmap sem read mode). Stop here (after we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) * return all pagetables will be destroyed) until
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) * khugepaged has finished working on the pagetables
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) * under the mmap_lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) mmap_write_lock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) mmap_write_unlock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) static void release_pte_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) mod_node_page_state(page_pgdat(page),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) NR_ISOLATED_ANON + page_is_file_lru(page),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) -compound_nr(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) putback_lru_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) static void release_pte_pages(pte_t *pte, pte_t *_pte,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) struct list_head *compound_pagelist)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) struct page *page, *tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) while (--_pte >= pte) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) pte_t pteval = *_pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) page = pte_page(pteval);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) !PageCompound(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) release_pte_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) list_for_each_entry_safe(page, tmp, compound_pagelist, lru) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) list_del(&page->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) release_pte_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) static bool is_refcount_suitable(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) int expected_refcount;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) expected_refcount = total_mapcount(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) if (PageSwapCache(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) expected_refcount += compound_nr(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) return page_count(page) == expected_refcount;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) unsigned long address,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) pte_t *pte,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) struct list_head *compound_pagelist)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) struct page *page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) pte_t *_pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) int none_or_zero = 0, shared = 0, result = 0, referenced = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) bool writable = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) _pte++, address += PAGE_SIZE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) pte_t pteval = *_pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) if (pte_none(pteval) || (pte_present(pteval) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) is_zero_pfn(pte_pfn(pteval)))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) if (!userfaultfd_armed(vma) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) ++none_or_zero <= khugepaged_max_ptes_none) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) result = SCAN_EXCEED_NONE_PTE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) if (!pte_present(pteval)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) result = SCAN_PTE_NON_PRESENT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) page = vm_normal_page(vma, address, pteval);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) if (unlikely(!page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) result = SCAN_PAGE_NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) VM_BUG_ON_PAGE(!PageAnon(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) if (page_mapcount(page) > 1 &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) ++shared > khugepaged_max_ptes_shared) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) result = SCAN_EXCEED_SHARED_PTE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) if (PageCompound(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) struct page *p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) page = compound_head(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) * Check if we have dealt with the compound page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) * already
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) list_for_each_entry(p, compound_pagelist, lru) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) if (page == p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) goto next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) * We can do it before isolate_lru_page because the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) * page can't be freed from under us. NOTE: PG_lock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) * is needed to serialize against split_huge_page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) * when invoked from the VM.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) if (!trylock_page(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) result = SCAN_PAGE_LOCK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) * Check if the page has any GUP (or other external) pins.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) * The page table that maps the page has been already unlinked
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) * from the page table tree and this process cannot get
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) * an additinal pin on the page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) * New pins can come later if the page is shared across fork,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) * but not from this process. The other process cannot write to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) * the page, only trigger CoW.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) if (!is_refcount_suitable(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) result = SCAN_PAGE_COUNT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) if (!pte_write(pteval) && PageSwapCache(page) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) !reuse_swap_page(page, NULL)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) * Page is in the swap cache and cannot be re-used.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) * It cannot be collapsed into a THP.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) result = SCAN_SWAP_CACHE_PAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) * Isolate the page to avoid collapsing an hugepage
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) * currently in use by the VM.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) if (isolate_lru_page(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) result = SCAN_DEL_PAGE_LRU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) mod_node_page_state(page_pgdat(page),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) NR_ISOLATED_ANON + page_is_file_lru(page),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) compound_nr(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) VM_BUG_ON_PAGE(!PageLocked(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) VM_BUG_ON_PAGE(PageLRU(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) if (PageCompound(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) list_add_tail(&page->lru, compound_pagelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) next:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) /* There should be enough young pte to collapse the page */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) if (pte_young(pteval) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) page_is_young(page) || PageReferenced(page) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) mmu_notifier_test_young(vma->vm_mm, address))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) referenced++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) if (pte_write(pteval))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) writable = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) if (unlikely(!writable)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) result = SCAN_PAGE_RO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) } else if (unlikely(!referenced)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) result = SCAN_LACK_REFERENCED_PAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) result = SCAN_SUCCEED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) trace_mm_collapse_huge_page_isolate(page, none_or_zero,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) referenced, writable, result);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) release_pte_pages(pte, _pte, compound_pagelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) trace_mm_collapse_huge_page_isolate(page, none_or_zero,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) referenced, writable, result);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) unsigned long address,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) spinlock_t *ptl,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) struct list_head *compound_pagelist)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) struct page *src_page, *tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) pte_t *_pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) _pte++, page++, address += PAGE_SIZE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) pte_t pteval = *_pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) clear_user_highpage(page, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) if (is_zero_pfn(pte_pfn(pteval))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) * ptl mostly unnecessary.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) spin_lock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) * paravirt calls inside pte_clear here are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) * superfluous.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) pte_clear(vma->vm_mm, address, _pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) src_page = pte_page(pteval);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) copy_user_highpage(page, src_page, address, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) if (!PageCompound(src_page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) release_pte_page(src_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) * ptl mostly unnecessary, but preempt has to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) * be disabled to update the per-cpu stats
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) * inside page_remove_rmap().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) spin_lock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) * paravirt calls inside pte_clear here are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) * superfluous.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) pte_clear(vma->vm_mm, address, _pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) page_remove_rmap(src_page, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) free_page_and_swap_cache(src_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) list_del(&src_page->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) release_pte_page(src_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) static void khugepaged_alloc_sleep(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) DEFINE_WAIT(wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) add_wait_queue(&khugepaged_wait, &wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) freezable_schedule_timeout_interruptible(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) remove_wait_queue(&khugepaged_wait, &wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) static int khugepaged_node_load[MAX_NUMNODES];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) static bool khugepaged_scan_abort(int nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) * If node_reclaim_mode is disabled, then no extra effort is made to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) * allocate memory locally.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) if (!node_reclaim_mode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) /* If there is a count for this node already, it must be acceptable */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) if (khugepaged_node_load[nid])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) for (i = 0; i < MAX_NUMNODES; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) if (!khugepaged_node_load[i])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) if (node_distance(nid, i) > node_reclaim_distance)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829) static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834) #ifdef CONFIG_NUMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) static int khugepaged_find_target_node(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837) static int last_khugepaged_target_node = NUMA_NO_NODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838) int nid, target_node = 0, max_value = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) /* find first node with max normal pages hit */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841) for (nid = 0; nid < MAX_NUMNODES; nid++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) if (khugepaged_node_load[nid] > max_value) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) max_value = khugepaged_node_load[nid];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) target_node = nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847) /* do some balance if several nodes have the same hit record */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848) if (target_node <= last_khugepaged_target_node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849) for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850) nid++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) if (max_value == khugepaged_node_load[nid]) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852) target_node = nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) last_khugepaged_target_node = target_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857) return target_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860) static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) if (IS_ERR(*hpage)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863) if (!*wait)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) *wait = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867) *hpage = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) khugepaged_alloc_sleep();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869) } else if (*hpage) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) put_page(*hpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) *hpage = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877) static struct page *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878) khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880) VM_BUG_ON_PAGE(*hpage, *hpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883) if (unlikely(!*hpage)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884) count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) *hpage = ERR_PTR(-ENOMEM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) prep_transhuge_page(*hpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890) count_vm_event(THP_COLLAPSE_ALLOC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) return *hpage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894) static int khugepaged_find_target_node(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899) static inline struct page *alloc_khugepaged_hugepage(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903) page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) HPAGE_PMD_ORDER);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906) prep_transhuge_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) static struct page *khugepaged_alloc_hugepage(bool *wait)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912) struct page *hpage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915) hpage = alloc_khugepaged_hugepage();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916) if (!hpage) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) if (!*wait)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921) *wait = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922) khugepaged_alloc_sleep();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924) count_vm_event(THP_COLLAPSE_ALLOC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925) } while (unlikely(!hpage) && likely(khugepaged_enabled()));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927) return hpage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930) static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933) * If the hpage allocated earlier was briefly exposed in page cache
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934) * before collapse_file() failed, it is possible that racing lookups
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935) * have not yet completed, and would then be unpleasantly surprised by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936) * finding the hpage reused for the same mapping at a different offset.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) * Just release the previous allocation if there is any danger of that.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939) if (*hpage && page_count(*hpage) > 1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940) put_page(*hpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941) *hpage = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944) if (!*hpage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945) *hpage = khugepaged_alloc_hugepage(wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947) if (unlikely(!*hpage))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953) static struct page *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956) VM_BUG_ON(!*hpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958) return *hpage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963) * If mmap_lock temporarily dropped, revalidate vma
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964) * before taking mmap_lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965) * Return 0 if succeeds, otherwise return none-zero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) * value (scan code).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969) static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970) struct vm_area_struct **vmap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972) struct vm_area_struct *vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973) unsigned long hstart, hend;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975) if (unlikely(khugepaged_test_exit(mm)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976) return SCAN_ANY_PROCESS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978) *vmap = vma = find_vma(mm, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979) if (!vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980) return SCAN_VMA_NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983) hend = vma->vm_end & HPAGE_PMD_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984) if (address < hstart || address + HPAGE_PMD_SIZE > hend)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985) return SCAN_ADDRESS_RANGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986) if (!hugepage_vma_check(vma, vma->vm_flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987) return SCAN_VMA_CHECK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988) /* Anon VMA expected */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989) if (!vma->anon_vma || vma->vm_ops)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990) return SCAN_VMA_CHECK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995) * Bring missing pages in from swap, to complete THP collapse.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996) * Only done if khugepaged_scan_pmd believes it is worthwhile.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998) * Called and returns without pte mapped or spinlocks held,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999) * but with mmap_lock held to protect against vma changes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) static bool __collapse_huge_page_swapin(struct mm_struct *mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) unsigned long haddr, pmd_t *pmd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) int referenced)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) int swapped_in = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) vm_fault_t ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) for (address = haddr; address < end; address += PAGE_SIZE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) struct vm_fault vmf = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) .vma = vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) .address = address,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) .pgoff = linear_page_index(vma, haddr),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) .flags = FAULT_FLAG_ALLOW_RETRY,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) .pmd = pmd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) .vma_flags = vma->vm_flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) .vma_page_prot = vma->vm_page_prot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) vmf.pte = pte_offset_map(pmd, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) vmf.orig_pte = *vmf.pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) if (!is_swap_pte(vmf.orig_pte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) pte_unmap(vmf.pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) swapped_in++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) ret = do_swap_page(&vmf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) if (ret & VM_FAULT_RETRY) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) mmap_read_lock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) if (hugepage_vma_revalidate(mm, haddr, &vma)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) /* vma is no longer available, don't continue to swapin */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) /* check if the pmd is still valid */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) if (mm_find_pmd(mm, haddr) != pmd) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) if (ret & VM_FAULT_ERROR) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) /* Drain LRU add pagevec to remove extra pin on the swapped in pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) if (swapped_in)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) lru_add_drain();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) static void collapse_huge_page(struct mm_struct *mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) unsigned long address,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061) struct page **hpage,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) int node, int referenced, int unmapped)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) LIST_HEAD(compound_pagelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) pmd_t *pmd, _pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) pte_t *pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) pgtable_t pgtable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) struct page *new_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) spinlock_t *pmd_ptl, *pte_ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) int isolated = 0, result = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) struct vm_area_struct *vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) struct mmu_notifier_range range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073) gfp_t gfp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) VM_BUG_ON(address & ~HPAGE_PMD_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) /* Only allocate from the target node */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081) * Before allocating the hugepage, release the mmap_lock read lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) * The allocation can take potentially a long time if it involves
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) * sync compaction, and we do not need to hold the mmap_lock during
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084) * that. We will recheck the vma after taking it again in write mode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) mmap_read_unlock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) new_page = khugepaged_alloc_page(hpage, gfp, node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) if (!new_page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089) result = SCAN_ALLOC_HUGE_PAGE_FAIL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) goto out_nolock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) result = SCAN_CGROUP_CHARGE_FAIL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) goto out_nolock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) mmap_read_lock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) result = hugepage_vma_revalidate(mm, address, &vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) if (result) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) mmap_read_unlock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) goto out_nolock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) pmd = mm_find_pmd(mm, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) if (!pmd) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) result = SCAN_PMD_NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109) mmap_read_unlock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110) goto out_nolock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114) * __collapse_huge_page_swapin always returns with mmap_lock locked.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) * If it fails, we release mmap_lock and jump out_nolock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) * Continuing to collapse causes inconsistency.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) pmd, referenced)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) mmap_read_unlock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) goto out_nolock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) mmap_read_unlock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) * Prevent all access to pagetables with the exception of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) * gup_fast later handled by the ptep_clear_flush and the VM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) * handled by the anon_vma lock + PG_lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) mmap_write_lock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) result = hugepage_vma_revalidate(mm, address, &vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) if (result)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) /* check if the pmd is still valid */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) if (mm_find_pmd(mm, address) != pmd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) vm_write_begin(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) anon_vma_lock_write(vma->anon_vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) address, address + HPAGE_PMD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) mmu_notifier_invalidate_range_start(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) pte = pte_offset_map(pmd, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) pte_ptl = pte_lockptr(mm, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) * After this gup_fast can't run anymore. This also removes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) * any huge TLB entry from the CPU so we won't allow
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) * huge and small TLB entries for the same virtual address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) * to avoid the risk of CPU bugs in that area.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) _pmd = pmdp_collapse_flush(vma, address, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) spin_unlock(pmd_ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) mmu_notifier_invalidate_range_end(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) spin_lock(pte_ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) isolated = __collapse_huge_page_isolate(vma, address, pte,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) &compound_pagelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) spin_unlock(pte_ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) if (unlikely(!isolated)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) pte_unmap(pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166) spin_lock(pmd_ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) BUG_ON(!pmd_none(*pmd));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169) * We can only use set_pmd_at when establishing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) * hugepmds and never for establishing regular pmds that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) * points to regular pagetables. Use pmd_populate for that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) pmd_populate(mm, pmd, pmd_pgtable(_pmd));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) spin_unlock(pmd_ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) anon_vma_unlock_write(vma->anon_vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) vm_write_end(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177) result = SCAN_FAIL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) * All pages are isolated and locked so anon_vma rmap
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) * can't run anymore.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) anon_vma_unlock_write(vma->anon_vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188) &compound_pagelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) pte_unmap(pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) __SetPageUptodate(new_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) pgtable = pmd_pgtable(_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194) _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) * spin_lock() below is not the equivalent of smp_wmb(), so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) * this is needed to avoid the copy_huge_page writes to become
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199) * visible after the set_pmd_at() write.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) smp_wmb();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) spin_lock(pmd_ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204) BUG_ON(!pmd_none(*pmd));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) page_add_new_anon_rmap(new_page, vma, address, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206) lru_cache_add_inactive_or_unevictable(new_page, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207) pgtable_trans_huge_deposit(mm, pmd, pgtable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) set_pmd_at(mm, address, pmd, _pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) update_mmu_cache_pmd(vma, address, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) spin_unlock(pmd_ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211) vm_write_end(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) *hpage = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) khugepaged_pages_collapsed++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) result = SCAN_SUCCEED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) out_up_write:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) mmap_write_unlock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) out_nolock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) if (!IS_ERR_OR_NULL(*hpage))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) mem_cgroup_uncharge(*hpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) trace_mm_collapse_huge_page(mm, isolated, result);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) goto out_up_write;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) static int khugepaged_scan_pmd(struct mm_struct *mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) unsigned long address,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) struct page **hpage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) pmd_t *pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) pte_t *pte, *_pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) int ret = 0, result = 0, referenced = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) int none_or_zero = 0, shared = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) struct page *page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238) unsigned long _address;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240) int node = NUMA_NO_NODE, unmapped = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241) bool writable = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243) VM_BUG_ON(address & ~HPAGE_PMD_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245) pmd = mm_find_pmd(mm, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) if (!pmd) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) result = SCAN_PMD_NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252) pte = pte_offset_map_lock(mm, pmd, address, &ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254) _pte++, _address += PAGE_SIZE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255) pte_t pteval = *_pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256) if (is_swap_pte(pteval)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) if (++unmapped <= khugepaged_max_ptes_swap) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259) * Always be strict with uffd-wp
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260) * enabled swap entries. Please see
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261) * comment below for pte_uffd_wp().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263) if (pte_swp_uffd_wp(pteval)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264) result = SCAN_PTE_UFFD_WP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265) goto out_unmap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269) result = SCAN_EXCEED_SWAP_PTE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270) goto out_unmap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273) if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274) if (!userfaultfd_armed(vma) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275) ++none_or_zero <= khugepaged_max_ptes_none) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) result = SCAN_EXCEED_NONE_PTE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279) goto out_unmap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282) if (!pte_present(pteval)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) result = SCAN_PTE_NON_PRESENT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284) goto out_unmap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286) if (pte_uffd_wp(pteval)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288) * Don't collapse the page if any of the small
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289) * PTEs are armed with uffd write protection.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1290) * Here we can also mark the new huge pmd as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1291) * write protected if any of the small ones is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1292) * marked but that could bring uknown
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1293) * userfault messages that falls outside of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1294) * the registered range. So, just be simple.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1295) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1296) result = SCAN_PTE_UFFD_WP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1297) goto out_unmap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1298) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1299) if (pte_write(pteval))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1300) writable = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1301)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1302) page = vm_normal_page(vma, _address, pteval);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1303) if (unlikely(!page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1304) result = SCAN_PAGE_NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1305) goto out_unmap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1306) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1307)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1308) if (page_mapcount(page) > 1 &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1309) ++shared > khugepaged_max_ptes_shared) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1310) result = SCAN_EXCEED_SHARED_PTE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1311) goto out_unmap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1312) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1313)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1314) page = compound_head(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1315)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1316) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1317) * Record which node the original page is from and save this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1318) * information to khugepaged_node_load[].
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1319) * Khupaged will allocate hugepage from the node has the max
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1320) * hit record.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1321) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1322) node = page_to_nid(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1323) if (khugepaged_scan_abort(node)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1324) result = SCAN_SCAN_ABORT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1325) goto out_unmap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1326) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1327) khugepaged_node_load[node]++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1328) if (!PageLRU(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1329) result = SCAN_PAGE_LRU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1330) goto out_unmap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1331) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1332) if (PageLocked(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1333) result = SCAN_PAGE_LOCK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1334) goto out_unmap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1335) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1336) if (!PageAnon(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1337) result = SCAN_PAGE_ANON;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1338) goto out_unmap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1339) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1340)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1341) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1342) * Check if the page has any GUP (or other external) pins.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1343) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1344) * Here the check is racy it may see totmal_mapcount > refcount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1345) * in some cases.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1346) * For example, one process with one forked child process.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1347) * The parent has the PMD split due to MADV_DONTNEED, then
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1348) * the child is trying unmap the whole PMD, but khugepaged
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1349) * may be scanning the parent between the child has
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1350) * PageDoubleMap flag cleared and dec the mapcount. So
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1351) * khugepaged may see total_mapcount > refcount.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1352) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1353) * But such case is ephemeral we could always retry collapse
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1354) * later. However it may report false positive if the page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1355) * has excessive GUP pins (i.e. 512). Anyway the same check
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1356) * will be done again later the risk seems low.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1357) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1358) if (!is_refcount_suitable(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1359) result = SCAN_PAGE_COUNT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1360) goto out_unmap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1361) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1362) if (pte_young(pteval) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1363) page_is_young(page) || PageReferenced(page) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1364) mmu_notifier_test_young(vma->vm_mm, address))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1365) referenced++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1366) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1367) if (!writable) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1368) result = SCAN_PAGE_RO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1369) } else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1370) result = SCAN_LACK_REFERENCED_PAGE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1371) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1372) result = SCAN_SUCCEED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1373) ret = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1374) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1375) out_unmap:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1376) pte_unmap_unlock(pte, ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1377) if (ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1378) node = khugepaged_find_target_node();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1379) /* collapse_huge_page will return with the mmap_lock released */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1380) collapse_huge_page(mm, address, hpage, node,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1381) referenced, unmapped);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1382) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1383) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1384) trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1385) none_or_zero, result, unmapped);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1386) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1387) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1388)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1389) static void collect_mm_slot(struct mm_slot *mm_slot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1390) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1391) struct mm_struct *mm = mm_slot->mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1392)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1393) lockdep_assert_held(&khugepaged_mm_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1394)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1395) if (khugepaged_test_exit(mm)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1396) /* free mm_slot */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1397) hash_del(&mm_slot->hash);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1398) list_del(&mm_slot->mm_node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1399)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1400) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1401) * Not strictly needed because the mm exited already.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1402) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1403) * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1404) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1405)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1406) /* khugepaged_mm_lock actually not necessary for the below */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1407) free_mm_slot(mm_slot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1408) mmdrop(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1409) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1410) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1411)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1412) #ifdef CONFIG_SHMEM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1413) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1414) * Notify khugepaged that given addr of the mm is pte-mapped THP. Then
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1415) * khugepaged should try to collapse the page table.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1416) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1417) static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1418) unsigned long addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1419) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1420) struct mm_slot *mm_slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1421)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1422) VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1423)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1424) spin_lock(&khugepaged_mm_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1425) mm_slot = get_mm_slot(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1426) if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1427) mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1428) spin_unlock(&khugepaged_mm_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1429) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1430) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1431)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1432) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1433) * Try to collapse a pte-mapped THP for mm at address haddr.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1434) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1435) * This function checks whether all the PTEs in the PMD are pointing to the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1436) * right THP. If so, retract the page table so the THP can refault in with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1437) * as pmd-mapped.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1438) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1439) void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1440) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1441) unsigned long haddr = addr & HPAGE_PMD_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1442) struct vm_area_struct *vma = find_vma(mm, haddr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1443) struct page *hpage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1444) pte_t *start_pte, *pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1445) pmd_t *pmd, _pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1446) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1447) int count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1448) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1449)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1450) if (!vma || !vma->vm_file ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1451) vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1452) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1453)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1454) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1455) * This vm_flags may not have VM_HUGEPAGE if the page was not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1456) * collapsed by this mm. But we can still collapse if the page is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1457) * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1458) * will not fail the vma for missing VM_HUGEPAGE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1459) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1460) if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1461) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1462)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1463) hpage = find_lock_page(vma->vm_file->f_mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1464) linear_page_index(vma, haddr));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1465) if (!hpage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1466) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1467)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1468) if (!PageHead(hpage))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1469) goto drop_hpage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1470)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1471) pmd = mm_find_pmd(mm, haddr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1472) if (!pmd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1473) goto drop_hpage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1474)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1475) start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1476)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1477) /* step 1: check all mapped PTEs are to the right huge page */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1478) for (i = 0, addr = haddr, pte = start_pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1479) i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1480) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1481)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1482) /* empty pte, skip */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1483) if (pte_none(*pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1484) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1485)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1486) /* page swapped out, abort */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1487) if (!pte_present(*pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1488) goto abort;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1489)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1490) page = vm_normal_page(vma, addr, *pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1491)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1492) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1493) * Note that uprobe, debugger, or MAP_PRIVATE may change the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1494) * page table, but the new page will not be a subpage of hpage.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1495) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1496) if (hpage + i != page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1497) goto abort;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1498) count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1499) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1500)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1501) /* step 2: adjust rmap */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1502) for (i = 0, addr = haddr, pte = start_pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1503) i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1504) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1505)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1506) if (pte_none(*pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1507) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1508) page = vm_normal_page(vma, addr, *pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1509) page_remove_rmap(page, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1510) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1511)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1512) pte_unmap_unlock(start_pte, ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1513)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1514) /* step 3: set proper refcount and mm_counters. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1515) if (count) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1516) page_ref_sub(hpage, count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1517) add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1518) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1519)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1520) /* step 4: collapse pmd */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1521) ptl = pmd_lock(vma->vm_mm, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1522) _pmd = pmdp_collapse_flush(vma, haddr, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1523) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1524) mm_dec_nr_ptes(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1525) pte_free(mm, pmd_pgtable(_pmd));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1526)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1527) drop_hpage:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1528) unlock_page(hpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1529) put_page(hpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1530) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1531)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1532) abort:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1533) pte_unmap_unlock(start_pte, ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1534) goto drop_hpage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1535) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1536)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1537) static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1538) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1539) struct mm_struct *mm = mm_slot->mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1540) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1541)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1542) if (likely(mm_slot->nr_pte_mapped_thp == 0))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1543) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1544)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1545) if (!mmap_write_trylock(mm))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1546) return -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1547)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1548) if (unlikely(khugepaged_test_exit(mm)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1549) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1550)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1551) for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1552) collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1553)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1554) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1555) mm_slot->nr_pte_mapped_thp = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1556) mmap_write_unlock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1557) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1558) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1559)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1560) static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1561) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1562) struct vm_area_struct *vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1563) struct mm_struct *mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1564) unsigned long addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1565) pmd_t *pmd, _pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1566)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1567) i_mmap_lock_write(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1568) vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1569) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1570) * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1571) * got written to. These VMAs are likely not worth investing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1572) * mmap_write_lock(mm) as PMD-mapping is likely to be split
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1573) * later.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1574) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1575) * Not that vma->anon_vma check is racy: it can be set up after
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1576) * the check but before we took mmap_lock by the fault path.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1577) * But page lock would prevent establishing any new ptes of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1578) * page, so we are safe.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1579) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1580) * An alternative would be drop the check, but check that page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1581) * table is clear before calling pmdp_collapse_flush() under
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1582) * ptl. It has higher chance to recover THP for the VMA, but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1583) * has higher cost too.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1584) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1585) if (vma->anon_vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1586) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1587) addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1588) if (addr & ~HPAGE_PMD_MASK)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1589) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1590) if (vma->vm_end < addr + HPAGE_PMD_SIZE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1591) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1592) mm = vma->vm_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1593) pmd = mm_find_pmd(mm, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1594) if (!pmd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1595) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1596) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1597) * We need exclusive mmap_lock to retract page table.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1598) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1599) * We use trylock due to lock inversion: we need to acquire
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1600) * mmap_lock while holding page lock. Fault path does it in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1601) * reverse order. Trylock is a way to avoid deadlock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1602) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1603) if (mmap_write_trylock(mm)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1604) if (!khugepaged_test_exit(mm)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1605) spinlock_t *ptl = pmd_lock(mm, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1606) /* assume page table is clear */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1607) _pmd = pmdp_collapse_flush(vma, addr, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1608) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1609) mm_dec_nr_ptes(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1610) pte_free(mm, pmd_pgtable(_pmd));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1611) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1612) mmap_write_unlock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1613) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1614) /* Try again later */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1615) khugepaged_add_pte_mapped_thp(mm, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1616) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1617) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1618) i_mmap_unlock_write(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1619) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1620)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1621) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1622) * collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1623) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1624) * Basic scheme is simple, details are more complex:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1625) * - allocate and lock a new huge page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1626) * - scan page cache replacing old pages with the new one
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1627) * + swap/gup in pages if necessary;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1628) * + fill in gaps;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1629) * + keep old pages around in case rollback is required;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1630) * - if replacing succeeds:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1631) * + copy data over;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1632) * + free old pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1633) * + unlock huge page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1634) * - if replacing failed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1635) * + put all pages back and unfreeze them;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1636) * + restore gaps in the page cache;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1637) * + unlock and free huge page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1638) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1639) static void collapse_file(struct mm_struct *mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1640) struct file *file, pgoff_t start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1641) struct page **hpage, int node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1642) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1643) struct address_space *mapping = file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1644) gfp_t gfp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1645) struct page *new_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1646) pgoff_t index, end = start + HPAGE_PMD_NR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1647) LIST_HEAD(pagelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1648) XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1649) int nr_none = 0, result = SCAN_SUCCEED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1650) bool is_shmem = shmem_file(file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1651)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1652) VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1653) VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1654)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1655) /* Only allocate from the target node */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1656) gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1657)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1658) new_page = khugepaged_alloc_page(hpage, gfp, node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1659) if (!new_page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1660) result = SCAN_ALLOC_HUGE_PAGE_FAIL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1661) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1662) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1663)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1664) if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1665) result = SCAN_CGROUP_CHARGE_FAIL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1666) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1667) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1668) count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1669)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1670) /* This will be less messy when we use multi-index entries */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1671) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1672) xas_lock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1673) xas_create_range(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1674) if (!xas_error(&xas))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1675) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1676) xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1677) if (!xas_nomem(&xas, GFP_KERNEL)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1678) result = SCAN_FAIL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1679) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1680) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1681) } while (1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1682)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1683) __SetPageLocked(new_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1684) if (is_shmem)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1685) __SetPageSwapBacked(new_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1686) new_page->index = start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1687) new_page->mapping = mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1688)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1689) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1690) * At this point the new_page is locked and not up-to-date.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1691) * It's safe to insert it into the page cache, because nobody would
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1692) * be able to map it or use it in another way until we unlock it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1693) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1694)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1695) xas_set(&xas, start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1696) for (index = start; index < end; index++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1697) struct page *page = xas_next(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1698)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1699) VM_BUG_ON(index != xas.xa_index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1700) if (is_shmem) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1701) if (!page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1702) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1703) * Stop if extent has been truncated or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1704) * hole-punched, and is now completely
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1705) * empty.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1706) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1707) if (index == start) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1708) if (!xas_next_entry(&xas, end - 1)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1709) result = SCAN_TRUNCATED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1710) goto xa_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1711) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1712) xas_set(&xas, index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1713) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1714) if (!shmem_charge(mapping->host, 1)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1715) result = SCAN_FAIL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1716) goto xa_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1717) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1718) xas_store(&xas, new_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1719) nr_none++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1720) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1721) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1722)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1723) if (xa_is_value(page) || !PageUptodate(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1724) xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1725) /* swap in or instantiate fallocated page */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1726) if (shmem_getpage(mapping->host, index, &page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1727) SGP_NOHUGE)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1728) result = SCAN_FAIL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1729) goto xa_unlocked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1730) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1731) } else if (trylock_page(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1732) get_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1733) xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1734) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1735) result = SCAN_PAGE_LOCK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1736) goto xa_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1737) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1738) } else { /* !is_shmem */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1739) if (!page || xa_is_value(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1740) xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1741) page_cache_sync_readahead(mapping, &file->f_ra,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1742) file, index,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1743) end - index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1744) /* drain pagevecs to help isolate_lru_page() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1745) lru_add_drain();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1746) page = find_lock_page(mapping, index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1747) if (unlikely(page == NULL)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1748) result = SCAN_FAIL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1749) goto xa_unlocked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1750) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1751) } else if (PageDirty(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1752) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1753) * khugepaged only works on read-only fd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1754) * so this page is dirty because it hasn't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1755) * been flushed since first write. There
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1756) * won't be new dirty pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1757) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1758) * Trigger async flush here and hope the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1759) * writeback is done when khugepaged
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1760) * revisits this page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1761) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1762) * This is a one-off situation. We are not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1763) * forcing writeback in loop.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1764) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1765) xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1766) filemap_flush(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1767) result = SCAN_FAIL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1768) goto xa_unlocked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1769) } else if (PageWriteback(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1770) xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1771) result = SCAN_FAIL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1772) goto xa_unlocked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1773) } else if (trylock_page(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1774) get_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1775) xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1776) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1777) result = SCAN_PAGE_LOCK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1778) goto xa_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1779) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1780) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1781)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1782) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1783) * The page must be locked, so we can drop the i_pages lock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1784) * without racing with truncate.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1785) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1786) VM_BUG_ON_PAGE(!PageLocked(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1787)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1788) /* make sure the page is up to date */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1789) if (unlikely(!PageUptodate(page))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1790) result = SCAN_FAIL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1791) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1792) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1793)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1794) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1795) * If file was truncated then extended, or hole-punched, before
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1796) * we locked the first page, then a THP might be there already.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1797) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1798) if (PageTransCompound(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1799) result = SCAN_PAGE_COMPOUND;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1800) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1801) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1802)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1803) if (page_mapping(page) != mapping) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1804) result = SCAN_TRUNCATED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1805) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1806) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1807)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1808) if (!is_shmem && (PageDirty(page) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1809) PageWriteback(page))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1810) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1811) * khugepaged only works on read-only fd, so this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1812) * page is dirty because it hasn't been flushed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1813) * since first write.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1814) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1815) result = SCAN_FAIL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1816) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1817) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1818)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1819) if (isolate_lru_page(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1820) result = SCAN_DEL_PAGE_LRU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1821) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1822) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1823)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1824) if (page_has_private(page) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1825) !try_to_release_page(page, GFP_KERNEL)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1826) result = SCAN_PAGE_HAS_PRIVATE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1827) putback_lru_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1828) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1829) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1830)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1831) if (page_mapped(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1832) unmap_mapping_pages(mapping, index, 1, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1833)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1834) xas_lock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1835) xas_set(&xas, index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1836)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1837) VM_BUG_ON_PAGE(page != xas_load(&xas), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1838) VM_BUG_ON_PAGE(page_mapped(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1839)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1840) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1841) * The page is expected to have page_count() == 3:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1842) * - we hold a pin on it;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1843) * - one reference from page cache;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1844) * - one from isolate_lru_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1845) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1846) if (!page_ref_freeze(page, 3)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1847) result = SCAN_PAGE_COUNT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1848) xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1849) putback_lru_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1850) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1851) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1852)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1853) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1854) * Add the page to the list to be able to undo the collapse if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1855) * something go wrong.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1856) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1857) list_add_tail(&page->lru, &pagelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1858)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1859) /* Finally, replace with the new page. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1860) xas_store(&xas, new_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1861) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1862) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1863) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1864) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1865) goto xa_unlocked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1866) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1867)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1868) if (is_shmem)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1869) __inc_node_page_state(new_page, NR_SHMEM_THPS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1870) else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1871) __inc_node_page_state(new_page, NR_FILE_THPS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1872) filemap_nr_thps_inc(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1873) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1874) * Paired with smp_mb() in do_dentry_open() to ensure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1875) * i_writecount is up to date and the update to nr_thps is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1876) * visible. Ensures the page cache will be truncated if the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1877) * file is opened writable.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1878) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1879) smp_mb();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1880) if (inode_is_open_for_write(mapping->host)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1881) result = SCAN_FAIL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1882) __dec_node_page_state(new_page, NR_FILE_THPS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1883) filemap_nr_thps_dec(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1884) goto xa_locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1885) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1886) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1887)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1888) if (nr_none) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1889) __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1890) if (is_shmem)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1891) __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1892) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1893)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1894) xa_locked:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1895) xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1896) xa_unlocked:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1897)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1898) if (result == SCAN_SUCCEED) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1899) struct page *page, *tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1900)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1901) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1902) * Replacing old pages with new one has succeeded, now we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1903) * need to copy the content and free the old pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1904) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1905) index = start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1906) list_for_each_entry_safe(page, tmp, &pagelist, lru) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1907) while (index < page->index) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1908) clear_highpage(new_page + (index % HPAGE_PMD_NR));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1909) index++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1910) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1911) copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1912) page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1913) list_del(&page->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1914) page->mapping = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1915) page_ref_unfreeze(page, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1916) ClearPageActive(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1917) ClearPageUnevictable(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1918) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1919) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1920) index++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1921) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1922) while (index < end) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1923) clear_highpage(new_page + (index % HPAGE_PMD_NR));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1924) index++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1925) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1926)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1927) SetPageUptodate(new_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1928) page_ref_add(new_page, HPAGE_PMD_NR - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1929) if (is_shmem)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1930) set_page_dirty(new_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1931) lru_cache_add(new_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1932)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1933) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1934) * Remove pte page tables, so we can re-fault the page as huge.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1935) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1936) retract_page_tables(mapping, start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1937) *hpage = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1938)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1939) khugepaged_pages_collapsed++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1940) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1941) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1942)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1943) /* Something went wrong: roll back page cache changes */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1944) xas_lock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1945) mapping->nrpages -= nr_none;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1946)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1947) if (is_shmem)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1948) shmem_uncharge(mapping->host, nr_none);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1949)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1950) xas_set(&xas, start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1951) xas_for_each(&xas, page, end - 1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1952) page = list_first_entry_or_null(&pagelist,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1953) struct page, lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1954) if (!page || xas.xa_index < page->index) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1955) if (!nr_none)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1956) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1957) nr_none--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1958) /* Put holes back where they were */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1959) xas_store(&xas, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1960) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1961) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1962)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1963) VM_BUG_ON_PAGE(page->index != xas.xa_index, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1964)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1965) /* Unfreeze the page. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1966) list_del(&page->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1967) page_ref_unfreeze(page, 2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1968) xas_store(&xas, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1969) xas_pause(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1970) xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1971) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1972) putback_lru_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1973) xas_lock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1974) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1975) VM_BUG_ON(nr_none);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1976) xas_unlock_irq(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1977)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1978) new_page->mapping = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1979) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1980)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1981) unlock_page(new_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1982) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1983) VM_BUG_ON(!list_empty(&pagelist));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1984) if (!IS_ERR_OR_NULL(*hpage))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1985) mem_cgroup_uncharge(*hpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1986) /* TODO: tracepoints */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1987) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1988)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1989) static void khugepaged_scan_file(struct mm_struct *mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1990) struct file *file, pgoff_t start, struct page **hpage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1991) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1992) struct page *page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1993) struct address_space *mapping = file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1994) XA_STATE(xas, &mapping->i_pages, start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1995) int present, swap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1996) int node = NUMA_NO_NODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1997) int result = SCAN_SUCCEED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1998)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1999) present = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2000) swap = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2001) memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2002) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2003) xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2004) if (xas_retry(&xas, page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2005) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2006)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2007) if (xa_is_value(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2008) if (++swap > khugepaged_max_ptes_swap) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2009) result = SCAN_EXCEED_SWAP_PTE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2010) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2011) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2012) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2013) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2014)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2015) if (PageTransCompound(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2016) result = SCAN_PAGE_COMPOUND;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2017) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2018) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2019)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2020) node = page_to_nid(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2021) if (khugepaged_scan_abort(node)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2022) result = SCAN_SCAN_ABORT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2023) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2024) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2025) khugepaged_node_load[node]++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2026)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2027) if (!PageLRU(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2028) result = SCAN_PAGE_LRU;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2029) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2030) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2031)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2032) if (page_count(page) !=
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2033) 1 + page_mapcount(page) + page_has_private(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2034) result = SCAN_PAGE_COUNT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2035) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2036) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2037)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2038) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2039) * We probably should check if the page is referenced here, but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2040) * nobody would transfer pte_young() to PageReferenced() for us.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2041) * And rmap walk here is just too costly...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2042) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2043)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2044) present++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2045)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2046) if (need_resched()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2047) xas_pause(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2048) cond_resched_rcu();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2049) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2050) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2051) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2052)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2053) if (result == SCAN_SUCCEED) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2054) if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2055) result = SCAN_EXCEED_NONE_PTE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2056) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2057) node = khugepaged_find_target_node();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2058) collapse_file(mm, file, start, hpage, node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2059) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2060) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2061)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2062) /* TODO: tracepoints */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2063) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2064) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2065) static void khugepaged_scan_file(struct mm_struct *mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2066) struct file *file, pgoff_t start, struct page **hpage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2067) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2068) BUILD_BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2069) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2070)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2071) static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2072) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2073) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2074) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2075) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2076)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2077) static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2078) struct page **hpage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2079) __releases(&khugepaged_mm_lock)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2080) __acquires(&khugepaged_mm_lock)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2081) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2082) struct mm_slot *mm_slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2083) struct mm_struct *mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2084) struct vm_area_struct *vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2085) int progress = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2086)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2087) VM_BUG_ON(!pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2088) lockdep_assert_held(&khugepaged_mm_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2089)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2090) if (khugepaged_scan.mm_slot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2091) mm_slot = khugepaged_scan.mm_slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2092) else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2093) mm_slot = list_entry(khugepaged_scan.mm_head.next,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2094) struct mm_slot, mm_node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2095) khugepaged_scan.address = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2096) khugepaged_scan.mm_slot = mm_slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2097) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2098) spin_unlock(&khugepaged_mm_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2099) khugepaged_collapse_pte_mapped_thps(mm_slot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2100)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2101) mm = mm_slot->mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2102) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2103) * Don't wait for semaphore (to avoid long wait times). Just move to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2104) * the next mm on the list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2105) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2106) vma = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2107) if (unlikely(!mmap_read_trylock(mm)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2108) goto breakouterloop_mmap_lock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2109) if (likely(!khugepaged_test_exit(mm)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2110) vma = find_vma(mm, khugepaged_scan.address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2111)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2112) progress++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2113) for (; vma; vma = vma->vm_next) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2114) unsigned long hstart, hend;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2115)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2116) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2117) if (unlikely(khugepaged_test_exit(mm))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2118) progress++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2119) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2120) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2121) if (!hugepage_vma_check(vma, vma->vm_flags)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2122) skip:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2123) progress++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2124) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2125) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2126) hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2127) hend = vma->vm_end & HPAGE_PMD_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2128) if (hstart >= hend)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2129) goto skip;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2130) if (khugepaged_scan.address > hend)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2131) goto skip;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2132) if (khugepaged_scan.address < hstart)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2133) khugepaged_scan.address = hstart;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2134) VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2135) if (shmem_file(vma->vm_file) && !shmem_huge_enabled(vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2136) goto skip;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2137)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2138) while (khugepaged_scan.address < hend) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2139) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2140) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2141) if (unlikely(khugepaged_test_exit(mm)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2142) goto breakouterloop;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2143)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2144) VM_BUG_ON(khugepaged_scan.address < hstart ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2145) khugepaged_scan.address + HPAGE_PMD_SIZE >
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2146) hend);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2147) if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2148) struct file *file = get_file(vma->vm_file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2149) pgoff_t pgoff = linear_page_index(vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2150) khugepaged_scan.address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2151)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2152) mmap_read_unlock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2153) ret = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2154) khugepaged_scan_file(mm, file, pgoff, hpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2155) fput(file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2156) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2157) ret = khugepaged_scan_pmd(mm, vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2158) khugepaged_scan.address,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2159) hpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2160) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2161) /* move to next address */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2162) khugepaged_scan.address += HPAGE_PMD_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2163) progress += HPAGE_PMD_NR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2164) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2165) /* we released mmap_lock so break loop */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2166) goto breakouterloop_mmap_lock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2167) if (progress >= pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2168) goto breakouterloop;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2169) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2170) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2171) breakouterloop:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2172) mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2173) breakouterloop_mmap_lock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2174)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2175) spin_lock(&khugepaged_mm_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2176) VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2177) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2178) * Release the current mm_slot if this mm is about to die, or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2179) * if we scanned all vmas of this mm.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2180) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2181) if (khugepaged_test_exit(mm) || !vma) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2182) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2183) * Make sure that if mm_users is reaching zero while
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2184) * khugepaged runs here, khugepaged_exit will find
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2185) * mm_slot not pointing to the exiting mm.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2186) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2187) if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2188) khugepaged_scan.mm_slot = list_entry(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2189) mm_slot->mm_node.next,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2190) struct mm_slot, mm_node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2191) khugepaged_scan.address = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2192) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2193) khugepaged_scan.mm_slot = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2194) khugepaged_full_scans++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2195) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2196)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2197) collect_mm_slot(mm_slot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2198) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2199)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2200) return progress;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2201) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2202)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2203) static int khugepaged_has_work(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2204) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2205) return !list_empty(&khugepaged_scan.mm_head) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2206) khugepaged_enabled();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2207) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2208)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2209) static int khugepaged_wait_event(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2210) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2211) return !list_empty(&khugepaged_scan.mm_head) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2212) kthread_should_stop();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2213) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2214)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2215) static void khugepaged_do_scan(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2216) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2217) struct page *hpage = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2218) unsigned int progress = 0, pass_through_head = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2219) unsigned int pages = khugepaged_pages_to_scan;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2220) bool wait = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2221)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2222) barrier(); /* write khugepaged_pages_to_scan to local stack */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2223)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2224) lru_add_drain_all();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2225)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2226) while (progress < pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2227) if (!khugepaged_prealloc_page(&hpage, &wait))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2228) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2229)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2230) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2231)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2232) if (unlikely(kthread_should_stop() || try_to_freeze()))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2233) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2234)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2235) spin_lock(&khugepaged_mm_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2236) if (!khugepaged_scan.mm_slot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2237) pass_through_head++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2238) if (khugepaged_has_work() &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2239) pass_through_head < 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2240) progress += khugepaged_scan_mm_slot(pages - progress,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2241) &hpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2242) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2243) progress = pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2244) spin_unlock(&khugepaged_mm_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2245) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2246)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2247) if (!IS_ERR_OR_NULL(hpage))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2248) put_page(hpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2249) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2250)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2251) static bool khugepaged_should_wakeup(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2252) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2253) return kthread_should_stop() ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2254) time_after_eq(jiffies, khugepaged_sleep_expire);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2255) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2256)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2257) static void khugepaged_wait_work(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2258) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2259) if (khugepaged_has_work()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2260) const unsigned long scan_sleep_jiffies =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2261) msecs_to_jiffies(khugepaged_scan_sleep_millisecs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2262)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2263) if (!scan_sleep_jiffies)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2264) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2265)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2266) khugepaged_sleep_expire = jiffies + scan_sleep_jiffies;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2267) wait_event_freezable_timeout(khugepaged_wait,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2268) khugepaged_should_wakeup(),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2269) scan_sleep_jiffies);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2270) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2271) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2272)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2273) if (khugepaged_enabled())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2274) wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2275) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2276)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2277) static int khugepaged(void *none)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2278) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2279) struct mm_slot *mm_slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2280)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2281) set_freezable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2282) set_user_nice(current, MAX_NICE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2283)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2284) while (!kthread_should_stop()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2285) khugepaged_do_scan();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2286) khugepaged_wait_work();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2287) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2288)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2289) spin_lock(&khugepaged_mm_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2290) mm_slot = khugepaged_scan.mm_slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2291) khugepaged_scan.mm_slot = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2292) if (mm_slot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2293) collect_mm_slot(mm_slot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2294) spin_unlock(&khugepaged_mm_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2295) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2296) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2297)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2298) static void set_recommended_min_free_kbytes(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2299) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2300) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2301) int nr_zones = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2302) unsigned long recommended_min;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2303)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2304) for_each_populated_zone(zone) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2305) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2306) * We don't need to worry about fragmentation of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2307) * ZONE_MOVABLE since it only has movable pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2308) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2309) if (zone_idx(zone) > gfp_zone(GFP_USER))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2310) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2311)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2312) nr_zones++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2313) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2314)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2315) /* Ensure 2 pageblocks are free to assist fragmentation avoidance */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2316) recommended_min = pageblock_nr_pages * nr_zones * 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2317)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2318) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2319) * Make sure that on average at least two pageblocks are almost free
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2320) * of another type, one for a migratetype to fall back to and a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2321) * second to avoid subsequent fallbacks of other types There are 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2322) * MIGRATE_TYPES we care about.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2323) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2324) recommended_min += pageblock_nr_pages * nr_zones *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2325) MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2326)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2327) /* don't ever allow to reserve more than 5% of the lowmem */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2328) recommended_min = min(recommended_min,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2329) (unsigned long) nr_free_buffer_pages() / 20);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2330) recommended_min <<= (PAGE_SHIFT-10);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2331)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2332) if (recommended_min > min_free_kbytes) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2333) if (user_min_free_kbytes >= 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2334) pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2335) min_free_kbytes, recommended_min);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2336)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2337) min_free_kbytes = recommended_min;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2338) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2339) setup_per_zone_wmarks();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2340) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2341)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2342) int start_stop_khugepaged(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2343) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2344) int err = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2345)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2346) mutex_lock(&khugepaged_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2347) if (khugepaged_enabled()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2348) if (!khugepaged_thread)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2349) khugepaged_thread = kthread_run(khugepaged, NULL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2350) "khugepaged");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2351) if (IS_ERR(khugepaged_thread)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2352) pr_err("khugepaged: kthread_run(khugepaged) failed\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2353) err = PTR_ERR(khugepaged_thread);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2354) khugepaged_thread = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2355) goto fail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2356) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2357)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2358) if (!list_empty(&khugepaged_scan.mm_head))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2359) wake_up_interruptible(&khugepaged_wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2360)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2361) set_recommended_min_free_kbytes();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2362) } else if (khugepaged_thread) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2363) kthread_stop(khugepaged_thread);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2364) khugepaged_thread = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2365) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2366) fail:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2367) mutex_unlock(&khugepaged_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2368) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2369) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2370)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2371) void khugepaged_min_free_kbytes_update(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2372) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2373) mutex_lock(&khugepaged_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2374) if (khugepaged_enabled() && khugepaged_thread)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2375) set_recommended_min_free_kbytes();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2376) mutex_unlock(&khugepaged_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2377) }