^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0-only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * Generic hugetlb support.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) * (C) Nadia Yvette Chambers, April 2004
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) #include <linux/list.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) #include <linux/init.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) #include <linux/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) #include <linux/seq_file.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) #include <linux/sysctl.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) #include <linux/highmem.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #include <linux/mmu_notifier.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) #include <linux/nodemask.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #include <linux/pagemap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #include <linux/mempolicy.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #include <linux/compiler.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #include <linux/cpuset.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #include <linux/mutex.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #include <linux/memblock.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #include <linux/sysfs.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #include <linux/slab.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) #include <linux/sched/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) #include <linux/mmdebug.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) #include <linux/sched/signal.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #include <linux/rmap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) #include <linux/string_helpers.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) #include <linux/swap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) #include <linux/swapops.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) #include <linux/jhash.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) #include <linux/numa.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) #include <linux/llist.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) #include <linux/cma.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) #include <asm/page.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) #include <asm/pgalloc.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) #include <asm/tlb.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) #include <linux/io.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) #include <linux/hugetlb.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) #include <linux/hugetlb_cgroup.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) #include <linux/node.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) #include <linux/page_owner.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) #include "internal.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) int hugetlb_max_hstate __read_mostly;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) unsigned int default_hstate_idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) struct hstate hstates[HUGE_MAX_HSTATE];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) #ifdef CONFIG_CMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) static struct cma *hugetlb_cma[MAX_NUMNODES];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) static unsigned long hugetlb_cma_size __initdata;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) * Minimum page order among possible hugepage sizes, set to a proper value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) * at boot time.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) static unsigned int minimum_order __read_mostly = UINT_MAX;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) __initdata LIST_HEAD(huge_boot_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) /* for command line parsing */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) static struct hstate * __initdata parsed_hstate;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) static unsigned long __initdata default_hstate_max_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) static bool __initdata parsed_valid_hugepagesz = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) static bool __initdata parsed_default_hugepagesz;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) * free_huge_pages, and surplus_huge_pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) DEFINE_SPINLOCK(hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) * Serializes faults on the same logical page. This is used to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) * prevent spurious OOMs when the hugepage pool is fully utilized.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) static int num_fault_mutexes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) static inline bool PageHugeFreed(struct page *head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) return page_private(head + 4) == -1UL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) static inline void SetPageHugeFreed(struct page *head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) set_page_private(head + 4, -1UL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) static inline void ClearPageHugeFreed(struct page *head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) set_page_private(head + 4, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) /* Forward declaration */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) static int hugetlb_acct_memory(struct hstate *h, long delta);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) bool free = (spool->count == 0) && (spool->used_hpages == 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) spin_unlock(&spool->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) /* If no pages are used, and no other handles to the subpool
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) * remain, give up any reservations based on minimum size and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) * free the subpool */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) if (free) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) if (spool->min_hpages != -1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) hugetlb_acct_memory(spool->hstate,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) -spool->min_hpages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) kfree(spool);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) long min_hpages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) struct hugepage_subpool *spool;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) spool = kzalloc(sizeof(*spool), GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) if (!spool)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) spin_lock_init(&spool->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) spool->count = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) spool->max_hpages = max_hpages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) spool->hstate = h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) spool->min_hpages = min_hpages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) kfree(spool);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) spool->rsv_hpages = min_hpages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) return spool;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) void hugepage_put_subpool(struct hugepage_subpool *spool)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) spin_lock(&spool->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) BUG_ON(!spool->count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) spool->count--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) unlock_or_release_subpool(spool);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) * Subpool accounting for allocating and reserving pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) * Return -ENOMEM if there are not enough resources to satisfy the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) * request. Otherwise, return the number of pages by which the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) * global pools must be adjusted (upward). The returned value may
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) * only be different than the passed value (delta) in the case where
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) * a subpool minimum size must be maintained.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) long delta)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) long ret = delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) if (!spool)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) spin_lock(&spool->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) if (spool->max_hpages != -1) { /* maximum size accounting */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) if ((spool->used_hpages + delta) <= spool->max_hpages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) spool->used_hpages += delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) ret = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) goto unlock_ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) /* minimum size accounting */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) if (spool->min_hpages != -1 && spool->rsv_hpages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) if (delta > spool->rsv_hpages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) * Asking for more reserves than those already taken on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) * behalf of subpool. Return difference.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) ret = delta - spool->rsv_hpages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) spool->rsv_hpages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) ret = 0; /* reserves already accounted for */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) spool->rsv_hpages -= delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) unlock_ret:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) spin_unlock(&spool->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) * Subpool accounting for freeing and unreserving pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) * Return the number of global page reservations that must be dropped.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) * The return value may only be different than the passed value (delta)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) * in the case where a subpool minimum size must be maintained.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) long delta)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) long ret = delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) if (!spool)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) return delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) spin_lock(&spool->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) if (spool->max_hpages != -1) /* maximum size accounting */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) spool->used_hpages -= delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) /* minimum size accounting */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) if (spool->rsv_hpages + delta <= spool->min_hpages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) ret = spool->rsv_hpages + delta - spool->min_hpages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) spool->rsv_hpages += delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) if (spool->rsv_hpages > spool->min_hpages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) spool->rsv_hpages = spool->min_hpages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) * If hugetlbfs_put_super couldn't free spool due to an outstanding
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) * quota reference, free it now.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) unlock_or_release_subpool(spool);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) return HUGETLBFS_SB(inode->i_sb)->spool;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) return subpool_inode(file_inode(vma->vm_file));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) /* Helper that removes a struct file_region from the resv_map cache and returns
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) * it for use.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) static struct file_region *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) struct file_region *nrg = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) VM_BUG_ON(resv->region_cache_count <= 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) resv->region_cache_count--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) nrg = list_first_entry(&resv->region_cache, struct file_region, link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) list_del(&nrg->link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) nrg->from = from;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) nrg->to = to;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) return nrg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) struct file_region *rg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) #ifdef CONFIG_CGROUP_HUGETLB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) nrg->reservation_counter = rg->reservation_counter;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) nrg->css = rg->css;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) if (rg->css)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) css_get(rg->css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) /* Helper that records hugetlb_cgroup uncharge info. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) struct resv_map *resv,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) struct file_region *nrg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) #ifdef CONFIG_CGROUP_HUGETLB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) if (h_cg) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) nrg->reservation_counter =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) &h_cg->rsvd_hugepage[hstate_index(h)];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) nrg->css = &h_cg->css;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) * The caller will hold exactly one h_cg->css reference for the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) * whole contiguous reservation region. But this area might be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) * scattered when there are already some file_regions reside in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) * it. As a result, many file_regions may share only one css
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) * reference. In order to ensure that one file_region must hold
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) * exactly one h_cg->css reference, we should do css_get for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) * each file_region and leave the reference held by caller
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) * untouched.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) css_get(&h_cg->css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) if (!resv->pages_per_hpage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) resv->pages_per_hpage = pages_per_huge_page(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) /* pages_per_hpage should be the same for all entries in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) * a resv_map.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) nrg->reservation_counter = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) nrg->css = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) static void put_uncharge_info(struct file_region *rg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) #ifdef CONFIG_CGROUP_HUGETLB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) if (rg->css)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) css_put(rg->css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) static bool has_same_uncharge_info(struct file_region *rg,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) struct file_region *org)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) #ifdef CONFIG_CGROUP_HUGETLB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) return rg && org &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) rg->reservation_counter == org->reservation_counter &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) rg->css == org->css;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) struct file_region *nrg = NULL, *prg = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) prg = list_prev_entry(rg, link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) if (&prg->link != &resv->regions && prg->to == rg->from &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) has_same_uncharge_info(prg, rg)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) prg->to = rg->to;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) list_del(&rg->link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) put_uncharge_info(rg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) kfree(rg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) rg = prg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) nrg = list_next_entry(rg, link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) if (&nrg->link != &resv->regions && nrg->from == rg->to &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) has_same_uncharge_info(nrg, rg)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) nrg->from = rg->from;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) list_del(&rg->link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) put_uncharge_info(rg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) kfree(rg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) * Must be called with resv->lock held.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) * Calling this with regions_needed != NULL will count the number of pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) * to be added but will not modify the linked list. And regions_needed will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) * indicate the number of file_regions needed in the cache to carry out to add
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) * the regions for this range.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) static long add_reservation_in_range(struct resv_map *resv, long f, long t,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) struct hugetlb_cgroup *h_cg,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) struct hstate *h, long *regions_needed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) long add = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) struct list_head *head = &resv->regions;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) long last_accounted_offset = f;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) struct file_region *rg = NULL, *trg = NULL, *nrg = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) if (regions_needed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) *regions_needed = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) /* In this loop, we essentially handle an entry for the range
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) * [last_accounted_offset, rg->from), at every iteration, with some
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) * bounds checking.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) list_for_each_entry_safe(rg, trg, head, link) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) /* Skip irrelevant regions that start before our range. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) if (rg->from < f) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) /* If this region ends after the last accounted offset,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) * then we need to update last_accounted_offset.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) if (rg->to > last_accounted_offset)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) last_accounted_offset = rg->to;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) /* When we find a region that starts beyond our range, we've
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) * finished.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) if (rg->from > t)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) /* Add an entry for last_accounted_offset -> rg->from, and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) * update last_accounted_offset.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) if (rg->from > last_accounted_offset) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) add += rg->from - last_accounted_offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) if (!regions_needed) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) nrg = get_file_region_entry_from_cache(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) resv, last_accounted_offset, rg->from);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) record_hugetlb_cgroup_uncharge_info(h_cg, h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) resv, nrg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) list_add(&nrg->link, rg->link.prev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) coalesce_file_region(resv, nrg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) *regions_needed += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) last_accounted_offset = rg->to;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) /* Handle the case where our range extends beyond
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) * last_accounted_offset.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) if (last_accounted_offset < t) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) add += t - last_accounted_offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) if (!regions_needed) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) nrg = get_file_region_entry_from_cache(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) resv, last_accounted_offset, t);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) list_add(&nrg->link, rg->link.prev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) coalesce_file_region(resv, nrg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) *regions_needed += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) VM_BUG_ON(add < 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) return add;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) /* Must be called with resv->lock acquired. Will drop lock to allocate entries.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) static int allocate_file_region_entries(struct resv_map *resv,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) int regions_needed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) __must_hold(&resv->lock)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) struct list_head allocated_regions;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) int to_allocate = 0, i = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) struct file_region *trg = NULL, *rg = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) VM_BUG_ON(regions_needed < 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) INIT_LIST_HEAD(&allocated_regions);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) * Check for sufficient descriptors in the cache to accommodate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) * the number of in progress add operations plus regions_needed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) * This is a while loop because when we drop the lock, some other call
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) * to region_add or region_del may have consumed some region_entries,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) * so we keep looping here until we finally have enough entries for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) * (adds_in_progress + regions_needed).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) while (resv->region_cache_count <
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) (resv->adds_in_progress + regions_needed)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) to_allocate = resv->adds_in_progress + regions_needed -
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) resv->region_cache_count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) /* At this point, we should have enough entries in the cache
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) * for all the existings adds_in_progress. We should only be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) * needing to allocate for regions_needed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) spin_unlock(&resv->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) for (i = 0; i < to_allocate; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) trg = kmalloc(sizeof(*trg), GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) if (!trg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) goto out_of_memory;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) list_add(&trg->link, &allocated_regions);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) spin_lock(&resv->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) list_splice(&allocated_regions, &resv->region_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) resv->region_cache_count += to_allocate;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) out_of_memory:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) list_del(&rg->link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) kfree(rg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) * Add the huge page range represented by [f, t) to the reserve
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) * map. Regions will be taken from the cache to fill in this range.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) * Sufficient regions should exist in the cache due to the previous
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) * call to region_chg with the same range, but in some cases the cache will not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) * have sufficient entries due to races with other code doing region_add or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) * region_del. The extra needed entries will be allocated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) * regions_needed is the out value provided by a previous call to region_chg.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) * Return the number of new huge pages added to the map. This number is greater
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) * than or equal to zero. If file_region entries needed to be allocated for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) * this operation and we were not able to allocate, it returns -ENOMEM.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) * region_add of regions of length 1 never allocate file_regions and cannot
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) * fail; region_chg will always allocate at least 1 entry and a region_add for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) * 1 page will only require at most 1 entry.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) static long region_add(struct resv_map *resv, long f, long t,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) long in_regions_needed, struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) struct hugetlb_cgroup *h_cg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) long add = 0, actual_regions_needed = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) spin_lock(&resv->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) retry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) /* Count how many regions are actually needed to execute this add. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) add_reservation_in_range(resv, f, t, NULL, NULL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) &actual_regions_needed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) * Check for sufficient descriptors in the cache to accommodate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) * this add operation. Note that actual_regions_needed may be greater
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) * than in_regions_needed, as the resv_map may have been modified since
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) * the region_chg call. In this case, we need to make sure that we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) * allocate extra entries, such that we have enough for all the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) * existing adds_in_progress, plus the excess needed for this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) * operation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) if (actual_regions_needed > in_regions_needed &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) resv->region_cache_count <
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) resv->adds_in_progress +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) (actual_regions_needed - in_regions_needed)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) /* region_add operation of range 1 should never need to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) * allocate file_region entries.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) VM_BUG_ON(t - f <= 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) if (allocate_file_region_entries(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) resv, actual_regions_needed - in_regions_needed)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) add = add_reservation_in_range(resv, f, t, h_cg, h, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) resv->adds_in_progress -= in_regions_needed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) spin_unlock(&resv->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) VM_BUG_ON(add < 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) return add;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) * Examine the existing reserve map and determine how many
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) * huge pages in the specified range [f, t) are NOT currently
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) * represented. This routine is called before a subsequent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) * call to region_add that will actually modify the reserve
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) * map to add the specified range [f, t). region_chg does
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) * not change the number of huge pages represented by the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) * map. A number of new file_region structures is added to the cache as a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) * placeholder, for the subsequent region_add call to use. At least 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) * file_region structure is added.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) * out_regions_needed is the number of regions added to the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) * resv->adds_in_progress. This value needs to be provided to a follow up call
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) * to region_add or region_abort for proper accounting.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) * Returns the number of huge pages that need to be added to the existing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) * reservation map for the range [f, t). This number is greater or equal to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) * zero. -ENOMEM is returned if a new file_region structure or cache entry
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) * is needed and can not be allocated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) static long region_chg(struct resv_map *resv, long f, long t,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) long *out_regions_needed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) long chg = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) spin_lock(&resv->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) /* Count how many hugepages in this range are NOT represented. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) chg = add_reservation_in_range(resv, f, t, NULL, NULL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) out_regions_needed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) if (*out_regions_needed == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) *out_regions_needed = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) if (allocate_file_region_entries(resv, *out_regions_needed))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) resv->adds_in_progress += *out_regions_needed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) spin_unlock(&resv->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) return chg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) * Abort the in progress add operation. The adds_in_progress field
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) * of the resv_map keeps track of the operations in progress between
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) * calls to region_chg and region_add. Operations are sometimes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) * aborted after the call to region_chg. In such cases, region_abort
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) * is called to decrement the adds_in_progress counter. regions_needed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) * is the value returned by the region_chg call, it is used to decrement
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) * the adds_in_progress counter.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) * NOTE: The range arguments [f, t) are not needed or used in this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) * routine. They are kept to make reading the calling code easier as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) * arguments will match the associated region_chg call.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) static void region_abort(struct resv_map *resv, long f, long t,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) long regions_needed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) spin_lock(&resv->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) VM_BUG_ON(!resv->region_cache_count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) resv->adds_in_progress -= regions_needed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) spin_unlock(&resv->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) * Delete the specified range [f, t) from the reserve map. If the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) * t parameter is LONG_MAX, this indicates that ALL regions after f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) * should be deleted. Locate the regions which intersect [f, t)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) * and either trim, delete or split the existing regions.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) * Returns the number of huge pages deleted from the reserve map.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) * In the normal case, the return value is zero or more. In the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) * case where a region must be split, a new region descriptor must
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) * be allocated. If the allocation fails, -ENOMEM will be returned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) * NOTE: If the parameter t == LONG_MAX, then we will never split
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) * a region and possibly return -ENOMEM. Callers specifying
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) * t == LONG_MAX do not need to check for -ENOMEM error.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) static long region_del(struct resv_map *resv, long f, long t)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) struct list_head *head = &resv->regions;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) struct file_region *rg, *trg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) struct file_region *nrg = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) long del = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) retry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) spin_lock(&resv->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) list_for_each_entry_safe(rg, trg, head, link) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) * Skip regions before the range to be deleted. file_region
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) * ranges are normally of the form [from, to). However, there
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) * may be a "placeholder" entry in the map which is of the form
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) * (from, to) with from == to. Check for placeholder entries
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) * at the beginning of the range to be deleted.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) if (rg->to <= f && (rg->to != rg->from || rg->to != f))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) if (rg->from >= t)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) if (f > rg->from && t < rg->to) { /* Must split region */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) * Check for an entry in the cache before dropping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) * lock and attempting allocation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) if (!nrg &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) resv->region_cache_count > resv->adds_in_progress) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) nrg = list_first_entry(&resv->region_cache,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) struct file_region,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) list_del(&nrg->link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) resv->region_cache_count--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) if (!nrg) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) spin_unlock(&resv->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) if (!nrg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) del += t - f;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) hugetlb_cgroup_uncharge_file_region(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) resv, rg, t - f, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) /* New entry for end of split region */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) nrg->from = t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) nrg->to = rg->to;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) copy_hugetlb_cgroup_uncharge_info(nrg, rg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) INIT_LIST_HEAD(&nrg->link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) /* Original entry is trimmed */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) rg->to = f;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) list_add(&nrg->link, &rg->link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) nrg = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) if (f <= rg->from && t >= rg->to) { /* Remove entire region */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) del += rg->to - rg->from;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) hugetlb_cgroup_uncharge_file_region(resv, rg,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) rg->to - rg->from, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) list_del(&rg->link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) kfree(rg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) if (f <= rg->from) { /* Trim beginning of region */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) hugetlb_cgroup_uncharge_file_region(resv, rg,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) t - rg->from, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) del += t - rg->from;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) rg->from = t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) } else { /* Trim end of region */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) hugetlb_cgroup_uncharge_file_region(resv, rg,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) rg->to - f, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) del += rg->to - f;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) rg->to = f;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) spin_unlock(&resv->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) kfree(nrg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) return del;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) * A rare out of memory error was encountered which prevented removal of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) * the reserve map region for a page. The huge page itself was free'ed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) * and removed from the page cache. This routine will adjust the subpool
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) * usage count, and the global reserve count if needed. By incrementing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) * these counts, the reserve map entry which could not be deleted will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) * appear as a "reserved" entry instead of simply dangling with incorrect
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) * counts.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) void hugetlb_fix_reserve_counts(struct inode *inode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) struct hugepage_subpool *spool = subpool_inode(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) long rsv_adjust;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) bool reserved = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) rsv_adjust = hugepage_subpool_get_pages(spool, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) if (rsv_adjust > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) struct hstate *h = hstate_inode(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) if (!hugetlb_acct_memory(h, 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) reserved = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) } else if (!rsv_adjust) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) reserved = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) if (!reserved)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) * Count and return the number of huge pages in the reserve map
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) * that intersect with the range [f, t).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) static long region_count(struct resv_map *resv, long f, long t)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) struct list_head *head = &resv->regions;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) struct file_region *rg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) long chg = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) spin_lock(&resv->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) /* Locate each segment we overlap with, and count that overlap. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) list_for_each_entry(rg, head, link) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) long seg_from;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) long seg_to;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) if (rg->to <= f)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) if (rg->from >= t)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) seg_from = max(rg->from, f);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) seg_to = min(rg->to, t);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) chg += seg_to - seg_from;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) spin_unlock(&resv->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) return chg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) * Convert the address within this vma to the page offset within
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) * the mapping, in pagecache page units; huge pages here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) static pgoff_t vma_hugecache_offset(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) struct vm_area_struct *vma, unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) return ((address - vma->vm_start) >> huge_page_shift(h)) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) (vma->vm_pgoff >> huge_page_order(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) return vma_hugecache_offset(hstate_vma(vma), vma, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) EXPORT_SYMBOL_GPL(linear_hugepage_index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) * Return the size of the pages allocated when backing a VMA. In the majority
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) * cases this will be same size as used by the page table entries.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818) if (vma->vm_ops && vma->vm_ops->pagesize)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) return vma->vm_ops->pagesize(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) return PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) * Return the page size being used by the MMU to back a VMA. In the majority
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) * of cases, the page size used by the kernel matches the MMU size. On
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) * architectures where it differs, an architecture-specific 'strong'
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) * version of this symbol is required.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832) return vma_kernel_pagesize(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) * Flags for MAP_PRIVATE reservations. These are stored in the bottom
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837) * bits of the reservation map pointer, which are always clear due to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838) * alignment.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) #define HPAGE_RESV_OWNER (1UL << 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841) #define HPAGE_RESV_UNMAPPED (1UL << 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845) * These helpers are used to track how many pages are reserved for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847) * is guaranteed to have their future faults succeed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849) * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850) * the reserve counters are updated with the hugetlb_lock held. It is safe
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) * to reset the VMA at fork() time as it is not in use yet and there is no
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852) * chance of the global counters getting corrupted as a result of the values.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854) * The private mapping reservation is represented in a subtly different
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) * manner to a shared mapping. A shared mapping has a region map associated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) * with the underlying file, this region map represents the backing file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857) * pages which have ever had a reservation assigned which this persists even
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858) * after the page is instantiated. A private mapping has a region map
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) * associated with the original mmap which is attached to all VMAs which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860) * reference it, this region map represents those offsets which have consumed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) * reservation ie. where pages have been instantiated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863) static unsigned long get_vma_private_data(struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865) return (unsigned long)vma->vm_private_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) static void set_vma_private_data(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869) unsigned long value)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) vma->vm_private_data = (void *)value;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) static void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875) resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) struct hugetlb_cgroup *h_cg,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877) struct hstate *h)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) #ifdef CONFIG_CGROUP_HUGETLB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880) if (!h_cg || !h) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881) resv_map->reservation_counter = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) resv_map->pages_per_hpage = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883) resv_map->css = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) resv_map->reservation_counter =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) &h_cg->rsvd_hugepage[hstate_index(h)];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887) resv_map->pages_per_hpage = pages_per_huge_page(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) resv_map->css = &h_cg->css;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893) struct resv_map *resv_map_alloc(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895) struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898) if (!resv_map || !rg) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899) kfree(resv_map);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) kfree(rg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) kref_init(&resv_map->refs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) spin_lock_init(&resv_map->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906) INIT_LIST_HEAD(&resv_map->regions);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) resv_map->adds_in_progress = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) * Initialize these to 0. On shared mappings, 0's here indicate these
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911) * fields don't do cgroup accounting. On private mappings, these will be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912) * re-initialized to the proper values, to indicate that hugetlb cgroup
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913) * reservations are to be un-charged from here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915) resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) INIT_LIST_HEAD(&resv_map->region_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) list_add(&rg->link, &resv_map->region_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919) resv_map->region_cache_count = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921) return resv_map;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924) void resv_map_release(struct kref *ref)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926) struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927) struct list_head *head = &resv_map->region_cache;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928) struct file_region *rg, *trg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930) /* Clear out any active regions before we release the map. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931) region_del(resv_map, 0, LONG_MAX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933) /* ... and any entries left in the cache */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934) list_for_each_entry_safe(rg, trg, head, link) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935) list_del(&rg->link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936) kfree(rg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939) VM_BUG_ON(resv_map->adds_in_progress);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941) kfree(resv_map);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944) static inline struct resv_map *inode_resv_map(struct inode *inode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947) * At inode evict time, i_mapping may not point to the original
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948) * address space within the inode. This original address space
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949) * contains the pointer to the resv_map. So, always use the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950) * address space embedded within the inode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951) * The VERY common case is inode->mapping == &inode->i_data but,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952) * this may not be true for device special inodes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) return (struct resv_map *)(&inode->i_data)->private_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957) static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) if (vma->vm_flags & VM_MAYSHARE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961) struct address_space *mapping = vma->vm_file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962) struct inode *inode = mapping->host;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964) return inode_resv_map(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967) return (struct resv_map *)(get_vma_private_data(vma) &
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968) ~HPAGE_RESV_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972) static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974) VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975) VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977) set_vma_private_data(vma, (get_vma_private_data(vma) &
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978) HPAGE_RESV_MASK) | (unsigned long)map);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981) static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983) VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984) VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986) set_vma_private_data(vma, get_vma_private_data(vma) | flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989) static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991) VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993) return (get_vma_private_data(vma) & flag) != 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996) /* Reset counters to 0 and clear all HPAGE_RESV_* flags */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997) void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999) VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) if (!(vma->vm_flags & VM_MAYSHARE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) vma->vm_private_data = (void *)0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) /* Returns true if the VMA has associated reserve pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) if (vma->vm_flags & VM_NORESERVE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) * This address is already reserved by other process(chg == 0),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) * so, we should decrement reserved count. Without decrementing,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) * reserve count remains after releasing inode, because this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) * allocated page will go into page cache and is regarded as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) * coming from reserved pool in releasing step. Currently, we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) * don't have any other solution to deal with this situation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) * properly, so add work-around here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) if (vma->vm_flags & VM_MAYSHARE && chg == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) /* Shared mappings always use reserves */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) if (vma->vm_flags & VM_MAYSHARE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) * We know VM_NORESERVE is not set. Therefore, there SHOULD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) * be a region map for all pages. The only situation where
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) * there is no region map is if a hole was punched via
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) * fallocate. In this case, there really are no reserves to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) * use. This situation is indicated if chg != 0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) if (chg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) * Only the process that called mmap() has reserves for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) * private mappings.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) * Like the shared case above, a hole punch or truncate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) * could have been performed on the private mapping.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) * Examine the value of chg to determine if reserves
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) * actually exist or were previously consumed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) * Very Subtle - The value of chg comes from a previous
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) * call to vma_needs_reserves(). The reserve map for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) * private mappings has different (opposite) semantics
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) * than that of shared mappings. vma_needs_reserves()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) * has already taken this difference in semantics into
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) * account. Therefore, the meaning of chg is the same
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) * as in the shared case above. Code could easily be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) * combined, but keeping it separate draws attention to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) * subtle differences.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) if (chg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) static void enqueue_huge_page(struct hstate *h, struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) int nid = page_to_nid(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) list_move(&page->lru, &h->hugepage_freelists[nid]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) h->free_huge_pages++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) h->free_huge_pages_node[nid]++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073) SetPageHugeFreed(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081) list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) if (nocma && is_migrate_cma_page(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) if (PageHWPoison(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) list_move(&page->lru, &h->hugepage_activelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089) set_page_refcounted(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) ClearPageHugeFreed(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) h->free_huge_pages--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) h->free_huge_pages_node[nid]--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) nodemask_t *nmask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) unsigned int cpuset_mems_cookie;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) struct zonelist *zonelist;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105) struct zoneref *z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) int node = NUMA_NO_NODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) zonelist = node_zonelist(nid, gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110) retry_cpuset:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) cpuset_mems_cookie = read_mems_allowed_begin();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) if (!cpuset_zone_allowed(zone, gfp_mask))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) * no need to ask again on the same node. Pool is node rather than
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) * zone aware
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) if (zone_to_nid(zone) == node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) node = zone_to_nid(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) page = dequeue_huge_page_node_exact(h, node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) goto retry_cpuset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) static struct page *dequeue_huge_page_vma(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) unsigned long address, int avoid_reserve,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) long chg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) struct mempolicy *mpol;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) gfp_t gfp_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) nodemask_t *nodemask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147) * A child process with MAP_PRIVATE mappings created by their parent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) * have no page reserves. This check ensures that reservations are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149) * not "stolen". The child may still get SIGKILLed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) if (!vma_has_reserves(vma, chg) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) h->free_huge_pages - h->resv_huge_pages == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) goto err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) /* If reserves cannot be used, ensure enough pages are in the pool */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) goto err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) gfp_mask = htlb_alloc_mask(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163) SetPagePrivate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) h->resv_huge_pages--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) mpol_cond_put(mpol);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) err:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) * common helper functions for hstate_next_node_to_{alloc|free}.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) * We may have allocated or freed a huge page based on a different
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177) * nodes_allowed previously, so h->next_node_to_{alloc|free} might
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) * be outside of *nodes_allowed. Ensure that we use an allowed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) * node for alloc or free.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) nid = next_node_in(nid, *nodes_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) VM_BUG_ON(nid >= MAX_NUMNODES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) return nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) if (!node_isset(nid, *nodes_allowed))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) nid = next_node_allowed(nid, nodes_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) return nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) * returns the previously saved node ["this node"] from which to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) * allocate a persistent huge page for the pool and advance the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199) * next node from which to allocate, handling wrap at end of node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200) * mask.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) static int hstate_next_node_to_alloc(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) nodemask_t *nodes_allowed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207) VM_BUG_ON(!nodes_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) return nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) * helper for free_pool_huge_page() - return the previously saved
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) * node ["this node"] from which to free a huge page. Advance the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) * next node id whether or not we find a free huge page to free so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) * that the next attempt to free addresses the next node.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223) int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) VM_BUG_ON(!nodes_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227) nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) return nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) #define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) for (nr_nodes = nodes_weight(*mask); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) nr_nodes > 0 && \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) nr_nodes--)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) #define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240) for (nr_nodes = nodes_weight(*mask); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241) nr_nodes > 0 && \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242) ((node = hstate_next_node_to_free(hs, mask)) || 1); \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243) nr_nodes--)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245) #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) static void destroy_compound_gigantic_page(struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250) int nr_pages = 1 << order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) struct page *p = page + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) atomic_set(compound_mapcount_ptr(page), 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254) atomic_set(compound_pincount_ptr(page), 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256) for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) clear_compound_head(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258) set_page_refcounted(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261) set_compound_order(page, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262) page[1].compound_nr = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263) __ClearPageHead(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266) static void free_gigantic_page(struct page *page, unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269) * If the page isn't allocated using the cma allocator,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270) * cma_release() returns false.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272) #ifdef CONFIG_CMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273) if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) free_contig_range(page_to_pfn(page), 1 << order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280) #ifdef CONFIG_CONTIG_ALLOC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282) int nid, nodemask_t *nodemask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284) unsigned long nr_pages = 1UL << huge_page_order(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285) if (nid == NUMA_NO_NODE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286) nid = numa_mem_id();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288) #ifdef CONFIG_CMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1290) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1291) int node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1292)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1293) if (hugetlb_cma[nid]) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1294) page = cma_alloc(hugetlb_cma[nid], nr_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1295) huge_page_order(h),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1296) GFP_KERNEL | __GFP_NOWARN);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1297) if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1298) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1299) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1300)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1301) if (!(gfp_mask & __GFP_THISNODE)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1302) for_each_node_mask(node, *nodemask) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1303) if (node == nid || !hugetlb_cma[node])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1304) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1305)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1306) page = cma_alloc(hugetlb_cma[node], nr_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1307) huge_page_order(h),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1308) GFP_KERNEL | __GFP_NOWARN);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1309) if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1310) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1311) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1312) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1313) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1314) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1315)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1316) return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1317) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1318)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1319) #else /* !CONFIG_CONTIG_ALLOC */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1320) static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1321) int nid, nodemask_t *nodemask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1322) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1323) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1324) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1325) #endif /* CONFIG_CONTIG_ALLOC */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1326)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1327) #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1328) static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1329) int nid, nodemask_t *nodemask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1330) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1331) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1332) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1333) static inline void free_gigantic_page(struct page *page, unsigned int order) { }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1334) static inline void destroy_compound_gigantic_page(struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1335) unsigned int order) { }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1336) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1337)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1338) static void update_and_free_page(struct hstate *h, struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1339) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1340) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1341) struct page *subpage = page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1342)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1343) if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1344) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1345)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1346) h->nr_huge_pages--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1347) h->nr_huge_pages_node[page_to_nid(page)]--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1348) for (i = 0; i < pages_per_huge_page(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1349) i++, subpage = mem_map_next(subpage, page, i)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1350) subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1351) 1 << PG_referenced | 1 << PG_dirty |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1352) 1 << PG_active | 1 << PG_private |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1353) 1 << PG_writeback);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1354) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1355) VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1356) VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1357) set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1358) set_page_refcounted(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1359) if (hstate_is_gigantic(h)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1360) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1361) * Temporarily drop the hugetlb_lock, because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1362) * we might block in free_gigantic_page().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1363) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1364) spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1365) destroy_compound_gigantic_page(page, huge_page_order(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1366) free_gigantic_page(page, huge_page_order(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1367) spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1368) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1369) __free_pages(page, huge_page_order(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1370) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1371) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1372)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1373) struct hstate *size_to_hstate(unsigned long size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1374) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1375) struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1376)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1377) for_each_hstate(h) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1378) if (huge_page_size(h) == size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1379) return h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1380) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1381) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1382) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1383)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1384) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1385) * Test to determine whether the hugepage is "active/in-use" (i.e. being linked
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1386) * to hstate->hugepage_activelist.)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1387) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1388) * This function can be called for tail pages, but never returns true for them.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1389) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1390) bool page_huge_active(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1391) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1392) return PageHeadHuge(page) && PagePrivate(&page[1]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1393) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1394)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1395) /* never called for tail page */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1396) void set_page_huge_active(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1397) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1398) VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1399) SetPagePrivate(&page[1]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1400) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1401)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1402) static void clear_page_huge_active(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1403) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1404) VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1405) ClearPagePrivate(&page[1]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1406) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1407)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1408) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1409) * Internal hugetlb specific page flag. Do not use outside of the hugetlb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1410) * code
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1411) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1412) static inline bool PageHugeTemporary(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1413) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1414) if (!PageHuge(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1415) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1416)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1417) return (unsigned long)page[2].mapping == -1U;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1418) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1419)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1420) static inline void SetPageHugeTemporary(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1421) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1422) page[2].mapping = (void *)-1U;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1423) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1424)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1425) static inline void ClearPageHugeTemporary(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1426) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1427) page[2].mapping = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1428) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1429)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1430) static void __free_huge_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1431) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1432) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1433) * Can't pass hstate in here because it is called from the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1434) * compound page destructor.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1435) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1436) struct hstate *h = page_hstate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1437) int nid = page_to_nid(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1438) struct hugepage_subpool *spool =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1439) (struct hugepage_subpool *)page_private(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1440) bool restore_reserve;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1441)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1442) VM_BUG_ON_PAGE(page_count(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1443) VM_BUG_ON_PAGE(page_mapcount(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1444)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1445) set_page_private(page, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1446) page->mapping = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1447) restore_reserve = PagePrivate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1448) ClearPagePrivate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1449)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1450) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1451) * If PagePrivate() was set on page, page allocation consumed a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1452) * reservation. If the page was associated with a subpool, there
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1453) * would have been a page reserved in the subpool before allocation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1454) * via hugepage_subpool_get_pages(). Since we are 'restoring' the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1455) * reservtion, do not call hugepage_subpool_put_pages() as this will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1456) * remove the reserved page from the subpool.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1457) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1458) if (!restore_reserve) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1459) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1460) * A return code of zero implies that the subpool will be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1461) * under its minimum size if the reservation is not restored
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1462) * after page is free. Therefore, force restore_reserve
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1463) * operation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1464) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1465) if (hugepage_subpool_put_pages(spool, 1) == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1466) restore_reserve = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1467) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1468)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1469) spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1470) clear_page_huge_active(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1471) hugetlb_cgroup_uncharge_page(hstate_index(h),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1472) pages_per_huge_page(h), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1473) hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1474) pages_per_huge_page(h), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1475) if (restore_reserve)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1476) h->resv_huge_pages++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1477)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1478) if (PageHugeTemporary(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1479) list_del(&page->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1480) ClearPageHugeTemporary(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1481) update_and_free_page(h, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1482) } else if (h->surplus_huge_pages_node[nid]) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1483) /* remove the page from active list */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1484) list_del(&page->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1485) update_and_free_page(h, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1486) h->surplus_huge_pages--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1487) h->surplus_huge_pages_node[nid]--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1488) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1489) arch_clear_hugepage_flags(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1490) enqueue_huge_page(h, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1491) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1492) spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1493) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1494)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1495) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1496) * As free_huge_page() can be called from a non-task context, we have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1497) * to defer the actual freeing in a workqueue to prevent potential
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1498) * hugetlb_lock deadlock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1499) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1500) * free_hpage_workfn() locklessly retrieves the linked list of pages to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1501) * be freed and frees them one-by-one. As the page->mapping pointer is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1502) * going to be cleared in __free_huge_page() anyway, it is reused as the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1503) * llist_node structure of a lockless linked list of huge pages to be freed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1504) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1505) static LLIST_HEAD(hpage_freelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1506)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1507) static void free_hpage_workfn(struct work_struct *work)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1508) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1509) struct llist_node *node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1510) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1511)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1512) node = llist_del_all(&hpage_freelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1513)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1514) while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1515) page = container_of((struct address_space **)node,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1516) struct page, mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1517) node = node->next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1518) __free_huge_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1519) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1520) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1521) static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1522)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1523) void free_huge_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1524) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1525) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1526) * Defer freeing if in non-task context to avoid hugetlb_lock deadlock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1527) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1528) if (!in_task()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1529) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1530) * Only call schedule_work() if hpage_freelist is previously
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1531) * empty. Otherwise, schedule_work() had been called but the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1532) * workfn hasn't retrieved the list yet.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1533) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1534) if (llist_add((struct llist_node *)&page->mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1535) &hpage_freelist))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1536) schedule_work(&free_hpage_work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1537) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1538) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1539)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1540) __free_huge_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1541) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1542)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1543) static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1544) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1545) INIT_LIST_HEAD(&page->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1546) set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1547) set_hugetlb_cgroup(page, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1548) set_hugetlb_cgroup_rsvd(page, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1549) spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1550) h->nr_huge_pages++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1551) h->nr_huge_pages_node[nid]++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1552) ClearPageHugeFreed(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1553) spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1554) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1555)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1556) static void prep_compound_gigantic_page(struct page *page, unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1557) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1558) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1559) int nr_pages = 1 << order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1560) struct page *p = page + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1561)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1562) /* we rely on prep_new_huge_page to set the destructor */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1563) set_compound_order(page, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1564) __ClearPageReserved(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1565) __SetPageHead(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1566) for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1567) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1568) * For gigantic hugepages allocated through bootmem at
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1569) * boot, it's safer to be consistent with the not-gigantic
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1570) * hugepages and clear the PG_reserved bit from all tail pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1571) * too. Otherwise drivers using get_user_pages() to access tail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1572) * pages may get the reference counting wrong if they see
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1573) * PG_reserved set on a tail page (despite the head page not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1574) * having PG_reserved set). Enforcing this consistency between
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1575) * head and tail pages allows drivers to optimize away a check
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1576) * on the head page when they need know if put_page() is needed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1577) * after get_user_pages().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1578) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1579) __ClearPageReserved(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1580) set_page_count(p, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1581) set_compound_head(p, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1582) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1583) atomic_set(compound_mapcount_ptr(page), -1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1584) atomic_set(compound_pincount_ptr(page), 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1585) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1586)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1587) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1588) * PageHuge() only returns true for hugetlbfs pages, but not for normal or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1589) * transparent huge pages. See the PageTransHuge() documentation for more
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1590) * details.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1591) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1592) int PageHuge(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1593) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1594) if (!PageCompound(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1595) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1596)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1597) page = compound_head(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1598) return page[1].compound_dtor == HUGETLB_PAGE_DTOR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1599) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1600) EXPORT_SYMBOL_GPL(PageHuge);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1601)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1602) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1603) * PageHeadHuge() only returns true for hugetlbfs head page, but not for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1604) * normal or transparent huge pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1605) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1606) int PageHeadHuge(struct page *page_head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1607) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1608) if (!PageHead(page_head))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1609) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1610)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1611) return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1612) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1613)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1614) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1615) * Find and lock address space (mapping) in write mode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1616) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1617) * Upon entry, the page is locked which means that page_mapping() is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1618) * stable. Due to locking order, we can only trylock_write. If we can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1619) * not get the lock, simply return NULL to caller.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1620) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1621) struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1622) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1623) struct address_space *mapping = page_mapping(hpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1624)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1625) if (!mapping)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1626) return mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1627)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1628) if (i_mmap_trylock_write(mapping))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1629) return mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1630)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1631) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1632) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1633)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1634) pgoff_t hugetlb_basepage_index(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1635) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1636) struct page *page_head = compound_head(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1637) pgoff_t index = page_index(page_head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1638) unsigned long compound_idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1639)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1640) if (compound_order(page_head) >= MAX_ORDER)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1641) compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1642) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1643) compound_idx = page - page_head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1644)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1645) return (index << compound_order(page_head)) + compound_idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1646) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1647)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1648) static struct page *alloc_buddy_huge_page(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1649) gfp_t gfp_mask, int nid, nodemask_t *nmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1650) nodemask_t *node_alloc_noretry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1651) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1652) int order = huge_page_order(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1653) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1654) bool alloc_try_hard = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1655)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1656) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1657) * By default we always try hard to allocate the page with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1658) * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1659) * a loop (to adjust global huge page counts) and previous allocation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1660) * failed, do not continue to try hard on the same node. Use the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1661) * node_alloc_noretry bitmap to manage this state information.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1662) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1663) if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1664) alloc_try_hard = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1665) gfp_mask |= __GFP_COMP|__GFP_NOWARN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1666) if (alloc_try_hard)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1667) gfp_mask |= __GFP_RETRY_MAYFAIL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1668) if (nid == NUMA_NO_NODE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1669) nid = numa_mem_id();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1670) page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1671) if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1672) __count_vm_event(HTLB_BUDDY_PGALLOC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1673) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1674) __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1675)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1676) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1677) * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1678) * indicates an overall state change. Clear bit so that we resume
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1679) * normal 'try hard' allocations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1680) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1681) if (node_alloc_noretry && page && !alloc_try_hard)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1682) node_clear(nid, *node_alloc_noretry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1683)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1684) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1685) * If we tried hard to get a page but failed, set bit so that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1686) * subsequent attempts will not try as hard until there is an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1687) * overall state change.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1688) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1689) if (node_alloc_noretry && !page && alloc_try_hard)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1690) node_set(nid, *node_alloc_noretry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1691)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1692) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1693) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1694)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1695) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1696) * Common helper to allocate a fresh hugetlb page. All specific allocators
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1697) * should use this function to get new hugetlb pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1698) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1699) static struct page *alloc_fresh_huge_page(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1700) gfp_t gfp_mask, int nid, nodemask_t *nmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1701) nodemask_t *node_alloc_noretry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1702) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1703) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1704)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1705) if (hstate_is_gigantic(h))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1706) page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1707) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1708) page = alloc_buddy_huge_page(h, gfp_mask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1709) nid, nmask, node_alloc_noretry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1710) if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1711) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1712)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1713) if (hstate_is_gigantic(h))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1714) prep_compound_gigantic_page(page, huge_page_order(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1715) prep_new_huge_page(h, page, page_to_nid(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1716)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1717) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1718) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1719)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1720) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1721) * Allocates a fresh page to the hugetlb allocator pool in the node interleaved
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1722) * manner.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1723) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1724) static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1725) nodemask_t *node_alloc_noretry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1726) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1727) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1728) int nr_nodes, node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1729) gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1730)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1731) for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1732) page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1733) node_alloc_noretry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1734) if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1735) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1736) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1737)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1738) if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1739) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1740)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1741) put_page(page); /* free it into the hugepage allocator */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1742)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1743) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1744) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1745)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1746) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1747) * Free huge page from pool from next node to free.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1748) * Attempt to keep persistent huge pages more or less
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1749) * balanced over allowed nodes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1750) * Called with hugetlb_lock locked.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1751) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1752) static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1753) bool acct_surplus)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1754) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1755) int nr_nodes, node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1756) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1757)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1758) for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1759) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1760) * If we're returning unused surplus pages, only examine
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1761) * nodes with surplus pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1762) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1763) if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1764) !list_empty(&h->hugepage_freelists[node])) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1765) struct page *page =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1766) list_entry(h->hugepage_freelists[node].next,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1767) struct page, lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1768) list_del(&page->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1769) h->free_huge_pages--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1770) h->free_huge_pages_node[node]--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1771) if (acct_surplus) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1772) h->surplus_huge_pages--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1773) h->surplus_huge_pages_node[node]--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1774) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1775) update_and_free_page(h, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1776) ret = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1777) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1778) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1779) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1780)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1781) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1782) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1783)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1784) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1785) * Dissolve a given free hugepage into free buddy pages. This function does
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1786) * nothing for in-use hugepages and non-hugepages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1787) * This function returns values like below:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1788) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1789) * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1790) * (allocated or reserved.)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1791) * 0: successfully dissolved free hugepages or the page is not a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1792) * hugepage (considered as already dissolved)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1793) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1794) int dissolve_free_huge_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1795) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1796) int rc = -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1797)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1798) retry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1799) /* Not to disrupt normal path by vainly holding hugetlb_lock */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1800) if (!PageHuge(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1801) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1802)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1803) spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1804) if (!PageHuge(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1805) rc = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1806) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1807) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1808)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1809) if (!page_count(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1810) struct page *head = compound_head(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1811) struct hstate *h = page_hstate(head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1812) int nid = page_to_nid(head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1813) if (h->free_huge_pages - h->resv_huge_pages == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1814) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1815)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1816) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1817) * We should make sure that the page is already on the free list
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1818) * when it is dissolved.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1819) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1820) if (unlikely(!PageHugeFreed(head))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1821) spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1822) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1823)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1824) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1825) * Theoretically, we should return -EBUSY when we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1826) * encounter this race. In fact, we have a chance
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1827) * to successfully dissolve the page if we do a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1828) * retry. Because the race window is quite small.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1829) * If we seize this opportunity, it is an optimization
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1830) * for increasing the success rate of dissolving page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1831) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1832) goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1833) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1834)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1835) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1836) * Move PageHWPoison flag from head page to the raw error page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1837) * which makes any subpages rather than the error page reusable.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1838) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1839) if (PageHWPoison(head) && page != head) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1840) SetPageHWPoison(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1841) ClearPageHWPoison(head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1842) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1843) list_del(&head->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1844) h->free_huge_pages--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1845) h->free_huge_pages_node[nid]--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1846) h->max_huge_pages--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1847) update_and_free_page(h, head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1848) rc = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1849) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1850) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1851) spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1852) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1853) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1854)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1855) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1856) * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1857) * make specified memory blocks removable from the system.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1858) * Note that this will dissolve a free gigantic hugepage completely, if any
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1859) * part of it lies within the given range.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1860) * Also note that if dissolve_free_huge_page() returns with an error, all
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1861) * free hugepages that were dissolved before that error are lost.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1862) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1863) int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1864) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1865) unsigned long pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1866) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1867) int rc = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1868)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1869) if (!hugepages_supported())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1870) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1871)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1872) for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1873) page = pfn_to_page(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1874) rc = dissolve_free_huge_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1875) if (rc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1876) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1877) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1878)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1879) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1880) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1881)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1882) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1883) * Allocates a fresh surplus page from the page allocator.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1884) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1885) static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1886) int nid, nodemask_t *nmask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1887) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1888) struct page *page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1889)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1890) if (hstate_is_gigantic(h))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1891) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1892)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1893) spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1894) if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1895) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1896) spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1897)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1898) page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1899) if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1900) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1901)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1902) spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1903) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1904) * We could have raced with the pool size change.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1905) * Double check that and simply deallocate the new page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1906) * if we would end up overcommiting the surpluses. Abuse
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1907) * temporary page to workaround the nasty free_huge_page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1908) * codeflow
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1909) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1910) if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1911) SetPageHugeTemporary(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1912) spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1913) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1914) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1915) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1916) h->surplus_huge_pages++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1917) h->surplus_huge_pages_node[page_to_nid(page)]++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1918) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1919)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1920) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1921) spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1922)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1923) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1924) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1925)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1926) static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1927) int nid, nodemask_t *nmask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1928) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1929) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1930)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1931) if (hstate_is_gigantic(h))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1932) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1933)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1934) page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1935) if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1936) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1937)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1938) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1939) * We do not account these pages as surplus because they are only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1940) * temporary and will be released properly on the last reference
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1941) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1942) SetPageHugeTemporary(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1943)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1944) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1945) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1946)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1947) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1948) * Use the VMA's mpolicy to allocate a huge page from the buddy.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1949) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1950) static
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1951) struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1952) struct vm_area_struct *vma, unsigned long addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1953) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1954) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1955) struct mempolicy *mpol;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1956) gfp_t gfp_mask = htlb_alloc_mask(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1957) int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1958) nodemask_t *nodemask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1959)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1960) nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1961) page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1962) mpol_cond_put(mpol);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1963)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1964) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1965) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1966)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1967) /* page migration callback function */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1968) struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1969) nodemask_t *nmask, gfp_t gfp_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1970) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1971) spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1972) if (h->free_huge_pages - h->resv_huge_pages > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1973) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1974)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1975) page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1976) if (page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1977) spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1978) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1979) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1980) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1981) spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1982)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1983) return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1984) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1985)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1986) /* mempolicy aware migration callback */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1987) struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1988) unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1989) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1990) struct mempolicy *mpol;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1991) nodemask_t *nodemask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1992) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1993) gfp_t gfp_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1994) int node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1995)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1996) gfp_mask = htlb_alloc_mask(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1997) node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1998) page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1999) mpol_cond_put(mpol);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2000)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2001) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2002) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2003)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2004) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2005) * Increase the hugetlb pool such that it can accommodate a reservation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2006) * of size 'delta'.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2007) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2008) static int gather_surplus_pages(struct hstate *h, int delta)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2009) __must_hold(&hugetlb_lock)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2010) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2011) struct list_head surplus_list;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2012) struct page *page, *tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2013) int ret, i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2014) int needed, allocated;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2015) bool alloc_ok = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2016)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2017) needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2018) if (needed <= 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2019) h->resv_huge_pages += delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2020) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2021) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2022)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2023) allocated = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2024) INIT_LIST_HEAD(&surplus_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2025)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2026) ret = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2027) retry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2028) spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2029) for (i = 0; i < needed; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2030) page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2031) NUMA_NO_NODE, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2032) if (!page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2033) alloc_ok = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2034) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2035) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2036) list_add(&page->lru, &surplus_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2037) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2038) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2039) allocated += i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2040)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2041) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2042) * After retaking hugetlb_lock, we need to recalculate 'needed'
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2043) * because either resv_huge_pages or free_huge_pages may have changed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2044) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2045) spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2046) needed = (h->resv_huge_pages + delta) -
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2047) (h->free_huge_pages + allocated);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2048) if (needed > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2049) if (alloc_ok)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2050) goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2051) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2052) * We were not able to allocate enough pages to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2053) * satisfy the entire reservation so we free what
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2054) * we've allocated so far.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2055) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2056) goto free;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2057) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2058) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2059) * The surplus_list now contains _at_least_ the number of extra pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2060) * needed to accommodate the reservation. Add the appropriate number
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2061) * of pages to the hugetlb pool and free the extras back to the buddy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2062) * allocator. Commit the entire reservation here to prevent another
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2063) * process from stealing the pages as they are added to the pool but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2064) * before they are reserved.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2065) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2066) needed += allocated;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2067) h->resv_huge_pages += delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2068) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2069)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2070) /* Free the needed pages to the hugetlb pool */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2071) list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2072) if ((--needed) < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2073) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2074) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2075) * This page is now managed by the hugetlb allocator and has
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2076) * no users -- drop the buddy allocator's reference.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2077) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2078) put_page_testzero(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2079) VM_BUG_ON_PAGE(page_count(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2080) enqueue_huge_page(h, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2081) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2082) free:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2083) spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2084)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2085) /* Free unnecessary surplus pages to the buddy allocator */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2086) list_for_each_entry_safe(page, tmp, &surplus_list, lru)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2087) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2088) spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2089)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2090) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2091) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2092)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2093) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2094) * This routine has two main purposes:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2095) * 1) Decrement the reservation count (resv_huge_pages) by the value passed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2096) * in unused_resv_pages. This corresponds to the prior adjustments made
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2097) * to the associated reservation map.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2098) * 2) Free any unused surplus pages that may have been allocated to satisfy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2099) * the reservation. As many as unused_resv_pages may be freed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2100) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2101) * Called with hugetlb_lock held. However, the lock could be dropped (and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2102) * reacquired) during calls to cond_resched_lock. Whenever dropping the lock,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2103) * we must make sure nobody else can claim pages we are in the process of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2104) * freeing. Do this by ensuring resv_huge_page always is greater than the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2105) * number of huge pages we plan to free when dropping the lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2106) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2107) static void return_unused_surplus_pages(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2108) unsigned long unused_resv_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2109) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2110) unsigned long nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2111)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2112) /* Cannot return gigantic pages currently */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2113) if (hstate_is_gigantic(h))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2114) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2115)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2116) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2117) * Part (or even all) of the reservation could have been backed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2118) * by pre-allocated pages. Only free surplus pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2119) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2120) nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2121)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2122) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2123) * We want to release as many surplus pages as possible, spread
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2124) * evenly across all nodes with memory. Iterate across these nodes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2125) * until we can no longer free unreserved surplus pages. This occurs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2126) * when the nodes with surplus pages have no free pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2127) * free_pool_huge_page() will balance the freed pages across the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2128) * on-line nodes with memory and will handle the hstate accounting.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2129) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2130) * Note that we decrement resv_huge_pages as we free the pages. If
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2131) * we drop the lock, resv_huge_pages will still be sufficiently large
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2132) * to cover subsequent pages we may free.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2133) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2134) while (nr_pages--) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2135) h->resv_huge_pages--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2136) unused_resv_pages--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2137) if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2138) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2139) cond_resched_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2140) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2141)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2142) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2143) /* Fully uncommit the reservation */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2144) h->resv_huge_pages -= unused_resv_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2145) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2146)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2147)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2148) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2149) * vma_needs_reservation, vma_commit_reservation and vma_end_reservation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2150) * are used by the huge page allocation routines to manage reservations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2151) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2152) * vma_needs_reservation is called to determine if the huge page at addr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2153) * within the vma has an associated reservation. If a reservation is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2154) * needed, the value 1 is returned. The caller is then responsible for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2155) * managing the global reservation and subpool usage counts. After
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2156) * the huge page has been allocated, vma_commit_reservation is called
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2157) * to add the page to the reservation map. If the page allocation fails,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2158) * the reservation must be ended instead of committed. vma_end_reservation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2159) * is called in such cases.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2160) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2161) * In the normal case, vma_commit_reservation returns the same value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2162) * as the preceding vma_needs_reservation call. The only time this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2163) * is not the case is if a reserve map was changed between calls. It
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2164) * is the responsibility of the caller to notice the difference and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2165) * take appropriate action.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2166) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2167) * vma_add_reservation is used in error paths where a reservation must
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2168) * be restored when a newly allocated huge page must be freed. It is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2169) * to be called after calling vma_needs_reservation to determine if a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2170) * reservation exists.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2171) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2172) enum vma_resv_mode {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2173) VMA_NEEDS_RESV,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2174) VMA_COMMIT_RESV,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2175) VMA_END_RESV,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2176) VMA_ADD_RESV,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2177) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2178) static long __vma_reservation_common(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2179) struct vm_area_struct *vma, unsigned long addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2180) enum vma_resv_mode mode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2181) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2182) struct resv_map *resv;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2183) pgoff_t idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2184) long ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2185) long dummy_out_regions_needed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2186)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2187) resv = vma_resv_map(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2188) if (!resv)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2189) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2190)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2191) idx = vma_hugecache_offset(h, vma, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2192) switch (mode) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2193) case VMA_NEEDS_RESV:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2194) ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2195) /* We assume that vma_reservation_* routines always operate on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2196) * 1 page, and that adding to resv map a 1 page entry can only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2197) * ever require 1 region.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2198) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2199) VM_BUG_ON(dummy_out_regions_needed != 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2200) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2201) case VMA_COMMIT_RESV:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2202) ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2203) /* region_add calls of range 1 should never fail. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2204) VM_BUG_ON(ret < 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2205) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2206) case VMA_END_RESV:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2207) region_abort(resv, idx, idx + 1, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2208) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2209) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2210) case VMA_ADD_RESV:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2211) if (vma->vm_flags & VM_MAYSHARE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2212) ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2213) /* region_add calls of range 1 should never fail. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2214) VM_BUG_ON(ret < 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2215) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2216) region_abort(resv, idx, idx + 1, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2217) ret = region_del(resv, idx, idx + 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2218) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2219) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2220) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2221) BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2222) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2223)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2224) if (vma->vm_flags & VM_MAYSHARE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2225) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2226) else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2227) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2228) * In most cases, reserves always exist for private mappings.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2229) * However, a file associated with mapping could have been
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2230) * hole punched or truncated after reserves were consumed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2231) * As subsequent fault on such a range will not use reserves.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2232) * Subtle - The reserve map for private mappings has the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2233) * opposite meaning than that of shared mappings. If NO
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2234) * entry is in the reserve map, it means a reservation exists.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2235) * If an entry exists in the reserve map, it means the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2236) * reservation has already been consumed. As a result, the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2237) * return value of this routine is the opposite of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2238) * value returned from reserve map manipulation routines above.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2239) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2240) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2241) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2242) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2243) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2244) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2245) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2246) return ret < 0 ? ret : 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2247) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2248)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2249) static long vma_needs_reservation(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2250) struct vm_area_struct *vma, unsigned long addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2251) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2252) return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2253) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2254)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2255) static long vma_commit_reservation(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2256) struct vm_area_struct *vma, unsigned long addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2257) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2258) return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2259) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2260)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2261) static void vma_end_reservation(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2262) struct vm_area_struct *vma, unsigned long addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2263) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2264) (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2265) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2266)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2267) static long vma_add_reservation(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2268) struct vm_area_struct *vma, unsigned long addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2269) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2270) return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2271) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2272)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2273) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2274) * This routine is called to restore a reservation on error paths. In the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2275) * specific error paths, a huge page was allocated (via alloc_huge_page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2276) * and is about to be freed. If a reservation for the page existed,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2277) * alloc_huge_page would have consumed the reservation and set PagePrivate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2278) * in the newly allocated page. When the page is freed via free_huge_page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2279) * the global reservation count will be incremented if PagePrivate is set.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2280) * However, free_huge_page can not adjust the reserve map. Adjust the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2281) * reserve map here to be consistent with global reserve count adjustments
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2282) * to be made by free_huge_page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2283) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2284) static void restore_reserve_on_error(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2285) struct vm_area_struct *vma, unsigned long address,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2286) struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2287) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2288) if (unlikely(PagePrivate(page))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2289) long rc = vma_needs_reservation(h, vma, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2290)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2291) if (unlikely(rc < 0)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2292) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2293) * Rare out of memory condition in reserve map
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2294) * manipulation. Clear PagePrivate so that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2295) * global reserve count will not be incremented
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2296) * by free_huge_page. This will make it appear
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2297) * as though the reservation for this page was
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2298) * consumed. This may prevent the task from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2299) * faulting in the page at a later time. This
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2300) * is better than inconsistent global huge page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2301) * accounting of reserve counts.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2302) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2303) ClearPagePrivate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2304) } else if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2305) rc = vma_add_reservation(h, vma, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2306) if (unlikely(rc < 0))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2307) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2308) * See above comment about rare out of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2309) * memory condition.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2310) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2311) ClearPagePrivate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2312) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2313) vma_end_reservation(h, vma, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2314) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2315) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2316)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2317) struct page *alloc_huge_page(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2318) unsigned long addr, int avoid_reserve)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2319) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2320) struct hugepage_subpool *spool = subpool_vma(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2321) struct hstate *h = hstate_vma(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2322) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2323) long map_chg, map_commit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2324) long gbl_chg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2325) int ret, idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2326) struct hugetlb_cgroup *h_cg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2327) bool deferred_reserve;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2328)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2329) idx = hstate_index(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2330) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2331) * Examine the region/reserve map to determine if the process
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2332) * has a reservation for the page to be allocated. A return
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2333) * code of zero indicates a reservation exists (no change).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2334) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2335) map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2336) if (map_chg < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2337) return ERR_PTR(-ENOMEM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2338)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2339) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2340) * Processes that did not create the mapping will have no
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2341) * reserves as indicated by the region/reserve map. Check
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2342) * that the allocation will not exceed the subpool limit.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2343) * Allocations for MAP_NORESERVE mappings also need to be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2344) * checked against any subpool limit.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2345) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2346) if (map_chg || avoid_reserve) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2347) gbl_chg = hugepage_subpool_get_pages(spool, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2348) if (gbl_chg < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2349) vma_end_reservation(h, vma, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2350) return ERR_PTR(-ENOSPC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2351) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2352)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2353) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2354) * Even though there was no reservation in the region/reserve
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2355) * map, there could be reservations associated with the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2356) * subpool that can be used. This would be indicated if the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2357) * return value of hugepage_subpool_get_pages() is zero.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2358) * However, if avoid_reserve is specified we still avoid even
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2359) * the subpool reservations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2360) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2361) if (avoid_reserve)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2362) gbl_chg = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2363) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2364)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2365) /* If this allocation is not consuming a reservation, charge it now.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2366) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2367) deferred_reserve = map_chg || avoid_reserve || !vma_resv_map(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2368) if (deferred_reserve) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2369) ret = hugetlb_cgroup_charge_cgroup_rsvd(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2370) idx, pages_per_huge_page(h), &h_cg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2371) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2372) goto out_subpool_put;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2373) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2374)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2375) ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2376) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2377) goto out_uncharge_cgroup_reservation;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2378)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2379) spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2380) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2381) * glb_chg is passed to indicate whether or not a page must be taken
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2382) * from the global free pool (global change). gbl_chg == 0 indicates
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2383) * a reservation exists for the allocation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2384) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2385) page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2386) if (!page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2387) spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2388) page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2389) if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2390) goto out_uncharge_cgroup;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2391) if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2392) SetPagePrivate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2393) h->resv_huge_pages--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2394) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2395) spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2396) list_add(&page->lru, &h->hugepage_activelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2397) /* Fall through */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2398) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2399) hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2400) /* If allocation is not consuming a reservation, also store the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2401) * hugetlb_cgroup pointer on the page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2402) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2403) if (deferred_reserve) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2404) hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2405) h_cg, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2406) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2407)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2408) spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2409)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2410) set_page_private(page, (unsigned long)spool);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2411)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2412) map_commit = vma_commit_reservation(h, vma, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2413) if (unlikely(map_chg > map_commit)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2414) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2415) * The page was added to the reservation map between
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2416) * vma_needs_reservation and vma_commit_reservation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2417) * This indicates a race with hugetlb_reserve_pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2418) * Adjust for the subpool count incremented above AND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2419) * in hugetlb_reserve_pages for the same page. Also,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2420) * the reservation count added in hugetlb_reserve_pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2421) * no longer applies.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2422) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2423) long rsv_adjust;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2424)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2425) rsv_adjust = hugepage_subpool_put_pages(spool, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2426) hugetlb_acct_memory(h, -rsv_adjust);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2427) if (deferred_reserve)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2428) hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2429) pages_per_huge_page(h), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2430) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2431) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2432)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2433) out_uncharge_cgroup:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2434) hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2435) out_uncharge_cgroup_reservation:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2436) if (deferred_reserve)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2437) hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2438) h_cg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2439) out_subpool_put:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2440) if (map_chg || avoid_reserve)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2441) hugepage_subpool_put_pages(spool, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2442) vma_end_reservation(h, vma, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2443) return ERR_PTR(-ENOSPC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2444) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2445)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2446) int alloc_bootmem_huge_page(struct hstate *h)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2447) __attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2448) int __alloc_bootmem_huge_page(struct hstate *h)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2449) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2450) struct huge_bootmem_page *m;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2451) int nr_nodes, node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2452)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2453) for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2454) void *addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2455)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2456) addr = memblock_alloc_try_nid_raw(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2457) huge_page_size(h), huge_page_size(h),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2458) 0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2459) if (addr) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2460) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2461) * Use the beginning of the huge page to store the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2462) * huge_bootmem_page struct (until gather_bootmem
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2463) * puts them into the mem_map).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2464) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2465) m = addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2466) goto found;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2467) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2468) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2469) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2470)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2471) found:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2472) BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2473) /* Put them into a private list first because mem_map is not up yet */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2474) INIT_LIST_HEAD(&m->list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2475) list_add(&m->list, &huge_boot_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2476) m->hstate = h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2477) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2478) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2479)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2480) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2481) * Put bootmem huge pages into the standard lists after mem_map is up.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2482) * Note: This only applies to gigantic (order > MAX_ORDER) pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2483) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2484) static void __init gather_bootmem_prealloc(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2485) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2486) struct huge_bootmem_page *m;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2487)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2488) list_for_each_entry(m, &huge_boot_pages, list) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2489) struct page *page = virt_to_page(m);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2490) struct hstate *h = m->hstate;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2491)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2492) VM_BUG_ON(!hstate_is_gigantic(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2493) WARN_ON(page_count(page) != 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2494) prep_compound_gigantic_page(page, huge_page_order(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2495) WARN_ON(PageReserved(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2496) prep_new_huge_page(h, page, page_to_nid(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2497) put_page(page); /* free it into the hugepage allocator */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2498)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2499) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2500) * We need to restore the 'stolen' pages to totalram_pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2501) * in order to fix confusing memory reports from free(1) and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2502) * other side-effects, like CommitLimit going negative.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2503) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2504) adjust_managed_page_count(page, pages_per_huge_page(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2505) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2506) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2507) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2508)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2509) static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2510) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2511) unsigned long i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2512) nodemask_t *node_alloc_noretry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2513)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2514) if (!hstate_is_gigantic(h)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2515) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2516) * Bit mask controlling how hard we retry per-node allocations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2517) * Ignore errors as lower level routines can deal with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2518) * node_alloc_noretry == NULL. If this kmalloc fails at boot
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2519) * time, we are likely in bigger trouble.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2520) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2521) node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2522) GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2523) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2524) /* allocations done at boot time */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2525) node_alloc_noretry = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2526) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2527)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2528) /* bit mask controlling how hard we retry per-node allocations */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2529) if (node_alloc_noretry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2530) nodes_clear(*node_alloc_noretry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2531)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2532) for (i = 0; i < h->max_huge_pages; ++i) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2533) if (hstate_is_gigantic(h)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2534) if (hugetlb_cma_size) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2535) pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2536) goto free;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2537) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2538) if (!alloc_bootmem_huge_page(h))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2539) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2540) } else if (!alloc_pool_huge_page(h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2541) &node_states[N_MEMORY],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2542) node_alloc_noretry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2543) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2544) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2545) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2546) if (i < h->max_huge_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2547) char buf[32];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2548)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2549) string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2550) pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2551) h->max_huge_pages, buf, i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2552) h->max_huge_pages = i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2553) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2554) free:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2555) kfree(node_alloc_noretry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2556) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2557)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2558) static void __init hugetlb_init_hstates(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2559) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2560) struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2561)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2562) for_each_hstate(h) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2563) if (minimum_order > huge_page_order(h))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2564) minimum_order = huge_page_order(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2565)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2566) /* oversize hugepages were init'ed in early boot */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2567) if (!hstate_is_gigantic(h))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2568) hugetlb_hstate_alloc_pages(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2569) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2570) VM_BUG_ON(minimum_order == UINT_MAX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2571) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2572)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2573) static void __init report_hugepages(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2574) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2575) struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2576)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2577) for_each_hstate(h) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2578) char buf[32];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2579)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2580) string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2581) pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2582) buf, h->free_huge_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2583) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2584) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2585)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2586) #ifdef CONFIG_HIGHMEM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2587) static void try_to_free_low(struct hstate *h, unsigned long count,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2588) nodemask_t *nodes_allowed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2589) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2590) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2591)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2592) if (hstate_is_gigantic(h))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2593) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2594)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2595) for_each_node_mask(i, *nodes_allowed) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2596) struct page *page, *next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2597) struct list_head *freel = &h->hugepage_freelists[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2598) list_for_each_entry_safe(page, next, freel, lru) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2599) if (count >= h->nr_huge_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2600) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2601) if (PageHighMem(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2602) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2603) list_del(&page->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2604) update_and_free_page(h, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2605) h->free_huge_pages--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2606) h->free_huge_pages_node[page_to_nid(page)]--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2607) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2608) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2609) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2610) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2611) static inline void try_to_free_low(struct hstate *h, unsigned long count,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2612) nodemask_t *nodes_allowed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2613) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2614) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2615) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2616)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2617) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2618) * Increment or decrement surplus_huge_pages. Keep node-specific counters
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2619) * balanced by operating on them in a round-robin fashion.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2620) * Returns 1 if an adjustment was made.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2621) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2622) static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2623) int delta)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2624) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2625) int nr_nodes, node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2626)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2627) VM_BUG_ON(delta != -1 && delta != 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2628)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2629) if (delta < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2630) for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2631) if (h->surplus_huge_pages_node[node])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2632) goto found;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2633) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2634) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2635) for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2636) if (h->surplus_huge_pages_node[node] <
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2637) h->nr_huge_pages_node[node])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2638) goto found;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2639) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2640) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2641) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2642)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2643) found:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2644) h->surplus_huge_pages += delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2645) h->surplus_huge_pages_node[node] += delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2646) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2647) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2648)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2649) #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2650) static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2651) nodemask_t *nodes_allowed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2652) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2653) unsigned long min_count, ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2654) NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2655)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2656) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2657) * Bit mask controlling how hard we retry per-node allocations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2658) * If we can not allocate the bit mask, do not attempt to allocate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2659) * the requested huge pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2660) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2661) if (node_alloc_noretry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2662) nodes_clear(*node_alloc_noretry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2663) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2664) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2665)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2666) spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2667)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2668) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2669) * Check for a node specific request.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2670) * Changing node specific huge page count may require a corresponding
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2671) * change to the global count. In any case, the passed node mask
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2672) * (nodes_allowed) will restrict alloc/free to the specified node.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2673) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2674) if (nid != NUMA_NO_NODE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2675) unsigned long old_count = count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2676)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2677) count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2678) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2679) * User may have specified a large count value which caused the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2680) * above calculation to overflow. In this case, they wanted
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2681) * to allocate as many huge pages as possible. Set count to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2682) * largest possible value to align with their intention.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2683) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2684) if (count < old_count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2685) count = ULONG_MAX;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2686) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2687)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2688) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2689) * Gigantic pages runtime allocation depend on the capability for large
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2690) * page range allocation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2691) * If the system does not provide this feature, return an error when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2692) * the user tries to allocate gigantic pages but let the user free the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2693) * boottime allocated gigantic pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2694) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2695) if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2696) if (count > persistent_huge_pages(h)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2697) spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2698) NODEMASK_FREE(node_alloc_noretry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2699) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2700) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2701) /* Fall through to decrease pool */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2702) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2703)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2704) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2705) * Increase the pool size
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2706) * First take pages out of surplus state. Then make up the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2707) * remaining difference by allocating fresh huge pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2708) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2709) * We might race with alloc_surplus_huge_page() here and be unable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2710) * to convert a surplus huge page to a normal huge page. That is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2711) * not critical, though, it just means the overall size of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2712) * pool might be one hugepage larger than it needs to be, but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2713) * within all the constraints specified by the sysctls.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2714) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2715) while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2716) if (!adjust_pool_surplus(h, nodes_allowed, -1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2717) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2718) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2719)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2720) while (count > persistent_huge_pages(h)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2721) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2722) * If this allocation races such that we no longer need the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2723) * page, free_huge_page will handle it by freeing the page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2724) * and reducing the surplus.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2725) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2726) spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2727)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2728) /* yield cpu to avoid soft lockup */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2729) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2730)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2731) ret = alloc_pool_huge_page(h, nodes_allowed,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2732) node_alloc_noretry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2733) spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2734) if (!ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2735) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2736)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2737) /* Bail for signals. Probably ctrl-c from user */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2738) if (signal_pending(current))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2739) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2740) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2741)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2742) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2743) * Decrease the pool size
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2744) * First return free pages to the buddy allocator (being careful
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2745) * to keep enough around to satisfy reservations). Then place
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2746) * pages into surplus state as needed so the pool will shrink
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2747) * to the desired size as pages become free.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2748) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2749) * By placing pages into the surplus state independent of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2750) * overcommit value, we are allowing the surplus pool size to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2751) * exceed overcommit. There are few sane options here. Since
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2752) * alloc_surplus_huge_page() is checking the global counter,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2753) * though, we'll note that we're not allowed to exceed surplus
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2754) * and won't grow the pool anywhere else. Not until one of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2755) * sysctls are changed, or the surplus pages go out of use.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2756) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2757) min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2758) min_count = max(count, min_count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2759) try_to_free_low(h, min_count, nodes_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2760) while (min_count < persistent_huge_pages(h)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2761) if (!free_pool_huge_page(h, nodes_allowed, 0))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2762) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2763) cond_resched_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2764) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2765) while (count < persistent_huge_pages(h)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2766) if (!adjust_pool_surplus(h, nodes_allowed, 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2767) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2768) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2769) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2770) h->max_huge_pages = persistent_huge_pages(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2771) spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2772)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2773) NODEMASK_FREE(node_alloc_noretry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2774)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2775) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2776) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2777)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2778) #define HSTATE_ATTR_RO(_name) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2779) static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2780)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2781) #define HSTATE_ATTR(_name) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2782) static struct kobj_attribute _name##_attr = \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2783) __ATTR(_name, 0644, _name##_show, _name##_store)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2784)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2785) static struct kobject *hugepages_kobj;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2786) static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2787)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2788) static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2789)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2790) static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2791) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2792) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2793)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2794) for (i = 0; i < HUGE_MAX_HSTATE; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2795) if (hstate_kobjs[i] == kobj) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2796) if (nidp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2797) *nidp = NUMA_NO_NODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2798) return &hstates[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2799) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2800)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2801) return kobj_to_node_hstate(kobj, nidp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2802) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2803)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2804) static ssize_t nr_hugepages_show_common(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2805) struct kobj_attribute *attr, char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2806) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2807) struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2808) unsigned long nr_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2809) int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2810)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2811) h = kobj_to_hstate(kobj, &nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2812) if (nid == NUMA_NO_NODE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2813) nr_huge_pages = h->nr_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2814) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2815) nr_huge_pages = h->nr_huge_pages_node[nid];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2816)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2817) return sprintf(buf, "%lu\n", nr_huge_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2818) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2819)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2820) static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2821) struct hstate *h, int nid,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2822) unsigned long count, size_t len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2823) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2824) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2825) nodemask_t nodes_allowed, *n_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2826)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2827) if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2828) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2829)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2830) if (nid == NUMA_NO_NODE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2831) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2832) * global hstate attribute
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2833) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2834) if (!(obey_mempolicy &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2835) init_nodemask_of_mempolicy(&nodes_allowed)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2836) n_mask = &node_states[N_MEMORY];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2837) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2838) n_mask = &nodes_allowed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2839) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2840) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2841) * Node specific request. count adjustment happens in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2842) * set_max_huge_pages() after acquiring hugetlb_lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2843) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2844) init_nodemask_of_node(&nodes_allowed, nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2845) n_mask = &nodes_allowed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2846) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2847)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2848) err = set_max_huge_pages(h, count, nid, n_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2849)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2850) return err ? err : len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2851) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2852)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2853) static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2854) struct kobject *kobj, const char *buf,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2855) size_t len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2856) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2857) struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2858) unsigned long count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2859) int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2860) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2861)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2862) err = kstrtoul(buf, 10, &count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2863) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2864) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2865)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2866) h = kobj_to_hstate(kobj, &nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2867) return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2868) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2869)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2870) static ssize_t nr_hugepages_show(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2871) struct kobj_attribute *attr, char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2872) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2873) return nr_hugepages_show_common(kobj, attr, buf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2874) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2875)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2876) static ssize_t nr_hugepages_store(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2877) struct kobj_attribute *attr, const char *buf, size_t len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2878) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2879) return nr_hugepages_store_common(false, kobj, buf, len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2880) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2881) HSTATE_ATTR(nr_hugepages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2882)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2883) #ifdef CONFIG_NUMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2884)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2885) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2886) * hstate attribute for optionally mempolicy-based constraint on persistent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2887) * huge page alloc/free.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2888) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2889) static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2890) struct kobj_attribute *attr, char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2891) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2892) return nr_hugepages_show_common(kobj, attr, buf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2893) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2894)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2895) static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2896) struct kobj_attribute *attr, const char *buf, size_t len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2897) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2898) return nr_hugepages_store_common(true, kobj, buf, len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2899) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2900) HSTATE_ATTR(nr_hugepages_mempolicy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2901) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2902)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2903)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2904) static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2905) struct kobj_attribute *attr, char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2906) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2907) struct hstate *h = kobj_to_hstate(kobj, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2908) return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2909) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2910)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2911) static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2912) struct kobj_attribute *attr, const char *buf, size_t count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2913) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2914) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2915) unsigned long input;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2916) struct hstate *h = kobj_to_hstate(kobj, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2917)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2918) if (hstate_is_gigantic(h))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2919) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2920)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2921) err = kstrtoul(buf, 10, &input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2922) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2923) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2924)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2925) spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2926) h->nr_overcommit_huge_pages = input;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2927) spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2928)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2929) return count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2930) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2931) HSTATE_ATTR(nr_overcommit_hugepages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2932)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2933) static ssize_t free_hugepages_show(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2934) struct kobj_attribute *attr, char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2935) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2936) struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2937) unsigned long free_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2938) int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2939)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2940) h = kobj_to_hstate(kobj, &nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2941) if (nid == NUMA_NO_NODE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2942) free_huge_pages = h->free_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2943) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2944) free_huge_pages = h->free_huge_pages_node[nid];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2945)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2946) return sprintf(buf, "%lu\n", free_huge_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2947) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2948) HSTATE_ATTR_RO(free_hugepages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2949)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2950) static ssize_t resv_hugepages_show(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2951) struct kobj_attribute *attr, char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2952) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2953) struct hstate *h = kobj_to_hstate(kobj, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2954) return sprintf(buf, "%lu\n", h->resv_huge_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2955) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2956) HSTATE_ATTR_RO(resv_hugepages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2957)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2958) static ssize_t surplus_hugepages_show(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2959) struct kobj_attribute *attr, char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2960) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2961) struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2962) unsigned long surplus_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2963) int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2964)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2965) h = kobj_to_hstate(kobj, &nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2966) if (nid == NUMA_NO_NODE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2967) surplus_huge_pages = h->surplus_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2968) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2969) surplus_huge_pages = h->surplus_huge_pages_node[nid];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2970)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2971) return sprintf(buf, "%lu\n", surplus_huge_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2972) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2973) HSTATE_ATTR_RO(surplus_hugepages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2974)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2975) static struct attribute *hstate_attrs[] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2976) &nr_hugepages_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2977) &nr_overcommit_hugepages_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2978) &free_hugepages_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2979) &resv_hugepages_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2980) &surplus_hugepages_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2981) #ifdef CONFIG_NUMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2982) &nr_hugepages_mempolicy_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2983) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2984) NULL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2985) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2986)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2987) static const struct attribute_group hstate_attr_group = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2988) .attrs = hstate_attrs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2989) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2990)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2991) static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2992) struct kobject **hstate_kobjs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2993) const struct attribute_group *hstate_attr_group)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2994) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2995) int retval;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2996) int hi = hstate_index(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2997)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2998) hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2999) if (!hstate_kobjs[hi])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3000) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3001)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3002) retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3003) if (retval) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3004) kobject_put(hstate_kobjs[hi]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3005) hstate_kobjs[hi] = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3006) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3007)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3008) return retval;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3009) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3010)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3011) static void __init hugetlb_sysfs_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3012) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3013) struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3014) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3015)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3016) hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3017) if (!hugepages_kobj)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3018) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3019)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3020) for_each_hstate(h) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3021) err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3022) hstate_kobjs, &hstate_attr_group);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3023) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3024) pr_err("HugeTLB: Unable to add hstate %s", h->name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3025) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3026) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3027)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3028) #ifdef CONFIG_NUMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3029)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3030) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3031) * node_hstate/s - associate per node hstate attributes, via their kobjects,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3032) * with node devices in node_devices[] using a parallel array. The array
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3033) * index of a node device or _hstate == node id.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3034) * This is here to avoid any static dependency of the node device driver, in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3035) * the base kernel, on the hugetlb module.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3036) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3037) struct node_hstate {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3038) struct kobject *hugepages_kobj;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3039) struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3040) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3041) static struct node_hstate node_hstates[MAX_NUMNODES];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3042)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3043) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3044) * A subset of global hstate attributes for node devices
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3045) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3046) static struct attribute *per_node_hstate_attrs[] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3047) &nr_hugepages_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3048) &free_hugepages_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3049) &surplus_hugepages_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3050) NULL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3051) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3052)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3053) static const struct attribute_group per_node_hstate_attr_group = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3054) .attrs = per_node_hstate_attrs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3055) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3056)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3057) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3058) * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3059) * Returns node id via non-NULL nidp.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3060) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3061) static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3062) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3063) int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3064)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3065) for (nid = 0; nid < nr_node_ids; nid++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3066) struct node_hstate *nhs = &node_hstates[nid];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3067) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3068) for (i = 0; i < HUGE_MAX_HSTATE; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3069) if (nhs->hstate_kobjs[i] == kobj) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3070) if (nidp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3071) *nidp = nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3072) return &hstates[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3073) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3074) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3075)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3076) BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3077) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3078) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3079)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3080) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3081) * Unregister hstate attributes from a single node device.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3082) * No-op if no hstate attributes attached.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3083) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3084) static void hugetlb_unregister_node(struct node *node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3085) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3086) struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3087) struct node_hstate *nhs = &node_hstates[node->dev.id];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3088)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3089) if (!nhs->hugepages_kobj)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3090) return; /* no hstate attributes */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3091)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3092) for_each_hstate(h) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3093) int idx = hstate_index(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3094) if (nhs->hstate_kobjs[idx]) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3095) kobject_put(nhs->hstate_kobjs[idx]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3096) nhs->hstate_kobjs[idx] = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3097) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3098) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3099)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3100) kobject_put(nhs->hugepages_kobj);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3101) nhs->hugepages_kobj = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3102) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3103)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3104)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3105) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3106) * Register hstate attributes for a single node device.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3107) * No-op if attributes already registered.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3108) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3109) static void hugetlb_register_node(struct node *node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3110) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3111) struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3112) struct node_hstate *nhs = &node_hstates[node->dev.id];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3113) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3114)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3115) if (nhs->hugepages_kobj)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3116) return; /* already allocated */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3117)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3118) nhs->hugepages_kobj = kobject_create_and_add("hugepages",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3119) &node->dev.kobj);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3120) if (!nhs->hugepages_kobj)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3121) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3122)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3123) for_each_hstate(h) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3124) err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3125) nhs->hstate_kobjs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3126) &per_node_hstate_attr_group);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3127) if (err) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3128) pr_err("HugeTLB: Unable to add hstate %s for node %d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3129) h->name, node->dev.id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3130) hugetlb_unregister_node(node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3131) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3132) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3133) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3134) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3135)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3136) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3137) * hugetlb init time: register hstate attributes for all registered node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3138) * devices of nodes that have memory. All on-line nodes should have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3139) * registered their associated device by this time.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3140) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3141) static void __init hugetlb_register_all_nodes(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3142) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3143) int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3144)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3145) for_each_node_state(nid, N_MEMORY) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3146) struct node *node = node_devices[nid];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3147) if (node->dev.id == nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3148) hugetlb_register_node(node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3149) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3150)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3151) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3152) * Let the node device driver know we're here so it can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3153) * [un]register hstate attributes on node hotplug.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3154) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3155) register_hugetlbfs_with_node(hugetlb_register_node,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3156) hugetlb_unregister_node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3157) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3158) #else /* !CONFIG_NUMA */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3159)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3160) static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3161) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3162) BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3163) if (nidp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3164) *nidp = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3165) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3166) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3167)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3168) static void hugetlb_register_all_nodes(void) { }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3169)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3170) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3171)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3172) static int __init hugetlb_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3173) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3174) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3175)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3176) if (!hugepages_supported()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3177) if (hugetlb_max_hstate || default_hstate_max_huge_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3178) pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3179) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3180) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3181)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3182) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3183) * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3184) * architectures depend on setup being done here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3185) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3186) hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3187) if (!parsed_default_hugepagesz) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3188) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3189) * If we did not parse a default huge page size, set
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3190) * default_hstate_idx to HPAGE_SIZE hstate. And, if the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3191) * number of huge pages for this default size was implicitly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3192) * specified, set that here as well.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3193) * Note that the implicit setting will overwrite an explicit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3194) * setting. A warning will be printed in this case.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3195) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3196) default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3197) if (default_hstate_max_huge_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3198) if (default_hstate.max_huge_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3199) char buf[32];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3200)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3201) string_get_size(huge_page_size(&default_hstate),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3202) 1, STRING_UNITS_2, buf, 32);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3203) pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3204) default_hstate.max_huge_pages, buf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3205) pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3206) default_hstate_max_huge_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3207) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3208) default_hstate.max_huge_pages =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3209) default_hstate_max_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3210) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3211) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3212)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3213) hugetlb_cma_check();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3214) hugetlb_init_hstates();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3215) gather_bootmem_prealloc();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3216) report_hugepages();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3217)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3218) hugetlb_sysfs_init();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3219) hugetlb_register_all_nodes();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3220) hugetlb_cgroup_file_init();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3221)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3222) #ifdef CONFIG_SMP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3223) num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3224) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3225) num_fault_mutexes = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3226) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3227) hugetlb_fault_mutex_table =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3228) kmalloc_array(num_fault_mutexes, sizeof(struct mutex),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3229) GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3230) BUG_ON(!hugetlb_fault_mutex_table);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3231)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3232) for (i = 0; i < num_fault_mutexes; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3233) mutex_init(&hugetlb_fault_mutex_table[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3234) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3235) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3236) subsys_initcall(hugetlb_init);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3237)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3238) /* Overwritten by architectures with more huge page sizes */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3239) bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3240) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3241) return size == HPAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3242) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3243)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3244) void __init hugetlb_add_hstate(unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3245) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3246) struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3247) unsigned long i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3248)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3249) if (size_to_hstate(PAGE_SIZE << order)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3250) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3251) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3252) BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3253) BUG_ON(order == 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3254) h = &hstates[hugetlb_max_hstate++];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3255) h->order = order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3256) h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3257) h->nr_huge_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3258) h->free_huge_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3259) for (i = 0; i < MAX_NUMNODES; ++i)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3260) INIT_LIST_HEAD(&h->hugepage_freelists[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3261) INIT_LIST_HEAD(&h->hugepage_activelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3262) h->next_nid_to_alloc = first_memory_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3263) h->next_nid_to_free = first_memory_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3264) snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3265) huge_page_size(h)/1024);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3266)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3267) parsed_hstate = h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3268) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3269)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3270) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3271) * hugepages command line processing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3272) * hugepages normally follows a valid hugepagsz or default_hugepagsz
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3273) * specification. If not, ignore the hugepages value. hugepages can also
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3274) * be the first huge page command line option in which case it implicitly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3275) * specifies the number of huge pages for the default size.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3276) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3277) static int __init hugepages_setup(char *s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3278) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3279) unsigned long *mhp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3280) static unsigned long *last_mhp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3281)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3282) if (!parsed_valid_hugepagesz) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3283) pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3284) parsed_valid_hugepagesz = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3285) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3286) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3287)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3288) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3289) * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3290) * yet, so this hugepages= parameter goes to the "default hstate".
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3291) * Otherwise, it goes with the previously parsed hugepagesz or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3292) * default_hugepagesz.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3293) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3294) else if (!hugetlb_max_hstate)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3295) mhp = &default_hstate_max_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3296) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3297) mhp = &parsed_hstate->max_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3298)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3299) if (mhp == last_mhp) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3300) pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3301) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3302) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3303)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3304) if (sscanf(s, "%lu", mhp) <= 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3305) *mhp = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3306)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3307) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3308) * Global state is always initialized later in hugetlb_init.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3309) * But we need to allocate >= MAX_ORDER hstates here early to still
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3310) * use the bootmem allocator.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3311) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3312) if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3313) hugetlb_hstate_alloc_pages(parsed_hstate);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3314)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3315) last_mhp = mhp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3316)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3317) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3318) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3319) __setup("hugepages=", hugepages_setup);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3320)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3321) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3322) * hugepagesz command line processing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3323) * A specific huge page size can only be specified once with hugepagesz.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3324) * hugepagesz is followed by hugepages on the command line. The global
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3325) * variable 'parsed_valid_hugepagesz' is used to determine if prior
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3326) * hugepagesz argument was valid.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3327) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3328) static int __init hugepagesz_setup(char *s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3329) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3330) unsigned long size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3331) struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3332)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3333) parsed_valid_hugepagesz = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3334) size = (unsigned long)memparse(s, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3335)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3336) if (!arch_hugetlb_valid_size(size)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3337) pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3338) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3339) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3340)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3341) h = size_to_hstate(size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3342) if (h) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3343) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3344) * hstate for this size already exists. This is normally
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3345) * an error, but is allowed if the existing hstate is the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3346) * default hstate. More specifically, it is only allowed if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3347) * the number of huge pages for the default hstate was not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3348) * previously specified.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3349) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3350) if (!parsed_default_hugepagesz || h != &default_hstate ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3351) default_hstate.max_huge_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3352) pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3353) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3354) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3355)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3356) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3357) * No need to call hugetlb_add_hstate() as hstate already
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3358) * exists. But, do set parsed_hstate so that a following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3359) * hugepages= parameter will be applied to this hstate.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3360) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3361) parsed_hstate = h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3362) parsed_valid_hugepagesz = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3363) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3364) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3365)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3366) hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3367) parsed_valid_hugepagesz = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3368) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3369) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3370) __setup("hugepagesz=", hugepagesz_setup);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3371)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3372) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3373) * default_hugepagesz command line input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3374) * Only one instance of default_hugepagesz allowed on command line.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3375) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3376) static int __init default_hugepagesz_setup(char *s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3377) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3378) unsigned long size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3379)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3380) parsed_valid_hugepagesz = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3381) if (parsed_default_hugepagesz) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3382) pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3383) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3384) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3385)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3386) size = (unsigned long)memparse(s, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3387)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3388) if (!arch_hugetlb_valid_size(size)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3389) pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3390) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3391) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3392)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3393) hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3394) parsed_valid_hugepagesz = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3395) parsed_default_hugepagesz = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3396) default_hstate_idx = hstate_index(size_to_hstate(size));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3397)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3398) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3399) * The number of default huge pages (for this size) could have been
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3400) * specified as the first hugetlb parameter: hugepages=X. If so,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3401) * then default_hstate_max_huge_pages is set. If the default huge
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3402) * page size is gigantic (>= MAX_ORDER), then the pages must be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3403) * allocated here from bootmem allocator.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3404) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3405) if (default_hstate_max_huge_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3406) default_hstate.max_huge_pages = default_hstate_max_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3407) if (hstate_is_gigantic(&default_hstate))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3408) hugetlb_hstate_alloc_pages(&default_hstate);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3409) default_hstate_max_huge_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3410) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3411)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3412) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3413) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3414) __setup("default_hugepagesz=", default_hugepagesz_setup);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3415)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3416) static unsigned int allowed_mems_nr(struct hstate *h)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3417) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3418) int node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3419) unsigned int nr = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3420) nodemask_t *mpol_allowed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3421) unsigned int *array = h->free_huge_pages_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3422) gfp_t gfp_mask = htlb_alloc_mask(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3423)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3424) mpol_allowed = policy_nodemask_current(gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3425)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3426) for_each_node_mask(node, cpuset_current_mems_allowed) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3427) if (!mpol_allowed ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3428) (mpol_allowed && node_isset(node, *mpol_allowed)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3429) nr += array[node];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3430) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3431)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3432) return nr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3433) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3434)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3435) #ifdef CONFIG_SYSCTL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3436) static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3437) void *buffer, size_t *length,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3438) loff_t *ppos, unsigned long *out)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3439) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3440) struct ctl_table dup_table;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3441)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3442) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3443) * In order to avoid races with __do_proc_doulongvec_minmax(), we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3444) * can duplicate the @table and alter the duplicate of it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3445) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3446) dup_table = *table;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3447) dup_table.data = out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3448)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3449) return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3450) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3451)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3452) static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3453) struct ctl_table *table, int write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3454) void *buffer, size_t *length, loff_t *ppos)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3455) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3456) struct hstate *h = &default_hstate;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3457) unsigned long tmp = h->max_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3458) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3459)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3460) if (!hugepages_supported())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3461) return -EOPNOTSUPP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3462)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3463) ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3464) &tmp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3465) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3466) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3467)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3468) if (write)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3469) ret = __nr_hugepages_store_common(obey_mempolicy, h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3470) NUMA_NO_NODE, tmp, *length);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3471) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3472) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3473) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3474)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3475) int hugetlb_sysctl_handler(struct ctl_table *table, int write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3476) void *buffer, size_t *length, loff_t *ppos)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3477) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3478)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3479) return hugetlb_sysctl_handler_common(false, table, write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3480) buffer, length, ppos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3481) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3482)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3483) #ifdef CONFIG_NUMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3484) int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3485) void *buffer, size_t *length, loff_t *ppos)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3486) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3487) return hugetlb_sysctl_handler_common(true, table, write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3488) buffer, length, ppos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3489) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3490) #endif /* CONFIG_NUMA */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3491)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3492) int hugetlb_overcommit_handler(struct ctl_table *table, int write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3493) void *buffer, size_t *length, loff_t *ppos)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3494) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3495) struct hstate *h = &default_hstate;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3496) unsigned long tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3497) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3498)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3499) if (!hugepages_supported())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3500) return -EOPNOTSUPP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3501)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3502) tmp = h->nr_overcommit_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3503)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3504) if (write && hstate_is_gigantic(h))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3505) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3506)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3507) ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3508) &tmp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3509) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3510) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3511)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3512) if (write) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3513) spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3514) h->nr_overcommit_huge_pages = tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3515) spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3516) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3517) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3518) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3519) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3520)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3521) #endif /* CONFIG_SYSCTL */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3522)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3523) void hugetlb_report_meminfo(struct seq_file *m)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3524) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3525) struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3526) unsigned long total = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3527)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3528) if (!hugepages_supported())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3529) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3530)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3531) for_each_hstate(h) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3532) unsigned long count = h->nr_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3533)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3534) total += (PAGE_SIZE << huge_page_order(h)) * count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3535)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3536) if (h == &default_hstate)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3537) seq_printf(m,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3538) "HugePages_Total: %5lu\n"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3539) "HugePages_Free: %5lu\n"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3540) "HugePages_Rsvd: %5lu\n"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3541) "HugePages_Surp: %5lu\n"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3542) "Hugepagesize: %8lu kB\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3543) count,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3544) h->free_huge_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3545) h->resv_huge_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3546) h->surplus_huge_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3547) (PAGE_SIZE << huge_page_order(h)) / 1024);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3548) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3549)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3550) seq_printf(m, "Hugetlb: %8lu kB\n", total / 1024);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3551) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3552)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3553) int hugetlb_report_node_meminfo(char *buf, int len, int nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3554) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3555) struct hstate *h = &default_hstate;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3556)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3557) if (!hugepages_supported())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3558) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3559)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3560) return sysfs_emit_at(buf, len,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3561) "Node %d HugePages_Total: %5u\n"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3562) "Node %d HugePages_Free: %5u\n"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3563) "Node %d HugePages_Surp: %5u\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3564) nid, h->nr_huge_pages_node[nid],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3565) nid, h->free_huge_pages_node[nid],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3566) nid, h->surplus_huge_pages_node[nid]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3567) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3568)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3569) void hugetlb_show_meminfo(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3570) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3571) struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3572) int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3573)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3574) if (!hugepages_supported())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3575) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3576)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3577) for_each_node_state(nid, N_MEMORY)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3578) for_each_hstate(h)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3579) pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3580) nid,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3581) h->nr_huge_pages_node[nid],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3582) h->free_huge_pages_node[nid],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3583) h->surplus_huge_pages_node[nid],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3584) 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3585) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3586)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3587) void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3588) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3589) seq_printf(m, "HugetlbPages:\t%8lu kB\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3590) atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3591) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3592)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3593) /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3594) unsigned long hugetlb_total_pages(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3595) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3596) struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3597) unsigned long nr_total_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3598)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3599) for_each_hstate(h)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3600) nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3601) return nr_total_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3602) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3603)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3604) static int hugetlb_acct_memory(struct hstate *h, long delta)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3605) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3606) int ret = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3607)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3608) spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3609) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3610) * When cpuset is configured, it breaks the strict hugetlb page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3611) * reservation as the accounting is done on a global variable. Such
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3612) * reservation is completely rubbish in the presence of cpuset because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3613) * the reservation is not checked against page availability for the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3614) * current cpuset. Application can still potentially OOM'ed by kernel
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3615) * with lack of free htlb page in cpuset that the task is in.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3616) * Attempt to enforce strict accounting with cpuset is almost
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3617) * impossible (or too ugly) because cpuset is too fluid that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3618) * task or memory node can be dynamically moved between cpusets.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3619) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3620) * The change of semantics for shared hugetlb mapping with cpuset is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3621) * undesirable. However, in order to preserve some of the semantics,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3622) * we fall back to check against current free page availability as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3623) * a best attempt and hopefully to minimize the impact of changing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3624) * semantics that cpuset has.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3625) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3626) * Apart from cpuset, we also have memory policy mechanism that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3627) * also determines from which node the kernel will allocate memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3628) * in a NUMA system. So similar to cpuset, we also should consider
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3629) * the memory policy of the current task. Similar to the description
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3630) * above.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3631) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3632) if (delta > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3633) if (gather_surplus_pages(h, delta) < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3634) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3635)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3636) if (delta > allowed_mems_nr(h)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3637) return_unused_surplus_pages(h, delta);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3638) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3639) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3640) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3641)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3642) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3643) if (delta < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3644) return_unused_surplus_pages(h, (unsigned long) -delta);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3645)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3646) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3647) spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3648) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3649) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3650)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3651) static void hugetlb_vm_op_open(struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3652) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3653) struct resv_map *resv = vma_resv_map(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3654)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3655) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3656) * This new VMA should share its siblings reservation map if present.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3657) * The VMA will only ever have a valid reservation map pointer where
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3658) * it is being copied for another still existing VMA. As that VMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3659) * has a reference to the reservation map it cannot disappear until
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3660) * after this open call completes. It is therefore safe to take a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3661) * new reference here without additional locking.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3662) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3663) if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3664) resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3665) kref_get(&resv->refs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3666) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3667) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3668)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3669) static void hugetlb_vm_op_close(struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3670) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3671) struct hstate *h = hstate_vma(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3672) struct resv_map *resv = vma_resv_map(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3673) struct hugepage_subpool *spool = subpool_vma(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3674) unsigned long reserve, start, end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3675) long gbl_reserve;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3676)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3677) if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3678) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3679)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3680) start = vma_hugecache_offset(h, vma, vma->vm_start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3681) end = vma_hugecache_offset(h, vma, vma->vm_end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3682)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3683) reserve = (end - start) - region_count(resv, start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3684) hugetlb_cgroup_uncharge_counter(resv, start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3685) if (reserve) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3686) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3687) * Decrement reserve counts. The global reserve count may be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3688) * adjusted if the subpool has a minimum size.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3689) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3690) gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3691) hugetlb_acct_memory(h, -gbl_reserve);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3692) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3693)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3694) kref_put(&resv->refs, resv_map_release);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3695) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3696)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3697) static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3698) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3699) if (addr & ~(huge_page_mask(hstate_vma(vma))))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3700) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3701) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3702) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3703)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3704) static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3705) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3706) struct hstate *hstate = hstate_vma(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3707)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3708) return 1UL << huge_page_shift(hstate);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3709) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3710)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3711) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3712) * We cannot handle pagefaults against hugetlb pages at all. They cause
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3713) * handle_mm_fault() to try to instantiate regular-sized pages in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3714) * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3715) * this far.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3716) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3717) static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3718) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3719) BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3720) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3721) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3722)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3723) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3724) * When a new function is introduced to vm_operations_struct and added
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3725) * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3726) * This is because under System V memory model, mappings created via
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3727) * shmget/shmat with "huge page" specified are backed by hugetlbfs files,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3728) * their original vm_ops are overwritten with shm_vm_ops.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3729) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3730) const struct vm_operations_struct hugetlb_vm_ops = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3731) .fault = hugetlb_vm_op_fault,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3732) .open = hugetlb_vm_op_open,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3733) .close = hugetlb_vm_op_close,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3734) .split = hugetlb_vm_op_split,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3735) .pagesize = hugetlb_vm_op_pagesize,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3736) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3737)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3738) static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3739) int writable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3740) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3741) pte_t entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3742)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3743) if (writable) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3744) entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3745) vma->vm_page_prot)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3746) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3747) entry = huge_pte_wrprotect(mk_huge_pte(page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3748) vma->vm_page_prot));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3749) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3750) entry = pte_mkyoung(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3751) entry = pte_mkhuge(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3752) entry = arch_make_huge_pte(entry, vma, page, writable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3753)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3754) return entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3755) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3756)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3757) static void set_huge_ptep_writable(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3758) unsigned long address, pte_t *ptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3759) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3760) pte_t entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3761)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3762) entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3763) if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3764) update_mmu_cache(vma, address, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3765) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3766)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3767) bool is_hugetlb_entry_migration(pte_t pte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3768) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3769) swp_entry_t swp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3770)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3771) if (huge_pte_none(pte) || pte_present(pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3772) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3773) swp = pte_to_swp_entry(pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3774) if (is_migration_entry(swp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3775) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3776) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3777) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3778) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3779)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3780) static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3781) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3782) swp_entry_t swp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3783)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3784) if (huge_pte_none(pte) || pte_present(pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3785) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3786) swp = pte_to_swp_entry(pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3787) if (is_hwpoison_entry(swp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3788) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3789) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3790) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3791) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3792)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3793) int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3794) struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3795) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3796) pte_t *src_pte, *dst_pte, entry, dst_entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3797) struct page *ptepage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3798) unsigned long addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3799) int cow;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3800) struct hstate *h = hstate_vma(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3801) unsigned long sz = huge_page_size(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3802) struct address_space *mapping = vma->vm_file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3803) struct mmu_notifier_range range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3804) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3805)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3806) cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3807)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3808) if (cow) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3809) mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3810) vma->vm_start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3811) vma->vm_end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3812) mmu_notifier_invalidate_range_start(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3813) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3814) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3815) * For shared mappings i_mmap_rwsem must be held to call
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3816) * huge_pte_alloc, otherwise the returned ptep could go
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3817) * away if part of a shared pmd and another thread calls
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3818) * huge_pmd_unshare.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3819) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3820) i_mmap_lock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3821) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3822)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3823) for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3824) spinlock_t *src_ptl, *dst_ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3825) src_pte = huge_pte_offset(src, addr, sz);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3826) if (!src_pte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3827) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3828) dst_pte = huge_pte_alloc(dst, vma, addr, sz);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3829) if (!dst_pte) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3830) ret = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3831) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3832) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3833)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3834) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3835) * If the pagetables are shared don't copy or take references.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3836) * dst_pte == src_pte is the common case of src/dest sharing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3837) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3838) * However, src could have 'unshared' and dst shares with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3839) * another vma. If dst_pte !none, this implies sharing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3840) * Check here before taking page table lock, and once again
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3841) * after taking the lock below.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3842) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3843) dst_entry = huge_ptep_get(dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3844) if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3845) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3846)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3847) dst_ptl = huge_pte_lock(h, dst, dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3848) src_ptl = huge_pte_lockptr(h, src, src_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3849) spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3850) entry = huge_ptep_get(src_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3851) dst_entry = huge_ptep_get(dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3852) if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3853) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3854) * Skip if src entry none. Also, skip in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3855) * unlikely case dst entry !none as this implies
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3856) * sharing with another vma.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3857) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3858) ;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3859) } else if (unlikely(is_hugetlb_entry_migration(entry) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3860) is_hugetlb_entry_hwpoisoned(entry))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3861) swp_entry_t swp_entry = pte_to_swp_entry(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3862)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3863) if (is_write_migration_entry(swp_entry) && cow) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3864) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3865) * COW mappings require pages in both
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3866) * parent and child to be set to read.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3867) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3868) make_migration_entry_read(&swp_entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3869) entry = swp_entry_to_pte(swp_entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3870) set_huge_swap_pte_at(src, addr, src_pte,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3871) entry, sz);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3872) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3873) set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3874) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3875) if (cow) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3876) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3877) * No need to notify as we are downgrading page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3878) * table protection not changing it to point
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3879) * to a new page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3880) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3881) * See Documentation/vm/mmu_notifier.rst
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3882) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3883) huge_ptep_set_wrprotect(src, addr, src_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3884) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3885) entry = huge_ptep_get(src_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3886) ptepage = pte_page(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3887) get_page(ptepage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3888) page_dup_rmap(ptepage, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3889) set_huge_pte_at(dst, addr, dst_pte, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3890) hugetlb_count_add(pages_per_huge_page(h), dst);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3891) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3892) spin_unlock(src_ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3893) spin_unlock(dst_ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3894) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3895)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3896) if (cow)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3897) mmu_notifier_invalidate_range_end(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3898) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3899) i_mmap_unlock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3900)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3901) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3902) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3903)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3904) void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3905) unsigned long start, unsigned long end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3906) struct page *ref_page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3907) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3908) struct mm_struct *mm = vma->vm_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3909) unsigned long address;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3910) pte_t *ptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3911) pte_t pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3912) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3913) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3914) struct hstate *h = hstate_vma(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3915) unsigned long sz = huge_page_size(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3916) struct mmu_notifier_range range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3917) bool force_flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3918)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3919) WARN_ON(!is_vm_hugetlb_page(vma));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3920) BUG_ON(start & ~huge_page_mask(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3921) BUG_ON(end & ~huge_page_mask(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3922)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3923) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3924) * This is a hugetlb vma, all the pte entries should point
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3925) * to huge page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3926) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3927) tlb_change_page_size(tlb, sz);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3928) tlb_start_vma(tlb, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3929)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3930) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3931) * If sharing possible, alert mmu notifiers of worst case.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3932) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3933) mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3934) end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3935) adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3936) mmu_notifier_invalidate_range_start(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3937) address = start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3938) for (; address < end; address += sz) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3939) ptep = huge_pte_offset(mm, address, sz);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3940) if (!ptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3941) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3942)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3943) ptl = huge_pte_lock(h, mm, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3944) if (huge_pmd_unshare(mm, vma, &address, ptep)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3945) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3946) tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3947) force_flush = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3948) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3949) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3950)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3951) pte = huge_ptep_get(ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3952) if (huge_pte_none(pte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3953) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3954) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3955) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3956)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3957) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3958) * Migrating hugepage or HWPoisoned hugepage is already
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3959) * unmapped and its refcount is dropped, so just clear pte here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3960) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3961) if (unlikely(!pte_present(pte))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3962) huge_pte_clear(mm, address, ptep, sz);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3963) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3964) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3965) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3966)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3967) page = pte_page(pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3968) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3969) * If a reference page is supplied, it is because a specific
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3970) * page is being unmapped, not a range. Ensure the page we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3971) * are about to unmap is the actual page of interest.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3972) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3973) if (ref_page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3974) if (page != ref_page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3975) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3976) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3977) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3978) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3979) * Mark the VMA as having unmapped its page so that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3980) * future faults in this VMA will fail rather than
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3981) * looking like data was lost
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3982) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3983) set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3984) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3985)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3986) pte = huge_ptep_get_and_clear(mm, address, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3987) tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3988) if (huge_pte_dirty(pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3989) set_page_dirty(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3990)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3991) hugetlb_count_sub(pages_per_huge_page(h), mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3992) page_remove_rmap(page, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3993)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3994) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3995) tlb_remove_page_size(tlb, page, huge_page_size(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3996) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3997) * Bail out after unmapping reference page if supplied
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3998) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3999) if (ref_page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4000) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4001) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4002) mmu_notifier_invalidate_range_end(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4003) tlb_end_vma(tlb, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4004)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4005) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4006) * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4007) * could defer the flush until now, since by holding i_mmap_rwsem we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4008) * guaranteed that the last refernece would not be dropped. But we must
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4009) * do the flushing before we return, as otherwise i_mmap_rwsem will be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4010) * dropped and the last reference to the shared PMDs page might be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4011) * dropped as well.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4012) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4013) * In theory we could defer the freeing of the PMD pages as well, but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4014) * huge_pmd_unshare() relies on the exact page_count for the PMD page to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4015) * detect sharing, so we cannot defer the release of the page either.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4016) * Instead, do flush now.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4017) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4018) if (force_flush)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4019) tlb_flush_mmu_tlbonly(tlb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4020) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4021)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4022) void __unmap_hugepage_range_final(struct mmu_gather *tlb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4023) struct vm_area_struct *vma, unsigned long start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4024) unsigned long end, struct page *ref_page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4025) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4026) __unmap_hugepage_range(tlb, vma, start, end, ref_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4027)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4028) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4029) * Clear this flag so that x86's huge_pmd_share page_table_shareable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4030) * test will fail on a vma being torn down, and not grab a page table
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4031) * on its way out. We're lucky that the flag has such an appropriate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4032) * name, and can in fact be safely cleared here. We could clear it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4033) * before the __unmap_hugepage_range above, but all that's necessary
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4034) * is to clear it before releasing the i_mmap_rwsem. This works
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4035) * because in the context this is called, the VMA is about to be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4036) * destroyed and the i_mmap_rwsem is held.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4037) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4038) vma->vm_flags &= ~VM_MAYSHARE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4039) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4040)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4041) void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4042) unsigned long end, struct page *ref_page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4043) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4044) struct mm_struct *mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4045) struct mmu_gather tlb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4046) unsigned long tlb_start = start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4047) unsigned long tlb_end = end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4048)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4049) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4050) * If shared PMDs were possibly used within this vma range, adjust
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4051) * start/end for worst case tlb flushing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4052) * Note that we can not be sure if PMDs are shared until we try to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4053) * unmap pages. However, we want to make sure TLB flushing covers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4054) * the largest possible range.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4055) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4056) adjust_range_if_pmd_sharing_possible(vma, &tlb_start, &tlb_end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4057)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4058) mm = vma->vm_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4059)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4060) tlb_gather_mmu(&tlb, mm, tlb_start, tlb_end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4061) __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4062) tlb_finish_mmu(&tlb, tlb_start, tlb_end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4063) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4064)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4065) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4066) * This is called when the original mapper is failing to COW a MAP_PRIVATE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4067) * mappping it owns the reserve page for. The intention is to unmap the page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4068) * from other VMAs and let the children be SIGKILLed if they are faulting the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4069) * same region.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4070) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4071) static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4072) struct page *page, unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4073) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4074) struct hstate *h = hstate_vma(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4075) struct vm_area_struct *iter_vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4076) struct address_space *mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4077) pgoff_t pgoff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4078)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4079) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4080) * vm_pgoff is in PAGE_SIZE units, hence the different calculation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4081) * from page cache lookup which is in HPAGE_SIZE units.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4082) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4083) address = address & huge_page_mask(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4084) pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4085) vma->vm_pgoff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4086) mapping = vma->vm_file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4087)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4088) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4089) * Take the mapping lock for the duration of the table walk. As
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4090) * this mapping should be shared between all the VMAs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4091) * __unmap_hugepage_range() is called as the lock is already held
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4092) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4093) i_mmap_lock_write(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4094) vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4095) /* Do not unmap the current VMA */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4096) if (iter_vma == vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4097) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4098)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4099) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4100) * Shared VMAs have their own reserves and do not affect
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4101) * MAP_PRIVATE accounting but it is possible that a shared
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4102) * VMA is using the same page so check and skip such VMAs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4103) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4104) if (iter_vma->vm_flags & VM_MAYSHARE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4105) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4106)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4107) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4108) * Unmap the page from other VMAs without their own reserves.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4109) * They get marked to be SIGKILLed if they fault in these
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4110) * areas. This is because a future no-page fault on this VMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4111) * could insert a zeroed page instead of the data existing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4112) * from the time of fork. This would look like data corruption
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4113) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4114) if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4115) unmap_hugepage_range(iter_vma, address,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4116) address + huge_page_size(h), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4117) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4118) i_mmap_unlock_write(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4119) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4120)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4121) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4122) * Hugetlb_cow() should be called with page lock of the original hugepage held.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4123) * Called with hugetlb_instantiation_mutex held and pte_page locked so we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4124) * cannot race with other handlers or page migration.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4125) * Keep the pte_same checks anyway to make transition from the mutex easier.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4126) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4127) static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4128) unsigned long address, pte_t *ptep,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4129) struct page *pagecache_page, spinlock_t *ptl)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4130) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4131) pte_t pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4132) struct hstate *h = hstate_vma(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4133) struct page *old_page, *new_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4134) int outside_reserve = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4135) vm_fault_t ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4136) unsigned long haddr = address & huge_page_mask(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4137) struct mmu_notifier_range range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4138)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4139) pte = huge_ptep_get(ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4140) old_page = pte_page(pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4141)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4142) retry_avoidcopy:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4143) /* If no-one else is actually using this page, avoid the copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4144) * and just make the page writable */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4145) if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4146) page_move_anon_rmap(old_page, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4147) set_huge_ptep_writable(vma, haddr, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4148) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4149) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4150)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4151) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4152) * If the process that created a MAP_PRIVATE mapping is about to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4153) * perform a COW due to a shared page count, attempt to satisfy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4154) * the allocation without using the existing reserves. The pagecache
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4155) * page is used to determine if the reserve at this address was
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4156) * consumed or not. If reserves were used, a partial faulted mapping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4157) * at the time of fork() could consume its reserves on COW instead
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4158) * of the full address range.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4159) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4160) if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4161) old_page != pagecache_page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4162) outside_reserve = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4163)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4164) get_page(old_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4165)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4166) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4167) * Drop page table lock as buddy allocator may be called. It will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4168) * be acquired again before returning to the caller, as expected.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4169) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4170) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4171) new_page = alloc_huge_page(vma, haddr, outside_reserve);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4172)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4173) if (IS_ERR(new_page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4174) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4175) * If a process owning a MAP_PRIVATE mapping fails to COW,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4176) * it is due to references held by a child and an insufficient
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4177) * huge page pool. To guarantee the original mappers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4178) * reliability, unmap the page from child processes. The child
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4179) * may get SIGKILLed if it later faults.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4180) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4181) if (outside_reserve) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4182) struct address_space *mapping = vma->vm_file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4183) pgoff_t idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4184) u32 hash;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4185)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4186) put_page(old_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4187) BUG_ON(huge_pte_none(pte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4188) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4189) * Drop hugetlb_fault_mutex and i_mmap_rwsem before
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4190) * unmapping. unmapping needs to hold i_mmap_rwsem
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4191) * in write mode. Dropping i_mmap_rwsem in read mode
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4192) * here is OK as COW mappings do not interact with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4193) * PMD sharing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4194) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4195) * Reacquire both after unmap operation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4196) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4197) idx = vma_hugecache_offset(h, vma, haddr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4198) hash = hugetlb_fault_mutex_hash(mapping, idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4199) mutex_unlock(&hugetlb_fault_mutex_table[hash]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4200) i_mmap_unlock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4201)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4202) unmap_ref_private(mm, vma, old_page, haddr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4203)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4204) i_mmap_lock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4205) mutex_lock(&hugetlb_fault_mutex_table[hash]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4206) spin_lock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4207) ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4208) if (likely(ptep &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4209) pte_same(huge_ptep_get(ptep), pte)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4210) goto retry_avoidcopy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4211) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4212) * race occurs while re-acquiring page table
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4213) * lock, and our job is done.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4214) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4215) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4216) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4217)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4218) ret = vmf_error(PTR_ERR(new_page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4219) goto out_release_old;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4220) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4221)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4222) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4223) * When the original hugepage is shared one, it does not have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4224) * anon_vma prepared.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4225) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4226) if (unlikely(anon_vma_prepare(vma))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4227) ret = VM_FAULT_OOM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4228) goto out_release_all;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4229) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4230)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4231) copy_user_huge_page(new_page, old_page, address, vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4232) pages_per_huge_page(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4233) __SetPageUptodate(new_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4234)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4235) mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4236) haddr + huge_page_size(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4237) mmu_notifier_invalidate_range_start(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4238)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4239) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4240) * Retake the page table lock to check for racing updates
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4241) * before the page tables are altered
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4242) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4243) spin_lock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4244) ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4245) if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4246) ClearPagePrivate(new_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4247)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4248) /* Break COW */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4249) huge_ptep_clear_flush(vma, haddr, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4250) mmu_notifier_invalidate_range(mm, range.start, range.end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4251) set_huge_pte_at(mm, haddr, ptep,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4252) make_huge_pte(vma, new_page, 1));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4253) page_remove_rmap(old_page, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4254) hugepage_add_new_anon_rmap(new_page, vma, haddr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4255) set_page_huge_active(new_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4256) /* Make the old page be freed below */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4257) new_page = old_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4258) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4259) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4260) mmu_notifier_invalidate_range_end(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4261) out_release_all:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4262) restore_reserve_on_error(h, vma, haddr, new_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4263) put_page(new_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4264) out_release_old:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4265) put_page(old_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4266)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4267) spin_lock(ptl); /* Caller expects lock to be held */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4268) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4269) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4270)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4271) /* Return the pagecache page at a given address within a VMA */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4272) static struct page *hugetlbfs_pagecache_page(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4273) struct vm_area_struct *vma, unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4274) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4275) struct address_space *mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4276) pgoff_t idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4277)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4278) mapping = vma->vm_file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4279) idx = vma_hugecache_offset(h, vma, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4280)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4281) return find_lock_page(mapping, idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4282) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4283)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4284) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4285) * Return whether there is a pagecache page to back given address within VMA.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4286) * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4287) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4288) static bool hugetlbfs_pagecache_present(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4289) struct vm_area_struct *vma, unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4290) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4291) struct address_space *mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4292) pgoff_t idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4293) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4294)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4295) mapping = vma->vm_file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4296) idx = vma_hugecache_offset(h, vma, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4297)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4298) page = find_get_page(mapping, idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4299) if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4300) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4301) return page != NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4302) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4303)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4304) int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4305) pgoff_t idx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4306) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4307) struct inode *inode = mapping->host;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4308) struct hstate *h = hstate_inode(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4309) int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4310)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4311) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4312) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4313) ClearPagePrivate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4314)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4315) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4316) * set page dirty so that it will not be removed from cache/file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4317) * by non-hugetlbfs specific code paths.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4318) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4319) set_page_dirty(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4320)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4321) spin_lock(&inode->i_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4322) inode->i_blocks += blocks_per_huge_page(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4323) spin_unlock(&inode->i_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4324) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4325) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4326)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4327) static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4328) struct address_space *mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4329) pgoff_t idx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4330) unsigned int flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4331) unsigned long haddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4332) unsigned long reason)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4333) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4334) vm_fault_t ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4335) u32 hash;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4336) struct vm_fault vmf = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4337) .vma = vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4338) .address = haddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4339) .flags = flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4340) .vma_flags = vma->vm_flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4341) .vma_page_prot = vma->vm_page_prot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4342)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4343) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4344) * Hard to debug if it ends up being
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4345) * used by a callee that assumes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4346) * something about the other
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4347) * uninitialized fields... same as in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4348) * memory.c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4349) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4350) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4351)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4352) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4353) * hugetlb_fault_mutex and i_mmap_rwsem must be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4354) * dropped before handling userfault. Reacquire
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4355) * after handling fault to make calling code simpler.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4356) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4357) hash = hugetlb_fault_mutex_hash(mapping, idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4358) mutex_unlock(&hugetlb_fault_mutex_table[hash]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4359) i_mmap_unlock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4360) ret = handle_userfault(&vmf, reason);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4361) i_mmap_lock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4362) mutex_lock(&hugetlb_fault_mutex_table[hash]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4363)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4364) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4365) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4366)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4367) static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4368) struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4369) struct address_space *mapping, pgoff_t idx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4370) unsigned long address, pte_t *ptep, unsigned int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4371) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4372) struct hstate *h = hstate_vma(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4373) vm_fault_t ret = VM_FAULT_SIGBUS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4374) int anon_rmap = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4375) unsigned long size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4376) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4377) pte_t new_pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4378) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4379) unsigned long haddr = address & huge_page_mask(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4380) bool new_page = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4381)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4382) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4383) * Currently, we are forced to kill the process in the event the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4384) * original mapper has unmapped pages from the child due to a failed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4385) * COW. Warn that such a situation has occurred as it may not be obvious
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4386) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4387) if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4388) pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4389) current->pid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4390) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4391) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4392)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4393) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4394) * We can not race with truncation due to holding i_mmap_rwsem.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4395) * i_size is modified when holding i_mmap_rwsem, so check here
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4396) * once for faults beyond end of file.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4397) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4398) size = i_size_read(mapping->host) >> huge_page_shift(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4399) if (idx >= size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4400) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4401)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4402) retry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4403) page = find_lock_page(mapping, idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4404) if (!page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4405) /* Check for page in userfault range */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4406) if (userfaultfd_missing(vma)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4407) ret = hugetlb_handle_userfault(vma, mapping, idx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4408) flags, haddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4409) VM_UFFD_MISSING);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4410) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4411) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4412)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4413) page = alloc_huge_page(vma, haddr, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4414) if (IS_ERR(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4415) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4416) * Returning error will result in faulting task being
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4417) * sent SIGBUS. The hugetlb fault mutex prevents two
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4418) * tasks from racing to fault in the same page which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4419) * could result in false unable to allocate errors.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4420) * Page migration does not take the fault mutex, but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4421) * does a clear then write of pte's under page table
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4422) * lock. Page fault code could race with migration,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4423) * notice the clear pte and try to allocate a page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4424) * here. Before returning error, get ptl and make
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4425) * sure there really is no pte entry.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4426) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4427) ptl = huge_pte_lock(h, mm, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4428) if (!huge_pte_none(huge_ptep_get(ptep))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4429) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4430) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4431) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4432) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4433) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4434) ret = vmf_error(PTR_ERR(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4435) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4436) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4437) clear_huge_page(page, address, pages_per_huge_page(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4438) __SetPageUptodate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4439) new_page = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4440)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4441) if (vma->vm_flags & VM_MAYSHARE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4442) int err = huge_add_to_page_cache(page, mapping, idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4443) if (err) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4444) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4445) if (err == -EEXIST)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4446) goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4447) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4448) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4449) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4450) lock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4451) if (unlikely(anon_vma_prepare(vma))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4452) ret = VM_FAULT_OOM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4453) goto backout_unlocked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4454) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4455) anon_rmap = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4456) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4457) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4458) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4459) * If memory error occurs between mmap() and fault, some process
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4460) * don't have hwpoisoned swap entry for errored virtual address.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4461) * So we need to block hugepage fault by PG_hwpoison bit check.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4462) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4463) if (unlikely(PageHWPoison(page))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4464) ret = VM_FAULT_HWPOISON_LARGE |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4465) VM_FAULT_SET_HINDEX(hstate_index(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4466) goto backout_unlocked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4467) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4468)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4469) /* Check for page in userfault range. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4470) if (userfaultfd_minor(vma)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4471) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4472) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4473) ret = hugetlb_handle_userfault(vma, mapping, idx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4474) flags, haddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4475) VM_UFFD_MINOR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4476) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4477) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4478) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4479)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4480) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4481) * If we are going to COW a private mapping later, we examine the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4482) * pending reservations for this page now. This will ensure that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4483) * any allocations necessary to record that reservation occur outside
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4484) * the spinlock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4485) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4486) if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4487) if (vma_needs_reservation(h, vma, haddr) < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4488) ret = VM_FAULT_OOM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4489) goto backout_unlocked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4490) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4491) /* Just decrements count, does not deallocate */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4492) vma_end_reservation(h, vma, haddr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4493) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4494)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4495) ptl = huge_pte_lock(h, mm, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4496) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4497) if (!huge_pte_none(huge_ptep_get(ptep)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4498) goto backout;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4499)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4500) if (anon_rmap) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4501) ClearPagePrivate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4502) hugepage_add_new_anon_rmap(page, vma, haddr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4503) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4504) page_dup_rmap(page, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4505) new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4506) && (vma->vm_flags & VM_SHARED)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4507) set_huge_pte_at(mm, haddr, ptep, new_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4508)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4509) hugetlb_count_add(pages_per_huge_page(h), mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4510) if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4511) /* Optimization, do the COW without a second fault */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4512) ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4513) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4514)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4515) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4516)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4517) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4518) * Only make newly allocated pages active. Existing pages found
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4519) * in the pagecache could be !page_huge_active() if they have been
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4520) * isolated for migration.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4521) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4522) if (new_page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4523) set_page_huge_active(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4524)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4525) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4526) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4527) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4528)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4529) backout:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4530) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4531) backout_unlocked:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4532) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4533) restore_reserve_on_error(h, vma, haddr, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4534) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4535) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4536) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4537)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4538) #ifdef CONFIG_SMP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4539) u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4540) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4541) unsigned long key[2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4542) u32 hash;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4543)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4544) key[0] = (unsigned long) mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4545) key[1] = idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4546)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4547) hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4548)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4549) return hash & (num_fault_mutexes - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4550) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4551) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4552) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4553) * For uniprocesor systems we always use a single mutex, so just
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4554) * return 0 and avoid the hashing overhead.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4555) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4556) u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4557) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4558) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4559) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4560) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4561)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4562) vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4563) unsigned long address, unsigned int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4564) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4565) pte_t *ptep, entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4566) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4567) vm_fault_t ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4568) u32 hash;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4569) pgoff_t idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4570) struct page *page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4571) struct page *pagecache_page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4572) struct hstate *h = hstate_vma(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4573) struct address_space *mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4574) int need_wait_lock = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4575) unsigned long haddr = address & huge_page_mask(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4576)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4577) ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4578) if (ptep) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4579) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4580) * Since we hold no locks, ptep could be stale. That is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4581) * OK as we are only making decisions based on content and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4582) * not actually modifying content here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4583) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4584) entry = huge_ptep_get(ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4585) if (unlikely(is_hugetlb_entry_migration(entry))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4586) migration_entry_wait_huge(vma, mm, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4587) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4588) } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4589) return VM_FAULT_HWPOISON_LARGE |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4590) VM_FAULT_SET_HINDEX(hstate_index(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4591) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4592)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4593) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4594) * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4595) * until finished with ptep. This serves two purposes:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4596) * 1) It prevents huge_pmd_unshare from being called elsewhere
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4597) * and making the ptep no longer valid.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4598) * 2) It synchronizes us with i_size modifications during truncation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4599) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4600) * ptep could have already be assigned via huge_pte_offset. That
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4601) * is OK, as huge_pte_alloc will return the same value unless
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4602) * something has changed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4603) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4604) mapping = vma->vm_file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4605) i_mmap_lock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4606) ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4607) if (!ptep) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4608) i_mmap_unlock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4609) return VM_FAULT_OOM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4610) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4611)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4612) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4613) * Serialize hugepage allocation and instantiation, so that we don't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4614) * get spurious allocation failures if two CPUs race to instantiate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4615) * the same page in the page cache.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4616) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4617) idx = vma_hugecache_offset(h, vma, haddr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4618) hash = hugetlb_fault_mutex_hash(mapping, idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4619) mutex_lock(&hugetlb_fault_mutex_table[hash]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4620)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4621) entry = huge_ptep_get(ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4622) if (huge_pte_none(entry)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4623) ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4624) goto out_mutex;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4625) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4626)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4627) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4628)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4629) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4630) * entry could be a migration/hwpoison entry at this point, so this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4631) * check prevents the kernel from going below assuming that we have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4632) * an active hugepage in pagecache. This goto expects the 2nd page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4633) * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4634) * properly handle it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4635) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4636) if (!pte_present(entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4637) goto out_mutex;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4638)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4639) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4640) * If we are going to COW the mapping later, we examine the pending
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4641) * reservations for this page now. This will ensure that any
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4642) * allocations necessary to record that reservation occur outside the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4643) * spinlock. For private mappings, we also lookup the pagecache
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4644) * page now as it is used to determine if a reservation has been
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4645) * consumed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4646) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4647) if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4648) if (vma_needs_reservation(h, vma, haddr) < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4649) ret = VM_FAULT_OOM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4650) goto out_mutex;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4651) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4652) /* Just decrements count, does not deallocate */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4653) vma_end_reservation(h, vma, haddr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4654)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4655) if (!(vma->vm_flags & VM_MAYSHARE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4656) pagecache_page = hugetlbfs_pagecache_page(h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4657) vma, haddr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4658) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4659)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4660) ptl = huge_pte_lock(h, mm, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4661)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4662) /* Check for a racing update before calling hugetlb_cow */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4663) if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4664) goto out_ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4665)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4666) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4667) * hugetlb_cow() requires page locks of pte_page(entry) and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4668) * pagecache_page, so here we need take the former one
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4669) * when page != pagecache_page or !pagecache_page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4670) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4671) page = pte_page(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4672) if (page != pagecache_page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4673) if (!trylock_page(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4674) need_wait_lock = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4675) goto out_ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4676) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4677)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4678) get_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4679)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4680) if (flags & FAULT_FLAG_WRITE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4681) if (!huge_pte_write(entry)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4682) ret = hugetlb_cow(mm, vma, address, ptep,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4683) pagecache_page, ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4684) goto out_put_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4685) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4686) entry = huge_pte_mkdirty(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4687) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4688) entry = pte_mkyoung(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4689) if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4690) flags & FAULT_FLAG_WRITE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4691) update_mmu_cache(vma, haddr, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4692) out_put_page:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4693) if (page != pagecache_page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4694) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4695) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4696) out_ptl:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4697) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4698)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4699) if (pagecache_page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4700) unlock_page(pagecache_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4701) put_page(pagecache_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4702) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4703) out_mutex:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4704) mutex_unlock(&hugetlb_fault_mutex_table[hash]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4705) i_mmap_unlock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4706) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4707) * Generally it's safe to hold refcount during waiting page lock. But
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4708) * here we just wait to defer the next page fault to avoid busy loop and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4709) * the page is not used after unlocked before returning from the current
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4710) * page fault. So we are safe from accessing freed page, even if we wait
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4711) * here without taking refcount.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4712) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4713) if (need_wait_lock)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4714) wait_on_page_locked(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4715) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4716) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4717)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4718) #ifdef CONFIG_USERFAULTFD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4719) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4720) * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4721) * modifications for huge pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4722) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4723) int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4724) pte_t *dst_pte,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4725) struct vm_area_struct *dst_vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4726) unsigned long dst_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4727) unsigned long src_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4728) enum mcopy_atomic_mode mode,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4729) struct page **pagep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4730) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4731) bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4732) struct address_space *mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4733) pgoff_t idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4734) unsigned long size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4735) int vm_shared = dst_vma->vm_flags & VM_SHARED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4736) struct hstate *h = hstate_vma(dst_vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4737) pte_t _dst_pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4738) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4739) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4740) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4741) int writable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4742)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4743) mapping = dst_vma->vm_file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4744) idx = vma_hugecache_offset(h, dst_vma, dst_addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4745)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4746) if (is_continue) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4747) ret = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4748) page = find_lock_page(mapping, idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4749) if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4750) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4751) } else if (!*pagep) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4752) /* If a page already exists, then it's UFFDIO_COPY for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4753) * a non-missing case. Return -EEXIST.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4754) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4755) if (vm_shared &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4756) hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4757) ret = -EEXIST;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4758) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4759) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4760)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4761) page = alloc_huge_page(dst_vma, dst_addr, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4762) if (IS_ERR(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4763) ret = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4764) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4765) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4766)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4767) ret = copy_huge_page_from_user(page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4768) (const void __user *) src_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4769) pages_per_huge_page(h), false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4770)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4771) /* fallback to copy_from_user outside mmap_lock */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4772) if (unlikely(ret)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4773) ret = -ENOENT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4774) *pagep = page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4775) /* don't free the page */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4776) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4777) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4778) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4779) page = *pagep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4780) *pagep = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4781) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4782)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4783) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4784) * The memory barrier inside __SetPageUptodate makes sure that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4785) * preceding stores to the page contents become visible before
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4786) * the set_pte_at() write.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4787) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4788) __SetPageUptodate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4789)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4790) /* Add shared, newly allocated pages to the page cache. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4791) if (vm_shared && !is_continue) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4792) size = i_size_read(mapping->host) >> huge_page_shift(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4793) ret = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4794) if (idx >= size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4795) goto out_release_nounlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4796)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4797) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4798) * Serialization between remove_inode_hugepages() and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4799) * huge_add_to_page_cache() below happens through the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4800) * hugetlb_fault_mutex_table that here must be hold by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4801) * the caller.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4802) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4803) ret = huge_add_to_page_cache(page, mapping, idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4804) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4805) goto out_release_nounlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4806) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4807)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4808) ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4809) spin_lock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4810)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4811) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4812) * Recheck the i_size after holding PT lock to make sure not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4813) * to leave any page mapped (as page_mapped()) beyond the end
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4814) * of the i_size (remove_inode_hugepages() is strict about
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4815) * enforcing that). If we bail out here, we'll also leave a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4816) * page in the radix tree in the vm_shared case beyond the end
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4817) * of the i_size, but remove_inode_hugepages() will take care
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4818) * of it as soon as we drop the hugetlb_fault_mutex_table.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4819) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4820) size = i_size_read(mapping->host) >> huge_page_shift(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4821) ret = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4822) if (idx >= size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4823) goto out_release_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4824)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4825) ret = -EEXIST;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4826) if (!huge_pte_none(huge_ptep_get(dst_pte)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4827) goto out_release_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4828)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4829) if (vm_shared) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4830) page_dup_rmap(page, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4831) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4832) ClearPagePrivate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4833) hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4834) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4835)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4836) /* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4837) if (is_continue && !vm_shared)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4838) writable = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4839) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4840) writable = dst_vma->vm_flags & VM_WRITE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4841)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4842) _dst_pte = make_huge_pte(dst_vma, page, writable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4843) if (writable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4844) _dst_pte = huge_pte_mkdirty(_dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4845) _dst_pte = pte_mkyoung(_dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4846)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4847) set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4848)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4849) (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4850) dst_vma->vm_flags & VM_WRITE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4851) hugetlb_count_add(pages_per_huge_page(h), dst_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4852)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4853) /* No need to invalidate - it was non-present before */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4854) update_mmu_cache(dst_vma, dst_addr, dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4855)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4856) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4857) if (!is_continue)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4858) set_page_huge_active(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4859) if (vm_shared || is_continue)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4860) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4861) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4862) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4863) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4864) out_release_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4865) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4866) if (vm_shared || is_continue)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4867) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4868) out_release_nounlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4869) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4870) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4871) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4872) #endif /* CONFIG_USERFAULTFD */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4873)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4874) long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4875) struct page **pages, struct vm_area_struct **vmas,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4876) unsigned long *position, unsigned long *nr_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4877) long i, unsigned int flags, int *locked)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4878) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4879) unsigned long pfn_offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4880) unsigned long vaddr = *position;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4881) unsigned long remainder = *nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4882) struct hstate *h = hstate_vma(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4883) int err = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4884)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4885) while (vaddr < vma->vm_end && remainder) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4886) pte_t *pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4887) spinlock_t *ptl = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4888) int absent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4889) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4890)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4891) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4892) * If we have a pending SIGKILL, don't keep faulting pages and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4893) * potentially allocating memory.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4894) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4895) if (fatal_signal_pending(current)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4896) remainder = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4897) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4898) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4899)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4900) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4901) * Some archs (sparc64, sh*) have multiple pte_ts to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4902) * each hugepage. We have to make sure we get the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4903) * first, for the page indexing below to work.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4904) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4905) * Note that page table lock is not held when pte is null.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4906) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4907) pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4908) huge_page_size(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4909) if (pte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4910) ptl = huge_pte_lock(h, mm, pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4911) absent = !pte || huge_pte_none(huge_ptep_get(pte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4912)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4913) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4914) * When coredumping, it suits get_dump_page if we just return
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4915) * an error where there's an empty slot with no huge pagecache
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4916) * to back it. This way, we avoid allocating a hugepage, and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4917) * the sparse dumpfile avoids allocating disk blocks, but its
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4918) * huge holes still show up with zeroes where they need to be.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4919) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4920) if (absent && (flags & FOLL_DUMP) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4921) !hugetlbfs_pagecache_present(h, vma, vaddr)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4922) if (pte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4923) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4924) remainder = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4925) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4926) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4927)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4928) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4929) * We need call hugetlb_fault for both hugepages under migration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4930) * (in which case hugetlb_fault waits for the migration,) and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4931) * hwpoisoned hugepages (in which case we need to prevent the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4932) * caller from accessing to them.) In order to do this, we use
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4933) * here is_swap_pte instead of is_hugetlb_entry_migration and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4934) * is_hugetlb_entry_hwpoisoned. This is because it simply covers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4935) * both cases, and because we can't follow correct pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4936) * directly from any kind of swap entries.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4937) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4938) if (absent || is_swap_pte(huge_ptep_get(pte)) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4939) ((flags & FOLL_WRITE) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4940) !huge_pte_write(huge_ptep_get(pte)))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4941) vm_fault_t ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4942) unsigned int fault_flags = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4943)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4944) if (pte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4945) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4946) if (flags & FOLL_WRITE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4947) fault_flags |= FAULT_FLAG_WRITE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4948) if (locked)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4949) fault_flags |= FAULT_FLAG_ALLOW_RETRY |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4950) FAULT_FLAG_KILLABLE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4951) if (flags & FOLL_NOWAIT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4952) fault_flags |= FAULT_FLAG_ALLOW_RETRY |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4953) FAULT_FLAG_RETRY_NOWAIT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4954) if (flags & FOLL_TRIED) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4955) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4956) * Note: FAULT_FLAG_ALLOW_RETRY and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4957) * FAULT_FLAG_TRIED can co-exist
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4958) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4959) fault_flags |= FAULT_FLAG_TRIED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4960) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4961) ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4962) if (ret & VM_FAULT_ERROR) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4963) err = vm_fault_to_errno(ret, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4964) remainder = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4965) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4966) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4967) if (ret & VM_FAULT_RETRY) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4968) if (locked &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4969) !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4970) *locked = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4971) *nr_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4972) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4973) * VM_FAULT_RETRY must not return an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4974) * error, it will return zero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4975) * instead.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4976) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4977) * No need to update "position" as the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4978) * caller will not check it after
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4979) * *nr_pages is set to 0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4980) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4981) return i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4982) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4983) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4984) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4985)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4986) pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4987) page = pte_page(huge_ptep_get(pte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4988)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4989) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4990) * If subpage information not requested, update counters
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4991) * and skip the same_page loop below.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4992) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4993) if (!pages && !vmas && !pfn_offset &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4994) (vaddr + huge_page_size(h) < vma->vm_end) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4995) (remainder >= pages_per_huge_page(h))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4996) vaddr += huge_page_size(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4997) remainder -= pages_per_huge_page(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4998) i += pages_per_huge_page(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4999) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5000) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5001) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5002)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5003) same_page:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5004) if (pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5005) pages[i] = mem_map_offset(page, pfn_offset);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5006) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5007) * try_grab_page() should always succeed here, because:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5008) * a) we hold the ptl lock, and b) we've just checked
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5009) * that the huge page is present in the page tables. If
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5010) * the huge page is present, then the tail pages must
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5011) * also be present. The ptl prevents the head page and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5012) * tail pages from being rearranged in any way. So this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5013) * page must be available at this point, unless the page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5014) * refcount overflowed:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5015) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5016) if (WARN_ON_ONCE(!try_grab_page(pages[i], flags))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5017) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5018) remainder = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5019) err = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5020) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5021) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5022) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5023)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5024) if (vmas)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5025) vmas[i] = vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5026)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5027) vaddr += PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5028) ++pfn_offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5029) --remainder;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5030) ++i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5031) if (vaddr < vma->vm_end && remainder &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5032) pfn_offset < pages_per_huge_page(h)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5033) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5034) * We use pfn_offset to avoid touching the pageframes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5035) * of this compound page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5036) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5037) goto same_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5038) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5039) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5040) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5041) *nr_pages = remainder;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5042) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5043) * setting position is actually required only if remainder is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5044) * not zero but it's faster not to add a "if (remainder)"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5045) * branch.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5046) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5047) *position = vaddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5048)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5049) return i ? i : err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5050) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5051)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5052) unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5053) unsigned long address, unsigned long end, pgprot_t newprot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5054) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5055) struct mm_struct *mm = vma->vm_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5056) unsigned long start = address;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5057) pte_t *ptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5058) pte_t pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5059) struct hstate *h = hstate_vma(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5060) unsigned long pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5061) bool shared_pmd = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5062) struct mmu_notifier_range range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5063)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5064) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5065) * In the case of shared PMDs, the area to flush could be beyond
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5066) * start/end. Set range.start/range.end to cover the maximum possible
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5067) * range if PMD sharing is possible.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5068) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5069) mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5070) 0, vma, mm, start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5071) adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5072)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5073) BUG_ON(address >= end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5074) flush_cache_range(vma, range.start, range.end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5075)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5076) mmu_notifier_invalidate_range_start(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5077) i_mmap_lock_write(vma->vm_file->f_mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5078) for (; address < end; address += huge_page_size(h)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5079) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5080) ptep = huge_pte_offset(mm, address, huge_page_size(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5081) if (!ptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5082) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5083) ptl = huge_pte_lock(h, mm, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5084) if (huge_pmd_unshare(mm, vma, &address, ptep)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5085) pages++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5086) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5087) shared_pmd = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5088) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5089) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5090) pte = huge_ptep_get(ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5091) if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5092) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5093) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5094) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5095) if (unlikely(is_hugetlb_entry_migration(pte))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5096) swp_entry_t entry = pte_to_swp_entry(pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5097)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5098) if (is_write_migration_entry(entry)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5099) pte_t newpte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5100)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5101) make_migration_entry_read(&entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5102) newpte = swp_entry_to_pte(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5103) set_huge_swap_pte_at(mm, address, ptep,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5104) newpte, huge_page_size(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5105) pages++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5106) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5107) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5108) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5109) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5110) if (!huge_pte_none(pte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5111) pte_t old_pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5112)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5113) old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5114) pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5115) pte = arch_make_huge_pte(pte, vma, NULL, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5116) huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5117) pages++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5118) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5119) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5120) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5121) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5122) * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5123) * may have cleared our pud entry and done put_page on the page table:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5124) * once we release i_mmap_rwsem, another task can do the final put_page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5125) * and that page table be reused and filled with junk. If we actually
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5126) * did unshare a page of pmds, flush the range corresponding to the pud.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5127) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5128) if (shared_pmd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5129) flush_hugetlb_tlb_range(vma, range.start, range.end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5130) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5131) flush_hugetlb_tlb_range(vma, start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5132) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5133) * No need to call mmu_notifier_invalidate_range() we are downgrading
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5134) * page table protection not changing it to point to a new page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5135) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5136) * See Documentation/vm/mmu_notifier.rst
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5137) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5138) i_mmap_unlock_write(vma->vm_file->f_mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5139) mmu_notifier_invalidate_range_end(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5140)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5141) return pages << h->order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5142) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5143)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5144) int hugetlb_reserve_pages(struct inode *inode,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5145) long from, long to,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5146) struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5147) vm_flags_t vm_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5148) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5149) long ret, chg, add = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5150) struct hstate *h = hstate_inode(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5151) struct hugepage_subpool *spool = subpool_inode(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5152) struct resv_map *resv_map;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5153) struct hugetlb_cgroup *h_cg = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5154) long gbl_reserve, regions_needed = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5155)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5156) /* This should never happen */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5157) if (from > to) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5158) VM_WARN(1, "%s called with a negative range\n", __func__);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5159) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5160) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5161)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5162) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5163) * Only apply hugepage reservation if asked. At fault time, an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5164) * attempt will be made for VM_NORESERVE to allocate a page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5165) * without using reserves
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5166) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5167) if (vm_flags & VM_NORESERVE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5168) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5169)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5170) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5171) * Shared mappings base their reservation on the number of pages that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5172) * are already allocated on behalf of the file. Private mappings need
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5173) * to reserve the full area even if read-only as mprotect() may be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5174) * called to make the mapping read-write. Assume !vma is a shm mapping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5175) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5176) if (!vma || vma->vm_flags & VM_MAYSHARE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5177) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5178) * resv_map can not be NULL as hugetlb_reserve_pages is only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5179) * called for inodes for which resv_maps were created (see
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5180) * hugetlbfs_get_inode).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5181) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5182) resv_map = inode_resv_map(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5183)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5184) chg = region_chg(resv_map, from, to, ®ions_needed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5185)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5186) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5187) /* Private mapping. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5188) resv_map = resv_map_alloc();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5189) if (!resv_map)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5190) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5191)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5192) chg = to - from;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5193)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5194) set_vma_resv_map(vma, resv_map);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5195) set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5196) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5197)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5198) if (chg < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5199) ret = chg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5200) goto out_err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5201) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5202)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5203) ret = hugetlb_cgroup_charge_cgroup_rsvd(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5204) hstate_index(h), chg * pages_per_huge_page(h), &h_cg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5205)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5206) if (ret < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5207) ret = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5208) goto out_err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5209) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5210)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5211) if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5212) /* For private mappings, the hugetlb_cgroup uncharge info hangs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5213) * of the resv_map.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5214) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5215) resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5216) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5217)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5218) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5219) * There must be enough pages in the subpool for the mapping. If
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5220) * the subpool has a minimum size, there may be some global
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5221) * reservations already in place (gbl_reserve).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5222) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5223) gbl_reserve = hugepage_subpool_get_pages(spool, chg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5224) if (gbl_reserve < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5225) ret = -ENOSPC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5226) goto out_uncharge_cgroup;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5227) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5228)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5229) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5230) * Check enough hugepages are available for the reservation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5231) * Hand the pages back to the subpool if there are not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5232) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5233) ret = hugetlb_acct_memory(h, gbl_reserve);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5234) if (ret < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5235) goto out_put_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5236) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5237)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5238) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5239) * Account for the reservations made. Shared mappings record regions
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5240) * that have reservations as they are shared by multiple VMAs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5241) * When the last VMA disappears, the region map says how much
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5242) * the reservation was and the page cache tells how much of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5243) * the reservation was consumed. Private mappings are per-VMA and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5244) * only the consumed reservations are tracked. When the VMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5245) * disappears, the original reservation is the VMA size and the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5246) * consumed reservations are stored in the map. Hence, nothing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5247) * else has to be done for private mappings here
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5248) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5249) if (!vma || vma->vm_flags & VM_MAYSHARE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5250) add = region_add(resv_map, from, to, regions_needed, h, h_cg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5251)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5252) if (unlikely(add < 0)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5253) hugetlb_acct_memory(h, -gbl_reserve);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5254) ret = add;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5255) goto out_put_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5256) } else if (unlikely(chg > add)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5257) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5258) * pages in this range were added to the reserve
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5259) * map between region_chg and region_add. This
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5260) * indicates a race with alloc_huge_page. Adjust
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5261) * the subpool and reserve counts modified above
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5262) * based on the difference.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5263) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5264) long rsv_adjust;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5265)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5266) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5267) * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5268) * reference to h_cg->css. See comment below for detail.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5269) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5270) hugetlb_cgroup_uncharge_cgroup_rsvd(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5271) hstate_index(h),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5272) (chg - add) * pages_per_huge_page(h), h_cg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5273)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5274) rsv_adjust = hugepage_subpool_put_pages(spool,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5275) chg - add);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5276) hugetlb_acct_memory(h, -rsv_adjust);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5277) } else if (h_cg) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5278) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5279) * The file_regions will hold their own reference to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5280) * h_cg->css. So we should release the reference held
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5281) * via hugetlb_cgroup_charge_cgroup_rsvd() when we are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5282) * done.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5283) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5284) hugetlb_cgroup_put_rsvd_cgroup(h_cg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5285) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5286) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5287) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5288) out_put_pages:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5289) /* put back original number of pages, chg */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5290) (void)hugepage_subpool_put_pages(spool, chg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5291) out_uncharge_cgroup:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5292) hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5293) chg * pages_per_huge_page(h), h_cg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5294) out_err:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5295) if (!vma || vma->vm_flags & VM_MAYSHARE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5296) /* Only call region_abort if the region_chg succeeded but the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5297) * region_add failed or didn't run.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5298) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5299) if (chg >= 0 && add < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5300) region_abort(resv_map, from, to, regions_needed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5301) if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5302) kref_put(&resv_map->refs, resv_map_release);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5303) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5304) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5305)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5306) long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5307) long freed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5308) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5309) struct hstate *h = hstate_inode(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5310) struct resv_map *resv_map = inode_resv_map(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5311) long chg = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5312) struct hugepage_subpool *spool = subpool_inode(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5313) long gbl_reserve;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5314)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5315) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5316) * Since this routine can be called in the evict inode path for all
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5317) * hugetlbfs inodes, resv_map could be NULL.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5318) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5319) if (resv_map) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5320) chg = region_del(resv_map, start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5321) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5322) * region_del() can fail in the rare case where a region
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5323) * must be split and another region descriptor can not be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5324) * allocated. If end == LONG_MAX, it will not fail.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5325) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5326) if (chg < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5327) return chg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5328) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5329)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5330) spin_lock(&inode->i_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5331) inode->i_blocks -= (blocks_per_huge_page(h) * freed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5332) spin_unlock(&inode->i_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5333)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5334) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5335) * If the subpool has a minimum size, the number of global
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5336) * reservations to be released may be adjusted.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5337) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5338) gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5339) hugetlb_acct_memory(h, -gbl_reserve);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5340)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5341) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5342) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5343)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5344) #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5345) static unsigned long page_table_shareable(struct vm_area_struct *svma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5346) struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5347) unsigned long addr, pgoff_t idx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5348) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5349) unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5350) svma->vm_start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5351) unsigned long sbase = saddr & PUD_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5352) unsigned long s_end = sbase + PUD_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5353)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5354) /* Allow segments to share if only one is marked locked */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5355) unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5356) unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5357)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5358) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5359) * match the virtual addresses, permission and the alignment of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5360) * page table page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5361) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5362) if (pmd_index(addr) != pmd_index(saddr) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5363) vm_flags != svm_flags ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5364) sbase < svma->vm_start || svma->vm_end < s_end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5365) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5366)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5367) return saddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5368) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5369)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5370) static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5371) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5372) unsigned long base = addr & PUD_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5373) unsigned long end = base + PUD_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5374)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5375) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5376) * check on proper vm_flags and page table alignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5377) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5378) if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5379) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5380) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5381) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5382)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5383) bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5384) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5385) #ifdef CONFIG_USERFAULTFD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5386) if (uffd_disable_huge_pmd_share(vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5387) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5388) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5389) return vma_shareable(vma, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5390) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5391)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5392) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5393) * Determine if start,end range within vma could be mapped by shared pmd.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5394) * If yes, adjust start and end to cover range associated with possible
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5395) * shared pmd mappings.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5396) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5397) void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5398) unsigned long *start, unsigned long *end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5399) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5400) unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5401) v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5402)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5403) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5404) * vma need span at least one aligned PUD size and the start,end range
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5405) * must at least partialy within it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5406) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5407) if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5408) (*end <= v_start) || (*start >= v_end))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5409) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5410)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5411) /* Extend the range to be PUD aligned for a worst case scenario */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5412) if (*start > v_start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5413) *start = ALIGN_DOWN(*start, PUD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5414)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5415) if (*end < v_end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5416) *end = ALIGN(*end, PUD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5417) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5418)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5419) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5420) * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5421) * and returns the corresponding pte. While this is not necessary for the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5422) * !shared pmd case because we can allocate the pmd later as well, it makes the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5423) * code much cleaner.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5424) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5425) * This routine must be called with i_mmap_rwsem held in at least read mode if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5426) * sharing is possible. For hugetlbfs, this prevents removal of any page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5427) * table entries associated with the address space. This is important as we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5428) * are setting up sharing based on existing page table entries (mappings).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5429) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5430) * NOTE: This routine is only called from huge_pte_alloc. Some callers of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5431) * huge_pte_alloc know that sharing is not possible and do not take
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5432) * i_mmap_rwsem as a performance optimization. This is handled by the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5433) * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5434) * only required for subsequent processing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5435) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5436) pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5437) unsigned long addr, pud_t *pud)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5438) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5439) struct address_space *mapping = vma->vm_file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5440) pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5441) vma->vm_pgoff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5442) struct vm_area_struct *svma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5443) unsigned long saddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5444) pte_t *spte = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5445) pte_t *pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5446) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5447)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5448) i_mmap_assert_locked(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5449) vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5450) if (svma == vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5451) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5452)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5453) saddr = page_table_shareable(svma, vma, addr, idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5454) if (saddr) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5455) spte = huge_pte_offset(svma->vm_mm, saddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5456) vma_mmu_pagesize(svma));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5457) if (spte) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5458) get_page(virt_to_page(spte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5459) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5460) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5461) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5462) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5463)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5464) if (!spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5465) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5466)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5467) ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5468) if (pud_none(*pud)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5469) pud_populate(mm, pud,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5470) (pmd_t *)((unsigned long)spte & PAGE_MASK));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5471) mm_inc_nr_pmds(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5472) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5473) put_page(virt_to_page(spte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5474) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5475) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5476) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5477) pte = (pte_t *)pmd_alloc(mm, pud, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5478) return pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5479) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5480)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5481) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5482) * unmap huge page backed by shared pte.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5483) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5484) * Hugetlb pte page is ref counted at the time of mapping. If pte is shared
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5485) * indicated by page_count > 1, unmap is achieved by clearing pud and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5486) * decrementing the ref count. If count == 1, the pte page is not shared.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5487) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5488) * Called with page table lock held and i_mmap_rwsem held in write mode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5489) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5490) * returns: 1 successfully unmapped a shared pte page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5491) * 0 the underlying pte page is not shared, or it is the last user
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5492) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5493) int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5494) unsigned long *addr, pte_t *ptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5495) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5496) pgd_t *pgd = pgd_offset(mm, *addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5497) p4d_t *p4d = p4d_offset(pgd, *addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5498) pud_t *pud = pud_offset(p4d, *addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5499)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5500) i_mmap_assert_write_locked(vma->vm_file->f_mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5501) BUG_ON(page_count(virt_to_page(ptep)) == 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5502) if (page_count(virt_to_page(ptep)) == 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5503) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5504)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5505) pud_clear(pud);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5506) put_page(virt_to_page(ptep));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5507) mm_dec_nr_pmds(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5508) *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5509) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5510) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5511)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5512) #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5513) pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5514) unsigned long addr, pud_t *pud)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5515) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5516) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5517) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5518)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5519) int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5520) unsigned long *addr, pte_t *ptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5521) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5522) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5523) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5524)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5525) void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5526) unsigned long *start, unsigned long *end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5527) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5528) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5529)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5530) bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5531) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5532) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5533) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5534) #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5535)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5536) #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5537) pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5538) unsigned long addr, unsigned long sz)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5539) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5540) pgd_t *pgd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5541) p4d_t *p4d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5542) pud_t *pud;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5543) pte_t *pte = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5544)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5545) pgd = pgd_offset(mm, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5546) p4d = p4d_alloc(mm, pgd, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5547) if (!p4d)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5548) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5549) pud = pud_alloc(mm, p4d, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5550) if (pud) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5551) if (sz == PUD_SIZE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5552) pte = (pte_t *)pud;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5553) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5554) BUG_ON(sz != PMD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5555) if (want_pmd_share(vma, addr) && pud_none(*pud))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5556) pte = huge_pmd_share(mm, vma, addr, pud);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5557) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5558) pte = (pte_t *)pmd_alloc(mm, pud, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5559) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5560) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5561) BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5562)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5563) return pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5564) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5565)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5566) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5567) * huge_pte_offset() - Walk the page table to resolve the hugepage
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5568) * entry at address @addr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5569) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5570) * Return: Pointer to page table entry (PUD or PMD) for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5571) * address @addr, or NULL if a !p*d_present() entry is encountered and the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5572) * size @sz doesn't match the hugepage size at this level of the page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5573) * table.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5574) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5575) pte_t *huge_pte_offset(struct mm_struct *mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5576) unsigned long addr, unsigned long sz)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5577) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5578) pgd_t *pgd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5579) p4d_t *p4d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5580) pud_t *pud;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5581) pmd_t *pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5582)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5583) pgd = pgd_offset(mm, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5584) if (!pgd_present(*pgd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5585) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5586) p4d = p4d_offset(pgd, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5587) if (!p4d_present(*p4d))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5588) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5589)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5590) pud = pud_offset(p4d, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5591) if (sz == PUD_SIZE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5592) /* must be pud huge, non-present or none */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5593) return (pte_t *)pud;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5594) if (!pud_present(*pud))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5595) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5596) /* must have a valid entry and size to go further */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5597)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5598) pmd = pmd_offset(pud, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5599) /* must be pmd huge, non-present or none */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5600) return (pte_t *)pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5601) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5602)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5603) #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5604)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5605) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5606) * These functions are overwritable if your architecture needs its own
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5607) * behavior.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5608) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5609) struct page * __weak
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5610) follow_huge_addr(struct mm_struct *mm, unsigned long address,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5611) int write)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5612) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5613) return ERR_PTR(-EINVAL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5614) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5615)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5616) struct page * __weak
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5617) follow_huge_pd(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5618) unsigned long address, hugepd_t hpd, int flags, int pdshift)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5619) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5620) WARN(1, "hugepd follow called with no support for hugepage directory format\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5621) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5622) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5623)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5624) struct page * __weak
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5625) follow_huge_pmd(struct mm_struct *mm, unsigned long address,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5626) pmd_t *pmd, int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5627) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5628) struct page *page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5629) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5630) pte_t pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5631)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5632) /* FOLL_GET and FOLL_PIN are mutually exclusive. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5633) if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5634) (FOLL_PIN | FOLL_GET)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5635) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5636)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5637) retry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5638) ptl = pmd_lockptr(mm, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5639) spin_lock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5640) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5641) * make sure that the address range covered by this pmd is not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5642) * unmapped from other threads.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5643) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5644) if (!pmd_huge(*pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5645) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5646) pte = huge_ptep_get((pte_t *)pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5647) if (pte_present(pte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5648) page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5649) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5650) * try_grab_page() should always succeed here, because: a) we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5651) * hold the pmd (ptl) lock, and b) we've just checked that the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5652) * huge pmd (head) page is present in the page tables. The ptl
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5653) * prevents the head page and tail pages from being rearranged
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5654) * in any way. So this page must be available at this point,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5655) * unless the page refcount overflowed:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5656) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5657) if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5658) page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5659) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5660) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5661) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5662) if (is_hugetlb_entry_migration(pte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5663) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5664) __migration_entry_wait(mm, (pte_t *)pmd, ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5665) goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5666) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5667) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5668) * hwpoisoned entry is treated as no_page_table in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5669) * follow_page_mask().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5670) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5671) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5672) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5673) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5674) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5675) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5676)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5677) struct page * __weak
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5678) follow_huge_pud(struct mm_struct *mm, unsigned long address,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5679) pud_t *pud, int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5680) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5681) if (flags & (FOLL_GET | FOLL_PIN))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5682) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5683)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5684) return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5685) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5686)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5687) struct page * __weak
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5688) follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5689) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5690) if (flags & (FOLL_GET | FOLL_PIN))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5691) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5692)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5693) return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5694) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5695)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5696) bool isolate_huge_page(struct page *page, struct list_head *list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5697) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5698) bool ret = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5699)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5700) spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5701) if (!PageHeadHuge(page) || !page_huge_active(page) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5702) !get_page_unless_zero(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5703) ret = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5704) goto unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5705) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5706) clear_page_huge_active(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5707) list_move_tail(&page->lru, list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5708) unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5709) spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5710) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5711) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5712)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5713) void putback_active_hugepage(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5714) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5715) VM_BUG_ON_PAGE(!PageHead(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5716) spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5717) set_page_huge_active(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5718) list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5719) spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5720) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5721) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5722)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5723) void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5724) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5725) struct hstate *h = page_hstate(oldpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5726)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5727) hugetlb_cgroup_migrate(oldpage, newpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5728) set_page_owner_migrate_reason(newpage, reason);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5729)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5730) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5731) * transfer temporary state of the new huge page. This is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5732) * reverse to other transitions because the newpage is going to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5733) * be final while the old one will be freed so it takes over
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5734) * the temporary status.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5735) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5736) * Also note that we have to transfer the per-node surplus state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5737) * here as well otherwise the global surplus count will not match
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5738) * the per-node's.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5739) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5740) if (PageHugeTemporary(newpage)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5741) int old_nid = page_to_nid(oldpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5742) int new_nid = page_to_nid(newpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5743)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5744) SetPageHugeTemporary(oldpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5745) ClearPageHugeTemporary(newpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5746)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5747) spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5748) if (h->surplus_huge_pages_node[old_nid]) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5749) h->surplus_huge_pages_node[old_nid]--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5750) h->surplus_huge_pages_node[new_nid]++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5751) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5752) spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5753) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5754) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5755)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5756) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5757) * This function will unconditionally remove all the shared pmd pgtable entries
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5758) * within the specific vma for a hugetlbfs memory range.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5759) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5760) void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5761) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5762) struct hstate *h = hstate_vma(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5763) unsigned long sz = huge_page_size(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5764) struct mm_struct *mm = vma->vm_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5765) struct mmu_notifier_range range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5766) unsigned long address, start, end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5767) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5768) pte_t *ptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5769)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5770) if (!(vma->vm_flags & VM_MAYSHARE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5771) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5772)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5773) start = ALIGN(vma->vm_start, PUD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5774) end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5775)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5776) if (start >= end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5777) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5778)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5779) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5780) * No need to call adjust_range_if_pmd_sharing_possible(), because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5781) * we have already done the PUD_SIZE alignment.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5782) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5783) mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5784) start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5785) mmu_notifier_invalidate_range_start(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5786) i_mmap_lock_write(vma->vm_file->f_mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5787) for (address = start; address < end; address += PUD_SIZE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5788) unsigned long tmp = address;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5789)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5790) ptep = huge_pte_offset(mm, address, sz);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5791) if (!ptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5792) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5793) ptl = huge_pte_lock(h, mm, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5794) /* We don't want 'address' to be changed */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5795) huge_pmd_unshare(mm, vma, &tmp, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5796) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5797) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5798) flush_hugetlb_tlb_range(vma, start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5799) i_mmap_unlock_write(vma->vm_file->f_mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5800) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5801) * No need to call mmu_notifier_invalidate_range(), see
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5802) * Documentation/vm/mmu_notifier.rst.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5803) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5804) mmu_notifier_invalidate_range_end(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5805) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5806)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5807) #ifdef CONFIG_CMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5808) static bool cma_reserve_called __initdata;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5809)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5810) static int __init cmdline_parse_hugetlb_cma(char *p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5811) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5812) hugetlb_cma_size = memparse(p, &p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5813) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5814) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5815)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5816) early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5817)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5818) void __init hugetlb_cma_reserve(int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5819) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5820) unsigned long size, reserved, per_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5821) int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5822)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5823) cma_reserve_called = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5824)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5825) if (!hugetlb_cma_size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5826) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5827)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5828) if (hugetlb_cma_size < (PAGE_SIZE << order)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5829) pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5830) (PAGE_SIZE << order) / SZ_1M);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5831) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5832) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5833)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5834) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5835) * If 3 GB area is requested on a machine with 4 numa nodes,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5836) * let's allocate 1 GB on first three nodes and ignore the last one.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5837) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5838) per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5839) pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5840) hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5841)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5842) reserved = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5843) for_each_node_state(nid, N_ONLINE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5844) int res;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5845) char name[CMA_MAX_NAME];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5846)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5847) size = min(per_node, hugetlb_cma_size - reserved);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5848) size = round_up(size, PAGE_SIZE << order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5849)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5850) snprintf(name, sizeof(name), "hugetlb%d", nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5851) res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5852) 0, false, name,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5853) &hugetlb_cma[nid], nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5854) if (res) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5855) pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5856) res, nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5857) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5858) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5859)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5860) reserved += size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5861) pr_info("hugetlb_cma: reserved %lu MiB on node %d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5862) size / SZ_1M, nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5863)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5864) if (reserved >= hugetlb_cma_size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5865) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5866) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5867) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5868)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5869) void __init hugetlb_cma_check(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5870) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5871) if (!hugetlb_cma_size || cma_reserve_called)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5872) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5873)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5874) pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5875) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5876)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5877) #endif /* CONFIG_CMA */