Orange Pi5 kernel

^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    1) // SPDX-License-Identifier: GPL-2.0-only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    3)  * Generic hugetlb support.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    4)  * (C) Nadia Yvette Chambers, April 2004
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    5)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    6) #include <linux/list.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    7) #include <linux/init.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    8) #include <linux/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    9) #include <linux/seq_file.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   10) #include <linux/sysctl.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   11) #include <linux/highmem.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   12) #include <linux/mmu_notifier.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   13) #include <linux/nodemask.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   14) #include <linux/pagemap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   15) #include <linux/mempolicy.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   16) #include <linux/compiler.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   17) #include <linux/cpuset.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   18) #include <linux/mutex.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   19) #include <linux/memblock.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   20) #include <linux/sysfs.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   21) #include <linux/slab.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   22) #include <linux/sched/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   23) #include <linux/mmdebug.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   24) #include <linux/sched/signal.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   25) #include <linux/rmap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   26) #include <linux/string_helpers.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   27) #include <linux/swap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   28) #include <linux/swapops.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   29) #include <linux/jhash.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   30) #include <linux/numa.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   31) #include <linux/llist.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   32) #include <linux/cma.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   33) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   34) #include <asm/page.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   35) #include <asm/pgalloc.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   36) #include <asm/tlb.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   37) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   38) #include <linux/io.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   39) #include <linux/hugetlb.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   40) #include <linux/hugetlb_cgroup.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   41) #include <linux/node.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   42) #include <linux/page_owner.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   43) #include "internal.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   44) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   45) int hugetlb_max_hstate __read_mostly;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   46) unsigned int default_hstate_idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   47) struct hstate hstates[HUGE_MAX_HSTATE];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   48) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   49) #ifdef CONFIG_CMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   50) static struct cma *hugetlb_cma[MAX_NUMNODES];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   51) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   52) static unsigned long hugetlb_cma_size __initdata;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   53) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   54) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   55)  * Minimum page order among possible hugepage sizes, set to a proper value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   56)  * at boot time.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   57)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   58) static unsigned int minimum_order __read_mostly = UINT_MAX;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   59) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   60) __initdata LIST_HEAD(huge_boot_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   61) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   62) /* for command line parsing */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   63) static struct hstate * __initdata parsed_hstate;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   64) static unsigned long __initdata default_hstate_max_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   65) static bool __initdata parsed_valid_hugepagesz = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   66) static bool __initdata parsed_default_hugepagesz;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   67) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   68) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   69)  * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   70)  * free_huge_pages, and surplus_huge_pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   71)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   72) DEFINE_SPINLOCK(hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   73) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   74) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   75)  * Serializes faults on the same logical page.  This is used to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   76)  * prevent spurious OOMs when the hugepage pool is fully utilized.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   77)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   78) static int num_fault_mutexes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   79) struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   80) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   81) static inline bool PageHugeFreed(struct page *head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   82) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   83) 	return page_private(head + 4) == -1UL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   84) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   85) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   86) static inline void SetPageHugeFreed(struct page *head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   87) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   88) 	set_page_private(head + 4, -1UL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   89) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   90) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   91) static inline void ClearPageHugeFreed(struct page *head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   92) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   93) 	set_page_private(head + 4, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   94) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   95) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   96) /* Forward declaration */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   97) static int hugetlb_acct_memory(struct hstate *h, long delta);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   98) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   99) static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  100) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  101) 	bool free = (spool->count == 0) && (spool->used_hpages == 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  102) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  103) 	spin_unlock(&spool->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  104) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  105) 	/* If no pages are used, and no other handles to the subpool
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  106) 	 * remain, give up any reservations based on minimum size and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  107) 	 * free the subpool */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  108) 	if (free) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  109) 		if (spool->min_hpages != -1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  110) 			hugetlb_acct_memory(spool->hstate,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  111) 						-spool->min_hpages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  112) 		kfree(spool);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  113) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  114) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  115) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  116) struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  117) 						long min_hpages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  118) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  119) 	struct hugepage_subpool *spool;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  120) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  121) 	spool = kzalloc(sizeof(*spool), GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  122) 	if (!spool)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  123) 		return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  124) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  125) 	spin_lock_init(&spool->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  126) 	spool->count = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  127) 	spool->max_hpages = max_hpages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  128) 	spool->hstate = h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  129) 	spool->min_hpages = min_hpages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  130) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  131) 	if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  132) 		kfree(spool);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  133) 		return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  134) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  135) 	spool->rsv_hpages = min_hpages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  136) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  137) 	return spool;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  138) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  139) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  140) void hugepage_put_subpool(struct hugepage_subpool *spool)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  141) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  142) 	spin_lock(&spool->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  143) 	BUG_ON(!spool->count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  144) 	spool->count--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  145) 	unlock_or_release_subpool(spool);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  146) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  147) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  148) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  149)  * Subpool accounting for allocating and reserving pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  150)  * Return -ENOMEM if there are not enough resources to satisfy the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  151)  * request.  Otherwise, return the number of pages by which the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  152)  * global pools must be adjusted (upward).  The returned value may
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  153)  * only be different than the passed value (delta) in the case where
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  154)  * a subpool minimum size must be maintained.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  155)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  156) static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  157) 				      long delta)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  158) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  159) 	long ret = delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  160) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  161) 	if (!spool)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  162) 		return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  163) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  164) 	spin_lock(&spool->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  165) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  166) 	if (spool->max_hpages != -1) {		/* maximum size accounting */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  167) 		if ((spool->used_hpages + delta) <= spool->max_hpages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  168) 			spool->used_hpages += delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  169) 		else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  170) 			ret = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  171) 			goto unlock_ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  172) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  173) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  174) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  175) 	/* minimum size accounting */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  176) 	if (spool->min_hpages != -1 && spool->rsv_hpages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  177) 		if (delta > spool->rsv_hpages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  178) 			/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  179) 			 * Asking for more reserves than those already taken on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  180) 			 * behalf of subpool.  Return difference.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  181) 			 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  182) 			ret = delta - spool->rsv_hpages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  183) 			spool->rsv_hpages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  184) 		} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  185) 			ret = 0;	/* reserves already accounted for */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  186) 			spool->rsv_hpages -= delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  187) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  188) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  189) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  190) unlock_ret:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  191) 	spin_unlock(&spool->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  192) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  193) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  194) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  195) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  196)  * Subpool accounting for freeing and unreserving pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  197)  * Return the number of global page reservations that must be dropped.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  198)  * The return value may only be different than the passed value (delta)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  199)  * in the case where a subpool minimum size must be maintained.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  200)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  201) static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  202) 				       long delta)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  203) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  204) 	long ret = delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  205) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  206) 	if (!spool)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  207) 		return delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  208) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  209) 	spin_lock(&spool->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  210) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  211) 	if (spool->max_hpages != -1)		/* maximum size accounting */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  212) 		spool->used_hpages -= delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  213) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  214) 	 /* minimum size accounting */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  215) 	if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  216) 		if (spool->rsv_hpages + delta <= spool->min_hpages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  217) 			ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  218) 		else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  219) 			ret = spool->rsv_hpages + delta - spool->min_hpages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  220) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  221) 		spool->rsv_hpages += delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  222) 		if (spool->rsv_hpages > spool->min_hpages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  223) 			spool->rsv_hpages = spool->min_hpages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  224) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  225) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  226) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  227) 	 * If hugetlbfs_put_super couldn't free spool due to an outstanding
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  228) 	 * quota reference, free it now.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  229) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  230) 	unlock_or_release_subpool(spool);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  231) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  232) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  233) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  234) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  235) static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  236) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  237) 	return HUGETLBFS_SB(inode->i_sb)->spool;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  238) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  239) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  240) static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  241) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  242) 	return subpool_inode(file_inode(vma->vm_file));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  243) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  244) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  245) /* Helper that removes a struct file_region from the resv_map cache and returns
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  246)  * it for use.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  247)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  248) static struct file_region *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  249) get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  250) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  251) 	struct file_region *nrg = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  252) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  253) 	VM_BUG_ON(resv->region_cache_count <= 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  254) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  255) 	resv->region_cache_count--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  256) 	nrg = list_first_entry(&resv->region_cache, struct file_region, link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  257) 	list_del(&nrg->link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  258) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  259) 	nrg->from = from;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  260) 	nrg->to = to;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  261) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  262) 	return nrg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  263) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  264) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  265) static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  266) 					      struct file_region *rg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  267) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  268) #ifdef CONFIG_CGROUP_HUGETLB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  269) 	nrg->reservation_counter = rg->reservation_counter;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  270) 	nrg->css = rg->css;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  271) 	if (rg->css)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  272) 		css_get(rg->css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  273) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  274) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  275) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  276) /* Helper that records hugetlb_cgroup uncharge info. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  277) static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  278) 						struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  279) 						struct resv_map *resv,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  280) 						struct file_region *nrg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  281) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  282) #ifdef CONFIG_CGROUP_HUGETLB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  283) 	if (h_cg) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  284) 		nrg->reservation_counter =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  285) 			&h_cg->rsvd_hugepage[hstate_index(h)];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  286) 		nrg->css = &h_cg->css;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  287) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  288) 		 * The caller will hold exactly one h_cg->css reference for the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  289) 		 * whole contiguous reservation region. But this area might be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  290) 		 * scattered when there are already some file_regions reside in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  291) 		 * it. As a result, many file_regions may share only one css
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  292) 		 * reference. In order to ensure that one file_region must hold
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  293) 		 * exactly one h_cg->css reference, we should do css_get for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  294) 		 * each file_region and leave the reference held by caller
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  295) 		 * untouched.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  296) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  297) 		css_get(&h_cg->css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  298) 		if (!resv->pages_per_hpage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  299) 			resv->pages_per_hpage = pages_per_huge_page(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  300) 		/* pages_per_hpage should be the same for all entries in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  301) 		 * a resv_map.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  302) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  303) 		VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  304) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  305) 		nrg->reservation_counter = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  306) 		nrg->css = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  307) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  308) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  309) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  310) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  311) static void put_uncharge_info(struct file_region *rg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  312) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  313) #ifdef CONFIG_CGROUP_HUGETLB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  314) 	if (rg->css)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  315) 		css_put(rg->css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  316) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  317) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  318) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  319) static bool has_same_uncharge_info(struct file_region *rg,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  320) 				   struct file_region *org)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  321) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  322) #ifdef CONFIG_CGROUP_HUGETLB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  323) 	return rg && org &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  324) 	       rg->reservation_counter == org->reservation_counter &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  325) 	       rg->css == org->css;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  326) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  327) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  328) 	return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  329) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  330) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  331) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  332) static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  333) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  334) 	struct file_region *nrg = NULL, *prg = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  335) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  336) 	prg = list_prev_entry(rg, link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  337) 	if (&prg->link != &resv->regions && prg->to == rg->from &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  338) 	    has_same_uncharge_info(prg, rg)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  339) 		prg->to = rg->to;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  340) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  341) 		list_del(&rg->link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  342) 		put_uncharge_info(rg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  343) 		kfree(rg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  344) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  345) 		rg = prg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  346) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  347) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  348) 	nrg = list_next_entry(rg, link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  349) 	if (&nrg->link != &resv->regions && nrg->from == rg->to &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  350) 	    has_same_uncharge_info(nrg, rg)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  351) 		nrg->from = rg->from;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  352) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  353) 		list_del(&rg->link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  354) 		put_uncharge_info(rg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  355) 		kfree(rg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  356) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  357) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  358) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  359) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  360)  * Must be called with resv->lock held.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  361)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  362)  * Calling this with regions_needed != NULL will count the number of pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  363)  * to be added but will not modify the linked list. And regions_needed will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  364)  * indicate the number of file_regions needed in the cache to carry out to add
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  365)  * the regions for this range.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  366)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  367) static long add_reservation_in_range(struct resv_map *resv, long f, long t,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  368) 				     struct hugetlb_cgroup *h_cg,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  369) 				     struct hstate *h, long *regions_needed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  370) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  371) 	long add = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  372) 	struct list_head *head = &resv->regions;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  373) 	long last_accounted_offset = f;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  374) 	struct file_region *rg = NULL, *trg = NULL, *nrg = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  375) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  376) 	if (regions_needed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  377) 		*regions_needed = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  378) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  379) 	/* In this loop, we essentially handle an entry for the range
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  380) 	 * [last_accounted_offset, rg->from), at every iteration, with some
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  381) 	 * bounds checking.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  382) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  383) 	list_for_each_entry_safe(rg, trg, head, link) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  384) 		/* Skip irrelevant regions that start before our range. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  385) 		if (rg->from < f) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  386) 			/* If this region ends after the last accounted offset,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  387) 			 * then we need to update last_accounted_offset.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  388) 			 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  389) 			if (rg->to > last_accounted_offset)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  390) 				last_accounted_offset = rg->to;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  391) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  392) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  393) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  394) 		/* When we find a region that starts beyond our range, we've
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  395) 		 * finished.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  396) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  397) 		if (rg->from > t)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  398) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  399) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  400) 		/* Add an entry for last_accounted_offset -> rg->from, and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  401) 		 * update last_accounted_offset.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  402) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  403) 		if (rg->from > last_accounted_offset) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  404) 			add += rg->from - last_accounted_offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  405) 			if (!regions_needed) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  406) 				nrg = get_file_region_entry_from_cache(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  407) 					resv, last_accounted_offset, rg->from);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  408) 				record_hugetlb_cgroup_uncharge_info(h_cg, h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  409) 								    resv, nrg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  410) 				list_add(&nrg->link, rg->link.prev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  411) 				coalesce_file_region(resv, nrg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  412) 			} else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  413) 				*regions_needed += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  414) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  415) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  416) 		last_accounted_offset = rg->to;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  417) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  418) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  419) 	/* Handle the case where our range extends beyond
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  420) 	 * last_accounted_offset.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  421) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  422) 	if (last_accounted_offset < t) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  423) 		add += t - last_accounted_offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  424) 		if (!regions_needed) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  425) 			nrg = get_file_region_entry_from_cache(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  426) 				resv, last_accounted_offset, t);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  427) 			record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  428) 			list_add(&nrg->link, rg->link.prev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  429) 			coalesce_file_region(resv, nrg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  430) 		} else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  431) 			*regions_needed += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  432) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  433) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  434) 	VM_BUG_ON(add < 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  435) 	return add;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  436) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  437) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  438) /* Must be called with resv->lock acquired. Will drop lock to allocate entries.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  439)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  440) static int allocate_file_region_entries(struct resv_map *resv,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  441) 					int regions_needed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  442) 	__must_hold(&resv->lock)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  443) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  444) 	struct list_head allocated_regions;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  445) 	int to_allocate = 0, i = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  446) 	struct file_region *trg = NULL, *rg = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  447) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  448) 	VM_BUG_ON(regions_needed < 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  449) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  450) 	INIT_LIST_HEAD(&allocated_regions);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  451) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  452) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  453) 	 * Check for sufficient descriptors in the cache to accommodate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  454) 	 * the number of in progress add operations plus regions_needed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  455) 	 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  456) 	 * This is a while loop because when we drop the lock, some other call
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  457) 	 * to region_add or region_del may have consumed some region_entries,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  458) 	 * so we keep looping here until we finally have enough entries for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  459) 	 * (adds_in_progress + regions_needed).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  460) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  461) 	while (resv->region_cache_count <
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  462) 	       (resv->adds_in_progress + regions_needed)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  463) 		to_allocate = resv->adds_in_progress + regions_needed -
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  464) 			      resv->region_cache_count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  465) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  466) 		/* At this point, we should have enough entries in the cache
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  467) 		 * for all the existings adds_in_progress. We should only be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  468) 		 * needing to allocate for regions_needed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  469) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  470) 		VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  471) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  472) 		spin_unlock(&resv->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  473) 		for (i = 0; i < to_allocate; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  474) 			trg = kmalloc(sizeof(*trg), GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  475) 			if (!trg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  476) 				goto out_of_memory;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  477) 			list_add(&trg->link, &allocated_regions);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  478) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  479) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  480) 		spin_lock(&resv->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  481) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  482) 		list_splice(&allocated_regions, &resv->region_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  483) 		resv->region_cache_count += to_allocate;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  484) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  485) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  486) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  487) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  488) out_of_memory:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  489) 	list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  490) 		list_del(&rg->link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  491) 		kfree(rg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  492) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  493) 	return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  494) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  495) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  496) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  497)  * Add the huge page range represented by [f, t) to the reserve
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  498)  * map.  Regions will be taken from the cache to fill in this range.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  499)  * Sufficient regions should exist in the cache due to the previous
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  500)  * call to region_chg with the same range, but in some cases the cache will not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  501)  * have sufficient entries due to races with other code doing region_add or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  502)  * region_del.  The extra needed entries will be allocated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  503)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  504)  * regions_needed is the out value provided by a previous call to region_chg.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  505)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  506)  * Return the number of new huge pages added to the map.  This number is greater
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  507)  * than or equal to zero.  If file_region entries needed to be allocated for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  508)  * this operation and we were not able to allocate, it returns -ENOMEM.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  509)  * region_add of regions of length 1 never allocate file_regions and cannot
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  510)  * fail; region_chg will always allocate at least 1 entry and a region_add for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  511)  * 1 page will only require at most 1 entry.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  512)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  513) static long region_add(struct resv_map *resv, long f, long t,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  514) 		       long in_regions_needed, struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  515) 		       struct hugetlb_cgroup *h_cg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  516) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  517) 	long add = 0, actual_regions_needed = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  518) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  519) 	spin_lock(&resv->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  520) retry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  521) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  522) 	/* Count how many regions are actually needed to execute this add. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  523) 	add_reservation_in_range(resv, f, t, NULL, NULL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  524) 				 &actual_regions_needed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  525) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  526) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  527) 	 * Check for sufficient descriptors in the cache to accommodate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  528) 	 * this add operation. Note that actual_regions_needed may be greater
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  529) 	 * than in_regions_needed, as the resv_map may have been modified since
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  530) 	 * the region_chg call. In this case, we need to make sure that we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  531) 	 * allocate extra entries, such that we have enough for all the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  532) 	 * existing adds_in_progress, plus the excess needed for this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  533) 	 * operation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  534) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  535) 	if (actual_regions_needed > in_regions_needed &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  536) 	    resv->region_cache_count <
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  537) 		    resv->adds_in_progress +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  538) 			    (actual_regions_needed - in_regions_needed)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  539) 		/* region_add operation of range 1 should never need to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  540) 		 * allocate file_region entries.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  541) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  542) 		VM_BUG_ON(t - f <= 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  543) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  544) 		if (allocate_file_region_entries(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  545) 			    resv, actual_regions_needed - in_regions_needed)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  546) 			return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  547) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  548) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  549) 		goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  550) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  551) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  552) 	add = add_reservation_in_range(resv, f, t, h_cg, h, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  553) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  554) 	resv->adds_in_progress -= in_regions_needed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  555) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  556) 	spin_unlock(&resv->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  557) 	VM_BUG_ON(add < 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  558) 	return add;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  559) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  560) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  561) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  562)  * Examine the existing reserve map and determine how many
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  563)  * huge pages in the specified range [f, t) are NOT currently
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  564)  * represented.  This routine is called before a subsequent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  565)  * call to region_add that will actually modify the reserve
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  566)  * map to add the specified range [f, t).  region_chg does
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  567)  * not change the number of huge pages represented by the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  568)  * map.  A number of new file_region structures is added to the cache as a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  569)  * placeholder, for the subsequent region_add call to use. At least 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  570)  * file_region structure is added.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  571)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  572)  * out_regions_needed is the number of regions added to the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  573)  * resv->adds_in_progress.  This value needs to be provided to a follow up call
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  574)  * to region_add or region_abort for proper accounting.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  575)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  576)  * Returns the number of huge pages that need to be added to the existing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  577)  * reservation map for the range [f, t).  This number is greater or equal to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  578)  * zero.  -ENOMEM is returned if a new file_region structure or cache entry
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  579)  * is needed and can not be allocated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  580)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  581) static long region_chg(struct resv_map *resv, long f, long t,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  582) 		       long *out_regions_needed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  583) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  584) 	long chg = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  585) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  586) 	spin_lock(&resv->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  587) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  588) 	/* Count how many hugepages in this range are NOT represented. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  589) 	chg = add_reservation_in_range(resv, f, t, NULL, NULL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  590) 				       out_regions_needed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  591) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  592) 	if (*out_regions_needed == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  593) 		*out_regions_needed = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  594) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  595) 	if (allocate_file_region_entries(resv, *out_regions_needed))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  596) 		return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  597) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  598) 	resv->adds_in_progress += *out_regions_needed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  599) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  600) 	spin_unlock(&resv->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  601) 	return chg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  602) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  603) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  604) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  605)  * Abort the in progress add operation.  The adds_in_progress field
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  606)  * of the resv_map keeps track of the operations in progress between
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  607)  * calls to region_chg and region_add.  Operations are sometimes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  608)  * aborted after the call to region_chg.  In such cases, region_abort
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  609)  * is called to decrement the adds_in_progress counter. regions_needed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  610)  * is the value returned by the region_chg call, it is used to decrement
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  611)  * the adds_in_progress counter.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  612)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  613)  * NOTE: The range arguments [f, t) are not needed or used in this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  614)  * routine.  They are kept to make reading the calling code easier as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  615)  * arguments will match the associated region_chg call.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  616)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  617) static void region_abort(struct resv_map *resv, long f, long t,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  618) 			 long regions_needed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  619) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  620) 	spin_lock(&resv->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  621) 	VM_BUG_ON(!resv->region_cache_count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  622) 	resv->adds_in_progress -= regions_needed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  623) 	spin_unlock(&resv->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  624) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  625) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  626) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  627)  * Delete the specified range [f, t) from the reserve map.  If the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  628)  * t parameter is LONG_MAX, this indicates that ALL regions after f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  629)  * should be deleted.  Locate the regions which intersect [f, t)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  630)  * and either trim, delete or split the existing regions.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  631)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  632)  * Returns the number of huge pages deleted from the reserve map.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  633)  * In the normal case, the return value is zero or more.  In the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  634)  * case where a region must be split, a new region descriptor must
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  635)  * be allocated.  If the allocation fails, -ENOMEM will be returned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  636)  * NOTE: If the parameter t == LONG_MAX, then we will never split
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  637)  * a region and possibly return -ENOMEM.  Callers specifying
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  638)  * t == LONG_MAX do not need to check for -ENOMEM error.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  639)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  640) static long region_del(struct resv_map *resv, long f, long t)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  641) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  642) 	struct list_head *head = &resv->regions;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  643) 	struct file_region *rg, *trg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  644) 	struct file_region *nrg = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  645) 	long del = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  646) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  647) retry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  648) 	spin_lock(&resv->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  649) 	list_for_each_entry_safe(rg, trg, head, link) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  650) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  651) 		 * Skip regions before the range to be deleted.  file_region
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  652) 		 * ranges are normally of the form [from, to).  However, there
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  653) 		 * may be a "placeholder" entry in the map which is of the form
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  654) 		 * (from, to) with from == to.  Check for placeholder entries
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  655) 		 * at the beginning of the range to be deleted.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  656) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  657) 		if (rg->to <= f && (rg->to != rg->from || rg->to != f))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  658) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  659) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  660) 		if (rg->from >= t)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  661) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  662) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  663) 		if (f > rg->from && t < rg->to) { /* Must split region */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  664) 			/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  665) 			 * Check for an entry in the cache before dropping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  666) 			 * lock and attempting allocation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  667) 			 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  668) 			if (!nrg &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  669) 			    resv->region_cache_count > resv->adds_in_progress) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  670) 				nrg = list_first_entry(&resv->region_cache,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  671) 							struct file_region,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  672) 							link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  673) 				list_del(&nrg->link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  674) 				resv->region_cache_count--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  675) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  676) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  677) 			if (!nrg) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  678) 				spin_unlock(&resv->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  679) 				nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  680) 				if (!nrg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  681) 					return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  682) 				goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  683) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  684) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  685) 			del += t - f;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  686) 			hugetlb_cgroup_uncharge_file_region(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  687) 				resv, rg, t - f, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  688) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  689) 			/* New entry for end of split region */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  690) 			nrg->from = t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  691) 			nrg->to = rg->to;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  692) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  693) 			copy_hugetlb_cgroup_uncharge_info(nrg, rg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  694) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  695) 			INIT_LIST_HEAD(&nrg->link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  696) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  697) 			/* Original entry is trimmed */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  698) 			rg->to = f;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  699) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  700) 			list_add(&nrg->link, &rg->link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  701) 			nrg = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  702) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  703) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  704) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  705) 		if (f <= rg->from && t >= rg->to) { /* Remove entire region */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  706) 			del += rg->to - rg->from;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  707) 			hugetlb_cgroup_uncharge_file_region(resv, rg,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  708) 							    rg->to - rg->from, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  709) 			list_del(&rg->link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  710) 			kfree(rg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  711) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  712) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  713) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  714) 		if (f <= rg->from) {	/* Trim beginning of region */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  715) 			hugetlb_cgroup_uncharge_file_region(resv, rg,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  716) 							    t - rg->from, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  717) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  718) 			del += t - rg->from;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  719) 			rg->from = t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  720) 		} else {		/* Trim end of region */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  721) 			hugetlb_cgroup_uncharge_file_region(resv, rg,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  722) 							    rg->to - f, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  723) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  724) 			del += rg->to - f;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  725) 			rg->to = f;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  726) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  727) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  728) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  729) 	spin_unlock(&resv->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  730) 	kfree(nrg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  731) 	return del;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  732) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  733) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  734) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  735)  * A rare out of memory error was encountered which prevented removal of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  736)  * the reserve map region for a page.  The huge page itself was free'ed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  737)  * and removed from the page cache.  This routine will adjust the subpool
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  738)  * usage count, and the global reserve count if needed.  By incrementing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  739)  * these counts, the reserve map entry which could not be deleted will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  740)  * appear as a "reserved" entry instead of simply dangling with incorrect
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  741)  * counts.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  742)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  743) void hugetlb_fix_reserve_counts(struct inode *inode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  744) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  745) 	struct hugepage_subpool *spool = subpool_inode(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  746) 	long rsv_adjust;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  747) 	bool reserved = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  748) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  749) 	rsv_adjust = hugepage_subpool_get_pages(spool, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  750) 	if (rsv_adjust > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  751) 		struct hstate *h = hstate_inode(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  752) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  753) 		if (!hugetlb_acct_memory(h, 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  754) 			reserved = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  755) 	} else if (!rsv_adjust) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  756) 		reserved = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  757) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  758) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  759) 	if (!reserved)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  760) 		pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  761) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  762) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  763) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  764)  * Count and return the number of huge pages in the reserve map
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  765)  * that intersect with the range [f, t).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  766)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  767) static long region_count(struct resv_map *resv, long f, long t)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  768) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  769) 	struct list_head *head = &resv->regions;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  770) 	struct file_region *rg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  771) 	long chg = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  772) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  773) 	spin_lock(&resv->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  774) 	/* Locate each segment we overlap with, and count that overlap. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  775) 	list_for_each_entry(rg, head, link) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  776) 		long seg_from;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  777) 		long seg_to;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  778) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  779) 		if (rg->to <= f)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  780) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  781) 		if (rg->from >= t)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  782) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  783) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  784) 		seg_from = max(rg->from, f);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  785) 		seg_to = min(rg->to, t);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  786) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  787) 		chg += seg_to - seg_from;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  788) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  789) 	spin_unlock(&resv->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  790) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  791) 	return chg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  792) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  793) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  794) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  795)  * Convert the address within this vma to the page offset within
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  796)  * the mapping, in pagecache page units; huge pages here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  797)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  798) static pgoff_t vma_hugecache_offset(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  799) 			struct vm_area_struct *vma, unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  800) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  801) 	return ((address - vma->vm_start) >> huge_page_shift(h)) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  802) 			(vma->vm_pgoff >> huge_page_order(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  803) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  804) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  805) pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  806) 				     unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  807) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  808) 	return vma_hugecache_offset(hstate_vma(vma), vma, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  809) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  810) EXPORT_SYMBOL_GPL(linear_hugepage_index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  811) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  812) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  813)  * Return the size of the pages allocated when backing a VMA. In the majority
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  814)  * cases this will be same size as used by the page table entries.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  815)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  816) unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  817) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  818) 	if (vma->vm_ops && vma->vm_ops->pagesize)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  819) 		return vma->vm_ops->pagesize(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  820) 	return PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  821) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  822) EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  823) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  824) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  825)  * Return the page size being used by the MMU to back a VMA. In the majority
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  826)  * of cases, the page size used by the kernel matches the MMU size. On
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  827)  * architectures where it differs, an architecture-specific 'strong'
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  828)  * version of this symbol is required.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  829)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  830) __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  831) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  832) 	return vma_kernel_pagesize(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  833) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  834) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  835) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  836)  * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  837)  * bits of the reservation map pointer, which are always clear due to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  838)  * alignment.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  839)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  840) #define HPAGE_RESV_OWNER    (1UL << 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  841) #define HPAGE_RESV_UNMAPPED (1UL << 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  842) #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  843) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  844) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  845)  * These helpers are used to track how many pages are reserved for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  846)  * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  847)  * is guaranteed to have their future faults succeed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  848)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  849)  * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  850)  * the reserve counters are updated with the hugetlb_lock held. It is safe
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  851)  * to reset the VMA at fork() time as it is not in use yet and there is no
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  852)  * chance of the global counters getting corrupted as a result of the values.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  853)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  854)  * The private mapping reservation is represented in a subtly different
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  855)  * manner to a shared mapping.  A shared mapping has a region map associated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  856)  * with the underlying file, this region map represents the backing file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  857)  * pages which have ever had a reservation assigned which this persists even
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  858)  * after the page is instantiated.  A private mapping has a region map
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  859)  * associated with the original mmap which is attached to all VMAs which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  860)  * reference it, this region map represents those offsets which have consumed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  861)  * reservation ie. where pages have been instantiated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  862)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  863) static unsigned long get_vma_private_data(struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  864) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  865) 	return (unsigned long)vma->vm_private_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  866) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  867) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  868) static void set_vma_private_data(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  869) 							unsigned long value)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  870) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  871) 	vma->vm_private_data = (void *)value;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  872) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  873) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  874) static void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  875) resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  876) 					  struct hugetlb_cgroup *h_cg,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  877) 					  struct hstate *h)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  878) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  879) #ifdef CONFIG_CGROUP_HUGETLB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  880) 	if (!h_cg || !h) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  881) 		resv_map->reservation_counter = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  882) 		resv_map->pages_per_hpage = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  883) 		resv_map->css = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  884) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  885) 		resv_map->reservation_counter =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  886) 			&h_cg->rsvd_hugepage[hstate_index(h)];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  887) 		resv_map->pages_per_hpage = pages_per_huge_page(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  888) 		resv_map->css = &h_cg->css;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  889) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  890) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  891) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  892) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  893) struct resv_map *resv_map_alloc(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  894) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  895) 	struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  896) 	struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  897) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  898) 	if (!resv_map || !rg) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  899) 		kfree(resv_map);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  900) 		kfree(rg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  901) 		return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  902) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  903) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  904) 	kref_init(&resv_map->refs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  905) 	spin_lock_init(&resv_map->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  906) 	INIT_LIST_HEAD(&resv_map->regions);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  907) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  908) 	resv_map->adds_in_progress = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  909) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  910) 	 * Initialize these to 0. On shared mappings, 0's here indicate these
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  911) 	 * fields don't do cgroup accounting. On private mappings, these will be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  912) 	 * re-initialized to the proper values, to indicate that hugetlb cgroup
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  913) 	 * reservations are to be un-charged from here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  914) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  915) 	resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  916) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  917) 	INIT_LIST_HEAD(&resv_map->region_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  918) 	list_add(&rg->link, &resv_map->region_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  919) 	resv_map->region_cache_count = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  920) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  921) 	return resv_map;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  922) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  923) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  924) void resv_map_release(struct kref *ref)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  925) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  926) 	struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  927) 	struct list_head *head = &resv_map->region_cache;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  928) 	struct file_region *rg, *trg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  929) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  930) 	/* Clear out any active regions before we release the map. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  931) 	region_del(resv_map, 0, LONG_MAX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  932) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  933) 	/* ... and any entries left in the cache */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  934) 	list_for_each_entry_safe(rg, trg, head, link) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  935) 		list_del(&rg->link);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  936) 		kfree(rg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  937) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  938) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  939) 	VM_BUG_ON(resv_map->adds_in_progress);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  940) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  941) 	kfree(resv_map);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  942) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  943) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  944) static inline struct resv_map *inode_resv_map(struct inode *inode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  945) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  946) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  947) 	 * At inode evict time, i_mapping may not point to the original
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  948) 	 * address space within the inode.  This original address space
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  949) 	 * contains the pointer to the resv_map.  So, always use the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  950) 	 * address space embedded within the inode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  951) 	 * The VERY common case is inode->mapping == &inode->i_data but,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  952) 	 * this may not be true for device special inodes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  953) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  954) 	return (struct resv_map *)(&inode->i_data)->private_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  955) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  956) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  957) static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  958) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  959) 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  960) 	if (vma->vm_flags & VM_MAYSHARE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  961) 		struct address_space *mapping = vma->vm_file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  962) 		struct inode *inode = mapping->host;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  963) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  964) 		return inode_resv_map(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  965) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  966) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  967) 		return (struct resv_map *)(get_vma_private_data(vma) &
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  968) 							~HPAGE_RESV_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  969) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  970) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  971) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  972) static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  973) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  974) 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  975) 	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  976) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  977) 	set_vma_private_data(vma, (get_vma_private_data(vma) &
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  978) 				HPAGE_RESV_MASK) | (unsigned long)map);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  979) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  980) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  981) static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  982) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  983) 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  984) 	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  985) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  986) 	set_vma_private_data(vma, get_vma_private_data(vma) | flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  987) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  988) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  989) static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  990) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  991) 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  992) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  993) 	return (get_vma_private_data(vma) & flag) != 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  994) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  995) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  996) /* Reset counters to 0 and clear all HPAGE_RESV_* flags */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  997) void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  998) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  999) 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) 	if (!(vma->vm_flags & VM_MAYSHARE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) 		vma->vm_private_data = (void *)0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) /* Returns true if the VMA has associated reserve pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) 	if (vma->vm_flags & VM_NORESERVE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) 		 * This address is already reserved by other process(chg == 0),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) 		 * so, we should decrement reserved count. Without decrementing,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) 		 * reserve count remains after releasing inode, because this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) 		 * allocated page will go into page cache and is regarded as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) 		 * coming from reserved pool in releasing step.  Currently, we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) 		 * don't have any other solution to deal with this situation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) 		 * properly, so add work-around here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) 		if (vma->vm_flags & VM_MAYSHARE && chg == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) 			return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) 		else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) 			return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) 	/* Shared mappings always use reserves */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) 	if (vma->vm_flags & VM_MAYSHARE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) 		 * We know VM_NORESERVE is not set.  Therefore, there SHOULD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) 		 * be a region map for all pages.  The only situation where
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) 		 * there is no region map is if a hole was punched via
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) 		 * fallocate.  In this case, there really are no reserves to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) 		 * use.  This situation is indicated if chg != 0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) 		if (chg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) 			return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) 		else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) 			return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) 	 * Only the process that called mmap() has reserves for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) 	 * private mappings.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) 	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) 		 * Like the shared case above, a hole punch or truncate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) 		 * could have been performed on the private mapping.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) 		 * Examine the value of chg to determine if reserves
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) 		 * actually exist or were previously consumed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) 		 * Very Subtle - The value of chg comes from a previous
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) 		 * call to vma_needs_reserves().  The reserve map for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) 		 * private mappings has different (opposite) semantics
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) 		 * than that of shared mappings.  vma_needs_reserves()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) 		 * has already taken this difference in semantics into
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) 		 * account.  Therefore, the meaning of chg is the same
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) 		 * as in the shared case above.  Code could easily be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) 		 * combined, but keeping it separate draws attention to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) 		 * subtle differences.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) 		if (chg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) 			return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) 		else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061) 			return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) 	return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) static void enqueue_huge_page(struct hstate *h, struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) 	int nid = page_to_nid(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) 	list_move(&page->lru, &h->hugepage_freelists[nid]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) 	h->free_huge_pages++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) 	h->free_huge_pages_node[nid]++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073) 	SetPageHugeFreed(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) 	struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) 	bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081) 	list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) 		if (nocma && is_migrate_cma_page(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) 		if (PageHWPoison(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) 		list_move(&page->lru, &h->hugepage_activelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089) 		set_page_refcounted(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) 		ClearPageHugeFreed(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) 		h->free_huge_pages--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) 		h->free_huge_pages_node[nid]--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) 		return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) 	return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) 		nodemask_t *nmask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) 	unsigned int cpuset_mems_cookie;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) 	struct zonelist *zonelist;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) 	struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105) 	struct zoneref *z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) 	int node = NUMA_NO_NODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) 	zonelist = node_zonelist(nid, gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110) retry_cpuset:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) 	cpuset_mems_cookie = read_mems_allowed_begin();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) 	for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) 		struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) 		if (!cpuset_zone_allowed(zone, gfp_mask))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) 		 * no need to ask again on the same node. Pool is node rather than
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) 		 * zone aware
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) 		if (zone_to_nid(zone) == node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) 		node = zone_to_nid(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) 		page = dequeue_huge_page_node_exact(h, node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) 		if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) 			return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) 	if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) 		goto retry_cpuset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) 	return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) static struct page *dequeue_huge_page_vma(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) 				struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) 				unsigned long address, int avoid_reserve,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) 				long chg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140) 	struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) 	struct mempolicy *mpol;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) 	gfp_t gfp_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) 	nodemask_t *nodemask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) 	int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147) 	 * A child process with MAP_PRIVATE mappings created by their parent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) 	 * have no page reserves. This check ensures that reservations are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149) 	 * not "stolen". The child may still get SIGKILLed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) 	if (!vma_has_reserves(vma, chg) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) 			h->free_huge_pages - h->resv_huge_pages == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) 		goto err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) 	/* If reserves cannot be used, ensure enough pages are in the pool */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) 	if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) 		goto err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) 	gfp_mask = htlb_alloc_mask(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) 	nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) 	page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) 	if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163) 		SetPagePrivate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) 		h->resv_huge_pages--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) 	mpol_cond_put(mpol);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) 	return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) err:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) 	return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175)  * common helper functions for hstate_next_node_to_{alloc|free}.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176)  * We may have allocated or freed a huge page based on a different
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177)  * nodes_allowed previously, so h->next_node_to_{alloc|free} might
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178)  * be outside of *nodes_allowed.  Ensure that we use an allowed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179)  * node for alloc or free.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) 	nid = next_node_in(nid, *nodes_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) 	VM_BUG_ON(nid >= MAX_NUMNODES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) 	return nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) 	if (!node_isset(nid, *nodes_allowed))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) 		nid = next_node_allowed(nid, nodes_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) 	return nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197)  * returns the previously saved node ["this node"] from which to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198)  * allocate a persistent huge page for the pool and advance the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199)  * next node from which to allocate, handling wrap at end of node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200)  * mask.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) static int hstate_next_node_to_alloc(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) 					nodemask_t *nodes_allowed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) 	int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207) 	VM_BUG_ON(!nodes_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) 	nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) 	h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) 	return nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216)  * helper for free_pool_huge_page() - return the previously saved
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217)  * node ["this node"] from which to free a huge page.  Advance the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218)  * next node id whether or not we find a free huge page to free so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219)  * that the next attempt to free addresses the next node.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223) 	int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) 	VM_BUG_ON(!nodes_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227) 	nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) 	h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) 	return nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) #define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask)		\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) 	for (nr_nodes = nodes_weight(*mask);				\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) 		nr_nodes > 0 &&						\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) 		((node = hstate_next_node_to_alloc(hs, mask)) || 1);	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) 		nr_nodes--)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) #define for_each_node_mask_to_free(hs, nr_nodes, node, mask)		\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240) 	for (nr_nodes = nodes_weight(*mask);				\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241) 		nr_nodes > 0 &&						\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242) 		((node = hstate_next_node_to_free(hs, mask)) || 1);	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243) 		nr_nodes--)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245) #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) static void destroy_compound_gigantic_page(struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) 					unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250) 	int nr_pages = 1 << order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) 	struct page *p = page + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) 	atomic_set(compound_mapcount_ptr(page), 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254) 	atomic_set(compound_pincount_ptr(page), 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256) 	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) 		clear_compound_head(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258) 		set_page_refcounted(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261) 	set_compound_order(page, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262) 	page[1].compound_nr = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263) 	__ClearPageHead(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266) static void free_gigantic_page(struct page *page, unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269) 	 * If the page isn't allocated using the cma allocator,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270) 	 * cma_release() returns false.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272) #ifdef CONFIG_CMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273) 	if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) 	free_contig_range(page_to_pfn(page), 1 << order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280) #ifdef CONFIG_CONTIG_ALLOC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282) 		int nid, nodemask_t *nodemask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284) 	unsigned long nr_pages = 1UL << huge_page_order(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285) 	if (nid == NUMA_NO_NODE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286) 		nid = numa_mem_id();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288) #ifdef CONFIG_CMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289) 	{
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1290) 		struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1291) 		int node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1292) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1293) 		if (hugetlb_cma[nid]) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1294) 			page = cma_alloc(hugetlb_cma[nid], nr_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1295) 					huge_page_order(h),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1296) 					GFP_KERNEL | __GFP_NOWARN);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1297) 			if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1298) 				return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1299) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1300) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1301) 		if (!(gfp_mask & __GFP_THISNODE)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1302) 			for_each_node_mask(node, *nodemask) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1303) 				if (node == nid || !hugetlb_cma[node])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1304) 					continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1305) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1306) 				page = cma_alloc(hugetlb_cma[node], nr_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1307) 						huge_page_order(h),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1308) 						GFP_KERNEL | __GFP_NOWARN);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1309) 				if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1310) 					return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1311) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1312) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1313) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1314) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1315) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1316) 	return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1317) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1318) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1319) #else /* !CONFIG_CONTIG_ALLOC */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1320) static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1321) 					int nid, nodemask_t *nodemask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1322) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1323) 	return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1324) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1325) #endif /* CONFIG_CONTIG_ALLOC */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1326) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1327) #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1328) static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1329) 					int nid, nodemask_t *nodemask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1330) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1331) 	return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1332) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1333) static inline void free_gigantic_page(struct page *page, unsigned int order) { }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1334) static inline void destroy_compound_gigantic_page(struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1335) 						unsigned int order) { }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1336) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1337) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1338) static void update_and_free_page(struct hstate *h, struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1339) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1340) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1341) 	struct page *subpage = page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1342) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1343) 	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1344) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1345) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1346) 	h->nr_huge_pages--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1347) 	h->nr_huge_pages_node[page_to_nid(page)]--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1348) 	for (i = 0; i < pages_per_huge_page(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1349) 	     i++, subpage = mem_map_next(subpage, page, i)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1350) 		subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1351) 				1 << PG_referenced | 1 << PG_dirty |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1352) 				1 << PG_active | 1 << PG_private |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1353) 				1 << PG_writeback);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1354) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1355) 	VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1356) 	VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1357) 	set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1358) 	set_page_refcounted(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1359) 	if (hstate_is_gigantic(h)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1360) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1361) 		 * Temporarily drop the hugetlb_lock, because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1362) 		 * we might block in free_gigantic_page().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1363) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1364) 		spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1365) 		destroy_compound_gigantic_page(page, huge_page_order(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1366) 		free_gigantic_page(page, huge_page_order(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1367) 		spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1368) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1369) 		__free_pages(page, huge_page_order(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1370) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1371) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1372) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1373) struct hstate *size_to_hstate(unsigned long size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1374) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1375) 	struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1376) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1377) 	for_each_hstate(h) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1378) 		if (huge_page_size(h) == size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1379) 			return h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1380) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1381) 	return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1382) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1383) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1384) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1385)  * Test to determine whether the hugepage is "active/in-use" (i.e. being linked
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1386)  * to hstate->hugepage_activelist.)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1387)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1388)  * This function can be called for tail pages, but never returns true for them.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1389)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1390) bool page_huge_active(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1391) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1392) 	return PageHeadHuge(page) && PagePrivate(&page[1]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1393) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1394) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1395) /* never called for tail page */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1396) void set_page_huge_active(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1397) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1398) 	VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1399) 	SetPagePrivate(&page[1]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1400) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1401) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1402) static void clear_page_huge_active(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1403) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1404) 	VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1405) 	ClearPagePrivate(&page[1]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1406) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1407) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1408) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1409)  * Internal hugetlb specific page flag. Do not use outside of the hugetlb
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1410)  * code
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1411)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1412) static inline bool PageHugeTemporary(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1413) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1414) 	if (!PageHuge(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1415) 		return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1416) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1417) 	return (unsigned long)page[2].mapping == -1U;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1418) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1419) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1420) static inline void SetPageHugeTemporary(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1421) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1422) 	page[2].mapping = (void *)-1U;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1423) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1424) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1425) static inline void ClearPageHugeTemporary(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1426) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1427) 	page[2].mapping = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1428) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1429) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1430) static void __free_huge_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1431) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1432) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1433) 	 * Can't pass hstate in here because it is called from the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1434) 	 * compound page destructor.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1435) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1436) 	struct hstate *h = page_hstate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1437) 	int nid = page_to_nid(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1438) 	struct hugepage_subpool *spool =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1439) 		(struct hugepage_subpool *)page_private(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1440) 	bool restore_reserve;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1441) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1442) 	VM_BUG_ON_PAGE(page_count(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1443) 	VM_BUG_ON_PAGE(page_mapcount(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1444) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1445) 	set_page_private(page, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1446) 	page->mapping = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1447) 	restore_reserve = PagePrivate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1448) 	ClearPagePrivate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1449) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1450) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1451) 	 * If PagePrivate() was set on page, page allocation consumed a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1452) 	 * reservation.  If the page was associated with a subpool, there
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1453) 	 * would have been a page reserved in the subpool before allocation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1454) 	 * via hugepage_subpool_get_pages().  Since we are 'restoring' the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1455) 	 * reservtion, do not call hugepage_subpool_put_pages() as this will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1456) 	 * remove the reserved page from the subpool.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1457) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1458) 	if (!restore_reserve) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1459) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1460) 		 * A return code of zero implies that the subpool will be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1461) 		 * under its minimum size if the reservation is not restored
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1462) 		 * after page is free.  Therefore, force restore_reserve
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1463) 		 * operation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1464) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1465) 		if (hugepage_subpool_put_pages(spool, 1) == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1466) 			restore_reserve = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1467) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1468) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1469) 	spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1470) 	clear_page_huge_active(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1471) 	hugetlb_cgroup_uncharge_page(hstate_index(h),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1472) 				     pages_per_huge_page(h), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1473) 	hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1474) 					  pages_per_huge_page(h), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1475) 	if (restore_reserve)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1476) 		h->resv_huge_pages++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1477) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1478) 	if (PageHugeTemporary(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1479) 		list_del(&page->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1480) 		ClearPageHugeTemporary(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1481) 		update_and_free_page(h, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1482) 	} else if (h->surplus_huge_pages_node[nid]) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1483) 		/* remove the page from active list */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1484) 		list_del(&page->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1485) 		update_and_free_page(h, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1486) 		h->surplus_huge_pages--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1487) 		h->surplus_huge_pages_node[nid]--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1488) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1489) 		arch_clear_hugepage_flags(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1490) 		enqueue_huge_page(h, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1491) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1492) 	spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1493) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1494) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1495) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1496)  * As free_huge_page() can be called from a non-task context, we have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1497)  * to defer the actual freeing in a workqueue to prevent potential
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1498)  * hugetlb_lock deadlock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1499)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1500)  * free_hpage_workfn() locklessly retrieves the linked list of pages to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1501)  * be freed and frees them one-by-one. As the page->mapping pointer is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1502)  * going to be cleared in __free_huge_page() anyway, it is reused as the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1503)  * llist_node structure of a lockless linked list of huge pages to be freed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1504)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1505) static LLIST_HEAD(hpage_freelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1506) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1507) static void free_hpage_workfn(struct work_struct *work)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1508) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1509) 	struct llist_node *node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1510) 	struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1511) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1512) 	node = llist_del_all(&hpage_freelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1513) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1514) 	while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1515) 		page = container_of((struct address_space **)node,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1516) 				     struct page, mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1517) 		node = node->next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1518) 		__free_huge_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1519) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1520) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1521) static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1522) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1523) void free_huge_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1524) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1525) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1526) 	 * Defer freeing if in non-task context to avoid hugetlb_lock deadlock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1527) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1528) 	if (!in_task()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1529) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1530) 		 * Only call schedule_work() if hpage_freelist is previously
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1531) 		 * empty. Otherwise, schedule_work() had been called but the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1532) 		 * workfn hasn't retrieved the list yet.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1533) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1534) 		if (llist_add((struct llist_node *)&page->mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1535) 			      &hpage_freelist))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1536) 			schedule_work(&free_hpage_work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1537) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1538) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1539) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1540) 	__free_huge_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1541) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1542) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1543) static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1544) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1545) 	INIT_LIST_HEAD(&page->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1546) 	set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1547) 	set_hugetlb_cgroup(page, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1548) 	set_hugetlb_cgroup_rsvd(page, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1549) 	spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1550) 	h->nr_huge_pages++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1551) 	h->nr_huge_pages_node[nid]++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1552) 	ClearPageHugeFreed(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1553) 	spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1554) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1555) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1556) static void prep_compound_gigantic_page(struct page *page, unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1557) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1558) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1559) 	int nr_pages = 1 << order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1560) 	struct page *p = page + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1561) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1562) 	/* we rely on prep_new_huge_page to set the destructor */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1563) 	set_compound_order(page, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1564) 	__ClearPageReserved(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1565) 	__SetPageHead(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1566) 	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1567) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1568) 		 * For gigantic hugepages allocated through bootmem at
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1569) 		 * boot, it's safer to be consistent with the not-gigantic
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1570) 		 * hugepages and clear the PG_reserved bit from all tail pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1571) 		 * too.  Otherwise drivers using get_user_pages() to access tail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1572) 		 * pages may get the reference counting wrong if they see
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1573) 		 * PG_reserved set on a tail page (despite the head page not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1574) 		 * having PG_reserved set).  Enforcing this consistency between
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1575) 		 * head and tail pages allows drivers to optimize away a check
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1576) 		 * on the head page when they need know if put_page() is needed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1577) 		 * after get_user_pages().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1578) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1579) 		__ClearPageReserved(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1580) 		set_page_count(p, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1581) 		set_compound_head(p, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1582) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1583) 	atomic_set(compound_mapcount_ptr(page), -1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1584) 	atomic_set(compound_pincount_ptr(page), 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1585) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1586) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1587) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1588)  * PageHuge() only returns true for hugetlbfs pages, but not for normal or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1589)  * transparent huge pages.  See the PageTransHuge() documentation for more
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1590)  * details.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1591)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1592) int PageHuge(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1593) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1594) 	if (!PageCompound(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1595) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1596) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1597) 	page = compound_head(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1598) 	return page[1].compound_dtor == HUGETLB_PAGE_DTOR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1599) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1600) EXPORT_SYMBOL_GPL(PageHuge);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1601) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1602) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1603)  * PageHeadHuge() only returns true for hugetlbfs head page, but not for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1604)  * normal or transparent huge pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1605)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1606) int PageHeadHuge(struct page *page_head)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1607) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1608) 	if (!PageHead(page_head))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1609) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1610) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1611) 	return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1612) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1613) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1614) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1615)  * Find and lock address space (mapping) in write mode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1616)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1617)  * Upon entry, the page is locked which means that page_mapping() is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1618)  * stable.  Due to locking order, we can only trylock_write.  If we can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1619)  * not get the lock, simply return NULL to caller.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1620)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1621) struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1622) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1623) 	struct address_space *mapping = page_mapping(hpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1624) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1625) 	if (!mapping)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1626) 		return mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1627) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1628) 	if (i_mmap_trylock_write(mapping))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1629) 		return mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1630) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1631) 	return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1632) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1633) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1634) pgoff_t hugetlb_basepage_index(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1635) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1636) 	struct page *page_head = compound_head(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1637) 	pgoff_t index = page_index(page_head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1638) 	unsigned long compound_idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1639) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1640) 	if (compound_order(page_head) >= MAX_ORDER)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1641) 		compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1642) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1643) 		compound_idx = page - page_head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1644) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1645) 	return (index << compound_order(page_head)) + compound_idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1646) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1647) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1648) static struct page *alloc_buddy_huge_page(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1649) 		gfp_t gfp_mask, int nid, nodemask_t *nmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1650) 		nodemask_t *node_alloc_noretry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1651) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1652) 	int order = huge_page_order(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1653) 	struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1654) 	bool alloc_try_hard = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1655) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1656) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1657) 	 * By default we always try hard to allocate the page with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1658) 	 * __GFP_RETRY_MAYFAIL flag.  However, if we are allocating pages in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1659) 	 * a loop (to adjust global huge page counts) and previous allocation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1660) 	 * failed, do not continue to try hard on the same node.  Use the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1661) 	 * node_alloc_noretry bitmap to manage this state information.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1662) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1663) 	if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1664) 		alloc_try_hard = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1665) 	gfp_mask |= __GFP_COMP|__GFP_NOWARN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1666) 	if (alloc_try_hard)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1667) 		gfp_mask |= __GFP_RETRY_MAYFAIL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1668) 	if (nid == NUMA_NO_NODE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1669) 		nid = numa_mem_id();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1670) 	page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1671) 	if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1672) 		__count_vm_event(HTLB_BUDDY_PGALLOC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1673) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1674) 		__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1675) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1676) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1677) 	 * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1678) 	 * indicates an overall state change.  Clear bit so that we resume
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1679) 	 * normal 'try hard' allocations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1680) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1681) 	if (node_alloc_noretry && page && !alloc_try_hard)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1682) 		node_clear(nid, *node_alloc_noretry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1683) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1684) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1685) 	 * If we tried hard to get a page but failed, set bit so that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1686) 	 * subsequent attempts will not try as hard until there is an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1687) 	 * overall state change.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1688) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1689) 	if (node_alloc_noretry && !page && alloc_try_hard)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1690) 		node_set(nid, *node_alloc_noretry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1691) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1692) 	return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1693) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1694) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1695) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1696)  * Common helper to allocate a fresh hugetlb page. All specific allocators
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1697)  * should use this function to get new hugetlb pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1698)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1699) static struct page *alloc_fresh_huge_page(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1700) 		gfp_t gfp_mask, int nid, nodemask_t *nmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1701) 		nodemask_t *node_alloc_noretry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1702) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1703) 	struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1704) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1705) 	if (hstate_is_gigantic(h))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1706) 		page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1707) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1708) 		page = alloc_buddy_huge_page(h, gfp_mask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1709) 				nid, nmask, node_alloc_noretry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1710) 	if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1711) 		return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1712) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1713) 	if (hstate_is_gigantic(h))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1714) 		prep_compound_gigantic_page(page, huge_page_order(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1715) 	prep_new_huge_page(h, page, page_to_nid(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1716) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1717) 	return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1718) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1719) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1720) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1721)  * Allocates a fresh page to the hugetlb allocator pool in the node interleaved
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1722)  * manner.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1723)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1724) static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1725) 				nodemask_t *node_alloc_noretry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1726) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1727) 	struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1728) 	int nr_nodes, node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1729) 	gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1730) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1731) 	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1732) 		page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1733) 						node_alloc_noretry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1734) 		if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1735) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1736) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1737) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1738) 	if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1739) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1740) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1741) 	put_page(page); /* free it into the hugepage allocator */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1742) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1743) 	return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1744) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1745) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1746) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1747)  * Free huge page from pool from next node to free.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1748)  * Attempt to keep persistent huge pages more or less
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1749)  * balanced over allowed nodes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1750)  * Called with hugetlb_lock locked.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1751)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1752) static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1753) 							 bool acct_surplus)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1754) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1755) 	int nr_nodes, node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1756) 	int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1757) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1758) 	for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1759) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1760) 		 * If we're returning unused surplus pages, only examine
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1761) 		 * nodes with surplus pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1762) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1763) 		if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1764) 		    !list_empty(&h->hugepage_freelists[node])) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1765) 			struct page *page =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1766) 				list_entry(h->hugepage_freelists[node].next,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1767) 					  struct page, lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1768) 			list_del(&page->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1769) 			h->free_huge_pages--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1770) 			h->free_huge_pages_node[node]--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1771) 			if (acct_surplus) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1772) 				h->surplus_huge_pages--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1773) 				h->surplus_huge_pages_node[node]--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1774) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1775) 			update_and_free_page(h, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1776) 			ret = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1777) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1778) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1779) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1780) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1781) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1782) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1783) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1784) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1785)  * Dissolve a given free hugepage into free buddy pages. This function does
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1786)  * nothing for in-use hugepages and non-hugepages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1787)  * This function returns values like below:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1788)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1789)  *  -EBUSY: failed to dissolved free hugepages or the hugepage is in-use
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1790)  *          (allocated or reserved.)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1791)  *       0: successfully dissolved free hugepages or the page is not a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1792)  *          hugepage (considered as already dissolved)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1793)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1794) int dissolve_free_huge_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1795) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1796) 	int rc = -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1797) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1798) retry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1799) 	/* Not to disrupt normal path by vainly holding hugetlb_lock */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1800) 	if (!PageHuge(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1801) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1802) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1803) 	spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1804) 	if (!PageHuge(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1805) 		rc = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1806) 		goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1807) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1808) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1809) 	if (!page_count(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1810) 		struct page *head = compound_head(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1811) 		struct hstate *h = page_hstate(head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1812) 		int nid = page_to_nid(head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1813) 		if (h->free_huge_pages - h->resv_huge_pages == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1814) 			goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1815) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1816) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1817) 		 * We should make sure that the page is already on the free list
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1818) 		 * when it is dissolved.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1819) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1820) 		if (unlikely(!PageHugeFreed(head))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1821) 			spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1822) 			cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1823) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1824) 			/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1825) 			 * Theoretically, we should return -EBUSY when we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1826) 			 * encounter this race. In fact, we have a chance
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1827) 			 * to successfully dissolve the page if we do a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1828) 			 * retry. Because the race window is quite small.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1829) 			 * If we seize this opportunity, it is an optimization
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1830) 			 * for increasing the success rate of dissolving page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1831) 			 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1832) 			goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1833) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1834) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1835) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1836) 		 * Move PageHWPoison flag from head page to the raw error page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1837) 		 * which makes any subpages rather than the error page reusable.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1838) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1839) 		if (PageHWPoison(head) && page != head) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1840) 			SetPageHWPoison(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1841) 			ClearPageHWPoison(head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1842) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1843) 		list_del(&head->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1844) 		h->free_huge_pages--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1845) 		h->free_huge_pages_node[nid]--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1846) 		h->max_huge_pages--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1847) 		update_and_free_page(h, head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1848) 		rc = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1849) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1850) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1851) 	spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1852) 	return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1853) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1854) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1855) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1856)  * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1857)  * make specified memory blocks removable from the system.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1858)  * Note that this will dissolve a free gigantic hugepage completely, if any
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1859)  * part of it lies within the given range.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1860)  * Also note that if dissolve_free_huge_page() returns with an error, all
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1861)  * free hugepages that were dissolved before that error are lost.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1862)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1863) int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1864) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1865) 	unsigned long pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1866) 	struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1867) 	int rc = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1868) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1869) 	if (!hugepages_supported())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1870) 		return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1871) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1872) 	for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1873) 		page = pfn_to_page(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1874) 		rc = dissolve_free_huge_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1875) 		if (rc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1876) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1877) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1878) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1879) 	return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1880) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1881) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1882) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1883)  * Allocates a fresh surplus page from the page allocator.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1884)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1885) static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1886) 		int nid, nodemask_t *nmask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1887) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1888) 	struct page *page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1889) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1890) 	if (hstate_is_gigantic(h))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1891) 		return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1892) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1893) 	spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1894) 	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1895) 		goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1896) 	spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1897) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1898) 	page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1899) 	if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1900) 		return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1901) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1902) 	spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1903) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1904) 	 * We could have raced with the pool size change.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1905) 	 * Double check that and simply deallocate the new page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1906) 	 * if we would end up overcommiting the surpluses. Abuse
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1907) 	 * temporary page to workaround the nasty free_huge_page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1908) 	 * codeflow
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1909) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1910) 	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1911) 		SetPageHugeTemporary(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1912) 		spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1913) 		put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1914) 		return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1915) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1916) 		h->surplus_huge_pages++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1917) 		h->surplus_huge_pages_node[page_to_nid(page)]++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1918) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1919) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1920) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1921) 	spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1922) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1923) 	return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1924) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1925) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1926) static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1927) 				     int nid, nodemask_t *nmask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1928) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1929) 	struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1930) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1931) 	if (hstate_is_gigantic(h))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1932) 		return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1933) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1934) 	page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1935) 	if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1936) 		return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1937) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1938) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1939) 	 * We do not account these pages as surplus because they are only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1940) 	 * temporary and will be released properly on the last reference
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1941) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1942) 	SetPageHugeTemporary(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1943) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1944) 	return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1945) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1946) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1947) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1948)  * Use the VMA's mpolicy to allocate a huge page from the buddy.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1949)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1950) static
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1951) struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1952) 		struct vm_area_struct *vma, unsigned long addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1953) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1954) 	struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1955) 	struct mempolicy *mpol;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1956) 	gfp_t gfp_mask = htlb_alloc_mask(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1957) 	int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1958) 	nodemask_t *nodemask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1959) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1960) 	nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1961) 	page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1962) 	mpol_cond_put(mpol);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1963) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1964) 	return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1965) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1966) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1967) /* page migration callback function */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1968) struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1969) 		nodemask_t *nmask, gfp_t gfp_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1970) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1971) 	spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1972) 	if (h->free_huge_pages - h->resv_huge_pages > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1973) 		struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1974) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1975) 		page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1976) 		if (page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1977) 			spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1978) 			return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1979) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1980) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1981) 	spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1982) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1983) 	return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1984) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1985) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1986) /* mempolicy aware migration callback */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1987) struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1988) 		unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1989) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1990) 	struct mempolicy *mpol;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1991) 	nodemask_t *nodemask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1992) 	struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1993) 	gfp_t gfp_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1994) 	int node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1995) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1996) 	gfp_mask = htlb_alloc_mask(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1997) 	node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1998) 	page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1999) 	mpol_cond_put(mpol);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2000) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2001) 	return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2002) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2003) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2004) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2005)  * Increase the hugetlb pool such that it can accommodate a reservation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2006)  * of size 'delta'.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2007)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2008) static int gather_surplus_pages(struct hstate *h, int delta)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2009) 	__must_hold(&hugetlb_lock)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2010) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2011) 	struct list_head surplus_list;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2012) 	struct page *page, *tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2013) 	int ret, i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2014) 	int needed, allocated;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2015) 	bool alloc_ok = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2016) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2017) 	needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2018) 	if (needed <= 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2019) 		h->resv_huge_pages += delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2020) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2021) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2022) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2023) 	allocated = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2024) 	INIT_LIST_HEAD(&surplus_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2025) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2026) 	ret = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2027) retry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2028) 	spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2029) 	for (i = 0; i < needed; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2030) 		page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2031) 				NUMA_NO_NODE, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2032) 		if (!page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2033) 			alloc_ok = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2034) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2035) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2036) 		list_add(&page->lru, &surplus_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2037) 		cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2038) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2039) 	allocated += i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2040) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2041) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2042) 	 * After retaking hugetlb_lock, we need to recalculate 'needed'
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2043) 	 * because either resv_huge_pages or free_huge_pages may have changed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2044) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2045) 	spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2046) 	needed = (h->resv_huge_pages + delta) -
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2047) 			(h->free_huge_pages + allocated);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2048) 	if (needed > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2049) 		if (alloc_ok)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2050) 			goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2051) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2052) 		 * We were not able to allocate enough pages to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2053) 		 * satisfy the entire reservation so we free what
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2054) 		 * we've allocated so far.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2055) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2056) 		goto free;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2057) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2058) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2059) 	 * The surplus_list now contains _at_least_ the number of extra pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2060) 	 * needed to accommodate the reservation.  Add the appropriate number
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2061) 	 * of pages to the hugetlb pool and free the extras back to the buddy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2062) 	 * allocator.  Commit the entire reservation here to prevent another
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2063) 	 * process from stealing the pages as they are added to the pool but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2064) 	 * before they are reserved.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2065) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2066) 	needed += allocated;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2067) 	h->resv_huge_pages += delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2068) 	ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2069) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2070) 	/* Free the needed pages to the hugetlb pool */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2071) 	list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2072) 		if ((--needed) < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2073) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2074) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2075) 		 * This page is now managed by the hugetlb allocator and has
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2076) 		 * no users -- drop the buddy allocator's reference.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2077) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2078) 		put_page_testzero(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2079) 		VM_BUG_ON_PAGE(page_count(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2080) 		enqueue_huge_page(h, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2081) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2082) free:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2083) 	spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2084) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2085) 	/* Free unnecessary surplus pages to the buddy allocator */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2086) 	list_for_each_entry_safe(page, tmp, &surplus_list, lru)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2087) 		put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2088) 	spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2089) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2090) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2091) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2092) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2093) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2094)  * This routine has two main purposes:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2095)  * 1) Decrement the reservation count (resv_huge_pages) by the value passed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2096)  *    in unused_resv_pages.  This corresponds to the prior adjustments made
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2097)  *    to the associated reservation map.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2098)  * 2) Free any unused surplus pages that may have been allocated to satisfy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2099)  *    the reservation.  As many as unused_resv_pages may be freed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2100)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2101)  * Called with hugetlb_lock held.  However, the lock could be dropped (and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2102)  * reacquired) during calls to cond_resched_lock.  Whenever dropping the lock,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2103)  * we must make sure nobody else can claim pages we are in the process of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2104)  * freeing.  Do this by ensuring resv_huge_page always is greater than the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2105)  * number of huge pages we plan to free when dropping the lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2106)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2107) static void return_unused_surplus_pages(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2108) 					unsigned long unused_resv_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2109) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2110) 	unsigned long nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2111) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2112) 	/* Cannot return gigantic pages currently */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2113) 	if (hstate_is_gigantic(h))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2114) 		goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2115) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2116) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2117) 	 * Part (or even all) of the reservation could have been backed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2118) 	 * by pre-allocated pages. Only free surplus pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2119) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2120) 	nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2121) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2122) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2123) 	 * We want to release as many surplus pages as possible, spread
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2124) 	 * evenly across all nodes with memory. Iterate across these nodes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2125) 	 * until we can no longer free unreserved surplus pages. This occurs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2126) 	 * when the nodes with surplus pages have no free pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2127) 	 * free_pool_huge_page() will balance the freed pages across the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2128) 	 * on-line nodes with memory and will handle the hstate accounting.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2129) 	 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2130) 	 * Note that we decrement resv_huge_pages as we free the pages.  If
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2131) 	 * we drop the lock, resv_huge_pages will still be sufficiently large
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2132) 	 * to cover subsequent pages we may free.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2133) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2134) 	while (nr_pages--) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2135) 		h->resv_huge_pages--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2136) 		unused_resv_pages--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2137) 		if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2138) 			goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2139) 		cond_resched_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2140) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2141) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2142) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2143) 	/* Fully uncommit the reservation */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2144) 	h->resv_huge_pages -= unused_resv_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2145) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2146) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2147) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2148) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2149)  * vma_needs_reservation, vma_commit_reservation and vma_end_reservation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2150)  * are used by the huge page allocation routines to manage reservations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2151)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2152)  * vma_needs_reservation is called to determine if the huge page at addr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2153)  * within the vma has an associated reservation.  If a reservation is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2154)  * needed, the value 1 is returned.  The caller is then responsible for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2155)  * managing the global reservation and subpool usage counts.  After
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2156)  * the huge page has been allocated, vma_commit_reservation is called
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2157)  * to add the page to the reservation map.  If the page allocation fails,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2158)  * the reservation must be ended instead of committed.  vma_end_reservation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2159)  * is called in such cases.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2160)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2161)  * In the normal case, vma_commit_reservation returns the same value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2162)  * as the preceding vma_needs_reservation call.  The only time this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2163)  * is not the case is if a reserve map was changed between calls.  It
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2164)  * is the responsibility of the caller to notice the difference and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2165)  * take appropriate action.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2166)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2167)  * vma_add_reservation is used in error paths where a reservation must
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2168)  * be restored when a newly allocated huge page must be freed.  It is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2169)  * to be called after calling vma_needs_reservation to determine if a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2170)  * reservation exists.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2171)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2172) enum vma_resv_mode {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2173) 	VMA_NEEDS_RESV,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2174) 	VMA_COMMIT_RESV,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2175) 	VMA_END_RESV,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2176) 	VMA_ADD_RESV,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2177) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2178) static long __vma_reservation_common(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2179) 				struct vm_area_struct *vma, unsigned long addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2180) 				enum vma_resv_mode mode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2181) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2182) 	struct resv_map *resv;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2183) 	pgoff_t idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2184) 	long ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2185) 	long dummy_out_regions_needed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2186) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2187) 	resv = vma_resv_map(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2188) 	if (!resv)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2189) 		return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2190) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2191) 	idx = vma_hugecache_offset(h, vma, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2192) 	switch (mode) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2193) 	case VMA_NEEDS_RESV:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2194) 		ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2195) 		/* We assume that vma_reservation_* routines always operate on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2196) 		 * 1 page, and that adding to resv map a 1 page entry can only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2197) 		 * ever require 1 region.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2198) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2199) 		VM_BUG_ON(dummy_out_regions_needed != 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2200) 		break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2201) 	case VMA_COMMIT_RESV:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2202) 		ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2203) 		/* region_add calls of range 1 should never fail. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2204) 		VM_BUG_ON(ret < 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2205) 		break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2206) 	case VMA_END_RESV:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2207) 		region_abort(resv, idx, idx + 1, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2208) 		ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2209) 		break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2210) 	case VMA_ADD_RESV:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2211) 		if (vma->vm_flags & VM_MAYSHARE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2212) 			ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2213) 			/* region_add calls of range 1 should never fail. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2214) 			VM_BUG_ON(ret < 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2215) 		} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2216) 			region_abort(resv, idx, idx + 1, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2217) 			ret = region_del(resv, idx, idx + 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2218) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2219) 		break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2220) 	default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2221) 		BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2222) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2223) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2224) 	if (vma->vm_flags & VM_MAYSHARE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2225) 		return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2226) 	else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2227) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2228) 		 * In most cases, reserves always exist for private mappings.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2229) 		 * However, a file associated with mapping could have been
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2230) 		 * hole punched or truncated after reserves were consumed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2231) 		 * As subsequent fault on such a range will not use reserves.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2232) 		 * Subtle - The reserve map for private mappings has the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2233) 		 * opposite meaning than that of shared mappings.  If NO
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2234) 		 * entry is in the reserve map, it means a reservation exists.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2235) 		 * If an entry exists in the reserve map, it means the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2236) 		 * reservation has already been consumed.  As a result, the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2237) 		 * return value of this routine is the opposite of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2238) 		 * value returned from reserve map manipulation routines above.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2239) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2240) 		if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2241) 			return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2242) 		else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2243) 			return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2244) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2245) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2246) 		return ret < 0 ? ret : 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2247) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2248) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2249) static long vma_needs_reservation(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2250) 			struct vm_area_struct *vma, unsigned long addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2251) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2252) 	return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2253) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2254) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2255) static long vma_commit_reservation(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2256) 			struct vm_area_struct *vma, unsigned long addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2257) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2258) 	return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2259) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2260) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2261) static void vma_end_reservation(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2262) 			struct vm_area_struct *vma, unsigned long addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2263) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2264) 	(void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2265) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2266) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2267) static long vma_add_reservation(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2268) 			struct vm_area_struct *vma, unsigned long addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2269) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2270) 	return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2271) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2272) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2273) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2274)  * This routine is called to restore a reservation on error paths.  In the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2275)  * specific error paths, a huge page was allocated (via alloc_huge_page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2276)  * and is about to be freed.  If a reservation for the page existed,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2277)  * alloc_huge_page would have consumed the reservation and set PagePrivate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2278)  * in the newly allocated page.  When the page is freed via free_huge_page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2279)  * the global reservation count will be incremented if PagePrivate is set.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2280)  * However, free_huge_page can not adjust the reserve map.  Adjust the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2281)  * reserve map here to be consistent with global reserve count adjustments
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2282)  * to be made by free_huge_page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2283)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2284) static void restore_reserve_on_error(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2285) 			struct vm_area_struct *vma, unsigned long address,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2286) 			struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2287) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2288) 	if (unlikely(PagePrivate(page))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2289) 		long rc = vma_needs_reservation(h, vma, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2290) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2291) 		if (unlikely(rc < 0)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2292) 			/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2293) 			 * Rare out of memory condition in reserve map
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2294) 			 * manipulation.  Clear PagePrivate so that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2295) 			 * global reserve count will not be incremented
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2296) 			 * by free_huge_page.  This will make it appear
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2297) 			 * as though the reservation for this page was
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2298) 			 * consumed.  This may prevent the task from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2299) 			 * faulting in the page at a later time.  This
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2300) 			 * is better than inconsistent global huge page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2301) 			 * accounting of reserve counts.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2302) 			 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2303) 			ClearPagePrivate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2304) 		} else if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2305) 			rc = vma_add_reservation(h, vma, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2306) 			if (unlikely(rc < 0))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2307) 				/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2308) 				 * See above comment about rare out of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2309) 				 * memory condition.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2310) 				 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2311) 				ClearPagePrivate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2312) 		} else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2313) 			vma_end_reservation(h, vma, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2314) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2315) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2316) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2317) struct page *alloc_huge_page(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2318) 				    unsigned long addr, int avoid_reserve)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2319) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2320) 	struct hugepage_subpool *spool = subpool_vma(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2321) 	struct hstate *h = hstate_vma(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2322) 	struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2323) 	long map_chg, map_commit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2324) 	long gbl_chg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2325) 	int ret, idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2326) 	struct hugetlb_cgroup *h_cg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2327) 	bool deferred_reserve;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2328) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2329) 	idx = hstate_index(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2330) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2331) 	 * Examine the region/reserve map to determine if the process
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2332) 	 * has a reservation for the page to be allocated.  A return
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2333) 	 * code of zero indicates a reservation exists (no change).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2334) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2335) 	map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2336) 	if (map_chg < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2337) 		return ERR_PTR(-ENOMEM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2338) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2339) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2340) 	 * Processes that did not create the mapping will have no
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2341) 	 * reserves as indicated by the region/reserve map. Check
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2342) 	 * that the allocation will not exceed the subpool limit.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2343) 	 * Allocations for MAP_NORESERVE mappings also need to be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2344) 	 * checked against any subpool limit.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2345) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2346) 	if (map_chg || avoid_reserve) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2347) 		gbl_chg = hugepage_subpool_get_pages(spool, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2348) 		if (gbl_chg < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2349) 			vma_end_reservation(h, vma, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2350) 			return ERR_PTR(-ENOSPC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2351) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2352) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2353) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2354) 		 * Even though there was no reservation in the region/reserve
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2355) 		 * map, there could be reservations associated with the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2356) 		 * subpool that can be used.  This would be indicated if the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2357) 		 * return value of hugepage_subpool_get_pages() is zero.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2358) 		 * However, if avoid_reserve is specified we still avoid even
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2359) 		 * the subpool reservations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2360) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2361) 		if (avoid_reserve)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2362) 			gbl_chg = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2363) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2364) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2365) 	/* If this allocation is not consuming a reservation, charge it now.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2366) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2367) 	deferred_reserve = map_chg || avoid_reserve || !vma_resv_map(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2368) 	if (deferred_reserve) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2369) 		ret = hugetlb_cgroup_charge_cgroup_rsvd(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2370) 			idx, pages_per_huge_page(h), &h_cg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2371) 		if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2372) 			goto out_subpool_put;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2373) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2374) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2375) 	ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2376) 	if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2377) 		goto out_uncharge_cgroup_reservation;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2378) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2379) 	spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2380) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2381) 	 * glb_chg is passed to indicate whether or not a page must be taken
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2382) 	 * from the global free pool (global change).  gbl_chg == 0 indicates
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2383) 	 * a reservation exists for the allocation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2384) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2385) 	page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2386) 	if (!page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2387) 		spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2388) 		page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2389) 		if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2390) 			goto out_uncharge_cgroup;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2391) 		if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2392) 			SetPagePrivate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2393) 			h->resv_huge_pages--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2394) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2395) 		spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2396) 		list_add(&page->lru, &h->hugepage_activelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2397) 		/* Fall through */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2398) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2399) 	hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2400) 	/* If allocation is not consuming a reservation, also store the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2401) 	 * hugetlb_cgroup pointer on the page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2402) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2403) 	if (deferred_reserve) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2404) 		hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2405) 						  h_cg, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2406) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2407) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2408) 	spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2409) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2410) 	set_page_private(page, (unsigned long)spool);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2411) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2412) 	map_commit = vma_commit_reservation(h, vma, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2413) 	if (unlikely(map_chg > map_commit)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2414) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2415) 		 * The page was added to the reservation map between
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2416) 		 * vma_needs_reservation and vma_commit_reservation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2417) 		 * This indicates a race with hugetlb_reserve_pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2418) 		 * Adjust for the subpool count incremented above AND
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2419) 		 * in hugetlb_reserve_pages for the same page.  Also,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2420) 		 * the reservation count added in hugetlb_reserve_pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2421) 		 * no longer applies.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2422) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2423) 		long rsv_adjust;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2424) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2425) 		rsv_adjust = hugepage_subpool_put_pages(spool, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2426) 		hugetlb_acct_memory(h, -rsv_adjust);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2427) 		if (deferred_reserve)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2428) 			hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2429) 					pages_per_huge_page(h), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2430) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2431) 	return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2432) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2433) out_uncharge_cgroup:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2434) 	hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2435) out_uncharge_cgroup_reservation:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2436) 	if (deferred_reserve)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2437) 		hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2438) 						    h_cg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2439) out_subpool_put:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2440) 	if (map_chg || avoid_reserve)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2441) 		hugepage_subpool_put_pages(spool, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2442) 	vma_end_reservation(h, vma, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2443) 	return ERR_PTR(-ENOSPC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2444) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2445) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2446) int alloc_bootmem_huge_page(struct hstate *h)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2447) 	__attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2448) int __alloc_bootmem_huge_page(struct hstate *h)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2449) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2450) 	struct huge_bootmem_page *m;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2451) 	int nr_nodes, node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2452) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2453) 	for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2454) 		void *addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2455) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2456) 		addr = memblock_alloc_try_nid_raw(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2457) 				huge_page_size(h), huge_page_size(h),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2458) 				0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2459) 		if (addr) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2460) 			/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2461) 			 * Use the beginning of the huge page to store the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2462) 			 * huge_bootmem_page struct (until gather_bootmem
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2463) 			 * puts them into the mem_map).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2464) 			 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2465) 			m = addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2466) 			goto found;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2467) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2468) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2469) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2470) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2471) found:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2472) 	BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2473) 	/* Put them into a private list first because mem_map is not up yet */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2474) 	INIT_LIST_HEAD(&m->list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2475) 	list_add(&m->list, &huge_boot_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2476) 	m->hstate = h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2477) 	return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2478) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2479) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2480) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2481)  * Put bootmem huge pages into the standard lists after mem_map is up.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2482)  * Note: This only applies to gigantic (order > MAX_ORDER) pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2483)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2484) static void __init gather_bootmem_prealloc(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2485) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2486) 	struct huge_bootmem_page *m;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2487) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2488) 	list_for_each_entry(m, &huge_boot_pages, list) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2489) 		struct page *page = virt_to_page(m);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2490) 		struct hstate *h = m->hstate;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2491) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2492) 		VM_BUG_ON(!hstate_is_gigantic(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2493) 		WARN_ON(page_count(page) != 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2494) 		prep_compound_gigantic_page(page, huge_page_order(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2495) 		WARN_ON(PageReserved(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2496) 		prep_new_huge_page(h, page, page_to_nid(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2497) 		put_page(page); /* free it into the hugepage allocator */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2498) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2499) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2500) 		 * We need to restore the 'stolen' pages to totalram_pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2501) 		 * in order to fix confusing memory reports from free(1) and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2502) 		 * other side-effects, like CommitLimit going negative.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2503) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2504) 		adjust_managed_page_count(page, pages_per_huge_page(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2505) 		cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2506) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2507) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2508) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2509) static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2510) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2511) 	unsigned long i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2512) 	nodemask_t *node_alloc_noretry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2513) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2514) 	if (!hstate_is_gigantic(h)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2515) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2516) 		 * Bit mask controlling how hard we retry per-node allocations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2517) 		 * Ignore errors as lower level routines can deal with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2518) 		 * node_alloc_noretry == NULL.  If this kmalloc fails at boot
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2519) 		 * time, we are likely in bigger trouble.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2520) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2521) 		node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2522) 						GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2523) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2524) 		/* allocations done at boot time */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2525) 		node_alloc_noretry = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2526) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2527) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2528) 	/* bit mask controlling how hard we retry per-node allocations */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2529) 	if (node_alloc_noretry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2530) 		nodes_clear(*node_alloc_noretry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2531) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2532) 	for (i = 0; i < h->max_huge_pages; ++i) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2533) 		if (hstate_is_gigantic(h)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2534) 			if (hugetlb_cma_size) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2535) 				pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2536) 				goto free;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2537) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2538) 			if (!alloc_bootmem_huge_page(h))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2539) 				break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2540) 		} else if (!alloc_pool_huge_page(h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2541) 					 &node_states[N_MEMORY],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2542) 					 node_alloc_noretry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2543) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2544) 		cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2545) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2546) 	if (i < h->max_huge_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2547) 		char buf[32];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2548) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2549) 		string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2550) 		pr_warn("HugeTLB: allocating %lu of page size %s failed.  Only allocated %lu hugepages.\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2551) 			h->max_huge_pages, buf, i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2552) 		h->max_huge_pages = i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2553) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2554) free:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2555) 	kfree(node_alloc_noretry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2556) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2557) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2558) static void __init hugetlb_init_hstates(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2559) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2560) 	struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2561) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2562) 	for_each_hstate(h) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2563) 		if (minimum_order > huge_page_order(h))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2564) 			minimum_order = huge_page_order(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2565) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2566) 		/* oversize hugepages were init'ed in early boot */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2567) 		if (!hstate_is_gigantic(h))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2568) 			hugetlb_hstate_alloc_pages(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2569) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2570) 	VM_BUG_ON(minimum_order == UINT_MAX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2571) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2572) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2573) static void __init report_hugepages(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2574) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2575) 	struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2576) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2577) 	for_each_hstate(h) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2578) 		char buf[32];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2579) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2580) 		string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2581) 		pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2582) 			buf, h->free_huge_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2583) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2584) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2585) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2586) #ifdef CONFIG_HIGHMEM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2587) static void try_to_free_low(struct hstate *h, unsigned long count,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2588) 						nodemask_t *nodes_allowed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2589) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2590) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2591) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2592) 	if (hstate_is_gigantic(h))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2593) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2594) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2595) 	for_each_node_mask(i, *nodes_allowed) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2596) 		struct page *page, *next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2597) 		struct list_head *freel = &h->hugepage_freelists[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2598) 		list_for_each_entry_safe(page, next, freel, lru) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2599) 			if (count >= h->nr_huge_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2600) 				return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2601) 			if (PageHighMem(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2602) 				continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2603) 			list_del(&page->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2604) 			update_and_free_page(h, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2605) 			h->free_huge_pages--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2606) 			h->free_huge_pages_node[page_to_nid(page)]--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2607) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2608) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2609) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2610) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2611) static inline void try_to_free_low(struct hstate *h, unsigned long count,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2612) 						nodemask_t *nodes_allowed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2613) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2614) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2615) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2616) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2617) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2618)  * Increment or decrement surplus_huge_pages.  Keep node-specific counters
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2619)  * balanced by operating on them in a round-robin fashion.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2620)  * Returns 1 if an adjustment was made.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2621)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2622) static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2623) 				int delta)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2624) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2625) 	int nr_nodes, node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2626) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2627) 	VM_BUG_ON(delta != -1 && delta != 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2628) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2629) 	if (delta < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2630) 		for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2631) 			if (h->surplus_huge_pages_node[node])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2632) 				goto found;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2633) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2634) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2635) 		for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2636) 			if (h->surplus_huge_pages_node[node] <
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2637) 					h->nr_huge_pages_node[node])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2638) 				goto found;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2639) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2640) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2641) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2642) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2643) found:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2644) 	h->surplus_huge_pages += delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2645) 	h->surplus_huge_pages_node[node] += delta;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2646) 	return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2647) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2648) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2649) #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2650) static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2651) 			      nodemask_t *nodes_allowed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2652) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2653) 	unsigned long min_count, ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2654) 	NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2655) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2656) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2657) 	 * Bit mask controlling how hard we retry per-node allocations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2658) 	 * If we can not allocate the bit mask, do not attempt to allocate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2659) 	 * the requested huge pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2660) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2661) 	if (node_alloc_noretry)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2662) 		nodes_clear(*node_alloc_noretry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2663) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2664) 		return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2665) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2666) 	spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2667) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2668) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2669) 	 * Check for a node specific request.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2670) 	 * Changing node specific huge page count may require a corresponding
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2671) 	 * change to the global count.  In any case, the passed node mask
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2672) 	 * (nodes_allowed) will restrict alloc/free to the specified node.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2673) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2674) 	if (nid != NUMA_NO_NODE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2675) 		unsigned long old_count = count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2676) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2677) 		count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2678) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2679) 		 * User may have specified a large count value which caused the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2680) 		 * above calculation to overflow.  In this case, they wanted
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2681) 		 * to allocate as many huge pages as possible.  Set count to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2682) 		 * largest possible value to align with their intention.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2683) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2684) 		if (count < old_count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2685) 			count = ULONG_MAX;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2686) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2687) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2688) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2689) 	 * Gigantic pages runtime allocation depend on the capability for large
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2690) 	 * page range allocation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2691) 	 * If the system does not provide this feature, return an error when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2692) 	 * the user tries to allocate gigantic pages but let the user free the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2693) 	 * boottime allocated gigantic pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2694) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2695) 	if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2696) 		if (count > persistent_huge_pages(h)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2697) 			spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2698) 			NODEMASK_FREE(node_alloc_noretry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2699) 			return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2700) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2701) 		/* Fall through to decrease pool */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2702) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2703) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2704) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2705) 	 * Increase the pool size
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2706) 	 * First take pages out of surplus state.  Then make up the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2707) 	 * remaining difference by allocating fresh huge pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2708) 	 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2709) 	 * We might race with alloc_surplus_huge_page() here and be unable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2710) 	 * to convert a surplus huge page to a normal huge page. That is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2711) 	 * not critical, though, it just means the overall size of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2712) 	 * pool might be one hugepage larger than it needs to be, but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2713) 	 * within all the constraints specified by the sysctls.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2714) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2715) 	while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2716) 		if (!adjust_pool_surplus(h, nodes_allowed, -1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2717) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2718) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2719) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2720) 	while (count > persistent_huge_pages(h)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2721) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2722) 		 * If this allocation races such that we no longer need the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2723) 		 * page, free_huge_page will handle it by freeing the page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2724) 		 * and reducing the surplus.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2725) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2726) 		spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2727) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2728) 		/* yield cpu to avoid soft lockup */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2729) 		cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2730) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2731) 		ret = alloc_pool_huge_page(h, nodes_allowed,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2732) 						node_alloc_noretry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2733) 		spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2734) 		if (!ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2735) 			goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2736) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2737) 		/* Bail for signals. Probably ctrl-c from user */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2738) 		if (signal_pending(current))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2739) 			goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2740) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2741) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2742) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2743) 	 * Decrease the pool size
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2744) 	 * First return free pages to the buddy allocator (being careful
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2745) 	 * to keep enough around to satisfy reservations).  Then place
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2746) 	 * pages into surplus state as needed so the pool will shrink
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2747) 	 * to the desired size as pages become free.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2748) 	 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2749) 	 * By placing pages into the surplus state independent of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2750) 	 * overcommit value, we are allowing the surplus pool size to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2751) 	 * exceed overcommit. There are few sane options here. Since
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2752) 	 * alloc_surplus_huge_page() is checking the global counter,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2753) 	 * though, we'll note that we're not allowed to exceed surplus
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2754) 	 * and won't grow the pool anywhere else. Not until one of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2755) 	 * sysctls are changed, or the surplus pages go out of use.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2756) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2757) 	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2758) 	min_count = max(count, min_count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2759) 	try_to_free_low(h, min_count, nodes_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2760) 	while (min_count < persistent_huge_pages(h)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2761) 		if (!free_pool_huge_page(h, nodes_allowed, 0))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2762) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2763) 		cond_resched_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2764) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2765) 	while (count < persistent_huge_pages(h)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2766) 		if (!adjust_pool_surplus(h, nodes_allowed, 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2767) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2768) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2769) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2770) 	h->max_huge_pages = persistent_huge_pages(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2771) 	spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2772) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2773) 	NODEMASK_FREE(node_alloc_noretry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2774) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2775) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2776) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2777) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2778) #define HSTATE_ATTR_RO(_name) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2779) 	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2780) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2781) #define HSTATE_ATTR(_name) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2782) 	static struct kobj_attribute _name##_attr = \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2783) 		__ATTR(_name, 0644, _name##_show, _name##_store)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2784) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2785) static struct kobject *hugepages_kobj;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2786) static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2787) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2788) static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2789) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2790) static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2791) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2792) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2793) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2794) 	for (i = 0; i < HUGE_MAX_HSTATE; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2795) 		if (hstate_kobjs[i] == kobj) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2796) 			if (nidp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2797) 				*nidp = NUMA_NO_NODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2798) 			return &hstates[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2799) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2800) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2801) 	return kobj_to_node_hstate(kobj, nidp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2802) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2803) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2804) static ssize_t nr_hugepages_show_common(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2805) 					struct kobj_attribute *attr, char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2806) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2807) 	struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2808) 	unsigned long nr_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2809) 	int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2810) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2811) 	h = kobj_to_hstate(kobj, &nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2812) 	if (nid == NUMA_NO_NODE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2813) 		nr_huge_pages = h->nr_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2814) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2815) 		nr_huge_pages = h->nr_huge_pages_node[nid];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2816) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2817) 	return sprintf(buf, "%lu\n", nr_huge_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2818) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2819) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2820) static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2821) 					   struct hstate *h, int nid,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2822) 					   unsigned long count, size_t len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2823) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2824) 	int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2825) 	nodemask_t nodes_allowed, *n_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2826) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2827) 	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2828) 		return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2829) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2830) 	if (nid == NUMA_NO_NODE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2831) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2832) 		 * global hstate attribute
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2833) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2834) 		if (!(obey_mempolicy &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2835) 				init_nodemask_of_mempolicy(&nodes_allowed)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2836) 			n_mask = &node_states[N_MEMORY];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2837) 		else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2838) 			n_mask = &nodes_allowed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2839) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2840) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2841) 		 * Node specific request.  count adjustment happens in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2842) 		 * set_max_huge_pages() after acquiring hugetlb_lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2843) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2844) 		init_nodemask_of_node(&nodes_allowed, nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2845) 		n_mask = &nodes_allowed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2846) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2847) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2848) 	err = set_max_huge_pages(h, count, nid, n_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2849) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2850) 	return err ? err : len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2851) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2852) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2853) static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2854) 					 struct kobject *kobj, const char *buf,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2855) 					 size_t len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2856) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2857) 	struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2858) 	unsigned long count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2859) 	int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2860) 	int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2861) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2862) 	err = kstrtoul(buf, 10, &count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2863) 	if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2864) 		return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2865) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2866) 	h = kobj_to_hstate(kobj, &nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2867) 	return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2868) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2869) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2870) static ssize_t nr_hugepages_show(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2871) 				       struct kobj_attribute *attr, char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2872) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2873) 	return nr_hugepages_show_common(kobj, attr, buf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2874) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2875) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2876) static ssize_t nr_hugepages_store(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2877) 	       struct kobj_attribute *attr, const char *buf, size_t len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2878) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2879) 	return nr_hugepages_store_common(false, kobj, buf, len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2880) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2881) HSTATE_ATTR(nr_hugepages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2882) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2883) #ifdef CONFIG_NUMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2884) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2885) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2886)  * hstate attribute for optionally mempolicy-based constraint on persistent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2887)  * huge page alloc/free.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2888)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2889) static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2890) 				       struct kobj_attribute *attr, char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2891) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2892) 	return nr_hugepages_show_common(kobj, attr, buf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2893) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2894) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2895) static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2896) 	       struct kobj_attribute *attr, const char *buf, size_t len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2897) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2898) 	return nr_hugepages_store_common(true, kobj, buf, len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2899) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2900) HSTATE_ATTR(nr_hugepages_mempolicy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2901) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2902) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2903) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2904) static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2905) 					struct kobj_attribute *attr, char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2906) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2907) 	struct hstate *h = kobj_to_hstate(kobj, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2908) 	return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2909) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2910) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2911) static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2912) 		struct kobj_attribute *attr, const char *buf, size_t count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2913) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2914) 	int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2915) 	unsigned long input;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2916) 	struct hstate *h = kobj_to_hstate(kobj, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2917) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2918) 	if (hstate_is_gigantic(h))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2919) 		return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2920) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2921) 	err = kstrtoul(buf, 10, &input);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2922) 	if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2923) 		return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2924) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2925) 	spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2926) 	h->nr_overcommit_huge_pages = input;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2927) 	spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2928) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2929) 	return count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2930) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2931) HSTATE_ATTR(nr_overcommit_hugepages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2932) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2933) static ssize_t free_hugepages_show(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2934) 					struct kobj_attribute *attr, char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2935) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2936) 	struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2937) 	unsigned long free_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2938) 	int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2939) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2940) 	h = kobj_to_hstate(kobj, &nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2941) 	if (nid == NUMA_NO_NODE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2942) 		free_huge_pages = h->free_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2943) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2944) 		free_huge_pages = h->free_huge_pages_node[nid];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2945) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2946) 	return sprintf(buf, "%lu\n", free_huge_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2947) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2948) HSTATE_ATTR_RO(free_hugepages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2949) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2950) static ssize_t resv_hugepages_show(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2951) 					struct kobj_attribute *attr, char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2952) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2953) 	struct hstate *h = kobj_to_hstate(kobj, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2954) 	return sprintf(buf, "%lu\n", h->resv_huge_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2955) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2956) HSTATE_ATTR_RO(resv_hugepages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2957) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2958) static ssize_t surplus_hugepages_show(struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2959) 					struct kobj_attribute *attr, char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2960) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2961) 	struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2962) 	unsigned long surplus_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2963) 	int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2964) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2965) 	h = kobj_to_hstate(kobj, &nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2966) 	if (nid == NUMA_NO_NODE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2967) 		surplus_huge_pages = h->surplus_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2968) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2969) 		surplus_huge_pages = h->surplus_huge_pages_node[nid];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2970) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2971) 	return sprintf(buf, "%lu\n", surplus_huge_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2972) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2973) HSTATE_ATTR_RO(surplus_hugepages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2974) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2975) static struct attribute *hstate_attrs[] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2976) 	&nr_hugepages_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2977) 	&nr_overcommit_hugepages_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2978) 	&free_hugepages_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2979) 	&resv_hugepages_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2980) 	&surplus_hugepages_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2981) #ifdef CONFIG_NUMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2982) 	&nr_hugepages_mempolicy_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2983) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2984) 	NULL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2985) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2986) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2987) static const struct attribute_group hstate_attr_group = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2988) 	.attrs = hstate_attrs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2989) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2990) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2991) static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2992) 				    struct kobject **hstate_kobjs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2993) 				    const struct attribute_group *hstate_attr_group)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2994) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2995) 	int retval;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2996) 	int hi = hstate_index(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2997) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2998) 	hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2999) 	if (!hstate_kobjs[hi])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3000) 		return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3001) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3002) 	retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3003) 	if (retval) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3004) 		kobject_put(hstate_kobjs[hi]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3005) 		hstate_kobjs[hi] = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3006) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3007) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3008) 	return retval;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3009) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3010) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3011) static void __init hugetlb_sysfs_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3012) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3013) 	struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3014) 	int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3015) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3016) 	hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3017) 	if (!hugepages_kobj)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3018) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3019) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3020) 	for_each_hstate(h) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3021) 		err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3022) 					 hstate_kobjs, &hstate_attr_group);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3023) 		if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3024) 			pr_err("HugeTLB: Unable to add hstate %s", h->name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3025) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3026) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3027) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3028) #ifdef CONFIG_NUMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3029) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3030) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3031)  * node_hstate/s - associate per node hstate attributes, via their kobjects,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3032)  * with node devices in node_devices[] using a parallel array.  The array
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3033)  * index of a node device or _hstate == node id.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3034)  * This is here to avoid any static dependency of the node device driver, in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3035)  * the base kernel, on the hugetlb module.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3036)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3037) struct node_hstate {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3038) 	struct kobject		*hugepages_kobj;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3039) 	struct kobject		*hstate_kobjs[HUGE_MAX_HSTATE];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3040) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3041) static struct node_hstate node_hstates[MAX_NUMNODES];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3042) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3043) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3044)  * A subset of global hstate attributes for node devices
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3045)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3046) static struct attribute *per_node_hstate_attrs[] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3047) 	&nr_hugepages_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3048) 	&free_hugepages_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3049) 	&surplus_hugepages_attr.attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3050) 	NULL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3051) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3052) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3053) static const struct attribute_group per_node_hstate_attr_group = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3054) 	.attrs = per_node_hstate_attrs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3055) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3056) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3057) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3058)  * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3059)  * Returns node id via non-NULL nidp.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3060)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3061) static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3062) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3063) 	int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3064) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3065) 	for (nid = 0; nid < nr_node_ids; nid++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3066) 		struct node_hstate *nhs = &node_hstates[nid];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3067) 		int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3068) 		for (i = 0; i < HUGE_MAX_HSTATE; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3069) 			if (nhs->hstate_kobjs[i] == kobj) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3070) 				if (nidp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3071) 					*nidp = nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3072) 				return &hstates[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3073) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3074) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3075) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3076) 	BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3077) 	return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3078) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3079) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3080) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3081)  * Unregister hstate attributes from a single node device.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3082)  * No-op if no hstate attributes attached.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3083)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3084) static void hugetlb_unregister_node(struct node *node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3085) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3086) 	struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3087) 	struct node_hstate *nhs = &node_hstates[node->dev.id];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3088) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3089) 	if (!nhs->hugepages_kobj)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3090) 		return;		/* no hstate attributes */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3091) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3092) 	for_each_hstate(h) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3093) 		int idx = hstate_index(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3094) 		if (nhs->hstate_kobjs[idx]) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3095) 			kobject_put(nhs->hstate_kobjs[idx]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3096) 			nhs->hstate_kobjs[idx] = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3097) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3098) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3099) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3100) 	kobject_put(nhs->hugepages_kobj);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3101) 	nhs->hugepages_kobj = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3102) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3103) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3104) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3105) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3106)  * Register hstate attributes for a single node device.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3107)  * No-op if attributes already registered.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3108)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3109) static void hugetlb_register_node(struct node *node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3110) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3111) 	struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3112) 	struct node_hstate *nhs = &node_hstates[node->dev.id];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3113) 	int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3114) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3115) 	if (nhs->hugepages_kobj)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3116) 		return;		/* already allocated */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3117) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3118) 	nhs->hugepages_kobj = kobject_create_and_add("hugepages",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3119) 							&node->dev.kobj);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3120) 	if (!nhs->hugepages_kobj)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3121) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3122) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3123) 	for_each_hstate(h) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3124) 		err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3125) 						nhs->hstate_kobjs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3126) 						&per_node_hstate_attr_group);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3127) 		if (err) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3128) 			pr_err("HugeTLB: Unable to add hstate %s for node %d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3129) 				h->name, node->dev.id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3130) 			hugetlb_unregister_node(node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3131) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3132) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3133) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3134) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3135) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3136) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3137)  * hugetlb init time:  register hstate attributes for all registered node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3138)  * devices of nodes that have memory.  All on-line nodes should have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3139)  * registered their associated device by this time.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3140)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3141) static void __init hugetlb_register_all_nodes(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3142) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3143) 	int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3144) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3145) 	for_each_node_state(nid, N_MEMORY) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3146) 		struct node *node = node_devices[nid];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3147) 		if (node->dev.id == nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3148) 			hugetlb_register_node(node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3149) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3150) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3151) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3152) 	 * Let the node device driver know we're here so it can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3153) 	 * [un]register hstate attributes on node hotplug.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3154) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3155) 	register_hugetlbfs_with_node(hugetlb_register_node,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3156) 				     hugetlb_unregister_node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3157) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3158) #else	/* !CONFIG_NUMA */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3159) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3160) static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3161) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3162) 	BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3163) 	if (nidp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3164) 		*nidp = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3165) 	return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3166) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3167) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3168) static void hugetlb_register_all_nodes(void) { }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3169) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3170) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3171) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3172) static int __init hugetlb_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3173) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3174) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3175) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3176) 	if (!hugepages_supported()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3177) 		if (hugetlb_max_hstate || default_hstate_max_huge_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3178) 			pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3179) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3180) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3181) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3182) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3183) 	 * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists.  Some
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3184) 	 * architectures depend on setup being done here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3185) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3186) 	hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3187) 	if (!parsed_default_hugepagesz) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3188) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3189) 		 * If we did not parse a default huge page size, set
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3190) 		 * default_hstate_idx to HPAGE_SIZE hstate. And, if the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3191) 		 * number of huge pages for this default size was implicitly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3192) 		 * specified, set that here as well.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3193) 		 * Note that the implicit setting will overwrite an explicit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3194) 		 * setting.  A warning will be printed in this case.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3195) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3196) 		default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3197) 		if (default_hstate_max_huge_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3198) 			if (default_hstate.max_huge_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3199) 				char buf[32];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3200) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3201) 				string_get_size(huge_page_size(&default_hstate),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3202) 					1, STRING_UNITS_2, buf, 32);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3203) 				pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3204) 					default_hstate.max_huge_pages, buf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3205) 				pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3206) 					default_hstate_max_huge_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3207) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3208) 			default_hstate.max_huge_pages =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3209) 				default_hstate_max_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3210) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3211) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3212) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3213) 	hugetlb_cma_check();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3214) 	hugetlb_init_hstates();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3215) 	gather_bootmem_prealloc();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3216) 	report_hugepages();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3217) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3218) 	hugetlb_sysfs_init();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3219) 	hugetlb_register_all_nodes();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3220) 	hugetlb_cgroup_file_init();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3221) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3222) #ifdef CONFIG_SMP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3223) 	num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3224) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3225) 	num_fault_mutexes = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3226) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3227) 	hugetlb_fault_mutex_table =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3228) 		kmalloc_array(num_fault_mutexes, sizeof(struct mutex),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3229) 			      GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3230) 	BUG_ON(!hugetlb_fault_mutex_table);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3231) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3232) 	for (i = 0; i < num_fault_mutexes; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3233) 		mutex_init(&hugetlb_fault_mutex_table[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3234) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3235) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3236) subsys_initcall(hugetlb_init);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3237) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3238) /* Overwritten by architectures with more huge page sizes */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3239) bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3240) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3241) 	return size == HPAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3242) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3243) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3244) void __init hugetlb_add_hstate(unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3245) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3246) 	struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3247) 	unsigned long i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3248) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3249) 	if (size_to_hstate(PAGE_SIZE << order)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3250) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3251) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3252) 	BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3253) 	BUG_ON(order == 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3254) 	h = &hstates[hugetlb_max_hstate++];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3255) 	h->order = order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3256) 	h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3257) 	h->nr_huge_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3258) 	h->free_huge_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3259) 	for (i = 0; i < MAX_NUMNODES; ++i)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3260) 		INIT_LIST_HEAD(&h->hugepage_freelists[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3261) 	INIT_LIST_HEAD(&h->hugepage_activelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3262) 	h->next_nid_to_alloc = first_memory_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3263) 	h->next_nid_to_free = first_memory_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3264) 	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3265) 					huge_page_size(h)/1024);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3266) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3267) 	parsed_hstate = h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3268) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3269) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3270) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3271)  * hugepages command line processing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3272)  * hugepages normally follows a valid hugepagsz or default_hugepagsz
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3273)  * specification.  If not, ignore the hugepages value.  hugepages can also
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3274)  * be the first huge page command line  option in which case it implicitly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3275)  * specifies the number of huge pages for the default size.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3276)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3277) static int __init hugepages_setup(char *s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3278) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3279) 	unsigned long *mhp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3280) 	static unsigned long *last_mhp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3281) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3282) 	if (!parsed_valid_hugepagesz) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3283) 		pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3284) 		parsed_valid_hugepagesz = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3285) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3286) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3287) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3288) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3289) 	 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3290) 	 * yet, so this hugepages= parameter goes to the "default hstate".
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3291) 	 * Otherwise, it goes with the previously parsed hugepagesz or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3292) 	 * default_hugepagesz.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3293) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3294) 	else if (!hugetlb_max_hstate)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3295) 		mhp = &default_hstate_max_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3296) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3297) 		mhp = &parsed_hstate->max_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3298) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3299) 	if (mhp == last_mhp) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3300) 		pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3301) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3302) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3303) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3304) 	if (sscanf(s, "%lu", mhp) <= 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3305) 		*mhp = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3306) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3307) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3308) 	 * Global state is always initialized later in hugetlb_init.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3309) 	 * But we need to allocate >= MAX_ORDER hstates here early to still
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3310) 	 * use the bootmem allocator.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3311) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3312) 	if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3313) 		hugetlb_hstate_alloc_pages(parsed_hstate);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3314) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3315) 	last_mhp = mhp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3316) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3317) 	return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3318) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3319) __setup("hugepages=", hugepages_setup);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3320) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3321) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3322)  * hugepagesz command line processing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3323)  * A specific huge page size can only be specified once with hugepagesz.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3324)  * hugepagesz is followed by hugepages on the command line.  The global
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3325)  * variable 'parsed_valid_hugepagesz' is used to determine if prior
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3326)  * hugepagesz argument was valid.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3327)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3328) static int __init hugepagesz_setup(char *s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3329) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3330) 	unsigned long size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3331) 	struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3332) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3333) 	parsed_valid_hugepagesz = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3334) 	size = (unsigned long)memparse(s, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3335) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3336) 	if (!arch_hugetlb_valid_size(size)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3337) 		pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3338) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3339) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3340) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3341) 	h = size_to_hstate(size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3342) 	if (h) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3343) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3344) 		 * hstate for this size already exists.  This is normally
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3345) 		 * an error, but is allowed if the existing hstate is the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3346) 		 * default hstate.  More specifically, it is only allowed if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3347) 		 * the number of huge pages for the default hstate was not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3348) 		 * previously specified.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3349) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3350) 		if (!parsed_default_hugepagesz ||  h != &default_hstate ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3351) 		    default_hstate.max_huge_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3352) 			pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3353) 			return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3354) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3355) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3356) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3357) 		 * No need to call hugetlb_add_hstate() as hstate already
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3358) 		 * exists.  But, do set parsed_hstate so that a following
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3359) 		 * hugepages= parameter will be applied to this hstate.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3360) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3361) 		parsed_hstate = h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3362) 		parsed_valid_hugepagesz = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3363) 		return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3364) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3365) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3366) 	hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3367) 	parsed_valid_hugepagesz = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3368) 	return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3369) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3370) __setup("hugepagesz=", hugepagesz_setup);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3371) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3372) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3373)  * default_hugepagesz command line input
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3374)  * Only one instance of default_hugepagesz allowed on command line.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3375)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3376) static int __init default_hugepagesz_setup(char *s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3377) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3378) 	unsigned long size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3379) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3380) 	parsed_valid_hugepagesz = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3381) 	if (parsed_default_hugepagesz) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3382) 		pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3383) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3384) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3385) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3386) 	size = (unsigned long)memparse(s, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3387) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3388) 	if (!arch_hugetlb_valid_size(size)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3389) 		pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3390) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3391) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3392) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3393) 	hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3394) 	parsed_valid_hugepagesz = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3395) 	parsed_default_hugepagesz = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3396) 	default_hstate_idx = hstate_index(size_to_hstate(size));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3397) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3398) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3399) 	 * The number of default huge pages (for this size) could have been
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3400) 	 * specified as the first hugetlb parameter: hugepages=X.  If so,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3401) 	 * then default_hstate_max_huge_pages is set.  If the default huge
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3402) 	 * page size is gigantic (>= MAX_ORDER), then the pages must be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3403) 	 * allocated here from bootmem allocator.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3404) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3405) 	if (default_hstate_max_huge_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3406) 		default_hstate.max_huge_pages = default_hstate_max_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3407) 		if (hstate_is_gigantic(&default_hstate))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3408) 			hugetlb_hstate_alloc_pages(&default_hstate);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3409) 		default_hstate_max_huge_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3410) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3411) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3412) 	return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3413) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3414) __setup("default_hugepagesz=", default_hugepagesz_setup);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3415) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3416) static unsigned int allowed_mems_nr(struct hstate *h)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3417) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3418) 	int node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3419) 	unsigned int nr = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3420) 	nodemask_t *mpol_allowed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3421) 	unsigned int *array = h->free_huge_pages_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3422) 	gfp_t gfp_mask = htlb_alloc_mask(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3423) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3424) 	mpol_allowed = policy_nodemask_current(gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3425) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3426) 	for_each_node_mask(node, cpuset_current_mems_allowed) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3427) 		if (!mpol_allowed ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3428) 		    (mpol_allowed && node_isset(node, *mpol_allowed)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3429) 			nr += array[node];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3430) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3431) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3432) 	return nr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3433) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3434) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3435) #ifdef CONFIG_SYSCTL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3436) static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3437) 					  void *buffer, size_t *length,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3438) 					  loff_t *ppos, unsigned long *out)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3439) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3440) 	struct ctl_table dup_table;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3441) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3442) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3443) 	 * In order to avoid races with __do_proc_doulongvec_minmax(), we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3444) 	 * can duplicate the @table and alter the duplicate of it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3445) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3446) 	dup_table = *table;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3447) 	dup_table.data = out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3448) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3449) 	return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3450) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3451) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3452) static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3453) 			 struct ctl_table *table, int write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3454) 			 void *buffer, size_t *length, loff_t *ppos)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3455) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3456) 	struct hstate *h = &default_hstate;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3457) 	unsigned long tmp = h->max_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3458) 	int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3459) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3460) 	if (!hugepages_supported())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3461) 		return -EOPNOTSUPP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3462) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3463) 	ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3464) 					     &tmp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3465) 	if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3466) 		goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3467) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3468) 	if (write)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3469) 		ret = __nr_hugepages_store_common(obey_mempolicy, h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3470) 						  NUMA_NO_NODE, tmp, *length);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3471) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3472) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3473) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3474) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3475) int hugetlb_sysctl_handler(struct ctl_table *table, int write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3476) 			  void *buffer, size_t *length, loff_t *ppos)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3477) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3478) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3479) 	return hugetlb_sysctl_handler_common(false, table, write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3480) 							buffer, length, ppos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3481) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3482) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3483) #ifdef CONFIG_NUMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3484) int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3485) 			  void *buffer, size_t *length, loff_t *ppos)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3486) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3487) 	return hugetlb_sysctl_handler_common(true, table, write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3488) 							buffer, length, ppos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3489) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3490) #endif /* CONFIG_NUMA */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3491) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3492) int hugetlb_overcommit_handler(struct ctl_table *table, int write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3493) 		void *buffer, size_t *length, loff_t *ppos)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3494) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3495) 	struct hstate *h = &default_hstate;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3496) 	unsigned long tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3497) 	int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3498) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3499) 	if (!hugepages_supported())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3500) 		return -EOPNOTSUPP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3501) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3502) 	tmp = h->nr_overcommit_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3503) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3504) 	if (write && hstate_is_gigantic(h))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3505) 		return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3506) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3507) 	ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3508) 					     &tmp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3509) 	if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3510) 		goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3511) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3512) 	if (write) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3513) 		spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3514) 		h->nr_overcommit_huge_pages = tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3515) 		spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3516) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3517) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3518) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3519) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3520) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3521) #endif /* CONFIG_SYSCTL */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3522) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3523) void hugetlb_report_meminfo(struct seq_file *m)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3524) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3525) 	struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3526) 	unsigned long total = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3527) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3528) 	if (!hugepages_supported())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3529) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3530) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3531) 	for_each_hstate(h) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3532) 		unsigned long count = h->nr_huge_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3533) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3534) 		total += (PAGE_SIZE << huge_page_order(h)) * count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3535) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3536) 		if (h == &default_hstate)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3537) 			seq_printf(m,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3538) 				   "HugePages_Total:   %5lu\n"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3539) 				   "HugePages_Free:    %5lu\n"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3540) 				   "HugePages_Rsvd:    %5lu\n"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3541) 				   "HugePages_Surp:    %5lu\n"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3542) 				   "Hugepagesize:   %8lu kB\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3543) 				   count,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3544) 				   h->free_huge_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3545) 				   h->resv_huge_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3546) 				   h->surplus_huge_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3547) 				   (PAGE_SIZE << huge_page_order(h)) / 1024);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3548) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3549) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3550) 	seq_printf(m, "Hugetlb:        %8lu kB\n", total / 1024);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3551) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3552) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3553) int hugetlb_report_node_meminfo(char *buf, int len, int nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3554) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3555) 	struct hstate *h = &default_hstate;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3556) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3557) 	if (!hugepages_supported())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3558) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3559) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3560) 	return sysfs_emit_at(buf, len,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3561) 			     "Node %d HugePages_Total: %5u\n"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3562) 			     "Node %d HugePages_Free:  %5u\n"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3563) 			     "Node %d HugePages_Surp:  %5u\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3564) 			     nid, h->nr_huge_pages_node[nid],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3565) 			     nid, h->free_huge_pages_node[nid],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3566) 			     nid, h->surplus_huge_pages_node[nid]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3567) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3568) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3569) void hugetlb_show_meminfo(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3570) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3571) 	struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3572) 	int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3573) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3574) 	if (!hugepages_supported())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3575) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3576) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3577) 	for_each_node_state(nid, N_MEMORY)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3578) 		for_each_hstate(h)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3579) 			pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3580) 				nid,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3581) 				h->nr_huge_pages_node[nid],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3582) 				h->free_huge_pages_node[nid],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3583) 				h->surplus_huge_pages_node[nid],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3584) 				1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3585) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3586) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3587) void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3588) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3589) 	seq_printf(m, "HugetlbPages:\t%8lu kB\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3590) 		   atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3591) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3592) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3593) /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3594) unsigned long hugetlb_total_pages(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3595) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3596) 	struct hstate *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3597) 	unsigned long nr_total_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3598) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3599) 	for_each_hstate(h)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3600) 		nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3601) 	return nr_total_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3602) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3603) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3604) static int hugetlb_acct_memory(struct hstate *h, long delta)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3605) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3606) 	int ret = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3607) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3608) 	spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3609) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3610) 	 * When cpuset is configured, it breaks the strict hugetlb page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3611) 	 * reservation as the accounting is done on a global variable. Such
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3612) 	 * reservation is completely rubbish in the presence of cpuset because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3613) 	 * the reservation is not checked against page availability for the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3614) 	 * current cpuset. Application can still potentially OOM'ed by kernel
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3615) 	 * with lack of free htlb page in cpuset that the task is in.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3616) 	 * Attempt to enforce strict accounting with cpuset is almost
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3617) 	 * impossible (or too ugly) because cpuset is too fluid that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3618) 	 * task or memory node can be dynamically moved between cpusets.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3619) 	 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3620) 	 * The change of semantics for shared hugetlb mapping with cpuset is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3621) 	 * undesirable. However, in order to preserve some of the semantics,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3622) 	 * we fall back to check against current free page availability as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3623) 	 * a best attempt and hopefully to minimize the impact of changing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3624) 	 * semantics that cpuset has.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3625) 	 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3626) 	 * Apart from cpuset, we also have memory policy mechanism that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3627) 	 * also determines from which node the kernel will allocate memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3628) 	 * in a NUMA system. So similar to cpuset, we also should consider
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3629) 	 * the memory policy of the current task. Similar to the description
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3630) 	 * above.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3631) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3632) 	if (delta > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3633) 		if (gather_surplus_pages(h, delta) < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3634) 			goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3635) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3636) 		if (delta > allowed_mems_nr(h)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3637) 			return_unused_surplus_pages(h, delta);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3638) 			goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3639) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3640) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3641) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3642) 	ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3643) 	if (delta < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3644) 		return_unused_surplus_pages(h, (unsigned long) -delta);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3645) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3646) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3647) 	spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3648) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3649) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3650) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3651) static void hugetlb_vm_op_open(struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3652) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3653) 	struct resv_map *resv = vma_resv_map(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3654) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3655) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3656) 	 * This new VMA should share its siblings reservation map if present.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3657) 	 * The VMA will only ever have a valid reservation map pointer where
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3658) 	 * it is being copied for another still existing VMA.  As that VMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3659) 	 * has a reference to the reservation map it cannot disappear until
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3660) 	 * after this open call completes.  It is therefore safe to take a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3661) 	 * new reference here without additional locking.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3662) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3663) 	if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3664) 		resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3665) 		kref_get(&resv->refs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3666) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3667) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3668) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3669) static void hugetlb_vm_op_close(struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3670) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3671) 	struct hstate *h = hstate_vma(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3672) 	struct resv_map *resv = vma_resv_map(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3673) 	struct hugepage_subpool *spool = subpool_vma(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3674) 	unsigned long reserve, start, end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3675) 	long gbl_reserve;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3676) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3677) 	if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3678) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3679) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3680) 	start = vma_hugecache_offset(h, vma, vma->vm_start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3681) 	end = vma_hugecache_offset(h, vma, vma->vm_end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3682) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3683) 	reserve = (end - start) - region_count(resv, start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3684) 	hugetlb_cgroup_uncharge_counter(resv, start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3685) 	if (reserve) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3686) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3687) 		 * Decrement reserve counts.  The global reserve count may be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3688) 		 * adjusted if the subpool has a minimum size.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3689) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3690) 		gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3691) 		hugetlb_acct_memory(h, -gbl_reserve);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3692) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3693) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3694) 	kref_put(&resv->refs, resv_map_release);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3695) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3696) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3697) static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3698) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3699) 	if (addr & ~(huge_page_mask(hstate_vma(vma))))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3700) 		return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3701) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3702) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3703) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3704) static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3705) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3706) 	struct hstate *hstate = hstate_vma(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3707) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3708) 	return 1UL << huge_page_shift(hstate);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3709) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3710) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3711) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3712)  * We cannot handle pagefaults against hugetlb pages at all.  They cause
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3713)  * handle_mm_fault() to try to instantiate regular-sized pages in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3714)  * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3715)  * this far.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3716)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3717) static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3718) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3719) 	BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3720) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3721) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3722) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3723) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3724)  * When a new function is introduced to vm_operations_struct and added
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3725)  * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3726)  * This is because under System V memory model, mappings created via
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3727)  * shmget/shmat with "huge page" specified are backed by hugetlbfs files,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3728)  * their original vm_ops are overwritten with shm_vm_ops.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3729)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3730) const struct vm_operations_struct hugetlb_vm_ops = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3731) 	.fault = hugetlb_vm_op_fault,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3732) 	.open = hugetlb_vm_op_open,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3733) 	.close = hugetlb_vm_op_close,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3734) 	.split = hugetlb_vm_op_split,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3735) 	.pagesize = hugetlb_vm_op_pagesize,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3736) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3737) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3738) static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3739) 				int writable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3740) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3741) 	pte_t entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3742) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3743) 	if (writable) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3744) 		entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3745) 					 vma->vm_page_prot)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3746) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3747) 		entry = huge_pte_wrprotect(mk_huge_pte(page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3748) 					   vma->vm_page_prot));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3749) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3750) 	entry = pte_mkyoung(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3751) 	entry = pte_mkhuge(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3752) 	entry = arch_make_huge_pte(entry, vma, page, writable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3753) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3754) 	return entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3755) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3756) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3757) static void set_huge_ptep_writable(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3758) 				   unsigned long address, pte_t *ptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3759) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3760) 	pte_t entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3761) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3762) 	entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3763) 	if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3764) 		update_mmu_cache(vma, address, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3765) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3766) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3767) bool is_hugetlb_entry_migration(pte_t pte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3768) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3769) 	swp_entry_t swp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3770) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3771) 	if (huge_pte_none(pte) || pte_present(pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3772) 		return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3773) 	swp = pte_to_swp_entry(pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3774) 	if (is_migration_entry(swp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3775) 		return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3776) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3777) 		return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3778) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3779) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3780) static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3781) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3782) 	swp_entry_t swp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3783) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3784) 	if (huge_pte_none(pte) || pte_present(pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3785) 		return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3786) 	swp = pte_to_swp_entry(pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3787) 	if (is_hwpoison_entry(swp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3788) 		return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3789) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3790) 		return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3791) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3792) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3793) int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3794) 			    struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3795) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3796) 	pte_t *src_pte, *dst_pte, entry, dst_entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3797) 	struct page *ptepage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3798) 	unsigned long addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3799) 	int cow;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3800) 	struct hstate *h = hstate_vma(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3801) 	unsigned long sz = huge_page_size(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3802) 	struct address_space *mapping = vma->vm_file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3803) 	struct mmu_notifier_range range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3804) 	int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3805) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3806) 	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3807) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3808) 	if (cow) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3809) 		mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3810) 					vma->vm_start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3811) 					vma->vm_end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3812) 		mmu_notifier_invalidate_range_start(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3813) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3814) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3815) 		 * For shared mappings i_mmap_rwsem must be held to call
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3816) 		 * huge_pte_alloc, otherwise the returned ptep could go
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3817) 		 * away if part of a shared pmd and another thread calls
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3818) 		 * huge_pmd_unshare.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3819) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3820) 		i_mmap_lock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3821) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3822) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3823) 	for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3824) 		spinlock_t *src_ptl, *dst_ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3825) 		src_pte = huge_pte_offset(src, addr, sz);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3826) 		if (!src_pte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3827) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3828) 		dst_pte = huge_pte_alloc(dst, vma, addr, sz);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3829) 		if (!dst_pte) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3830) 			ret = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3831) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3832) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3833) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3834) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3835) 		 * If the pagetables are shared don't copy or take references.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3836) 		 * dst_pte == src_pte is the common case of src/dest sharing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3837) 		 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3838) 		 * However, src could have 'unshared' and dst shares with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3839) 		 * another vma.  If dst_pte !none, this implies sharing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3840) 		 * Check here before taking page table lock, and once again
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3841) 		 * after taking the lock below.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3842) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3843) 		dst_entry = huge_ptep_get(dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3844) 		if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3845) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3846) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3847) 		dst_ptl = huge_pte_lock(h, dst, dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3848) 		src_ptl = huge_pte_lockptr(h, src, src_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3849) 		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3850) 		entry = huge_ptep_get(src_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3851) 		dst_entry = huge_ptep_get(dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3852) 		if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3853) 			/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3854) 			 * Skip if src entry none.  Also, skip in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3855) 			 * unlikely case dst entry !none as this implies
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3856) 			 * sharing with another vma.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3857) 			 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3858) 			;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3859) 		} else if (unlikely(is_hugetlb_entry_migration(entry) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3860) 				    is_hugetlb_entry_hwpoisoned(entry))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3861) 			swp_entry_t swp_entry = pte_to_swp_entry(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3862) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3863) 			if (is_write_migration_entry(swp_entry) && cow) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3864) 				/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3865) 				 * COW mappings require pages in both
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3866) 				 * parent and child to be set to read.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3867) 				 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3868) 				make_migration_entry_read(&swp_entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3869) 				entry = swp_entry_to_pte(swp_entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3870) 				set_huge_swap_pte_at(src, addr, src_pte,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3871) 						     entry, sz);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3872) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3873) 			set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3874) 		} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3875) 			if (cow) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3876) 				/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3877) 				 * No need to notify as we are downgrading page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3878) 				 * table protection not changing it to point
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3879) 				 * to a new page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3880) 				 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3881) 				 * See Documentation/vm/mmu_notifier.rst
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3882) 				 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3883) 				huge_ptep_set_wrprotect(src, addr, src_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3884) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3885) 			entry = huge_ptep_get(src_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3886) 			ptepage = pte_page(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3887) 			get_page(ptepage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3888) 			page_dup_rmap(ptepage, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3889) 			set_huge_pte_at(dst, addr, dst_pte, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3890) 			hugetlb_count_add(pages_per_huge_page(h), dst);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3891) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3892) 		spin_unlock(src_ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3893) 		spin_unlock(dst_ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3894) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3895) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3896) 	if (cow)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3897) 		mmu_notifier_invalidate_range_end(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3898) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3899) 		i_mmap_unlock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3900) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3901) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3902) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3903) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3904) void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3905) 			    unsigned long start, unsigned long end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3906) 			    struct page *ref_page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3907) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3908) 	struct mm_struct *mm = vma->vm_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3909) 	unsigned long address;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3910) 	pte_t *ptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3911) 	pte_t pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3912) 	spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3913) 	struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3914) 	struct hstate *h = hstate_vma(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3915) 	unsigned long sz = huge_page_size(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3916) 	struct mmu_notifier_range range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3917) 	bool force_flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3918) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3919) 	WARN_ON(!is_vm_hugetlb_page(vma));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3920) 	BUG_ON(start & ~huge_page_mask(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3921) 	BUG_ON(end & ~huge_page_mask(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3922) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3923) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3924) 	 * This is a hugetlb vma, all the pte entries should point
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3925) 	 * to huge page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3926) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3927) 	tlb_change_page_size(tlb, sz);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3928) 	tlb_start_vma(tlb, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3929) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3930) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3931) 	 * If sharing possible, alert mmu notifiers of worst case.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3932) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3933) 	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3934) 				end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3935) 	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3936) 	mmu_notifier_invalidate_range_start(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3937) 	address = start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3938) 	for (; address < end; address += sz) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3939) 		ptep = huge_pte_offset(mm, address, sz);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3940) 		if (!ptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3941) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3942) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3943) 		ptl = huge_pte_lock(h, mm, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3944) 		if (huge_pmd_unshare(mm, vma, &address, ptep)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3945) 			spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3946) 			tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3947) 			force_flush = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3948) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3949) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3950) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3951) 		pte = huge_ptep_get(ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3952) 		if (huge_pte_none(pte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3953) 			spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3954) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3955) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3956) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3957) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3958) 		 * Migrating hugepage or HWPoisoned hugepage is already
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3959) 		 * unmapped and its refcount is dropped, so just clear pte here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3960) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3961) 		if (unlikely(!pte_present(pte))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3962) 			huge_pte_clear(mm, address, ptep, sz);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3963) 			spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3964) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3965) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3966) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3967) 		page = pte_page(pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3968) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3969) 		 * If a reference page is supplied, it is because a specific
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3970) 		 * page is being unmapped, not a range. Ensure the page we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3971) 		 * are about to unmap is the actual page of interest.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3972) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3973) 		if (ref_page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3974) 			if (page != ref_page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3975) 				spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3976) 				continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3977) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3978) 			/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3979) 			 * Mark the VMA as having unmapped its page so that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3980) 			 * future faults in this VMA will fail rather than
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3981) 			 * looking like data was lost
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3982) 			 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3983) 			set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3984) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3985) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3986) 		pte = huge_ptep_get_and_clear(mm, address, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3987) 		tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3988) 		if (huge_pte_dirty(pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3989) 			set_page_dirty(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3990) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3991) 		hugetlb_count_sub(pages_per_huge_page(h), mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3992) 		page_remove_rmap(page, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3993) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3994) 		spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3995) 		tlb_remove_page_size(tlb, page, huge_page_size(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3996) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3997) 		 * Bail out after unmapping reference page if supplied
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3998) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3999) 		if (ref_page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4000) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4001) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4002) 	mmu_notifier_invalidate_range_end(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4003) 	tlb_end_vma(tlb, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4004) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4005) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4006) 	 * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4007) 	 * could defer the flush until now, since by holding i_mmap_rwsem we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4008) 	 * guaranteed that the last refernece would not be dropped. But we must
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4009) 	 * do the flushing before we return, as otherwise i_mmap_rwsem will be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4010) 	 * dropped and the last reference to the shared PMDs page might be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4011) 	 * dropped as well.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4012) 	 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4013) 	 * In theory we could defer the freeing of the PMD pages as well, but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4014) 	 * huge_pmd_unshare() relies on the exact page_count for the PMD page to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4015) 	 * detect sharing, so we cannot defer the release of the page either.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4016) 	 * Instead, do flush now.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4017) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4018) 	if (force_flush)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4019) 		tlb_flush_mmu_tlbonly(tlb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4020) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4021) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4022) void __unmap_hugepage_range_final(struct mmu_gather *tlb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4023) 			  struct vm_area_struct *vma, unsigned long start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4024) 			  unsigned long end, struct page *ref_page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4025) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4026) 	__unmap_hugepage_range(tlb, vma, start, end, ref_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4027) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4028) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4029) 	 * Clear this flag so that x86's huge_pmd_share page_table_shareable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4030) 	 * test will fail on a vma being torn down, and not grab a page table
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4031) 	 * on its way out.  We're lucky that the flag has such an appropriate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4032) 	 * name, and can in fact be safely cleared here. We could clear it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4033) 	 * before the __unmap_hugepage_range above, but all that's necessary
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4034) 	 * is to clear it before releasing the i_mmap_rwsem. This works
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4035) 	 * because in the context this is called, the VMA is about to be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4036) 	 * destroyed and the i_mmap_rwsem is held.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4037) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4038) 	vma->vm_flags &= ~VM_MAYSHARE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4039) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4040) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4041) void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4042) 			  unsigned long end, struct page *ref_page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4043) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4044) 	struct mm_struct *mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4045) 	struct mmu_gather tlb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4046) 	unsigned long tlb_start = start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4047) 	unsigned long tlb_end = end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4048) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4049) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4050) 	 * If shared PMDs were possibly used within this vma range, adjust
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4051) 	 * start/end for worst case tlb flushing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4052) 	 * Note that we can not be sure if PMDs are shared until we try to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4053) 	 * unmap pages.  However, we want to make sure TLB flushing covers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4054) 	 * the largest possible range.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4055) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4056) 	adjust_range_if_pmd_sharing_possible(vma, &tlb_start, &tlb_end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4057) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4058) 	mm = vma->vm_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4059) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4060) 	tlb_gather_mmu(&tlb, mm, tlb_start, tlb_end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4061) 	__unmap_hugepage_range(&tlb, vma, start, end, ref_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4062) 	tlb_finish_mmu(&tlb, tlb_start, tlb_end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4063) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4064) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4065) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4066)  * This is called when the original mapper is failing to COW a MAP_PRIVATE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4067)  * mappping it owns the reserve page for. The intention is to unmap the page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4068)  * from other VMAs and let the children be SIGKILLed if they are faulting the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4069)  * same region.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4070)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4071) static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4072) 			      struct page *page, unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4073) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4074) 	struct hstate *h = hstate_vma(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4075) 	struct vm_area_struct *iter_vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4076) 	struct address_space *mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4077) 	pgoff_t pgoff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4078) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4079) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4080) 	 * vm_pgoff is in PAGE_SIZE units, hence the different calculation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4081) 	 * from page cache lookup which is in HPAGE_SIZE units.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4082) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4083) 	address = address & huge_page_mask(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4084) 	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4085) 			vma->vm_pgoff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4086) 	mapping = vma->vm_file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4087) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4088) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4089) 	 * Take the mapping lock for the duration of the table walk. As
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4090) 	 * this mapping should be shared between all the VMAs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4091) 	 * __unmap_hugepage_range() is called as the lock is already held
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4092) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4093) 	i_mmap_lock_write(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4094) 	vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4095) 		/* Do not unmap the current VMA */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4096) 		if (iter_vma == vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4097) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4098) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4099) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4100) 		 * Shared VMAs have their own reserves and do not affect
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4101) 		 * MAP_PRIVATE accounting but it is possible that a shared
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4102) 		 * VMA is using the same page so check and skip such VMAs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4103) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4104) 		if (iter_vma->vm_flags & VM_MAYSHARE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4105) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4106) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4107) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4108) 		 * Unmap the page from other VMAs without their own reserves.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4109) 		 * They get marked to be SIGKILLed if they fault in these
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4110) 		 * areas. This is because a future no-page fault on this VMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4111) 		 * could insert a zeroed page instead of the data existing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4112) 		 * from the time of fork. This would look like data corruption
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4113) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4114) 		if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4115) 			unmap_hugepage_range(iter_vma, address,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4116) 					     address + huge_page_size(h), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4117) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4118) 	i_mmap_unlock_write(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4119) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4120) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4121) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4122)  * Hugetlb_cow() should be called with page lock of the original hugepage held.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4123)  * Called with hugetlb_instantiation_mutex held and pte_page locked so we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4124)  * cannot race with other handlers or page migration.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4125)  * Keep the pte_same checks anyway to make transition from the mutex easier.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4126)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4127) static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4128) 		       unsigned long address, pte_t *ptep,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4129) 		       struct page *pagecache_page, spinlock_t *ptl)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4130) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4131) 	pte_t pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4132) 	struct hstate *h = hstate_vma(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4133) 	struct page *old_page, *new_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4134) 	int outside_reserve = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4135) 	vm_fault_t ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4136) 	unsigned long haddr = address & huge_page_mask(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4137) 	struct mmu_notifier_range range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4138) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4139) 	pte = huge_ptep_get(ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4140) 	old_page = pte_page(pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4141) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4142) retry_avoidcopy:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4143) 	/* If no-one else is actually using this page, avoid the copy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4144) 	 * and just make the page writable */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4145) 	if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4146) 		page_move_anon_rmap(old_page, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4147) 		set_huge_ptep_writable(vma, haddr, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4148) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4149) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4150) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4151) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4152) 	 * If the process that created a MAP_PRIVATE mapping is about to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4153) 	 * perform a COW due to a shared page count, attempt to satisfy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4154) 	 * the allocation without using the existing reserves. The pagecache
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4155) 	 * page is used to determine if the reserve at this address was
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4156) 	 * consumed or not. If reserves were used, a partial faulted mapping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4157) 	 * at the time of fork() could consume its reserves on COW instead
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4158) 	 * of the full address range.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4159) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4160) 	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4161) 			old_page != pagecache_page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4162) 		outside_reserve = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4163) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4164) 	get_page(old_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4165) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4166) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4167) 	 * Drop page table lock as buddy allocator may be called. It will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4168) 	 * be acquired again before returning to the caller, as expected.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4169) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4170) 	spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4171) 	new_page = alloc_huge_page(vma, haddr, outside_reserve);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4172) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4173) 	if (IS_ERR(new_page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4174) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4175) 		 * If a process owning a MAP_PRIVATE mapping fails to COW,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4176) 		 * it is due to references held by a child and an insufficient
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4177) 		 * huge page pool. To guarantee the original mappers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4178) 		 * reliability, unmap the page from child processes. The child
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4179) 		 * may get SIGKILLed if it later faults.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4180) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4181) 		if (outside_reserve) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4182) 			struct address_space *mapping = vma->vm_file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4183) 			pgoff_t idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4184) 			u32 hash;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4185) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4186) 			put_page(old_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4187) 			BUG_ON(huge_pte_none(pte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4188) 			/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4189) 			 * Drop hugetlb_fault_mutex and i_mmap_rwsem before
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4190) 			 * unmapping.  unmapping needs to hold i_mmap_rwsem
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4191) 			 * in write mode.  Dropping i_mmap_rwsem in read mode
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4192) 			 * here is OK as COW mappings do not interact with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4193) 			 * PMD sharing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4194) 			 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4195) 			 * Reacquire both after unmap operation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4196) 			 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4197) 			idx = vma_hugecache_offset(h, vma, haddr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4198) 			hash = hugetlb_fault_mutex_hash(mapping, idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4199) 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4200) 			i_mmap_unlock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4201) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4202) 			unmap_ref_private(mm, vma, old_page, haddr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4203) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4204) 			i_mmap_lock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4205) 			mutex_lock(&hugetlb_fault_mutex_table[hash]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4206) 			spin_lock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4207) 			ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4208) 			if (likely(ptep &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4209) 				   pte_same(huge_ptep_get(ptep), pte)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4210) 				goto retry_avoidcopy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4211) 			/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4212) 			 * race occurs while re-acquiring page table
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4213) 			 * lock, and our job is done.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4214) 			 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4215) 			return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4216) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4217) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4218) 		ret = vmf_error(PTR_ERR(new_page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4219) 		goto out_release_old;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4220) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4221) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4222) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4223) 	 * When the original hugepage is shared one, it does not have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4224) 	 * anon_vma prepared.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4225) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4226) 	if (unlikely(anon_vma_prepare(vma))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4227) 		ret = VM_FAULT_OOM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4228) 		goto out_release_all;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4229) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4230) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4231) 	copy_user_huge_page(new_page, old_page, address, vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4232) 			    pages_per_huge_page(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4233) 	__SetPageUptodate(new_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4234) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4235) 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4236) 				haddr + huge_page_size(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4237) 	mmu_notifier_invalidate_range_start(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4238) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4239) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4240) 	 * Retake the page table lock to check for racing updates
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4241) 	 * before the page tables are altered
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4242) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4243) 	spin_lock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4244) 	ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4245) 	if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4246) 		ClearPagePrivate(new_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4247) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4248) 		/* Break COW */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4249) 		huge_ptep_clear_flush(vma, haddr, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4250) 		mmu_notifier_invalidate_range(mm, range.start, range.end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4251) 		set_huge_pte_at(mm, haddr, ptep,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4252) 				make_huge_pte(vma, new_page, 1));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4253) 		page_remove_rmap(old_page, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4254) 		hugepage_add_new_anon_rmap(new_page, vma, haddr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4255) 		set_page_huge_active(new_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4256) 		/* Make the old page be freed below */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4257) 		new_page = old_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4258) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4259) 	spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4260) 	mmu_notifier_invalidate_range_end(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4261) out_release_all:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4262) 	restore_reserve_on_error(h, vma, haddr, new_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4263) 	put_page(new_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4264) out_release_old:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4265) 	put_page(old_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4266) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4267) 	spin_lock(ptl); /* Caller expects lock to be held */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4268) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4269) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4270) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4271) /* Return the pagecache page at a given address within a VMA */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4272) static struct page *hugetlbfs_pagecache_page(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4273) 			struct vm_area_struct *vma, unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4274) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4275) 	struct address_space *mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4276) 	pgoff_t idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4277) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4278) 	mapping = vma->vm_file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4279) 	idx = vma_hugecache_offset(h, vma, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4280) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4281) 	return find_lock_page(mapping, idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4282) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4283) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4284) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4285)  * Return whether there is a pagecache page to back given address within VMA.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4286)  * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4287)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4288) static bool hugetlbfs_pagecache_present(struct hstate *h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4289) 			struct vm_area_struct *vma, unsigned long address)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4290) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4291) 	struct address_space *mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4292) 	pgoff_t idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4293) 	struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4294) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4295) 	mapping = vma->vm_file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4296) 	idx = vma_hugecache_offset(h, vma, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4297) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4298) 	page = find_get_page(mapping, idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4299) 	if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4300) 		put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4301) 	return page != NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4302) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4303) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4304) int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4305) 			   pgoff_t idx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4306) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4307) 	struct inode *inode = mapping->host;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4308) 	struct hstate *h = hstate_inode(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4309) 	int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4310) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4311) 	if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4312) 		return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4313) 	ClearPagePrivate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4314) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4315) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4316) 	 * set page dirty so that it will not be removed from cache/file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4317) 	 * by non-hugetlbfs specific code paths.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4318) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4319) 	set_page_dirty(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4320) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4321) 	spin_lock(&inode->i_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4322) 	inode->i_blocks += blocks_per_huge_page(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4323) 	spin_unlock(&inode->i_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4324) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4325) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4326) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4327) static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4328) 						  struct address_space *mapping,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4329) 						  pgoff_t idx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4330) 						  unsigned int flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4331) 						  unsigned long haddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4332) 						  unsigned long reason)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4333) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4334) 	vm_fault_t ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4335) 	u32 hash;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4336) 	struct vm_fault vmf = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4337) 		.vma = vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4338) 		.address = haddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4339) 		.flags = flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4340) 		.vma_flags = vma->vm_flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4341) 		.vma_page_prot = vma->vm_page_prot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4342) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4343) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4344) 		 * Hard to debug if it ends up being
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4345) 		 * used by a callee that assumes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4346) 		 * something about the other
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4347) 		 * uninitialized fields... same as in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4348) 		 * memory.c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4349) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4350) 	};
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4351) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4352) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4353) 	 * hugetlb_fault_mutex and i_mmap_rwsem must be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4354) 	 * dropped before handling userfault.  Reacquire
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4355) 	 * after handling fault to make calling code simpler.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4356) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4357) 	hash = hugetlb_fault_mutex_hash(mapping, idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4358) 	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4359) 	i_mmap_unlock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4360) 	ret = handle_userfault(&vmf, reason);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4361) 	i_mmap_lock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4362) 	mutex_lock(&hugetlb_fault_mutex_table[hash]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4363) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4364) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4365) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4366) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4367) static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4368) 			struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4369) 			struct address_space *mapping, pgoff_t idx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4370) 			unsigned long address, pte_t *ptep, unsigned int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4371) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4372) 	struct hstate *h = hstate_vma(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4373) 	vm_fault_t ret = VM_FAULT_SIGBUS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4374) 	int anon_rmap = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4375) 	unsigned long size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4376) 	struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4377) 	pte_t new_pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4378) 	spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4379) 	unsigned long haddr = address & huge_page_mask(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4380) 	bool new_page = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4381) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4382) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4383) 	 * Currently, we are forced to kill the process in the event the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4384) 	 * original mapper has unmapped pages from the child due to a failed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4385) 	 * COW. Warn that such a situation has occurred as it may not be obvious
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4386) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4387) 	if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4388) 		pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4389) 			   current->pid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4390) 		return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4391) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4392) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4393) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4394) 	 * We can not race with truncation due to holding i_mmap_rwsem.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4395) 	 * i_size is modified when holding i_mmap_rwsem, so check here
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4396) 	 * once for faults beyond end of file.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4397) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4398) 	size = i_size_read(mapping->host) >> huge_page_shift(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4399) 	if (idx >= size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4400) 		goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4401) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4402) retry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4403) 	page = find_lock_page(mapping, idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4404) 	if (!page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4405) 		/* Check for page in userfault range */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4406) 		if (userfaultfd_missing(vma)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4407) 			ret = hugetlb_handle_userfault(vma, mapping, idx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4408) 						       flags, haddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4409) 						       VM_UFFD_MISSING);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4410) 			goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4411) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4412) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4413) 		page = alloc_huge_page(vma, haddr, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4414) 		if (IS_ERR(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4415) 			/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4416) 			 * Returning error will result in faulting task being
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4417) 			 * sent SIGBUS.  The hugetlb fault mutex prevents two
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4418) 			 * tasks from racing to fault in the same page which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4419) 			 * could result in false unable to allocate errors.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4420) 			 * Page migration does not take the fault mutex, but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4421) 			 * does a clear then write of pte's under page table
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4422) 			 * lock.  Page fault code could race with migration,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4423) 			 * notice the clear pte and try to allocate a page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4424) 			 * here.  Before returning error, get ptl and make
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4425) 			 * sure there really is no pte entry.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4426) 			 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4427) 			ptl = huge_pte_lock(h, mm, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4428) 			if (!huge_pte_none(huge_ptep_get(ptep))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4429) 				ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4430) 				spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4431) 				goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4432) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4433) 			spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4434) 			ret = vmf_error(PTR_ERR(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4435) 			goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4436) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4437) 		clear_huge_page(page, address, pages_per_huge_page(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4438) 		__SetPageUptodate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4439) 		new_page = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4440) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4441) 		if (vma->vm_flags & VM_MAYSHARE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4442) 			int err = huge_add_to_page_cache(page, mapping, idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4443) 			if (err) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4444) 				put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4445) 				if (err == -EEXIST)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4446) 					goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4447) 				goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4448) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4449) 		} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4450) 			lock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4451) 			if (unlikely(anon_vma_prepare(vma))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4452) 				ret = VM_FAULT_OOM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4453) 				goto backout_unlocked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4454) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4455) 			anon_rmap = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4456) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4457) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4458) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4459) 		 * If memory error occurs between mmap() and fault, some process
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4460) 		 * don't have hwpoisoned swap entry for errored virtual address.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4461) 		 * So we need to block hugepage fault by PG_hwpoison bit check.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4462) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4463) 		if (unlikely(PageHWPoison(page))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4464) 			ret = VM_FAULT_HWPOISON_LARGE |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4465) 				VM_FAULT_SET_HINDEX(hstate_index(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4466) 			goto backout_unlocked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4467) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4468) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4469) 		/* Check for page in userfault range. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4470) 		if (userfaultfd_minor(vma)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4471) 			unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4472) 			put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4473) 			ret = hugetlb_handle_userfault(vma, mapping, idx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4474) 						       flags, haddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4475) 						       VM_UFFD_MINOR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4476) 			goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4477) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4478) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4479) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4480) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4481) 	 * If we are going to COW a private mapping later, we examine the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4482) 	 * pending reservations for this page now. This will ensure that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4483) 	 * any allocations necessary to record that reservation occur outside
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4484) 	 * the spinlock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4485) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4486) 	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4487) 		if (vma_needs_reservation(h, vma, haddr) < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4488) 			ret = VM_FAULT_OOM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4489) 			goto backout_unlocked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4490) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4491) 		/* Just decrements count, does not deallocate */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4492) 		vma_end_reservation(h, vma, haddr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4493) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4494) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4495) 	ptl = huge_pte_lock(h, mm, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4496) 	ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4497) 	if (!huge_pte_none(huge_ptep_get(ptep)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4498) 		goto backout;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4499) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4500) 	if (anon_rmap) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4501) 		ClearPagePrivate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4502) 		hugepage_add_new_anon_rmap(page, vma, haddr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4503) 	} else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4504) 		page_dup_rmap(page, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4505) 	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4506) 				&& (vma->vm_flags & VM_SHARED)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4507) 	set_huge_pte_at(mm, haddr, ptep, new_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4508) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4509) 	hugetlb_count_add(pages_per_huge_page(h), mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4510) 	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4511) 		/* Optimization, do the COW without a second fault */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4512) 		ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4513) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4514) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4515) 	spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4516) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4517) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4518) 	 * Only make newly allocated pages active.  Existing pages found
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4519) 	 * in the pagecache could be !page_huge_active() if they have been
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4520) 	 * isolated for migration.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4521) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4522) 	if (new_page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4523) 		set_page_huge_active(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4524) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4525) 	unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4526) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4527) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4528) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4529) backout:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4530) 	spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4531) backout_unlocked:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4532) 	unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4533) 	restore_reserve_on_error(h, vma, haddr, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4534) 	put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4535) 	goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4536) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4537) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4538) #ifdef CONFIG_SMP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4539) u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4540) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4541) 	unsigned long key[2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4542) 	u32 hash;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4543) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4544) 	key[0] = (unsigned long) mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4545) 	key[1] = idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4546) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4547) 	hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4548) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4549) 	return hash & (num_fault_mutexes - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4550) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4551) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4552) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4553)  * For uniprocesor systems we always use a single mutex, so just
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4554)  * return 0 and avoid the hashing overhead.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4555)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4556) u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4557) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4558) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4559) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4560) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4561) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4562) vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4563) 			unsigned long address, unsigned int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4564) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4565) 	pte_t *ptep, entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4566) 	spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4567) 	vm_fault_t ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4568) 	u32 hash;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4569) 	pgoff_t idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4570) 	struct page *page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4571) 	struct page *pagecache_page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4572) 	struct hstate *h = hstate_vma(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4573) 	struct address_space *mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4574) 	int need_wait_lock = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4575) 	unsigned long haddr = address & huge_page_mask(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4576) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4577) 	ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4578) 	if (ptep) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4579) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4580) 		 * Since we hold no locks, ptep could be stale.  That is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4581) 		 * OK as we are only making decisions based on content and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4582) 		 * not actually modifying content here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4583) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4584) 		entry = huge_ptep_get(ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4585) 		if (unlikely(is_hugetlb_entry_migration(entry))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4586) 			migration_entry_wait_huge(vma, mm, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4587) 			return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4588) 		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4589) 			return VM_FAULT_HWPOISON_LARGE |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4590) 				VM_FAULT_SET_HINDEX(hstate_index(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4591) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4592) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4593) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4594) 	 * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4595) 	 * until finished with ptep.  This serves two purposes:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4596) 	 * 1) It prevents huge_pmd_unshare from being called elsewhere
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4597) 	 *    and making the ptep no longer valid.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4598) 	 * 2) It synchronizes us with i_size modifications during truncation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4599) 	 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4600) 	 * ptep could have already be assigned via huge_pte_offset.  That
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4601) 	 * is OK, as huge_pte_alloc will return the same value unless
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4602) 	 * something has changed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4603) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4604) 	mapping = vma->vm_file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4605) 	i_mmap_lock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4606) 	ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4607) 	if (!ptep) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4608) 		i_mmap_unlock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4609) 		return VM_FAULT_OOM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4610) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4611) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4612) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4613) 	 * Serialize hugepage allocation and instantiation, so that we don't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4614) 	 * get spurious allocation failures if two CPUs race to instantiate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4615) 	 * the same page in the page cache.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4616) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4617) 	idx = vma_hugecache_offset(h, vma, haddr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4618) 	hash = hugetlb_fault_mutex_hash(mapping, idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4619) 	mutex_lock(&hugetlb_fault_mutex_table[hash]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4620) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4621) 	entry = huge_ptep_get(ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4622) 	if (huge_pte_none(entry)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4623) 		ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4624) 		goto out_mutex;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4625) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4626) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4627) 	ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4628) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4629) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4630) 	 * entry could be a migration/hwpoison entry at this point, so this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4631) 	 * check prevents the kernel from going below assuming that we have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4632) 	 * an active hugepage in pagecache. This goto expects the 2nd page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4633) 	 * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4634) 	 * properly handle it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4635) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4636) 	if (!pte_present(entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4637) 		goto out_mutex;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4638) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4639) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4640) 	 * If we are going to COW the mapping later, we examine the pending
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4641) 	 * reservations for this page now. This will ensure that any
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4642) 	 * allocations necessary to record that reservation occur outside the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4643) 	 * spinlock. For private mappings, we also lookup the pagecache
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4644) 	 * page now as it is used to determine if a reservation has been
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4645) 	 * consumed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4646) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4647) 	if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4648) 		if (vma_needs_reservation(h, vma, haddr) < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4649) 			ret = VM_FAULT_OOM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4650) 			goto out_mutex;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4651) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4652) 		/* Just decrements count, does not deallocate */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4653) 		vma_end_reservation(h, vma, haddr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4654) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4655) 		if (!(vma->vm_flags & VM_MAYSHARE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4656) 			pagecache_page = hugetlbfs_pagecache_page(h,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4657) 								vma, haddr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4658) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4659) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4660) 	ptl = huge_pte_lock(h, mm, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4661) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4662) 	/* Check for a racing update before calling hugetlb_cow */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4663) 	if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4664) 		goto out_ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4665) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4666) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4667) 	 * hugetlb_cow() requires page locks of pte_page(entry) and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4668) 	 * pagecache_page, so here we need take the former one
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4669) 	 * when page != pagecache_page or !pagecache_page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4670) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4671) 	page = pte_page(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4672) 	if (page != pagecache_page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4673) 		if (!trylock_page(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4674) 			need_wait_lock = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4675) 			goto out_ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4676) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4677) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4678) 	get_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4679) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4680) 	if (flags & FAULT_FLAG_WRITE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4681) 		if (!huge_pte_write(entry)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4682) 			ret = hugetlb_cow(mm, vma, address, ptep,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4683) 					  pagecache_page, ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4684) 			goto out_put_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4685) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4686) 		entry = huge_pte_mkdirty(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4687) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4688) 	entry = pte_mkyoung(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4689) 	if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4690) 						flags & FAULT_FLAG_WRITE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4691) 		update_mmu_cache(vma, haddr, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4692) out_put_page:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4693) 	if (page != pagecache_page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4694) 		unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4695) 	put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4696) out_ptl:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4697) 	spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4698) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4699) 	if (pagecache_page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4700) 		unlock_page(pagecache_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4701) 		put_page(pagecache_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4702) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4703) out_mutex:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4704) 	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4705) 	i_mmap_unlock_read(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4706) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4707) 	 * Generally it's safe to hold refcount during waiting page lock. But
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4708) 	 * here we just wait to defer the next page fault to avoid busy loop and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4709) 	 * the page is not used after unlocked before returning from the current
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4710) 	 * page fault. So we are safe from accessing freed page, even if we wait
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4711) 	 * here without taking refcount.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4712) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4713) 	if (need_wait_lock)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4714) 		wait_on_page_locked(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4715) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4716) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4717) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4718) #ifdef CONFIG_USERFAULTFD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4719) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4720)  * Used by userfaultfd UFFDIO_COPY.  Based on mcopy_atomic_pte with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4721)  * modifications for huge pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4722)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4723) int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4724) 			    pte_t *dst_pte,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4725) 			    struct vm_area_struct *dst_vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4726) 			    unsigned long dst_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4727) 			    unsigned long src_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4728) 			    enum mcopy_atomic_mode mode,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4729) 			    struct page **pagep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4730) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4731) 	bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4732) 	struct address_space *mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4733) 	pgoff_t idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4734) 	unsigned long size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4735) 	int vm_shared = dst_vma->vm_flags & VM_SHARED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4736) 	struct hstate *h = hstate_vma(dst_vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4737) 	pte_t _dst_pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4738) 	spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4739) 	int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4740) 	struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4741) 	int writable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4742) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4743) 	mapping = dst_vma->vm_file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4744) 	idx = vma_hugecache_offset(h, dst_vma, dst_addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4745) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4746) 	if (is_continue) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4747) 		ret = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4748) 		page = find_lock_page(mapping, idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4749) 		if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4750) 			goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4751) 	} else if (!*pagep) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4752) 		/* If a page already exists, then it's UFFDIO_COPY for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4753) 		 * a non-missing case. Return -EEXIST.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4754) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4755) 		if (vm_shared &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4756) 		    hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4757) 			ret = -EEXIST;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4758) 			goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4759) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4760) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4761) 		page = alloc_huge_page(dst_vma, dst_addr, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4762) 		if (IS_ERR(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4763) 			ret = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4764) 			goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4765) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4766) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4767) 		ret = copy_huge_page_from_user(page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4768) 						(const void __user *) src_addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4769) 						pages_per_huge_page(h), false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4770) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4771) 		/* fallback to copy_from_user outside mmap_lock */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4772) 		if (unlikely(ret)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4773) 			ret = -ENOENT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4774) 			*pagep = page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4775) 			/* don't free the page */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4776) 			goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4777) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4778) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4779) 		page = *pagep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4780) 		*pagep = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4781) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4782) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4783) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4784) 	 * The memory barrier inside __SetPageUptodate makes sure that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4785) 	 * preceding stores to the page contents become visible before
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4786) 	 * the set_pte_at() write.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4787) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4788) 	__SetPageUptodate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4789) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4790) 	/* Add shared, newly allocated pages to the page cache. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4791) 	if (vm_shared && !is_continue) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4792) 		size = i_size_read(mapping->host) >> huge_page_shift(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4793) 		ret = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4794) 		if (idx >= size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4795) 			goto out_release_nounlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4796) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4797) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4798) 		 * Serialization between remove_inode_hugepages() and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4799) 		 * huge_add_to_page_cache() below happens through the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4800) 		 * hugetlb_fault_mutex_table that here must be hold by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4801) 		 * the caller.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4802) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4803) 		ret = huge_add_to_page_cache(page, mapping, idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4804) 		if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4805) 			goto out_release_nounlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4806) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4807) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4808) 	ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4809) 	spin_lock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4810) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4811) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4812) 	 * Recheck the i_size after holding PT lock to make sure not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4813) 	 * to leave any page mapped (as page_mapped()) beyond the end
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4814) 	 * of the i_size (remove_inode_hugepages() is strict about
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4815) 	 * enforcing that). If we bail out here, we'll also leave a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4816) 	 * page in the radix tree in the vm_shared case beyond the end
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4817) 	 * of the i_size, but remove_inode_hugepages() will take care
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4818) 	 * of it as soon as we drop the hugetlb_fault_mutex_table.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4819) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4820) 	size = i_size_read(mapping->host) >> huge_page_shift(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4821) 	ret = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4822) 	if (idx >= size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4823) 		goto out_release_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4824) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4825) 	ret = -EEXIST;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4826) 	if (!huge_pte_none(huge_ptep_get(dst_pte)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4827) 		goto out_release_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4828) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4829) 	if (vm_shared) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4830) 		page_dup_rmap(page, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4831) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4832) 		ClearPagePrivate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4833) 		hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4834) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4835) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4836) 	/* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4837) 	if (is_continue && !vm_shared)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4838) 		writable = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4839) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4840) 		writable = dst_vma->vm_flags & VM_WRITE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4841) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4842) 	_dst_pte = make_huge_pte(dst_vma, page, writable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4843) 	if (writable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4844) 		_dst_pte = huge_pte_mkdirty(_dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4845) 	_dst_pte = pte_mkyoung(_dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4846) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4847) 	set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4848) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4849) 	(void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4850) 					dst_vma->vm_flags & VM_WRITE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4851) 	hugetlb_count_add(pages_per_huge_page(h), dst_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4852) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4853) 	/* No need to invalidate - it was non-present before */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4854) 	update_mmu_cache(dst_vma, dst_addr, dst_pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4855) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4856) 	spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4857) 	if (!is_continue)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4858) 		set_page_huge_active(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4859) 	if (vm_shared || is_continue)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4860) 		unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4861) 	ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4862) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4863) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4864) out_release_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4865) 	spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4866) 	if (vm_shared || is_continue)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4867) 		unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4868) out_release_nounlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4869) 	put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4870) 	goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4871) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4872) #endif /* CONFIG_USERFAULTFD */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4873) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4874) long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4875) 			 struct page **pages, struct vm_area_struct **vmas,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4876) 			 unsigned long *position, unsigned long *nr_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4877) 			 long i, unsigned int flags, int *locked)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4878) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4879) 	unsigned long pfn_offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4880) 	unsigned long vaddr = *position;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4881) 	unsigned long remainder = *nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4882) 	struct hstate *h = hstate_vma(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4883) 	int err = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4884) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4885) 	while (vaddr < vma->vm_end && remainder) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4886) 		pte_t *pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4887) 		spinlock_t *ptl = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4888) 		int absent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4889) 		struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4890) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4891) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4892) 		 * If we have a pending SIGKILL, don't keep faulting pages and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4893) 		 * potentially allocating memory.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4894) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4895) 		if (fatal_signal_pending(current)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4896) 			remainder = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4897) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4898) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4899) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4900) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4901) 		 * Some archs (sparc64, sh*) have multiple pte_ts to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4902) 		 * each hugepage.  We have to make sure we get the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4903) 		 * first, for the page indexing below to work.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4904) 		 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4905) 		 * Note that page table lock is not held when pte is null.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4906) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4907) 		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4908) 				      huge_page_size(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4909) 		if (pte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4910) 			ptl = huge_pte_lock(h, mm, pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4911) 		absent = !pte || huge_pte_none(huge_ptep_get(pte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4912) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4913) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4914) 		 * When coredumping, it suits get_dump_page if we just return
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4915) 		 * an error where there's an empty slot with no huge pagecache
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4916) 		 * to back it.  This way, we avoid allocating a hugepage, and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4917) 		 * the sparse dumpfile avoids allocating disk blocks, but its
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4918) 		 * huge holes still show up with zeroes where they need to be.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4919) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4920) 		if (absent && (flags & FOLL_DUMP) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4921) 		    !hugetlbfs_pagecache_present(h, vma, vaddr)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4922) 			if (pte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4923) 				spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4924) 			remainder = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4925) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4926) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4927) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4928) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4929) 		 * We need call hugetlb_fault for both hugepages under migration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4930) 		 * (in which case hugetlb_fault waits for the migration,) and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4931) 		 * hwpoisoned hugepages (in which case we need to prevent the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4932) 		 * caller from accessing to them.) In order to do this, we use
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4933) 		 * here is_swap_pte instead of is_hugetlb_entry_migration and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4934) 		 * is_hugetlb_entry_hwpoisoned. This is because it simply covers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4935) 		 * both cases, and because we can't follow correct pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4936) 		 * directly from any kind of swap entries.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4937) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4938) 		if (absent || is_swap_pte(huge_ptep_get(pte)) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4939) 		    ((flags & FOLL_WRITE) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4940) 		      !huge_pte_write(huge_ptep_get(pte)))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4941) 			vm_fault_t ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4942) 			unsigned int fault_flags = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4943) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4944) 			if (pte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4945) 				spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4946) 			if (flags & FOLL_WRITE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4947) 				fault_flags |= FAULT_FLAG_WRITE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4948) 			if (locked)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4949) 				fault_flags |= FAULT_FLAG_ALLOW_RETRY |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4950) 					FAULT_FLAG_KILLABLE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4951) 			if (flags & FOLL_NOWAIT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4952) 				fault_flags |= FAULT_FLAG_ALLOW_RETRY |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4953) 					FAULT_FLAG_RETRY_NOWAIT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4954) 			if (flags & FOLL_TRIED) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4955) 				/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4956) 				 * Note: FAULT_FLAG_ALLOW_RETRY and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4957) 				 * FAULT_FLAG_TRIED can co-exist
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4958) 				 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4959) 				fault_flags |= FAULT_FLAG_TRIED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4960) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4961) 			ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4962) 			if (ret & VM_FAULT_ERROR) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4963) 				err = vm_fault_to_errno(ret, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4964) 				remainder = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4965) 				break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4966) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4967) 			if (ret & VM_FAULT_RETRY) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4968) 				if (locked &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4969) 				    !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4970) 					*locked = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4971) 				*nr_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4972) 				/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4973) 				 * VM_FAULT_RETRY must not return an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4974) 				 * error, it will return zero
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4975) 				 * instead.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4976) 				 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4977) 				 * No need to update "position" as the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4978) 				 * caller will not check it after
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4979) 				 * *nr_pages is set to 0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4980) 				 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4981) 				return i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4982) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4983) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4984) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4985) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4986) 		pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4987) 		page = pte_page(huge_ptep_get(pte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4988) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4989) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4990) 		 * If subpage information not requested, update counters
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4991) 		 * and skip the same_page loop below.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4992) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4993) 		if (!pages && !vmas && !pfn_offset &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4994) 		    (vaddr + huge_page_size(h) < vma->vm_end) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4995) 		    (remainder >= pages_per_huge_page(h))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4996) 			vaddr += huge_page_size(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4997) 			remainder -= pages_per_huge_page(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4998) 			i += pages_per_huge_page(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4999) 			spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5000) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5001) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5002) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5003) same_page:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5004) 		if (pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5005) 			pages[i] = mem_map_offset(page, pfn_offset);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5006) 			/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5007) 			 * try_grab_page() should always succeed here, because:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5008) 			 * a) we hold the ptl lock, and b) we've just checked
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5009) 			 * that the huge page is present in the page tables. If
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5010) 			 * the huge page is present, then the tail pages must
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5011) 			 * also be present. The ptl prevents the head page and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5012) 			 * tail pages from being rearranged in any way. So this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5013) 			 * page must be available at this point, unless the page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5014) 			 * refcount overflowed:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5015) 			 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5016) 			if (WARN_ON_ONCE(!try_grab_page(pages[i], flags))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5017) 				spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5018) 				remainder = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5019) 				err = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5020) 				break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5021) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5022) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5023) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5024) 		if (vmas)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5025) 			vmas[i] = vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5026) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5027) 		vaddr += PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5028) 		++pfn_offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5029) 		--remainder;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5030) 		++i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5031) 		if (vaddr < vma->vm_end && remainder &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5032) 				pfn_offset < pages_per_huge_page(h)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5033) 			/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5034) 			 * We use pfn_offset to avoid touching the pageframes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5035) 			 * of this compound page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5036) 			 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5037) 			goto same_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5038) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5039) 		spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5040) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5041) 	*nr_pages = remainder;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5042) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5043) 	 * setting position is actually required only if remainder is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5044) 	 * not zero but it's faster not to add a "if (remainder)"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5045) 	 * branch.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5046) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5047) 	*position = vaddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5048) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5049) 	return i ? i : err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5050) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5051) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5052) unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5053) 		unsigned long address, unsigned long end, pgprot_t newprot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5054) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5055) 	struct mm_struct *mm = vma->vm_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5056) 	unsigned long start = address;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5057) 	pte_t *ptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5058) 	pte_t pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5059) 	struct hstate *h = hstate_vma(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5060) 	unsigned long pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5061) 	bool shared_pmd = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5062) 	struct mmu_notifier_range range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5063) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5064) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5065) 	 * In the case of shared PMDs, the area to flush could be beyond
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5066) 	 * start/end.  Set range.start/range.end to cover the maximum possible
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5067) 	 * range if PMD sharing is possible.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5068) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5069) 	mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5070) 				0, vma, mm, start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5071) 	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5072) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5073) 	BUG_ON(address >= end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5074) 	flush_cache_range(vma, range.start, range.end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5075) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5076) 	mmu_notifier_invalidate_range_start(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5077) 	i_mmap_lock_write(vma->vm_file->f_mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5078) 	for (; address < end; address += huge_page_size(h)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5079) 		spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5080) 		ptep = huge_pte_offset(mm, address, huge_page_size(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5081) 		if (!ptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5082) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5083) 		ptl = huge_pte_lock(h, mm, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5084) 		if (huge_pmd_unshare(mm, vma, &address, ptep)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5085) 			pages++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5086) 			spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5087) 			shared_pmd = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5088) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5089) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5090) 		pte = huge_ptep_get(ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5091) 		if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5092) 			spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5093) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5094) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5095) 		if (unlikely(is_hugetlb_entry_migration(pte))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5096) 			swp_entry_t entry = pte_to_swp_entry(pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5097) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5098) 			if (is_write_migration_entry(entry)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5099) 				pte_t newpte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5100) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5101) 				make_migration_entry_read(&entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5102) 				newpte = swp_entry_to_pte(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5103) 				set_huge_swap_pte_at(mm, address, ptep,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5104) 						     newpte, huge_page_size(h));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5105) 				pages++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5106) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5107) 			spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5108) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5109) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5110) 		if (!huge_pte_none(pte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5111) 			pte_t old_pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5112) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5113) 			old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5114) 			pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5115) 			pte = arch_make_huge_pte(pte, vma, NULL, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5116) 			huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5117) 			pages++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5118) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5119) 		spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5120) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5121) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5122) 	 * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5123) 	 * may have cleared our pud entry and done put_page on the page table:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5124) 	 * once we release i_mmap_rwsem, another task can do the final put_page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5125) 	 * and that page table be reused and filled with junk.  If we actually
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5126) 	 * did unshare a page of pmds, flush the range corresponding to the pud.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5127) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5128) 	if (shared_pmd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5129) 		flush_hugetlb_tlb_range(vma, range.start, range.end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5130) 	else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5131) 		flush_hugetlb_tlb_range(vma, start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5132) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5133) 	 * No need to call mmu_notifier_invalidate_range() we are downgrading
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5134) 	 * page table protection not changing it to point to a new page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5135) 	 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5136) 	 * See Documentation/vm/mmu_notifier.rst
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5137) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5138) 	i_mmap_unlock_write(vma->vm_file->f_mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5139) 	mmu_notifier_invalidate_range_end(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5140) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5141) 	return pages << h->order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5142) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5143) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5144) int hugetlb_reserve_pages(struct inode *inode,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5145) 					long from, long to,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5146) 					struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5147) 					vm_flags_t vm_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5148) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5149) 	long ret, chg, add = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5150) 	struct hstate *h = hstate_inode(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5151) 	struct hugepage_subpool *spool = subpool_inode(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5152) 	struct resv_map *resv_map;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5153) 	struct hugetlb_cgroup *h_cg = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5154) 	long gbl_reserve, regions_needed = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5155) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5156) 	/* This should never happen */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5157) 	if (from > to) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5158) 		VM_WARN(1, "%s called with a negative range\n", __func__);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5159) 		return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5160) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5161) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5162) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5163) 	 * Only apply hugepage reservation if asked. At fault time, an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5164) 	 * attempt will be made for VM_NORESERVE to allocate a page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5165) 	 * without using reserves
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5166) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5167) 	if (vm_flags & VM_NORESERVE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5168) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5169) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5170) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5171) 	 * Shared mappings base their reservation on the number of pages that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5172) 	 * are already allocated on behalf of the file. Private mappings need
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5173) 	 * to reserve the full area even if read-only as mprotect() may be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5174) 	 * called to make the mapping read-write. Assume !vma is a shm mapping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5175) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5176) 	if (!vma || vma->vm_flags & VM_MAYSHARE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5177) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5178) 		 * resv_map can not be NULL as hugetlb_reserve_pages is only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5179) 		 * called for inodes for which resv_maps were created (see
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5180) 		 * hugetlbfs_get_inode).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5181) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5182) 		resv_map = inode_resv_map(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5183) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5184) 		chg = region_chg(resv_map, from, to, &regions_needed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5185) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5186) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5187) 		/* Private mapping. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5188) 		resv_map = resv_map_alloc();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5189) 		if (!resv_map)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5190) 			return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5191) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5192) 		chg = to - from;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5193) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5194) 		set_vma_resv_map(vma, resv_map);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5195) 		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5196) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5197) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5198) 	if (chg < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5199) 		ret = chg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5200) 		goto out_err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5201) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5202) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5203) 	ret = hugetlb_cgroup_charge_cgroup_rsvd(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5204) 		hstate_index(h), chg * pages_per_huge_page(h), &h_cg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5205) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5206) 	if (ret < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5207) 		ret = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5208) 		goto out_err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5209) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5210) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5211) 	if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5212) 		/* For private mappings, the hugetlb_cgroup uncharge info hangs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5213) 		 * of the resv_map.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5214) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5215) 		resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5216) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5217) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5218) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5219) 	 * There must be enough pages in the subpool for the mapping. If
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5220) 	 * the subpool has a minimum size, there may be some global
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5221) 	 * reservations already in place (gbl_reserve).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5222) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5223) 	gbl_reserve = hugepage_subpool_get_pages(spool, chg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5224) 	if (gbl_reserve < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5225) 		ret = -ENOSPC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5226) 		goto out_uncharge_cgroup;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5227) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5228) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5229) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5230) 	 * Check enough hugepages are available for the reservation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5231) 	 * Hand the pages back to the subpool if there are not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5232) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5233) 	ret = hugetlb_acct_memory(h, gbl_reserve);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5234) 	if (ret < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5235) 		goto out_put_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5236) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5237) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5238) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5239) 	 * Account for the reservations made. Shared mappings record regions
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5240) 	 * that have reservations as they are shared by multiple VMAs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5241) 	 * When the last VMA disappears, the region map says how much
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5242) 	 * the reservation was and the page cache tells how much of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5243) 	 * the reservation was consumed. Private mappings are per-VMA and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5244) 	 * only the consumed reservations are tracked. When the VMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5245) 	 * disappears, the original reservation is the VMA size and the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5246) 	 * consumed reservations are stored in the map. Hence, nothing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5247) 	 * else has to be done for private mappings here
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5248) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5249) 	if (!vma || vma->vm_flags & VM_MAYSHARE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5250) 		add = region_add(resv_map, from, to, regions_needed, h, h_cg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5251) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5252) 		if (unlikely(add < 0)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5253) 			hugetlb_acct_memory(h, -gbl_reserve);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5254) 			ret = add;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5255) 			goto out_put_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5256) 		} else if (unlikely(chg > add)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5257) 			/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5258) 			 * pages in this range were added to the reserve
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5259) 			 * map between region_chg and region_add.  This
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5260) 			 * indicates a race with alloc_huge_page.  Adjust
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5261) 			 * the subpool and reserve counts modified above
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5262) 			 * based on the difference.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5263) 			 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5264) 			long rsv_adjust;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5265) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5266) 			/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5267) 			 * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5268) 			 * reference to h_cg->css. See comment below for detail.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5269) 			 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5270) 			hugetlb_cgroup_uncharge_cgroup_rsvd(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5271) 				hstate_index(h),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5272) 				(chg - add) * pages_per_huge_page(h), h_cg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5273) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5274) 			rsv_adjust = hugepage_subpool_put_pages(spool,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5275) 								chg - add);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5276) 			hugetlb_acct_memory(h, -rsv_adjust);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5277) 		} else if (h_cg) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5278) 			/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5279) 			 * The file_regions will hold their own reference to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5280) 			 * h_cg->css. So we should release the reference held
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5281) 			 * via hugetlb_cgroup_charge_cgroup_rsvd() when we are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5282) 			 * done.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5283) 			 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5284) 			hugetlb_cgroup_put_rsvd_cgroup(h_cg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5285) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5286) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5287) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5288) out_put_pages:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5289) 	/* put back original number of pages, chg */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5290) 	(void)hugepage_subpool_put_pages(spool, chg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5291) out_uncharge_cgroup:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5292) 	hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5293) 					    chg * pages_per_huge_page(h), h_cg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5294) out_err:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5295) 	if (!vma || vma->vm_flags & VM_MAYSHARE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5296) 		/* Only call region_abort if the region_chg succeeded but the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5297) 		 * region_add failed or didn't run.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5298) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5299) 		if (chg >= 0 && add < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5300) 			region_abort(resv_map, from, to, regions_needed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5301) 	if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5302) 		kref_put(&resv_map->refs, resv_map_release);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5303) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5304) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5305) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5306) long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5307) 								long freed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5308) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5309) 	struct hstate *h = hstate_inode(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5310) 	struct resv_map *resv_map = inode_resv_map(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5311) 	long chg = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5312) 	struct hugepage_subpool *spool = subpool_inode(inode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5313) 	long gbl_reserve;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5314) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5315) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5316) 	 * Since this routine can be called in the evict inode path for all
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5317) 	 * hugetlbfs inodes, resv_map could be NULL.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5318) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5319) 	if (resv_map) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5320) 		chg = region_del(resv_map, start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5321) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5322) 		 * region_del() can fail in the rare case where a region
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5323) 		 * must be split and another region descriptor can not be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5324) 		 * allocated.  If end == LONG_MAX, it will not fail.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5325) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5326) 		if (chg < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5327) 			return chg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5328) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5329) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5330) 	spin_lock(&inode->i_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5331) 	inode->i_blocks -= (blocks_per_huge_page(h) * freed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5332) 	spin_unlock(&inode->i_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5333) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5334) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5335) 	 * If the subpool has a minimum size, the number of global
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5336) 	 * reservations to be released may be adjusted.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5337) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5338) 	gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5339) 	hugetlb_acct_memory(h, -gbl_reserve);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5340) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5341) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5342) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5343) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5344) #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5345) static unsigned long page_table_shareable(struct vm_area_struct *svma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5346) 				struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5347) 				unsigned long addr, pgoff_t idx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5348) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5349) 	unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5350) 				svma->vm_start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5351) 	unsigned long sbase = saddr & PUD_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5352) 	unsigned long s_end = sbase + PUD_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5353) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5354) 	/* Allow segments to share if only one is marked locked */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5355) 	unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5356) 	unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5357) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5358) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5359) 	 * match the virtual addresses, permission and the alignment of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5360) 	 * page table page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5361) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5362) 	if (pmd_index(addr) != pmd_index(saddr) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5363) 	    vm_flags != svm_flags ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5364) 	    sbase < svma->vm_start || svma->vm_end < s_end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5365) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5366) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5367) 	return saddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5368) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5369) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5370) static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5371) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5372) 	unsigned long base = addr & PUD_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5373) 	unsigned long end = base + PUD_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5374) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5375) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5376) 	 * check on proper vm_flags and page table alignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5377) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5378) 	if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5379) 		return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5380) 	return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5381) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5382) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5383) bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5384) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5385) #ifdef CONFIG_USERFAULTFD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5386) 	if (uffd_disable_huge_pmd_share(vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5387) 		return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5388) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5389) 	return vma_shareable(vma, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5390) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5391) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5392) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5393)  * Determine if start,end range within vma could be mapped by shared pmd.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5394)  * If yes, adjust start and end to cover range associated with possible
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5395)  * shared pmd mappings.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5396)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5397) void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5398) 				unsigned long *start, unsigned long *end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5399) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5400) 	unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5401) 		v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5402) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5403) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5404) 	 * vma need span at least one aligned PUD size and the start,end range
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5405) 	 * must at least partialy within it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5406) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5407) 	if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5408) 		(*end <= v_start) || (*start >= v_end))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5409) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5410) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5411) 	/* Extend the range to be PUD aligned for a worst case scenario */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5412) 	if (*start > v_start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5413) 		*start = ALIGN_DOWN(*start, PUD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5414) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5415) 	if (*end < v_end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5416) 		*end = ALIGN(*end, PUD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5417) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5418) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5419) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5420)  * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5421)  * and returns the corresponding pte. While this is not necessary for the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5422)  * !shared pmd case because we can allocate the pmd later as well, it makes the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5423)  * code much cleaner.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5424)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5425)  * This routine must be called with i_mmap_rwsem held in at least read mode if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5426)  * sharing is possible.  For hugetlbfs, this prevents removal of any page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5427)  * table entries associated with the address space.  This is important as we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5428)  * are setting up sharing based on existing page table entries (mappings).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5429)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5430)  * NOTE: This routine is only called from huge_pte_alloc.  Some callers of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5431)  * huge_pte_alloc know that sharing is not possible and do not take
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5432)  * i_mmap_rwsem as a performance optimization.  This is handled by the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5433)  * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5434)  * only required for subsequent processing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5435)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5436) pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5437) 		      unsigned long addr, pud_t *pud)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5438) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5439) 	struct address_space *mapping = vma->vm_file->f_mapping;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5440) 	pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5441) 			vma->vm_pgoff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5442) 	struct vm_area_struct *svma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5443) 	unsigned long saddr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5444) 	pte_t *spte = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5445) 	pte_t *pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5446) 	spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5447) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5448) 	i_mmap_assert_locked(mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5449) 	vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5450) 		if (svma == vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5451) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5452) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5453) 		saddr = page_table_shareable(svma, vma, addr, idx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5454) 		if (saddr) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5455) 			spte = huge_pte_offset(svma->vm_mm, saddr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5456) 					       vma_mmu_pagesize(svma));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5457) 			if (spte) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5458) 				get_page(virt_to_page(spte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5459) 				break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5460) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5461) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5462) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5463) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5464) 	if (!spte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5465) 		goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5466) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5467) 	ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5468) 	if (pud_none(*pud)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5469) 		pud_populate(mm, pud,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5470) 				(pmd_t *)((unsigned long)spte & PAGE_MASK));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5471) 		mm_inc_nr_pmds(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5472) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5473) 		put_page(virt_to_page(spte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5474) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5475) 	spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5476) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5477) 	pte = (pte_t *)pmd_alloc(mm, pud, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5478) 	return pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5479) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5480) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5481) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5482)  * unmap huge page backed by shared pte.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5483)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5484)  * Hugetlb pte page is ref counted at the time of mapping.  If pte is shared
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5485)  * indicated by page_count > 1, unmap is achieved by clearing pud and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5486)  * decrementing the ref count. If count == 1, the pte page is not shared.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5487)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5488)  * Called with page table lock held and i_mmap_rwsem held in write mode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5489)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5490)  * returns: 1 successfully unmapped a shared pte page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5491)  *	    0 the underlying pte page is not shared, or it is the last user
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5492)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5493) int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5494) 					unsigned long *addr, pte_t *ptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5495) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5496) 	pgd_t *pgd = pgd_offset(mm, *addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5497) 	p4d_t *p4d = p4d_offset(pgd, *addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5498) 	pud_t *pud = pud_offset(p4d, *addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5499) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5500) 	i_mmap_assert_write_locked(vma->vm_file->f_mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5501) 	BUG_ON(page_count(virt_to_page(ptep)) == 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5502) 	if (page_count(virt_to_page(ptep)) == 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5503) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5504) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5505) 	pud_clear(pud);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5506) 	put_page(virt_to_page(ptep));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5507) 	mm_dec_nr_pmds(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5508) 	*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5509) 	return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5510) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5511) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5512) #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5513) pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5514) 		      unsigned long addr, pud_t *pud)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5515) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5516) 	return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5517) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5518) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5519) int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5520) 				unsigned long *addr, pte_t *ptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5521) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5522) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5523) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5524) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5525) void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5526) 				unsigned long *start, unsigned long *end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5527) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5528) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5529) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5530) bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5531) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5532) 	return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5533) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5534) #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5535) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5536) #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5537) pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5538) 			unsigned long addr, unsigned long sz)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5539) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5540) 	pgd_t *pgd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5541) 	p4d_t *p4d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5542) 	pud_t *pud;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5543) 	pte_t *pte = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5544) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5545) 	pgd = pgd_offset(mm, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5546) 	p4d = p4d_alloc(mm, pgd, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5547) 	if (!p4d)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5548) 		return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5549) 	pud = pud_alloc(mm, p4d, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5550) 	if (pud) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5551) 		if (sz == PUD_SIZE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5552) 			pte = (pte_t *)pud;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5553) 		} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5554) 			BUG_ON(sz != PMD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5555) 			if (want_pmd_share(vma, addr) && pud_none(*pud))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5556) 				pte = huge_pmd_share(mm, vma, addr, pud);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5557) 			else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5558) 				pte = (pte_t *)pmd_alloc(mm, pud, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5559) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5560) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5561) 	BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5562) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5563) 	return pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5564) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5565) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5566) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5567)  * huge_pte_offset() - Walk the page table to resolve the hugepage
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5568)  * entry at address @addr
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5569)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5570)  * Return: Pointer to page table entry (PUD or PMD) for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5571)  * address @addr, or NULL if a !p*d_present() entry is encountered and the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5572)  * size @sz doesn't match the hugepage size at this level of the page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5573)  * table.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5574)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5575) pte_t *huge_pte_offset(struct mm_struct *mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5576) 		       unsigned long addr, unsigned long sz)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5577) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5578) 	pgd_t *pgd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5579) 	p4d_t *p4d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5580) 	pud_t *pud;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5581) 	pmd_t *pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5582) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5583) 	pgd = pgd_offset(mm, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5584) 	if (!pgd_present(*pgd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5585) 		return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5586) 	p4d = p4d_offset(pgd, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5587) 	if (!p4d_present(*p4d))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5588) 		return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5589) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5590) 	pud = pud_offset(p4d, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5591) 	if (sz == PUD_SIZE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5592) 		/* must be pud huge, non-present or none */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5593) 		return (pte_t *)pud;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5594) 	if (!pud_present(*pud))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5595) 		return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5596) 	/* must have a valid entry and size to go further */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5597) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5598) 	pmd = pmd_offset(pud, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5599) 	/* must be pmd huge, non-present or none */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5600) 	return (pte_t *)pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5601) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5602) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5603) #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5604) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5605) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5606)  * These functions are overwritable if your architecture needs its own
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5607)  * behavior.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5608)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5609) struct page * __weak
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5610) follow_huge_addr(struct mm_struct *mm, unsigned long address,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5611) 			      int write)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5612) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5613) 	return ERR_PTR(-EINVAL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5614) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5615) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5616) struct page * __weak
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5617) follow_huge_pd(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5618) 	       unsigned long address, hugepd_t hpd, int flags, int pdshift)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5619) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5620) 	WARN(1, "hugepd follow called with no support for hugepage directory format\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5621) 	return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5622) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5623) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5624) struct page * __weak
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5625) follow_huge_pmd(struct mm_struct *mm, unsigned long address,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5626) 		pmd_t *pmd, int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5627) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5628) 	struct page *page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5629) 	spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5630) 	pte_t pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5631) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5632) 	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5633) 	if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5634) 			 (FOLL_PIN | FOLL_GET)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5635) 		return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5636) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5637) retry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5638) 	ptl = pmd_lockptr(mm, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5639) 	spin_lock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5640) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5641) 	 * make sure that the address range covered by this pmd is not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5642) 	 * unmapped from other threads.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5643) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5644) 	if (!pmd_huge(*pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5645) 		goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5646) 	pte = huge_ptep_get((pte_t *)pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5647) 	if (pte_present(pte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5648) 		page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5649) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5650) 		 * try_grab_page() should always succeed here, because: a) we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5651) 		 * hold the pmd (ptl) lock, and b) we've just checked that the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5652) 		 * huge pmd (head) page is present in the page tables. The ptl
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5653) 		 * prevents the head page and tail pages from being rearranged
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5654) 		 * in any way. So this page must be available at this point,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5655) 		 * unless the page refcount overflowed:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5656) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5657) 		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5658) 			page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5659) 			goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5660) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5661) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5662) 		if (is_hugetlb_entry_migration(pte)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5663) 			spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5664) 			__migration_entry_wait(mm, (pte_t *)pmd, ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5665) 			goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5666) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5667) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5668) 		 * hwpoisoned entry is treated as no_page_table in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5669) 		 * follow_page_mask().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5670) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5671) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5672) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5673) 	spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5674) 	return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5675) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5676) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5677) struct page * __weak
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5678) follow_huge_pud(struct mm_struct *mm, unsigned long address,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5679) 		pud_t *pud, int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5680) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5681) 	if (flags & (FOLL_GET | FOLL_PIN))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5682) 		return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5683) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5684) 	return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5685) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5686) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5687) struct page * __weak
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5688) follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5689) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5690) 	if (flags & (FOLL_GET | FOLL_PIN))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5691) 		return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5692) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5693) 	return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5694) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5695) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5696) bool isolate_huge_page(struct page *page, struct list_head *list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5697) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5698) 	bool ret = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5699) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5700) 	spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5701) 	if (!PageHeadHuge(page) || !page_huge_active(page) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5702) 	    !get_page_unless_zero(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5703) 		ret = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5704) 		goto unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5705) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5706) 	clear_page_huge_active(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5707) 	list_move_tail(&page->lru, list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5708) unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5709) 	spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5710) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5711) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5712) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5713) void putback_active_hugepage(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5714) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5715) 	VM_BUG_ON_PAGE(!PageHead(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5716) 	spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5717) 	set_page_huge_active(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5718) 	list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5719) 	spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5720) 	put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5721) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5722) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5723) void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5724) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5725) 	struct hstate *h = page_hstate(oldpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5726) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5727) 	hugetlb_cgroup_migrate(oldpage, newpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5728) 	set_page_owner_migrate_reason(newpage, reason);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5729) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5730) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5731) 	 * transfer temporary state of the new huge page. This is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5732) 	 * reverse to other transitions because the newpage is going to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5733) 	 * be final while the old one will be freed so it takes over
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5734) 	 * the temporary status.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5735) 	 *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5736) 	 * Also note that we have to transfer the per-node surplus state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5737) 	 * here as well otherwise the global surplus count will not match
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5738) 	 * the per-node's.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5739) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5740) 	if (PageHugeTemporary(newpage)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5741) 		int old_nid = page_to_nid(oldpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5742) 		int new_nid = page_to_nid(newpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5743) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5744) 		SetPageHugeTemporary(oldpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5745) 		ClearPageHugeTemporary(newpage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5746) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5747) 		spin_lock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5748) 		if (h->surplus_huge_pages_node[old_nid]) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5749) 			h->surplus_huge_pages_node[old_nid]--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5750) 			h->surplus_huge_pages_node[new_nid]++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5751) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5752) 		spin_unlock(&hugetlb_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5753) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5754) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5755) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5756) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5757)  * This function will unconditionally remove all the shared pmd pgtable entries
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5758)  * within the specific vma for a hugetlbfs memory range.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5759)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5760) void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5761) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5762) 	struct hstate *h = hstate_vma(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5763) 	unsigned long sz = huge_page_size(h);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5764) 	struct mm_struct *mm = vma->vm_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5765) 	struct mmu_notifier_range range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5766) 	unsigned long address, start, end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5767) 	spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5768) 	pte_t *ptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5769) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5770) 	if (!(vma->vm_flags & VM_MAYSHARE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5771) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5772) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5773) 	start = ALIGN(vma->vm_start, PUD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5774) 	end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5775) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5776) 	if (start >= end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5777) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5778) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5779) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5780) 	 * No need to call adjust_range_if_pmd_sharing_possible(), because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5781) 	 * we have already done the PUD_SIZE alignment.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5782) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5783) 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5784) 				start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5785) 	mmu_notifier_invalidate_range_start(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5786) 	i_mmap_lock_write(vma->vm_file->f_mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5787) 	for (address = start; address < end; address += PUD_SIZE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5788) 		unsigned long tmp = address;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5789) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5790) 		ptep = huge_pte_offset(mm, address, sz);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5791) 		if (!ptep)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5792) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5793) 		ptl = huge_pte_lock(h, mm, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5794) 		/* We don't want 'address' to be changed */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5795) 		huge_pmd_unshare(mm, vma, &tmp, ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5796) 		spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5797) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5798) 	flush_hugetlb_tlb_range(vma, start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5799) 	i_mmap_unlock_write(vma->vm_file->f_mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5800) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5801) 	 * No need to call mmu_notifier_invalidate_range(), see
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5802) 	 * Documentation/vm/mmu_notifier.rst.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5803) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5804) 	mmu_notifier_invalidate_range_end(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5805) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5806) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5807) #ifdef CONFIG_CMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5808) static bool cma_reserve_called __initdata;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5809) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5810) static int __init cmdline_parse_hugetlb_cma(char *p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5811) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5812) 	hugetlb_cma_size = memparse(p, &p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5813) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5814) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5815) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5816) early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5817) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5818) void __init hugetlb_cma_reserve(int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5819) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5820) 	unsigned long size, reserved, per_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5821) 	int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5822) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5823) 	cma_reserve_called = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5824) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5825) 	if (!hugetlb_cma_size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5826) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5827) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5828) 	if (hugetlb_cma_size < (PAGE_SIZE << order)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5829) 		pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5830) 			(PAGE_SIZE << order) / SZ_1M);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5831) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5832) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5833) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5834) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5835) 	 * If 3 GB area is requested on a machine with 4 numa nodes,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5836) 	 * let's allocate 1 GB on first three nodes and ignore the last one.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5837) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5838) 	per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5839) 	pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5840) 		hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5841) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5842) 	reserved = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5843) 	for_each_node_state(nid, N_ONLINE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5844) 		int res;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5845) 		char name[CMA_MAX_NAME];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5846) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5847) 		size = min(per_node, hugetlb_cma_size - reserved);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5848) 		size = round_up(size, PAGE_SIZE << order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5849) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5850) 		snprintf(name, sizeof(name), "hugetlb%d", nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5851) 		res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5852) 						 0, false, name,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5853) 						 &hugetlb_cma[nid], nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5854) 		if (res) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5855) 			pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5856) 				res, nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5857) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5858) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5859) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5860) 		reserved += size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5861) 		pr_info("hugetlb_cma: reserved %lu MiB on node %d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5862) 			size / SZ_1M, nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5863) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5864) 		if (reserved >= hugetlb_cma_size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5865) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5866) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5867) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5868) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5869) void __init hugetlb_cma_check(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5870) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5871) 	if (!hugetlb_cma_size || cma_reserve_called)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5872) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5873) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5874) 	pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5875) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5876) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5877) #endif /* CONFIG_CMA */