^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0-only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * linux/mm/page_alloc.c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Manages the free list, the system allocates free pages here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * Note that kmalloc() lives in slab.c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) * Swap reorganised 29.12.95, Stephen Tweedie
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #include <linux/stddef.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #include <linux/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #include <linux/highmem.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #include <linux/swap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) #include <linux/interrupt.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) #include <linux/pagemap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) #include <linux/jiffies.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #include <linux/memblock.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) #include <linux/compiler.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) #include <linux/kernel.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) #include <linux/kasan.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) #include <linux/module.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) #include <linux/suspend.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) #include <linux/pagevec.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) #include <linux/blkdev.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) #include <linux/slab.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) #include <linux/ratelimit.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) #include <linux/oom.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) #include <linux/topology.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) #include <linux/sysctl.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) #include <linux/cpu.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) #include <linux/cpuset.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) #include <linux/memory_hotplug.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) #include <linux/nodemask.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) #include <linux/vmalloc.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) #include <linux/vmstat.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) #include <linux/mempolicy.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) #include <linux/memremap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) #include <linux/stop_machine.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) #include <linux/random.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) #include <linux/sort.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) #include <linux/pfn.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) #include <linux/backing-dev.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) #include <linux/fault-inject.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) #include <linux/page-isolation.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) #include <linux/debugobjects.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) #include <linux/kmemleak.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) #include <linux/compaction.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) #include <trace/events/kmem.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) #include <trace/events/oom.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) #include <linux/prefetch.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) #include <linux/mm_inline.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) #include <linux/migrate.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) #include <linux/hugetlb.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) #include <linux/sched/rt.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) #include <linux/sched/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) #include <linux/page_owner.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) #include <linux/page_pinner.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) #include <linux/kthread.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) #include <linux/memcontrol.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) #include <linux/ftrace.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) #include <linux/lockdep.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) #include <linux/nmi.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) #include <linux/psi.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) #include <linux/padata.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) #include <linux/khugepaged.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) #include <trace/hooks/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) #include <asm/sections.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) #include <asm/tlbflush.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) #include <asm/div64.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) #include "internal.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) #include "shuffle.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) #include "page_reporting.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) typedef int __bitwise fpi_t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) /* No special request */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) #define FPI_NONE ((__force fpi_t)0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) * Skip free page reporting notification for the (possibly merged) page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) * This does not hinder free page reporting from grabbing the page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) * reporting it and marking it "reported" - it only skips notifying
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) * the free page reporting infrastructure about a newly freed page. For
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) * example, used when temporarily pulling a page from a freelist and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) * putting it back unmodified.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) #define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) * Place the (possibly merged) page to the tail of the freelist. Will ignore
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) * page shuffling (relevant code - e.g., memory onlining - is expected to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) * shuffle the whole zone).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) * Note: No code should rely on this flag for correctness - it's purely
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) * to allow for optimizations when handing back either fresh pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) * (memory onlining) or untouched pages (page isolation, free page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) * reporting).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) #define FPI_TO_TAIL ((__force fpi_t)BIT(1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) * Don't poison memory with KASAN (only for the tag-based modes).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) * During boot, all non-reserved memblock memory is exposed to page_alloc.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) * Poisoning all that memory lengthens boot time, especially on systems with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) * large amount of RAM. This flag is used to skip that poisoning.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) * This is only done for the tag-based KASAN modes, as those are able to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) * detect memory corruptions with the memory tags assigned by default.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) * All memory allocated normally after boot gets poisoned as usual.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) #define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) static DEFINE_MUTEX(pcp_batch_high_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) #define MIN_PERCPU_PAGELIST_FRACTION (8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) DEFINE_PER_CPU(int, numa_node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) EXPORT_PER_CPU_SYMBOL(numa_node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) #ifdef CONFIG_HAVE_MEMORYLESS_NODES
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) * defined in <linux/topology.h>.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) EXPORT_PER_CPU_SYMBOL(_numa_mem_);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) /* work_structs for global per-cpu drains */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) struct pcpu_drain {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) struct work_struct work;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) static DEFINE_MUTEX(pcpu_drain_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) volatile unsigned long latent_entropy __latent_entropy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) EXPORT_SYMBOL(latent_entropy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) * Array of node states.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) [N_POSSIBLE] = NODE_MASK_ALL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) [N_ONLINE] = { { [0] = 1UL } },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) #ifndef CONFIG_NUMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) [N_NORMAL_MEMORY] = { { [0] = 1UL } },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) #ifdef CONFIG_HIGHMEM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) [N_HIGH_MEMORY] = { { [0] = 1UL } },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) [N_MEMORY] = { { [0] = 1UL } },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) [N_CPU] = { { [0] = 1UL } },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) #endif /* NUMA */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) EXPORT_SYMBOL(node_states);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) atomic_long_t _totalram_pages __read_mostly;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) EXPORT_SYMBOL(_totalram_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) unsigned long totalreserve_pages __read_mostly;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) unsigned long totalcma_pages __read_mostly;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) int percpu_pagelist_fraction;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) DEFINE_STATIC_KEY_FALSE(init_on_alloc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) EXPORT_SYMBOL(init_on_alloc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) DEFINE_STATIC_KEY_FALSE(init_on_free);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) EXPORT_SYMBOL(init_on_free);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) static bool _init_on_alloc_enabled_early __read_mostly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) static int __init early_init_on_alloc(char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) return kstrtobool(buf, &_init_on_alloc_enabled_early);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) early_param("init_on_alloc", early_init_on_alloc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) static bool _init_on_free_enabled_early __read_mostly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) static int __init early_init_on_free(char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) return kstrtobool(buf, &_init_on_free_enabled_early);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) early_param("init_on_free", early_init_on_free);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) * A cached value of the page's pageblock's migratetype, used when the page is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) * put on a pcplist. Used to avoid the pageblock migratetype lookup when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) * freeing from pcplists in most cases, at the cost of possibly becoming stale.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) * Also the migratetype set in the page does not necessarily match the pcplist
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) * other index - this ensures that it will be put on the correct CMA freelist.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) static inline int get_pcppage_migratetype(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) return page->index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) static inline void set_pcppage_migratetype(struct page *page, int migratetype)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) page->index = migratetype;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) #ifdef CONFIG_PM_SLEEP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) * The following functions are used by the suspend/hibernate code to temporarily
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) * change gfp_allowed_mask in order to avoid using I/O during memory allocations
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) * while devices are suspended. To avoid races with the suspend/hibernate code,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) * they should always be called with system_transition_mutex held
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) * (gfp_allowed_mask also should only be modified with system_transition_mutex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) * held, unless the suspend/hibernate code is guaranteed not to run in parallel
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) * with that modification).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) static gfp_t saved_gfp_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) void pm_restore_gfp_mask(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) WARN_ON(!mutex_is_locked(&system_transition_mutex));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) if (saved_gfp_mask) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) gfp_allowed_mask = saved_gfp_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) saved_gfp_mask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) void pm_restrict_gfp_mask(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) WARN_ON(!mutex_is_locked(&system_transition_mutex));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) WARN_ON(saved_gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) saved_gfp_mask = gfp_allowed_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) bool pm_suspended_storage(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) #endif /* CONFIG_PM_SLEEP */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) unsigned int pageblock_order __read_mostly;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) static void __free_pages_ok(struct page *page, unsigned int order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) fpi_t fpi_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) * results with 256, 32 in the lowmem_reserve sysctl:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) * 1G machine -> (16M dma, 784M normal, 224M high)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) * TBD: should special case ZONE_DMA32 machines here - in those we normally
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) * don't need any ZONE_NORMAL reservation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) #ifdef CONFIG_ZONE_DMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) [ZONE_DMA] = 256,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) #ifdef CONFIG_ZONE_DMA32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) [ZONE_DMA32] = 256,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) [ZONE_NORMAL] = 32,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) #ifdef CONFIG_HIGHMEM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) [ZONE_HIGHMEM] = 0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) [ZONE_MOVABLE] = 0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) static char * const zone_names[MAX_NR_ZONES] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) #ifdef CONFIG_ZONE_DMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) "DMA",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) #ifdef CONFIG_ZONE_DMA32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) "DMA32",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) "Normal",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) #ifdef CONFIG_HIGHMEM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) "HighMem",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) "Movable",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) #ifdef CONFIG_ZONE_DEVICE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) "Device",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) const char * const migratetype_names[MIGRATE_TYPES] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) "Unmovable",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) "Movable",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) "Reclaimable",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) #ifdef CONFIG_CMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) "CMA",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) "HighAtomic",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) #ifdef CONFIG_MEMORY_ISOLATION
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) "Isolate",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) [NULL_COMPOUND_DTOR] = NULL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) [COMPOUND_PAGE_DTOR] = free_compound_page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) #ifdef CONFIG_HUGETLB_PAGE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) [HUGETLB_PAGE_DTOR] = free_huge_page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) #ifdef CONFIG_TRANSPARENT_HUGEPAGE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) [TRANSHUGE_PAGE_DTOR] = free_transhuge_page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) * Try to keep at least this much lowmem free. Do not allow normal
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) * allocations below this point, only high priority ones. Automatically
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) * tuned according to the amount of memory in the system.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) int min_free_kbytes = 1024;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) int user_min_free_kbytes = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) #ifdef CONFIG_DISCONTIGMEM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) * are not on separate NUMA nodes. Functionally this works but with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) * watermark_boost_factor, it can reclaim prematurely as the ranges can be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) * quite small. By default, do not boost watermarks on discontigmem as in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) * many cases very high-order allocations like THP are likely to be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) * unsupported and the premature reclaim offsets the advantage of long-term
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) * fragmentation avoidance.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) int watermark_boost_factor __read_mostly;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) int watermark_boost_factor __read_mostly = 15000;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) int watermark_scale_factor = 10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) * Extra memory for the system to try freeing. Used to temporarily
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) * free memory, to make space for new workloads. Anyone can allocate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) * down to the min watermarks controlled by min_free_kbytes above.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) int extra_free_kbytes = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) static unsigned long nr_kernel_pages __initdata;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) static unsigned long nr_all_pages __initdata;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) static unsigned long dma_reserve __initdata;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) static unsigned long required_kernelcore __initdata;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) static unsigned long required_kernelcore_percent __initdata;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) static unsigned long required_movablecore __initdata;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) static unsigned long required_movablecore_percent __initdata;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) static bool mirrored_kernelcore __meminitdata;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) int movable_zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) EXPORT_SYMBOL(movable_zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) #if MAX_NUMNODES > 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) unsigned int nr_online_nodes __read_mostly = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) EXPORT_SYMBOL(nr_node_ids);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) EXPORT_SYMBOL(nr_online_nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) int page_group_by_mobility_disabled __read_mostly;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) * During boot we initialize deferred pages on-demand, as needed, but once
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) * page_alloc_init_late() has finished, the deferred pages are all initialized,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) * and we can permanently disable that path.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) static DEFINE_STATIC_KEY_TRUE(deferred_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) * Calling kasan_poison_pages() only after deferred memory initialization
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) * has completed. Poisoning pages during deferred memory init will greatly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) * lengthen the process and cause problem in large memory systems as the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) * deferred pages initialization is done with interrupt disabled.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) * Assuming that there will be no reference to those newly initialized
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) * pages before they are ever allocated, this should have no effect on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) * KASAN memory tracking as the poison will be properly inserted at page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) * allocation time. The only corner case is when pages are allocated by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) * on-demand allocation and then freed again before the deferred pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) * initialization is done, but this is not likely to happen.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) return static_branch_unlikely(&deferred_pages) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) PageSkipKASanPoison(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) /* Returns true if the struct page for the pfn is uninitialised */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) static inline bool __meminit early_page_uninitialised(unsigned long pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) int nid = early_pfn_to_nid(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) * Returns true when the remaining initialisation should be deferred until
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) * later in the boot cycle when it can be parallelised.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) static bool __meminit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) static unsigned long prev_end_pfn, nr_initialised;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) * prev_end_pfn static that contains the end of previous zone
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) * No need to protect because called very early in boot before smp_init.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) if (prev_end_pfn != end_pfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) prev_end_pfn = end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) nr_initialised = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) /* Always populate low zones for address-constrained allocations */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) * We start only with one section of pages, more pages are added as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) * needed until the rest of deferred pages are initialized.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) nr_initialised++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) if ((nr_initialised > PAGES_PER_SECTION) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) (pfn & (PAGES_PER_SECTION - 1)) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) NODE_DATA(nid)->first_deferred_pfn = pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) return (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) PageSkipKASanPoison(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) static inline bool early_page_uninitialised(unsigned long pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) /* Return a pointer to the bitmap storing bits affecting a block of pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) static inline unsigned long *get_pageblock_bitmap(struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) unsigned long pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) #ifdef CONFIG_SPARSEMEM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) return section_to_usemap(__pfn_to_section(pfn));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) return page_zone(page)->pageblock_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) #endif /* CONFIG_SPARSEMEM */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) #ifdef CONFIG_SPARSEMEM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) pfn &= (PAGES_PER_SECTION-1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) #endif /* CONFIG_SPARSEMEM */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) * @page: The page within the block of interest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) * @pfn: The target page frame number
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) * @mask: mask of bits that the caller is interested in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) * Return: pageblock_bits flags
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) static __always_inline
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) unsigned long __get_pfnblock_flags_mask(struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) unsigned long pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) unsigned long mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) unsigned long *bitmap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) unsigned long bitidx, word_bitidx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) unsigned long word;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) bitmap = get_pageblock_bitmap(page, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) bitidx = pfn_to_bitidx(page, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) word_bitidx = bitidx / BITS_PER_LONG;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) bitidx &= (BITS_PER_LONG-1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) word = bitmap[word_bitidx];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) return (word >> bitidx) & mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) unsigned long mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) return __get_pfnblock_flags_mask(page, pfn, mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) EXPORT_SYMBOL_GPL(get_pfnblock_flags_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) int isolate_anon_lru_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) if (!PageLRU(page) || !PageAnon(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) if (!get_page_unless_zero(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) ret = isolate_lru_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) EXPORT_SYMBOL_GPL(isolate_anon_lru_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) * @page: The page within the block of interest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) * @flags: The flags to set
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) * @pfn: The target page frame number
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) * @mask: mask of bits that the caller is interested in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) unsigned long pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) unsigned long mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) unsigned long *bitmap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) unsigned long bitidx, word_bitidx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) unsigned long old_word, word;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) bitmap = get_pageblock_bitmap(page, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) bitidx = pfn_to_bitidx(page, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) word_bitidx = bitidx / BITS_PER_LONG;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) bitidx &= (BITS_PER_LONG-1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) mask <<= bitidx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) flags <<= bitidx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) word = READ_ONCE(bitmap[word_bitidx]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) for (;;) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) if (word == old_word)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) word = old_word;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) void set_pageblock_migratetype(struct page *page, int migratetype)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) if (unlikely(page_group_by_mobility_disabled &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) migratetype < MIGRATE_PCPTYPES))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) migratetype = MIGRATE_UNMOVABLE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) set_pfnblock_flags_mask(page, (unsigned long)migratetype,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) page_to_pfn(page), MIGRATETYPE_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) #ifdef CONFIG_DEBUG_VM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) unsigned seq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) unsigned long pfn = page_to_pfn(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) unsigned long sp, start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) seq = zone_span_seqbegin(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) start_pfn = zone->zone_start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) sp = zone->spanned_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) if (!zone_spans_pfn(zone, pfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) ret = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) } while (zone_span_seqretry(zone, seq));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) pfn, zone_to_nid(zone), zone->name,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) start_pfn, start_pfn + sp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) static int page_is_consistent(struct zone *zone, struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) if (!pfn_valid_within(page_to_pfn(page)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) if (zone != page_zone(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) * Temporary debugging check for pages not lying within a given zone.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) static int __maybe_unused bad_range(struct zone *zone, struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) if (page_outside_zone_boundaries(zone, page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) if (!page_is_consistent(zone, page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) static void bad_page(struct page *page, const char *reason)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) static unsigned long resume;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) static unsigned long nr_shown;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) static unsigned long nr_unshown;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) * Allow a burst of 60 reports, then keep quiet for that minute;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) * or allow a steady drip of one report per second.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) if (nr_shown == 60) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) if (time_before(jiffies, resume)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) nr_unshown++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) if (nr_unshown) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) pr_alert(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) "BUG: Bad page state: %lu messages suppressed\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) nr_unshown);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) nr_unshown = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) nr_shown = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) if (nr_shown++ == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) resume = jiffies + 60 * HZ;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) current->comm, page_to_pfn(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) __dump_page(page, reason);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) dump_page_owner(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) print_modules();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) dump_stack();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) /* Leave bad fields for debug, except PageBuddy could make trouble */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) page_mapcount_reset(page); /* remove PageBuddy */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) * Higher-order pages are called "compound pages". They are structured thusly:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) * The first PAGE_SIZE page is called the "head page" and have PG_head set.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) * The first tail page's ->compound_dtor holds the offset in array of compound
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) * page destructors. See compound_page_dtors.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) * The first tail page's ->compound_order holds the order of allocation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) * This usage means that zero-order pages may not be compound.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) void free_compound_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) mem_cgroup_uncharge(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) __free_pages_ok(page, compound_order(page), FPI_NONE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) void prep_compound_page(struct page *page, unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) int nr_pages = 1 << order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) __SetPageHead(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) for (i = 1; i < nr_pages; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) struct page *p = page + i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) set_page_count(p, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) p->mapping = TAIL_MAPPING;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) set_compound_head(p, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) set_compound_order(page, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) atomic_set(compound_mapcount_ptr(page), -1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) if (hpage_pincount_available(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) atomic_set(compound_pincount_ptr(page), 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) #ifdef CONFIG_DEBUG_PAGEALLOC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) unsigned int _debug_guardpage_minorder;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) bool _debug_pagealloc_enabled_early __read_mostly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) EXPORT_SYMBOL(_debug_pagealloc_enabled);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) static int __init early_debug_pagealloc(char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) return kstrtobool(buf, &_debug_pagealloc_enabled_early);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) early_param("debug_pagealloc", early_debug_pagealloc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) static int __init debug_guardpage_minorder_setup(char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) unsigned long res;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) pr_err("Bad debug_guardpage_minorder value\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) _debug_guardpage_minorder = res;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) pr_info("Setting debug_guardpage_minorder to %lu\n", res);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) static inline bool set_page_guard(struct zone *zone, struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) unsigned int order, int migratetype)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) if (!debug_guardpage_enabled())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) if (order >= debug_guardpage_minorder())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) __SetPageGuard(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) INIT_LIST_HEAD(&page->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) set_page_private(page, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) /* Guard pages are not available for any usage */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) __mod_zone_freepage_state(zone, -(1 << order), migratetype);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) static inline void clear_page_guard(struct zone *zone, struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) unsigned int order, int migratetype)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) if (!debug_guardpage_enabled())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) __ClearPageGuard(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) set_page_private(page, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) if (!is_migrate_isolate(migratetype))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) __mod_zone_freepage_state(zone, (1 << order), migratetype);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) static inline bool set_page_guard(struct zone *zone, struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) unsigned int order, int migratetype) { return false; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803) static inline void clear_page_guard(struct zone *zone, struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) unsigned int order, int migratetype) {}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) * Enable static keys related to various memory debugging and hardening options.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) * Some override others, and depend on early params that are evaluated in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) * order of appearance. So we need to first gather the full picture of what was
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) * enabled, and then make decisions.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) void init_mem_debugging_and_hardening(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) bool page_poisoning_requested = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) #ifdef CONFIG_PAGE_POISONING
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) * Page poisoning is debug page alloc for some arches. If
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) * either of those options are enabled, enable poisoning.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) if (page_poisoning_enabled() ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823) (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) debug_pagealloc_enabled())) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) static_branch_enable(&_page_poisoning_enabled);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) page_poisoning_requested = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) if (_init_on_alloc_enabled_early) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) if (page_poisoning_requested)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832) pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) "will take precedence over init_on_alloc\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) static_branch_enable(&init_on_alloc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837) if (_init_on_free_enabled_early) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838) if (page_poisoning_requested)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839) pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) "will take precedence over init_on_free\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) static_branch_enable(&init_on_free);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845) #ifdef CONFIG_DEBUG_PAGEALLOC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) if (!debug_pagealloc_enabled())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849) static_branch_enable(&_debug_pagealloc_enabled);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) if (!debug_guardpage_minorder())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854) static_branch_enable(&_debug_guardpage_enabled);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858) static inline void set_buddy_order(struct page *page, unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860) set_page_private(page, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) __SetPageBuddy(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865) * This function checks whether a page is free && is the buddy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) * we can coalesce a page and its buddy if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867) * (a) the buddy is not in a hole (check before calling!) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) * (b) the buddy is in the buddy system &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869) * (c) a page and its buddy have the same order &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) * (d) a page and its buddy are in the same zone.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872) * For recording whether a page is in the buddy system, we set PageBuddy.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873) * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875) * For recording page's order, we use page_private(page).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877) static inline bool page_is_buddy(struct page *page, struct page *buddy,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878) unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880) if (!page_is_guard(buddy) && !PageBuddy(buddy))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883) if (buddy_order(buddy) != order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887) * zone check is done late to avoid uselessly calculating
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) * zone/node ids for pages that could never merge.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890) if (page_zone_id(page) != page_zone_id(buddy))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893) VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898) #ifdef CONFIG_COMPACTION
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899) static inline struct capture_control *task_capc(struct zone *zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) struct capture_control *capc = current->capture_control;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903) return unlikely(capc) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) !(current->flags & PF_KTHREAD) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) !capc->page &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906) capc->cc->zone == zone ? capc : NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909) static inline bool
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) compaction_capture(struct capture_control *capc, struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911) int order, int migratetype)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913) if (!capc || order != capc->cc->order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916) /* Do not accidentally pollute CMA or isolated regions*/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) if (is_migrate_cma(migratetype) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) is_migrate_isolate(migratetype))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922) * Do not let lower order allocations polluate a movable pageblock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923) * This might let an unmovable request use a reclaimable pageblock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924) * and vice-versa but no more than normal fallback logic which can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925) * have trouble finding a high-order free page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927) if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930) capc->page = page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935) static inline struct capture_control *task_capc(struct zone *zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940) static inline bool
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941) compaction_capture(struct capture_control *capc, struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942) int order, int migratetype)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946) #endif /* CONFIG_COMPACTION */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948) /* Used for pages not on another list */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949) static inline void add_to_free_list(struct page *page, struct zone *zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950) unsigned int order, int migratetype)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952) struct free_area *area = &zone->free_area[order];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) list_add(&page->lru, &area->free_list[migratetype]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955) area->nr_free++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958) /* Used for pages not on another list */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) unsigned int order, int migratetype)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962) struct free_area *area = &zone->free_area[order];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964) list_add_tail(&page->lru, &area->free_list[migratetype]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965) area->nr_free++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969) * Used for pages which are on another list. Move the pages to the tail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970) * of the list - so the moved pages won't immediately be considered for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971) * allocation again (e.g., optimization for memory onlining).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973) static inline void move_to_free_list(struct page *page, struct zone *zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974) unsigned int order, int migratetype)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976) struct free_area *area = &zone->free_area[order];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978) list_move_tail(&page->lru, &area->free_list[migratetype]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981) static inline void del_page_from_free_list(struct page *page, struct zone *zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984) /* clear reported state and update reported page count */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985) if (page_reported(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986) __ClearPageReported(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988) list_del(&page->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989) __ClearPageBuddy(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990) set_page_private(page, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991) zone->free_area[order].nr_free--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995) * If this is not the largest possible page, check if the buddy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996) * of the next-highest order is free. If it is, it's possible
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997) * that pages are being freed that will coalesce soon. In case,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998) * that is happening, add the free page to the tail of the list
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999) * so it's less likely to be used soon and more likely to be merged
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) * as a higher order page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) static inline bool
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) struct page *page, unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) struct page *higher_page, *higher_buddy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) unsigned long combined_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) if (order >= MAX_ORDER - 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) if (!pfn_valid_within(buddy_pfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) combined_pfn = buddy_pfn & pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) higher_page = page + (combined_pfn - pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) higher_buddy = higher_page + (buddy_pfn - combined_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) return pfn_valid_within(buddy_pfn) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) page_is_buddy(higher_page, higher_buddy, order + 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) * Freeing function for a buddy system allocator.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) * The concept of a buddy system is to maintain direct-mapped table
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) * (containing bit values) for memory blocks of various "orders".
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) * The bottom level table contains the map for the smallest allocatable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) * units of memory (here, pages), and each level above it describes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) * pairs of units from the levels below, hence, "buddies".
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) * At a high level, all that happens here is marking the table entry
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) * at the bottom level available, and propagating the changes upward
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) * as necessary, plus some accounting needed to play nicely with other
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) * parts of the VM system.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) * At each level, we keep a list of pages, which are heads of continuous
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) * free pages of length of (1 << order) and marked with PageBuddy.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) * Page's order is recorded in page_private(page) field.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) * So when we are allocating or freeing one, we can derive the state of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) * other. That is, if we allocate a small block, and both were
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) * free, the remainder of the region must be split into blocks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) * If a block is freed, and its buddy is also free, then this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) * triggers coalescing into a block of larger size.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) * -- nyc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) static inline void __free_one_page(struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) unsigned long pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) struct zone *zone, unsigned int order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) int migratetype, fpi_t fpi_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) struct capture_control *capc = task_capc(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) unsigned long buddy_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) unsigned long combined_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) unsigned int max_order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) struct page *buddy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) bool to_tail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) VM_BUG_ON(!zone_is_initialized(zone));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) VM_BUG_ON(migratetype == -1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) if (likely(!is_migrate_isolate(migratetype)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) __mod_zone_freepage_state(zone, 1 << order, migratetype);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) VM_BUG_ON_PAGE(bad_range(zone, page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) continue_merging:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073) while (order < max_order) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) if (compaction_capture(capc, page, order, migratetype)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) __mod_zone_freepage_state(zone, -(1 << order),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) migratetype);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) buddy_pfn = __find_buddy_pfn(pfn, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) buddy = page + (buddy_pfn - pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) if (!pfn_valid_within(buddy_pfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) goto done_merging;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084) if (!page_is_buddy(page, buddy, order))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) goto done_merging;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) * merge with it and move up one order.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) if (page_is_guard(buddy))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) clear_page_guard(zone, buddy, order, migratetype);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) del_page_from_free_list(buddy, zone, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) combined_pfn = buddy_pfn & pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) page = page + (combined_pfn - pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) pfn = combined_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) order++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) if (order < MAX_ORDER - 1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) /* If we are here, it means order is >= pageblock_order.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) * We want to prevent merge between freepages on isolate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) * pageblock and normal pageblock. Without this, pageblock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) * isolation could cause incorrect freepage or CMA accounting.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105) * We don't want to hit this code for the more frequent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) * low-order merging.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) if (unlikely(has_isolate_pageblock(zone))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109) int buddy_mt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) buddy_pfn = __find_buddy_pfn(pfn, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) buddy = page + (buddy_pfn - pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) buddy_mt = get_pageblock_migratetype(buddy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) if (migratetype != buddy_mt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) && (is_migrate_isolate(migratetype) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) is_migrate_isolate(buddy_mt)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) goto done_merging;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) max_order = order + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) goto continue_merging;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) done_merging:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) set_buddy_order(page, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) if (fpi_flags & FPI_TO_TAIL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) to_tail = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) else if (is_shuffle_order(order))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) to_tail = shuffle_pick_tail();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) if (to_tail)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) add_to_free_list_tail(page, zone, order, migratetype);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) add_to_free_list(page, zone, order, migratetype);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) /* Notify page reporting subsystem of freed page */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140) if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) page_reporting_notify_free(order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) * A bad page could be due to a number of fields. Instead of multiple branches,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) * try and check multiple fields with one check. The caller must do a detailed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147) * check if necessary.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149) static inline bool page_expected_state(struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) unsigned long check_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) if (unlikely(atomic_read(&page->_mapcount) != -1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) if (unlikely((unsigned long)page->mapping |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) page_ref_count(page) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) #ifdef CONFIG_MEMCG
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158) (unsigned long)page->mem_cgroup |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) (page->flags & check_flags)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166) static const char *page_bad_reason(struct page *page, unsigned long flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) const char *bad_reason = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) if (unlikely(atomic_read(&page->_mapcount) != -1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) bad_reason = "nonzero mapcount";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) if (unlikely(page->mapping != NULL))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) bad_reason = "non-NULL mapping";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) if (unlikely(page_ref_count(page) != 0))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) bad_reason = "nonzero _refcount";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) if (unlikely(page->flags & flags)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177) if (flags == PAGE_FLAGS_CHECK_AT_PREP)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) #ifdef CONFIG_MEMCG
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) if (unlikely(page->mem_cgroup))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) bad_reason = "page still charged to cgroup";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) return bad_reason;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) static void check_free_page_bad(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) bad_page(page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) static inline int check_free_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200) /* Something has gone sideways, find it */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) check_free_page_bad(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) static int free_tail_pages_check(struct page *head_page, struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207) int ret = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) * We rely page->lru.next never has bit 0 set, unless the page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211) * is PageTail(). Let's make sure that's true even for poisoned ->lru.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) switch (page - head_page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) case 1:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) /* the first tail page: ->mapping may be compound_mapcount() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) if (unlikely(compound_mapcount(page))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223) bad_page(page, "nonzero compound_mapcount");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227) case 2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) * the second tail page: ->mapping is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) * deferred_list.next -- ignore value.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) if (page->mapping != TAIL_MAPPING) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) bad_page(page, "corrupted mapping in tail page");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240) if (unlikely(!PageTail(page))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241) bad_page(page, "PageTail not set");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244) if (unlikely(compound_head(page) != head_page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245) bad_page(page, "compound_head not consistent");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250) page->mapping = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) clear_compound_head(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255) static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259) if (zero_tags) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260) for (i = 0; i < numpages; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261) tag_clear_highpage(page + i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265) /* s390's use of memset() could override KASAN redzones. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266) kasan_disable_current();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267) for (i = 0; i < numpages; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268) u8 tag = page_kasan_tag(page + i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269) page_kasan_tag_reset(page + i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270) clear_highpage(page + i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271) page_kasan_tag_set(page + i, tag);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273) kasan_enable_current();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276) static __always_inline bool free_pages_prepare(struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) unsigned int order, bool check_free, fpi_t fpi_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279) int bad = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280) bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282) VM_BUG_ON_PAGE(PageTail(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284) trace_mm_page_free(page, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286) if (unlikely(PageHWPoison(page)) && !order) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288) * Do not let hwpoison pages hit pcplists/buddy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289) * Untie memcg state and reset page's owner
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1290) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1291) if (memcg_kmem_enabled() && PageKmemcg(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1292) __memcg_kmem_uncharge_page(page, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1293) reset_page_owner(page, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1294) free_page_pinner(page, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1295) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1296) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1297)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1298) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1299) * Check tail pages before head page information is cleared to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1300) * avoid checking PageCompound for order-0 pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1301) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1302) if (unlikely(order)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1303) bool compound = PageCompound(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1304) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1305)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1306) VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1307)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1308) if (compound)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1309) ClearPageDoubleMap(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1310) for (i = 1; i < (1 << order); i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1311) if (compound)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1312) bad += free_tail_pages_check(page, page + i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1313) if (unlikely(check_free_page(page + i))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1314) bad++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1315) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1316) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1317) (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1318) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1319) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1320) if (PageMappingFlags(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1321) page->mapping = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1322) if (memcg_kmem_enabled() && PageKmemcg(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1323) __memcg_kmem_uncharge_page(page, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1324) if (check_free)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1325) bad += check_free_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1326) if (bad)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1327) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1328)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1329) page_cpupid_reset_last(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1330) page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1331) reset_page_owner(page, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1332) free_page_pinner(page, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1333)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1334) if (!PageHighMem(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1335) debug_check_no_locks_freed(page_address(page),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1336) PAGE_SIZE << order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1337) debug_check_no_obj_freed(page_address(page),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1338) PAGE_SIZE << order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1339) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1340)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1341) kernel_poison_pages(page, 1 << order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1342)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1343) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1344) * As memory initialization might be integrated into KASAN,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1345) * kasan_free_pages and kernel_init_free_pages must be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1346) * kept together to avoid discrepancies in behavior.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1347) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1348) * With hardware tag-based KASAN, memory tags must be set before the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1349) * page becomes unavailable via debug_pagealloc or arch_free_page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1350) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1351) if (kasan_has_integrated_init()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1352) if (!skip_kasan_poison)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1353) kasan_free_pages(page, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1354) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1355) bool init = want_init_on_free();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1356)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1357) if (init)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1358) kernel_init_free_pages(page, 1 << order, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1359) if (!skip_kasan_poison)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1360) kasan_poison_pages(page, order, init);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1361) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1362)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1363) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1364) * arch_free_page() can make the page's contents inaccessible. s390
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1365) * does this. So nothing which can access the page's contents should
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1366) * happen after this.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1367) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1368) arch_free_page(page, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1369)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1370) debug_pagealloc_unmap_pages(page, 1 << order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1371)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1372) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1373) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1374)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1375) #ifdef CONFIG_DEBUG_VM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1376) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1377) * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1378) * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1379) * moved from pcp lists to free lists.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1380) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1381) static bool free_pcp_prepare(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1382) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1383) return free_pages_prepare(page, 0, true, FPI_NONE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1384) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1385)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1386) static bool bulkfree_pcp_prepare(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1387) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1388) if (debug_pagealloc_enabled_static())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1389) return check_free_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1390) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1391) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1392) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1393) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1394) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1395) * With DEBUG_VM disabled, order-0 pages being freed are checked only when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1396) * moving from pcp lists to free list in order to reduce overhead. With
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1397) * debug_pagealloc enabled, they are checked also immediately when being freed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1398) * to the pcp lists.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1399) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1400) static bool free_pcp_prepare(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1401) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1402) if (debug_pagealloc_enabled_static())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1403) return free_pages_prepare(page, 0, true, FPI_NONE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1404) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1405) return free_pages_prepare(page, 0, false, FPI_NONE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1406) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1407)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1408) static bool bulkfree_pcp_prepare(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1409) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1410) return check_free_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1411) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1412) #endif /* CONFIG_DEBUG_VM */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1413)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1414) static inline void prefetch_buddy(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1415) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1416) unsigned long pfn = page_to_pfn(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1417) unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1418) struct page *buddy = page + (buddy_pfn - pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1419)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1420) prefetch(buddy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1421) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1422)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1423) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1424) * Frees a number of pages from the PCP lists
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1425) * Assumes all pages on list are in same zone, and of same order.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1426) * count is the number of pages to free.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1427) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1428) * If the zone was previously in an "all pages pinned" state then look to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1429) * see if this freeing clears that state.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1430) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1431) * And clear the zone's pages_scanned counter, to hold off the "all pages are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1432) * pinned" detection logic.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1433) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1434) static void free_pcppages_bulk(struct zone *zone, int count,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1435) struct per_cpu_pages *pcp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1436) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1437) int migratetype = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1438) int batch_free = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1439) int prefetch_nr = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1440) bool isolated_pageblocks;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1441) struct page *page, *tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1442) LIST_HEAD(head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1443)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1444) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1445) * Ensure proper count is passed which otherwise would stuck in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1446) * below while (list_empty(list)) loop.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1447) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1448) count = min(pcp->count, count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1449) while (count) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1450) struct list_head *list;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1451)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1452) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1453) * Remove pages from lists in a round-robin fashion. A
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1454) * batch_free count is maintained that is incremented when an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1455) * empty list is encountered. This is so more pages are freed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1456) * off fuller lists instead of spinning excessively around empty
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1457) * lists
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1458) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1459) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1460) batch_free++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1461) if (++migratetype == MIGRATE_PCPTYPES)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1462) migratetype = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1463) list = &pcp->lists[migratetype];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1464) } while (list_empty(list));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1465)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1466) /* This is the only non-empty list. Free them all. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1467) if (batch_free == MIGRATE_PCPTYPES)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1468) batch_free = count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1469)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1470) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1471) page = list_last_entry(list, struct page, lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1472) /* must delete to avoid corrupting pcp list */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1473) list_del(&page->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1474) pcp->count--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1475)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1476) if (bulkfree_pcp_prepare(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1477) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1478)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1479) list_add_tail(&page->lru, &head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1480)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1481) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1482) * We are going to put the page back to the global
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1483) * pool, prefetch its buddy to speed up later access
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1484) * under zone->lock. It is believed the overhead of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1485) * an additional test and calculating buddy_pfn here
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1486) * can be offset by reduced memory latency later. To
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1487) * avoid excessive prefetching due to large count, only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1488) * prefetch buddy for the first pcp->batch nr of pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1489) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1490) if (prefetch_nr++ < pcp->batch)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1491) prefetch_buddy(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1492) } while (--count && --batch_free && !list_empty(list));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1493) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1494)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1495) spin_lock(&zone->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1496) isolated_pageblocks = has_isolate_pageblock(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1497)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1498) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1499) * Use safe version since after __free_one_page(),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1500) * page->lru.next will not point to original list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1501) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1502) list_for_each_entry_safe(page, tmp, &head, lru) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1503) int mt = get_pcppage_migratetype(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1504) /* MIGRATE_ISOLATE page should not go to pcplists */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1505) VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1506) /* Pageblock could have been isolated meanwhile */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1507) if (unlikely(isolated_pageblocks))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1508) mt = get_pageblock_migratetype(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1509)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1510) __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1511) trace_mm_page_pcpu_drain(page, 0, mt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1512) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1513) spin_unlock(&zone->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1514) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1515)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1516) static void free_one_page(struct zone *zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1517) struct page *page, unsigned long pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1518) unsigned int order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1519) int migratetype, fpi_t fpi_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1520) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1521) spin_lock(&zone->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1522) if (unlikely(has_isolate_pageblock(zone) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1523) is_migrate_isolate(migratetype))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1524) migratetype = get_pfnblock_migratetype(page, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1525) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1526) __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1527) spin_unlock(&zone->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1528) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1529)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1530) static void __meminit __init_single_page(struct page *page, unsigned long pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1531) unsigned long zone, int nid,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1532) bool zero_page_struct __maybe_unused)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1533) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1534) #ifdef CONFIG_ROCKCHIP_THUNDER_BOOT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1535) if (zero_page_struct)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1536) mm_zero_struct_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1537) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1538) mm_zero_struct_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1539) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1540) set_page_links(page, zone, nid, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1541) init_page_count(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1542) page_mapcount_reset(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1543) page_cpupid_reset_last(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1544) page_kasan_tag_reset(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1545)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1546) INIT_LIST_HEAD(&page->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1547) #ifdef WANT_PAGE_VIRTUAL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1548) /* The shift won't overflow because ZONE_NORMAL is below 4G. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1549) if (!is_highmem_idx(zone))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1550) set_page_address(page, __va(pfn << PAGE_SHIFT));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1551) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1552) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1553)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1554) #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1555) static void __meminit init_reserved_page(unsigned long pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1556) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1557) pg_data_t *pgdat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1558) int nid, zid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1559)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1560) if (!early_page_uninitialised(pfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1561) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1562)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1563) nid = early_pfn_to_nid(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1564) pgdat = NODE_DATA(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1565)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1566) for (zid = 0; zid < MAX_NR_ZONES; zid++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1567) struct zone *zone = &pgdat->node_zones[zid];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1568)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1569) if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1570) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1571) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1572) __init_single_page(pfn_to_page(pfn), pfn, zid, nid, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1573) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1574) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1575) static inline void init_reserved_page(unsigned long pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1576) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1577) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1578) #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1579)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1580) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1581) * Initialised pages do not have PageReserved set. This function is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1582) * called for each range allocated by the bootmem allocator and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1583) * marks the pages PageReserved. The remaining valid pages are later
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1584) * sent to the buddy page allocator.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1585) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1586) void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1587) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1588) unsigned long start_pfn = PFN_DOWN(start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1589) unsigned long end_pfn = PFN_UP(end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1590)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1591) for (; start_pfn < end_pfn; start_pfn++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1592) if (pfn_valid(start_pfn)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1593) struct page *page = pfn_to_page(start_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1594)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1595) init_reserved_page(start_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1596)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1597) /* Avoid false-positive PageTail() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1598) INIT_LIST_HEAD(&page->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1599)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1600) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1601) * no need for atomic set_bit because the struct
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1602) * page is not visible yet so nobody should
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1603) * access it yet.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1604) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1605) __SetPageReserved(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1606) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1607) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1608) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1609)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1610) static void __free_pages_ok(struct page *page, unsigned int order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1611) fpi_t fpi_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1612) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1613) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1614) int migratetype;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1615) unsigned long pfn = page_to_pfn(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1616)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1617) if (!free_pages_prepare(page, order, true, fpi_flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1618) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1619)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1620) migratetype = get_pfnblock_migratetype(page, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1621) local_irq_save(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1622) __count_vm_events(PGFREE, 1 << order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1623) free_one_page(page_zone(page), page, pfn, order, migratetype,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1624) fpi_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1625) local_irq_restore(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1626) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1627)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1628) void __free_pages_core(struct page *page, unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1629) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1630) unsigned int nr_pages = 1 << order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1631) struct page *p = page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1632) unsigned int loop;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1633)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1634) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1635) * When initializing the memmap, __init_single_page() sets the refcount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1636) * of all pages to 1 ("allocated"/"not free"). We have to set the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1637) * refcount of all involved pages to 0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1638) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1639) prefetchw(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1640) for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1641) prefetchw(p + 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1642) __ClearPageReserved(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1643) set_page_count(p, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1644) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1645) __ClearPageReserved(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1646) set_page_count(p, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1647)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1648) atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1649)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1650) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1651) * Bypass PCP and place fresh pages right to the tail, primarily
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1652) * relevant for memory onlining.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1653) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1654) __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1655) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1656)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1657) #ifdef CONFIG_NEED_MULTIPLE_NODES
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1658)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1659) static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1660)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1661) #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1662)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1663) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1664) * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1665) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1666) int __meminit __early_pfn_to_nid(unsigned long pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1667) struct mminit_pfnnid_cache *state)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1668) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1669) unsigned long start_pfn, end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1670) int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1671)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1672) if (state->last_start <= pfn && pfn < state->last_end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1673) return state->last_nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1674)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1675) nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1676) if (nid != NUMA_NO_NODE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1677) state->last_start = start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1678) state->last_end = end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1679) state->last_nid = nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1680) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1681)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1682) return nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1683) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1684) #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1685)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1686) int __meminit early_pfn_to_nid(unsigned long pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1687) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1688) static DEFINE_SPINLOCK(early_pfn_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1689) int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1690)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1691) spin_lock(&early_pfn_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1692) nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1693) if (nid < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1694) nid = first_online_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1695) spin_unlock(&early_pfn_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1696)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1697) return nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1698) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1699) #endif /* CONFIG_NEED_MULTIPLE_NODES */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1700)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1701) void __init memblock_free_pages(struct page *page, unsigned long pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1702) unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1703) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1704) if (early_page_uninitialised(pfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1705) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1706) __free_pages_core(page, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1707) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1708)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1709) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1710) * Check that the whole (or subset of) a pageblock given by the interval of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1711) * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1712) * with the migration of free compaction scanner. The scanners then need to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1713) * use only pfn_valid_within() check for arches that allow holes within
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1714) * pageblocks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1715) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1716) * Return struct page pointer of start_pfn, or NULL if checks were not passed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1717) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1718) * It's possible on some configurations to have a setup like node0 node1 node0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1719) * i.e. it's possible that all pages within a zones range of pages do not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1720) * belong to a single zone. We assume that a border between node0 and node1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1721) * can occur within a single pageblock, but not a node0 node1 node0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1722) * interleaving within a single pageblock. It is therefore sufficient to check
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1723) * the first and last page of a pageblock and avoid checking each individual
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1724) * page in a pageblock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1725) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1726) struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1727) unsigned long end_pfn, struct zone *zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1728) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1729) struct page *start_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1730) struct page *end_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1731)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1732) /* end_pfn is one past the range we are checking */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1733) end_pfn--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1734)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1735) if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1736) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1737)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1738) start_page = pfn_to_online_page(start_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1739) if (!start_page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1740) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1741)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1742) if (page_zone(start_page) != zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1743) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1744)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1745) end_page = pfn_to_page(end_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1746)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1747) /* This gives a shorter code than deriving page_zone(end_page) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1748) if (page_zone_id(start_page) != page_zone_id(end_page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1749) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1750)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1751) return start_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1752) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1753)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1754) void set_zone_contiguous(struct zone *zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1755) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1756) unsigned long block_start_pfn = zone->zone_start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1757) unsigned long block_end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1758)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1759) block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1760) for (; block_start_pfn < zone_end_pfn(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1761) block_start_pfn = block_end_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1762) block_end_pfn += pageblock_nr_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1763)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1764) block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1765)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1766) if (!__pageblock_pfn_to_page(block_start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1767) block_end_pfn, zone))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1768) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1769) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1770) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1771)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1772) /* We confirm that there is no hole */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1773) zone->contiguous = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1774) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1775)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1776) void clear_zone_contiguous(struct zone *zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1777) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1778) zone->contiguous = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1779) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1780)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1781) #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1782) static void __init deferred_free_range(unsigned long pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1783) unsigned long nr_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1784) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1785) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1786) unsigned long i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1787)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1788) if (!nr_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1789) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1790)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1791) page = pfn_to_page(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1792)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1793) /* Free a large naturally-aligned chunk if possible */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1794) if (nr_pages == pageblock_nr_pages &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1795) (pfn & (pageblock_nr_pages - 1)) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1796) set_pageblock_migratetype(page, MIGRATE_MOVABLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1797) __free_pages_core(page, pageblock_order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1798) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1799) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1800)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1801) for (i = 0; i < nr_pages; i++, page++, pfn++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1802) if ((pfn & (pageblock_nr_pages - 1)) == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1803) set_pageblock_migratetype(page, MIGRATE_MOVABLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1804) __free_pages_core(page, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1805) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1806) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1807)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1808) /* Completion tracking for deferred_init_memmap() threads */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1809) static atomic_t pgdat_init_n_undone __initdata;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1810) static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1811)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1812) static inline void __init pgdat_init_report_one_done(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1813) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1814) if (atomic_dec_and_test(&pgdat_init_n_undone))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1815) complete(&pgdat_init_all_done_comp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1816) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1817)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1818) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1819) * Returns true if page needs to be initialized or freed to buddy allocator.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1820) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1821) * First we check if pfn is valid on architectures where it is possible to have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1822) * holes within pageblock_nr_pages. On systems where it is not possible, this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1823) * function is optimized out.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1824) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1825) * Then, we check if a current large page is valid by only checking the validity
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1826) * of the head pfn.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1827) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1828) static inline bool __init deferred_pfn_valid(unsigned long pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1829) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1830) if (!pfn_valid_within(pfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1831) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1832) if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1833) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1834) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1835) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1836)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1837) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1838) * Free pages to buddy allocator. Try to free aligned pages in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1839) * pageblock_nr_pages sizes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1840) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1841) static void __init deferred_free_pages(unsigned long pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1842) unsigned long end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1843) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1844) unsigned long nr_pgmask = pageblock_nr_pages - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1845) unsigned long nr_free = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1846)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1847) for (; pfn < end_pfn; pfn++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1848) if (!deferred_pfn_valid(pfn)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1849) deferred_free_range(pfn - nr_free, nr_free);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1850) nr_free = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1851) } else if (!(pfn & nr_pgmask)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1852) deferred_free_range(pfn - nr_free, nr_free);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1853) nr_free = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1854) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1855) nr_free++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1856) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1857) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1858) /* Free the last block of pages to allocator */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1859) deferred_free_range(pfn - nr_free, nr_free);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1860) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1861)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1862) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1863) * Initialize struct pages. We minimize pfn page lookups and scheduler checks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1864) * by performing it only once every pageblock_nr_pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1865) * Return number of pages initialized.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1866) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1867) static unsigned long __init deferred_init_pages(struct zone *zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1868) unsigned long pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1869) unsigned long end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1870) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1871) unsigned long nr_pgmask = pageblock_nr_pages - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1872) int nid = zone_to_nid(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1873) unsigned long nr_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1874) int zid = zone_idx(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1875) struct page *page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1876)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1877) for (; pfn < end_pfn; pfn++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1878) if (!deferred_pfn_valid(pfn)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1879) page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1880) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1881) } else if (!page || !(pfn & nr_pgmask)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1882) page = pfn_to_page(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1883) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1884) page++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1885) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1886) __init_single_page(page, pfn, zid, nid, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1887) nr_pages++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1888) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1889) return (nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1890) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1891)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1892) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1893) * This function is meant to pre-load the iterator for the zone init.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1894) * Specifically it walks through the ranges until we are caught up to the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1895) * first_init_pfn value and exits there. If we never encounter the value we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1896) * return false indicating there are no valid ranges left.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1897) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1898) static bool __init
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1899) deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1900) unsigned long *spfn, unsigned long *epfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1901) unsigned long first_init_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1902) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1903) u64 j;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1904)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1905) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1906) * Start out by walking through the ranges in this zone that have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1907) * already been initialized. We don't need to do anything with them
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1908) * so we just need to flush them out of the system.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1909) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1910) for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1911) if (*epfn <= first_init_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1912) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1913) if (*spfn < first_init_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1914) *spfn = first_init_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1915) *i = j;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1916) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1917) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1918)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1919) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1920) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1921)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1922) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1923) * Initialize and free pages. We do it in two loops: first we initialize
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1924) * struct page, then free to buddy allocator, because while we are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1925) * freeing pages we can access pages that are ahead (computing buddy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1926) * page in __free_one_page()).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1927) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1928) * In order to try and keep some memory in the cache we have the loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1929) * broken along max page order boundaries. This way we will not cause
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1930) * any issues with the buddy page computation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1931) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1932) static unsigned long __init
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1933) deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1934) unsigned long *end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1935) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1936) unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1937) unsigned long spfn = *start_pfn, epfn = *end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1938) unsigned long nr_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1939) u64 j = *i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1940)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1941) /* First we loop through and initialize the page values */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1942) for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1943) unsigned long t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1944)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1945) if (mo_pfn <= *start_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1946) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1947)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1948) t = min(mo_pfn, *end_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1949) nr_pages += deferred_init_pages(zone, *start_pfn, t);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1950)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1951) if (mo_pfn < *end_pfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1952) *start_pfn = mo_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1953) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1954) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1955) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1956)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1957) /* Reset values and now loop through freeing pages as needed */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1958) swap(j, *i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1959)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1960) for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1961) unsigned long t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1962)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1963) if (mo_pfn <= spfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1964) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1965)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1966) t = min(mo_pfn, epfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1967) deferred_free_pages(spfn, t);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1968)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1969) if (mo_pfn <= epfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1970) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1971) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1972)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1973) return nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1974) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1975)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1976) static void __init
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1977) deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1978) void *arg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1979) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1980) unsigned long spfn, epfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1981) struct zone *zone = arg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1982) u64 i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1983)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1984) deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1985)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1986) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1987) * Initialize and free pages in MAX_ORDER sized increments so that we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1988) * can avoid introducing any issues with the buddy allocator.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1989) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1990) while (spfn < end_pfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1991) deferred_init_maxorder(&i, zone, &spfn, &epfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1992) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1993) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1994) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1995)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1996) /* An arch may override for more concurrency. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1997) __weak int __init
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1998) deferred_page_init_max_threads(const struct cpumask *node_cpumask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1999) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2000) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2001) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2002)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2003) /* Initialise remaining memory on a node */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2004) static int __init deferred_init_memmap(void *data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2005) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2006) pg_data_t *pgdat = data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2007) const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2008) unsigned long spfn = 0, epfn = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2009) unsigned long first_init_pfn, flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2010) unsigned long start = jiffies;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2011) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2012) int zid, max_threads;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2013) u64 i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2014)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2015) /* Bind memory initialisation thread to a local node if possible */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2016) if (!cpumask_empty(cpumask))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2017) set_cpus_allowed_ptr(current, cpumask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2018)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2019) pgdat_resize_lock(pgdat, &flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2020) first_init_pfn = pgdat->first_deferred_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2021) if (first_init_pfn == ULONG_MAX) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2022) pgdat_resize_unlock(pgdat, &flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2023) pgdat_init_report_one_done();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2024) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2025) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2026)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2027) /* Sanity check boundaries */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2028) BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2029) BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2030) pgdat->first_deferred_pfn = ULONG_MAX;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2031)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2032) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2033) * Once we unlock here, the zone cannot be grown anymore, thus if an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2034) * interrupt thread must allocate this early in boot, zone must be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2035) * pre-grown prior to start of deferred page initialization.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2036) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2037) pgdat_resize_unlock(pgdat, &flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2038)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2039) /* Only the highest zone is deferred so find it */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2040) for (zid = 0; zid < MAX_NR_ZONES; zid++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2041) zone = pgdat->node_zones + zid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2042) if (first_init_pfn < zone_end_pfn(zone))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2043) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2044) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2045)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2046) /* If the zone is empty somebody else may have cleared out the zone */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2047) if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2048) first_init_pfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2049) goto zone_empty;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2050)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2051) max_threads = deferred_page_init_max_threads(cpumask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2052)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2053) while (spfn < epfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2054) unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2055) struct padata_mt_job job = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2056) .thread_fn = deferred_init_memmap_chunk,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2057) .fn_arg = zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2058) .start = spfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2059) .size = epfn_align - spfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2060) .align = PAGES_PER_SECTION,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2061) .min_chunk = PAGES_PER_SECTION,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2062) .max_threads = max_threads,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2063) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2064)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2065) padata_do_multithreaded(&job);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2066) deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2067) epfn_align);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2068) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2069) zone_empty:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2070) /* Sanity check that the next zone really is unpopulated */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2071) WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2072)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2073) pr_info("node %d deferred pages initialised in %ums\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2074) pgdat->node_id, jiffies_to_msecs(jiffies - start));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2075)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2076) pgdat_init_report_one_done();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2077) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2078) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2079)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2080) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2081) * If this zone has deferred pages, try to grow it by initializing enough
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2082) * deferred pages to satisfy the allocation specified by order, rounded up to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2083) * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2084) * of SECTION_SIZE bytes by initializing struct pages in increments of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2085) * PAGES_PER_SECTION * sizeof(struct page) bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2086) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2087) * Return true when zone was grown, otherwise return false. We return true even
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2088) * when we grow less than requested, to let the caller decide if there are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2089) * enough pages to satisfy the allocation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2090) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2091) * Note: We use noinline because this function is needed only during boot, and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2092) * it is called from a __ref function _deferred_grow_zone. This way we are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2093) * making sure that it is not inlined into permanent text section.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2094) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2095) static noinline bool __init
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2096) deferred_grow_zone(struct zone *zone, unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2097) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2098) unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2099) pg_data_t *pgdat = zone->zone_pgdat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2100) unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2101) unsigned long spfn, epfn, flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2102) unsigned long nr_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2103) u64 i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2104)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2105) /* Only the last zone may have deferred pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2106) if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2107) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2108)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2109) pgdat_resize_lock(pgdat, &flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2110)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2111) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2112) * If someone grew this zone while we were waiting for spinlock, return
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2113) * true, as there might be enough pages already.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2114) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2115) if (first_deferred_pfn != pgdat->first_deferred_pfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2116) pgdat_resize_unlock(pgdat, &flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2117) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2118) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2119)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2120) /* If the zone is empty somebody else may have cleared out the zone */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2121) if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2122) first_deferred_pfn)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2123) pgdat->first_deferred_pfn = ULONG_MAX;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2124) pgdat_resize_unlock(pgdat, &flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2125) /* Retry only once. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2126) return first_deferred_pfn != ULONG_MAX;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2127) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2128)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2129) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2130) * Initialize and free pages in MAX_ORDER sized increments so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2131) * that we can avoid introducing any issues with the buddy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2132) * allocator.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2133) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2134) while (spfn < epfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2135) /* update our first deferred PFN for this section */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2136) first_deferred_pfn = spfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2137)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2138) nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2139) touch_nmi_watchdog();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2140)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2141) /* We should only stop along section boundaries */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2142) if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2143) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2144)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2145) /* If our quota has been met we can stop here */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2146) if (nr_pages >= nr_pages_needed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2147) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2148) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2149)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2150) pgdat->first_deferred_pfn = spfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2151) pgdat_resize_unlock(pgdat, &flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2152)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2153) return nr_pages > 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2154) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2155)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2156) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2157) * deferred_grow_zone() is __init, but it is called from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2158) * get_page_from_freelist() during early boot until deferred_pages permanently
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2159) * disables this call. This is why we have refdata wrapper to avoid warning,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2160) * and to ensure that the function body gets unloaded.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2161) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2162) static bool __ref
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2163) _deferred_grow_zone(struct zone *zone, unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2164) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2165) return deferred_grow_zone(zone, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2166) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2167)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2168) #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2169)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2170) void __init page_alloc_init_late(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2171) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2172) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2173) int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2174)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2175) #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2176)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2177) /* There will be num_node_state(N_MEMORY) threads */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2178) atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2179) for_each_node_state(nid, N_MEMORY) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2180) kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2181) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2182)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2183) /* Block until all are initialised */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2184) wait_for_completion(&pgdat_init_all_done_comp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2185)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2186) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2187) * The number of managed pages has changed due to the initialisation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2188) * so the pcpu batch and high limits needs to be updated or the limits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2189) * will be artificially small.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2190) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2191) for_each_populated_zone(zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2192) zone_pcp_update(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2193)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2194) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2195) * We initialized the rest of the deferred pages. Permanently disable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2196) * on-demand struct page initialization.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2197) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2198) static_branch_disable(&deferred_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2199)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2200) /* Reinit limits that are based on free pages after the kernel is up */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2201) files_maxfiles_init();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2202) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2203)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2204) /* Discard memblock private memory */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2205) memblock_discard();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2206)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2207) for_each_node_state(nid, N_MEMORY)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2208) shuffle_free_memory(NODE_DATA(nid));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2209)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2210) for_each_populated_zone(zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2211) set_zone_contiguous(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2212) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2213)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2214) #ifdef CONFIG_CMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2215) /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2216) void __init init_cma_reserved_pageblock(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2217) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2218) unsigned i = pageblock_nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2219) struct page *p = page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2220)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2221) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2222) __ClearPageReserved(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2223) set_page_count(p, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2224) } while (++p, --i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2225)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2226) set_pageblock_migratetype(page, MIGRATE_CMA);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2227)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2228) if (pageblock_order >= MAX_ORDER) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2229) i = pageblock_nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2230) p = page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2231) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2232) set_page_refcounted(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2233) __free_pages(p, MAX_ORDER - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2234) p += MAX_ORDER_NR_PAGES;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2235) } while (i -= MAX_ORDER_NR_PAGES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2236) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2237) set_page_refcounted(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2238) __free_pages(page, pageblock_order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2239) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2240)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2241) adjust_managed_page_count(page, pageblock_nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2242) page_zone(page)->cma_pages += pageblock_nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2243) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2244) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2245)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2246) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2247) * The order of subdivision here is critical for the IO subsystem.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2248) * Please do not alter this order without good reasons and regression
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2249) * testing. Specifically, as large blocks of memory are subdivided,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2250) * the order in which smaller blocks are delivered depends on the order
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2251) * they're subdivided in this function. This is the primary factor
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2252) * influencing the order in which pages are delivered to the IO
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2253) * subsystem according to empirical testing, and this is also justified
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2254) * by considering the behavior of a buddy system containing a single
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2255) * large block of memory acted on by a series of small allocations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2256) * This behavior is a critical factor in sglist merging's success.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2257) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2258) * -- nyc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2259) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2260) static inline void expand(struct zone *zone, struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2261) int low, int high, int migratetype)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2262) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2263) unsigned long size = 1 << high;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2264)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2265) while (high > low) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2266) high--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2267) size >>= 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2268) VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2269)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2270) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2271) * Mark as guard pages (or page), that will allow to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2272) * merge back to allocator when buddy will be freed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2273) * Corresponding page table entries will not be touched,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2274) * pages will stay not present in virtual address space
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2275) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2276) if (set_page_guard(zone, &page[size], high, migratetype))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2277) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2278)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2279) add_to_free_list(&page[size], zone, high, migratetype);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2280) set_buddy_order(&page[size], high);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2281) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2282) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2283)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2284) static void check_new_page_bad(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2285) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2286) if (unlikely(page->flags & __PG_HWPOISON)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2287) /* Don't complain about hwpoisoned pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2288) page_mapcount_reset(page); /* remove PageBuddy */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2289) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2290) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2291)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2292) bad_page(page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2293) page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2294) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2295)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2296) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2297) * This page is about to be returned from the page allocator
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2298) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2299) static inline int check_new_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2300) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2301) if (likely(page_expected_state(page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2302) PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2303) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2304)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2305) check_new_page_bad(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2306) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2307) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2308)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2309) #ifdef CONFIG_DEBUG_VM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2310) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2311) * With DEBUG_VM enabled, order-0 pages are checked for expected state when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2312) * being allocated from pcp lists. With debug_pagealloc also enabled, they are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2313) * also checked when pcp lists are refilled from the free lists.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2314) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2315) static inline bool check_pcp_refill(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2316) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2317) if (debug_pagealloc_enabled_static())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2318) return check_new_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2319) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2320) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2321) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2322)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2323) static inline bool check_new_pcp(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2324) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2325) return check_new_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2326) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2327) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2328) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2329) * With DEBUG_VM disabled, free order-0 pages are checked for expected state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2330) * when pcp lists are being refilled from the free lists. With debug_pagealloc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2331) * enabled, they are also checked when being allocated from the pcp lists.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2332) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2333) static inline bool check_pcp_refill(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2334) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2335) return check_new_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2336) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2337) static inline bool check_new_pcp(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2338) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2339) if (debug_pagealloc_enabled_static())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2340) return check_new_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2341) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2342) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2343) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2344) #endif /* CONFIG_DEBUG_VM */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2345)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2346) static bool check_new_pages(struct page *page, unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2347) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2348) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2349) for (i = 0; i < (1 << order); i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2350) struct page *p = page + i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2351)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2352) if (unlikely(check_new_page(p)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2353) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2354) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2355)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2356) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2357) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2358)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2359) inline void post_alloc_hook(struct page *page, unsigned int order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2360) gfp_t gfp_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2361) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2362) set_page_private(page, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2363) set_page_refcounted(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2364)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2365) arch_alloc_page(page, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2366) debug_pagealloc_map_pages(page, 1 << order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2367)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2368) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2369) * Page unpoisoning must happen before memory initialization.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2370) * Otherwise, the poison pattern will be overwritten for __GFP_ZERO
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2371) * allocations and the page unpoisoning code will complain.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2372) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2373) kernel_unpoison_pages(page, 1 << order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2374)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2375) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2376) * As memory initialization might be integrated into KASAN,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2377) * kasan_alloc_pages and kernel_init_free_pages must be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2378) * kept together to avoid discrepancies in behavior.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2379) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2380) if (kasan_has_integrated_init()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2381) kasan_alloc_pages(page, order, gfp_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2382) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2383) bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2384)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2385) kasan_unpoison_pages(page, order, init);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2386) if (init)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2387) kernel_init_free_pages(page, 1 << order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2388) gfp_flags & __GFP_ZEROTAGS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2389) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2390)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2391) set_page_owner(page, order, gfp_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2392) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2393)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2394) static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2395) unsigned int alloc_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2396) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2397) post_alloc_hook(page, order, gfp_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2398)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2399) if (order && (gfp_flags & __GFP_COMP))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2400) prep_compound_page(page, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2401)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2402) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2403) * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2404) * allocate the page. The expectation is that the caller is taking
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2405) * steps that will free more memory. The caller should avoid the page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2406) * being used for !PFMEMALLOC purposes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2407) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2408) if (alloc_flags & ALLOC_NO_WATERMARKS)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2409) set_page_pfmemalloc(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2410) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2411) clear_page_pfmemalloc(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2412) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2413)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2414) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2415) * Go through the free lists for the given migratetype and remove
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2416) * the smallest available page from the freelists
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2417) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2418) static __always_inline
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2419) struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2420) int migratetype)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2421) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2422) unsigned int current_order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2423) struct free_area *area;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2424) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2425)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2426) /* Find a page of the appropriate size in the preferred list */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2427) for (current_order = order; current_order < MAX_ORDER; ++current_order) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2428) area = &(zone->free_area[current_order]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2429) page = get_page_from_free_area(area, migratetype);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2430) if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2431) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2432) del_page_from_free_list(page, zone, current_order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2433) expand(zone, page, order, current_order, migratetype);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2434) set_pcppage_migratetype(page, migratetype);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2435) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2436) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2437)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2438) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2439) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2440)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2441)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2442) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2443) * This array describes the order lists are fallen back to when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2444) * the free lists for the desirable migrate type are depleted
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2445) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2446) static int fallbacks[MIGRATE_TYPES][3] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2447) [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2448) [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2449) [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2450) #ifdef CONFIG_CMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2451) [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2452) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2453) #ifdef CONFIG_MEMORY_ISOLATION
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2454) [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2455) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2456) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2457)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2458) #ifdef CONFIG_CMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2459) static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2460) unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2461) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2462) return __rmqueue_smallest(zone, order, MIGRATE_CMA);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2463) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2464) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2465) static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2466) unsigned int order) { return NULL; }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2467) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2468)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2469) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2470) * Move the free pages in a range to the freelist tail of the requested type.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2471) * Note that start_page and end_pages are not aligned on a pageblock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2472) * boundary. If alignment is required, use move_freepages_block()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2473) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2474) static int move_freepages(struct zone *zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2475) struct page *start_page, struct page *end_page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2476) int migratetype, int *num_movable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2477) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2478) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2479) unsigned int order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2480) int pages_moved = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2481)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2482) for (page = start_page; page <= end_page;) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2483) if (!pfn_valid_within(page_to_pfn(page))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2484) page++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2485) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2486) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2487)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2488) if (!PageBuddy(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2489) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2490) * We assume that pages that could be isolated for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2491) * migration are movable. But we don't actually try
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2492) * isolating, as that would be expensive.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2493) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2494) if (num_movable &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2495) (PageLRU(page) || __PageMovable(page)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2496) (*num_movable)++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2497)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2498) page++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2499) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2500) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2501)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2502) /* Make sure we are not inadvertently changing nodes */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2503) VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2504) VM_BUG_ON_PAGE(page_zone(page) != zone, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2505)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2506) order = buddy_order(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2507) move_to_free_list(page, zone, order, migratetype);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2508) page += 1 << order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2509) pages_moved += 1 << order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2510) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2511)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2512) return pages_moved;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2513) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2514)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2515) int move_freepages_block(struct zone *zone, struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2516) int migratetype, int *num_movable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2517) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2518) unsigned long start_pfn, end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2519) struct page *start_page, *end_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2520)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2521) if (num_movable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2522) *num_movable = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2523)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2524) start_pfn = page_to_pfn(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2525) start_pfn = start_pfn & ~(pageblock_nr_pages-1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2526) start_page = pfn_to_page(start_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2527) end_page = start_page + pageblock_nr_pages - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2528) end_pfn = start_pfn + pageblock_nr_pages - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2529)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2530) /* Do not cross zone boundaries */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2531) if (!zone_spans_pfn(zone, start_pfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2532) start_page = page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2533) if (!zone_spans_pfn(zone, end_pfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2534) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2535)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2536) return move_freepages(zone, start_page, end_page, migratetype,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2537) num_movable);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2538) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2539)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2540) static void change_pageblock_range(struct page *pageblock_page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2541) int start_order, int migratetype)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2542) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2543) int nr_pageblocks = 1 << (start_order - pageblock_order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2544)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2545) while (nr_pageblocks--) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2546) set_pageblock_migratetype(pageblock_page, migratetype);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2547) pageblock_page += pageblock_nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2548) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2549) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2550)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2551) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2552) * When we are falling back to another migratetype during allocation, try to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2553) * steal extra free pages from the same pageblocks to satisfy further
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2554) * allocations, instead of polluting multiple pageblocks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2555) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2556) * If we are stealing a relatively large buddy page, it is likely there will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2557) * be more free pages in the pageblock, so try to steal them all. For
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2558) * reclaimable and unmovable allocations, we steal regardless of page size,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2559) * as fragmentation caused by those allocations polluting movable pageblocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2560) * is worse than movable allocations stealing from unmovable and reclaimable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2561) * pageblocks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2562) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2563) static bool can_steal_fallback(unsigned int order, int start_mt)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2564) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2565) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2566) * Leaving this order check is intended, although there is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2567) * relaxed order check in next check. The reason is that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2568) * we can actually steal whole pageblock if this condition met,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2569) * but, below check doesn't guarantee it and that is just heuristic
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2570) * so could be changed anytime.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2571) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2572) if (order >= pageblock_order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2573) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2574)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2575) if (order >= pageblock_order / 2 ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2576) start_mt == MIGRATE_RECLAIMABLE ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2577) start_mt == MIGRATE_UNMOVABLE ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2578) page_group_by_mobility_disabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2579) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2580)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2581) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2582) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2583)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2584) static inline bool boost_watermark(struct zone *zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2585) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2586) unsigned long max_boost;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2587)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2588) if (!watermark_boost_factor)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2589) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2590) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2591) * Don't bother in zones that are unlikely to produce results.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2592) * On small machines, including kdump capture kernels running
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2593) * in a small area, boosting the watermark can cause an out of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2594) * memory situation immediately.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2595) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2596) if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2597) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2598)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2599) max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2600) watermark_boost_factor, 10000);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2601)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2602) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2603) * high watermark may be uninitialised if fragmentation occurs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2604) * very early in boot so do not boost. We do not fall
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2605) * through and boost by pageblock_nr_pages as failing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2606) * allocations that early means that reclaim is not going
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2607) * to help and it may even be impossible to reclaim the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2608) * boosted watermark resulting in a hang.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2609) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2610) if (!max_boost)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2611) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2612)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2613) max_boost = max(pageblock_nr_pages, max_boost);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2614)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2615) zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2616) max_boost);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2617)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2618) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2619) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2620)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2621) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2622) * This function implements actual steal behaviour. If order is large enough,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2623) * we can steal whole pageblock. If not, we first move freepages in this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2624) * pageblock to our migratetype and determine how many already-allocated pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2625) * are there in the pageblock with a compatible migratetype. If at least half
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2626) * of pages are free or compatible, we can change migratetype of the pageblock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2627) * itself, so pages freed in the future will be put on the correct free list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2628) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2629) static void steal_suitable_fallback(struct zone *zone, struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2630) unsigned int alloc_flags, int start_type, bool whole_block)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2631) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2632) unsigned int current_order = buddy_order(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2633) int free_pages, movable_pages, alike_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2634) int old_block_type;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2635)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2636) old_block_type = get_pageblock_migratetype(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2637)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2638) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2639) * This can happen due to races and we want to prevent broken
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2640) * highatomic accounting.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2641) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2642) if (is_migrate_highatomic(old_block_type))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2643) goto single_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2644)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2645) /* Take ownership for orders >= pageblock_order */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2646) if (current_order >= pageblock_order) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2647) change_pageblock_range(page, current_order, start_type);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2648) goto single_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2649) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2650)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2651) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2652) * Boost watermarks to increase reclaim pressure to reduce the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2653) * likelihood of future fallbacks. Wake kswapd now as the node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2654) * may be balanced overall and kswapd will not wake naturally.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2655) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2656) if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2657) set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2658)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2659) /* We are not allowed to try stealing from the whole block */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2660) if (!whole_block)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2661) goto single_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2662)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2663) free_pages = move_freepages_block(zone, page, start_type,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2664) &movable_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2665) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2666) * Determine how many pages are compatible with our allocation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2667) * For movable allocation, it's the number of movable pages which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2668) * we just obtained. For other types it's a bit more tricky.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2669) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2670) if (start_type == MIGRATE_MOVABLE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2671) alike_pages = movable_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2672) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2673) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2674) * If we are falling back a RECLAIMABLE or UNMOVABLE allocation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2675) * to MOVABLE pageblock, consider all non-movable pages as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2676) * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2677) * vice versa, be conservative since we can't distinguish the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2678) * exact migratetype of non-movable pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2679) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2680) if (old_block_type == MIGRATE_MOVABLE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2681) alike_pages = pageblock_nr_pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2682) - (free_pages + movable_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2683) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2684) alike_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2685) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2686)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2687) /* moving whole block can fail due to zone boundary conditions */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2688) if (!free_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2689) goto single_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2690)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2691) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2692) * If a sufficient number of pages in the block are either free or of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2693) * comparable migratability as our allocation, claim the whole block.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2694) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2695) if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2696) page_group_by_mobility_disabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2697) set_pageblock_migratetype(page, start_type);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2698)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2699) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2700)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2701) single_page:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2702) move_to_free_list(page, zone, current_order, start_type);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2703) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2704)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2705) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2706) * Check whether there is a suitable fallback freepage with requested order.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2707) * If only_stealable is true, this function returns fallback_mt only if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2708) * we can steal other freepages all together. This would help to reduce
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2709) * fragmentation due to mixed migratetype pages in one pageblock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2710) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2711) int find_suitable_fallback(struct free_area *area, unsigned int order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2712) int migratetype, bool only_stealable, bool *can_steal)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2713) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2714) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2715) int fallback_mt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2716)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2717) if (area->nr_free == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2718) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2719)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2720) *can_steal = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2721) for (i = 0;; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2722) fallback_mt = fallbacks[migratetype][i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2723) if (fallback_mt == MIGRATE_TYPES)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2724) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2725)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2726) if (free_area_empty(area, fallback_mt))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2727) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2728)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2729) if (can_steal_fallback(order, migratetype))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2730) *can_steal = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2731)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2732) if (!only_stealable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2733) return fallback_mt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2734)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2735) if (*can_steal)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2736) return fallback_mt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2737) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2738)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2739) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2740) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2741)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2742) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2743) * Reserve a pageblock for exclusive use of high-order atomic allocations if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2744) * there are no empty page blocks that contain a page with a suitable order
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2745) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2746) static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2747) unsigned int alloc_order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2748) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2749) int mt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2750) unsigned long max_managed, flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2751)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2752) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2753) * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2754) * Check is race-prone but harmless.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2755) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2756) max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2757) if (zone->nr_reserved_highatomic >= max_managed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2758) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2759)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2760) spin_lock_irqsave(&zone->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2761)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2762) /* Recheck the nr_reserved_highatomic limit under the lock */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2763) if (zone->nr_reserved_highatomic >= max_managed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2764) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2765)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2766) /* Yoink! */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2767) mt = get_pageblock_migratetype(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2768) if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2769) && !is_migrate_cma(mt)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2770) zone->nr_reserved_highatomic += pageblock_nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2771) set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2772) move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2773) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2774)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2775) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2776) spin_unlock_irqrestore(&zone->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2777) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2778)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2779) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2780) * Used when an allocation is about to fail under memory pressure. This
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2781) * potentially hurts the reliability of high-order allocations when under
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2782) * intense memory pressure but failed atomic allocations should be easier
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2783) * to recover from than an OOM.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2784) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2785) * If @force is true, try to unreserve a pageblock even though highatomic
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2786) * pageblock is exhausted.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2787) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2788) static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2789) bool force)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2790) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2791) struct zonelist *zonelist = ac->zonelist;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2792) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2793) struct zoneref *z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2794) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2795) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2796) int order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2797) bool ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2798)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2799) for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2800) ac->nodemask) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2801) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2802) * Preserve at least one pageblock unless memory pressure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2803) * is really high.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2804) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2805) if (!force && zone->nr_reserved_highatomic <=
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2806) pageblock_nr_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2807) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2808)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2809) spin_lock_irqsave(&zone->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2810) for (order = 0; order < MAX_ORDER; order++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2811) struct free_area *area = &(zone->free_area[order]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2812)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2813) page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2814) if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2815) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2816)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2817) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2818) * In page freeing path, migratetype change is racy so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2819) * we can counter several free pages in a pageblock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2820) * in this loop althoug we changed the pageblock type
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2821) * from highatomic to ac->migratetype. So we should
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2822) * adjust the count once.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2823) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2824) if (is_migrate_highatomic_page(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2825) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2826) * It should never happen but changes to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2827) * locking could inadvertently allow a per-cpu
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2828) * drain to add pages to MIGRATE_HIGHATOMIC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2829) * while unreserving so be safe and watch for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2830) * underflows.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2831) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2832) zone->nr_reserved_highatomic -= min(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2833) pageblock_nr_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2834) zone->nr_reserved_highatomic);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2835) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2836)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2837) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2838) * Convert to ac->migratetype and avoid the normal
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2839) * pageblock stealing heuristics. Minimally, the caller
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2840) * is doing the work and needs the pages. More
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2841) * importantly, if the block was always converted to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2842) * MIGRATE_UNMOVABLE or another type then the number
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2843) * of pageblocks that cannot be completely freed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2844) * may increase.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2845) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2846) set_pageblock_migratetype(page, ac->migratetype);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2847) ret = move_freepages_block(zone, page, ac->migratetype,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2848) NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2849) if (ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2850) spin_unlock_irqrestore(&zone->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2851) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2852) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2853) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2854) spin_unlock_irqrestore(&zone->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2855) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2856)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2857) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2858) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2859)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2860) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2861) * Try finding a free buddy page on the fallback list and put it on the free
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2862) * list of requested migratetype, possibly along with other pages from the same
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2863) * block, depending on fragmentation avoidance heuristics. Returns true if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2864) * fallback was found so that __rmqueue_smallest() can grab it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2865) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2866) * The use of signed ints for order and current_order is a deliberate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2867) * deviation from the rest of this file, to make the for loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2868) * condition simpler.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2869) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2870) static __always_inline bool
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2871) __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2872) unsigned int alloc_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2873) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2874) struct free_area *area;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2875) int current_order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2876) int min_order = order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2877) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2878) int fallback_mt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2879) bool can_steal;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2880)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2881) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2882) * Do not steal pages from freelists belonging to other pageblocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2883) * i.e. orders < pageblock_order. If there are no local zones free,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2884) * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2885) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2886) if (alloc_flags & ALLOC_NOFRAGMENT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2887) min_order = pageblock_order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2888)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2889) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2890) * Find the largest available free page in the other list. This roughly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2891) * approximates finding the pageblock with the most free pages, which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2892) * would be too costly to do exactly.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2893) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2894) for (current_order = MAX_ORDER - 1; current_order >= min_order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2895) --current_order) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2896) area = &(zone->free_area[current_order]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2897) fallback_mt = find_suitable_fallback(area, current_order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2898) start_migratetype, false, &can_steal);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2899) if (fallback_mt == -1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2900) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2901)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2902) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2903) * We cannot steal all free pages from the pageblock and the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2904) * requested migratetype is movable. In that case it's better to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2905) * steal and split the smallest available page instead of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2906) * largest available page, because even if the next movable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2907) * allocation falls back into a different pageblock than this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2908) * one, it won't cause permanent fragmentation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2909) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2910) if (!can_steal && start_migratetype == MIGRATE_MOVABLE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2911) && current_order > order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2912) goto find_smallest;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2913)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2914) goto do_steal;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2915) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2916)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2917) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2918)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2919) find_smallest:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2920) for (current_order = order; current_order < MAX_ORDER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2921) current_order++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2922) area = &(zone->free_area[current_order]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2923) fallback_mt = find_suitable_fallback(area, current_order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2924) start_migratetype, false, &can_steal);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2925) if (fallback_mt != -1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2926) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2927) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2928)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2929) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2930) * This should not happen - we already found a suitable fallback
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2931) * when looking for the largest page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2932) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2933) VM_BUG_ON(current_order == MAX_ORDER);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2934)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2935) do_steal:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2936) page = get_page_from_free_area(area, fallback_mt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2937)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2938) steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2939) can_steal);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2940)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2941) trace_mm_page_alloc_extfrag(page, order, current_order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2942) start_migratetype, fallback_mt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2943)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2944) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2945)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2946) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2947)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2948) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2949) * Do the hard work of removing an element from the buddy allocator.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2950) * Call me with the zone->lock already held.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2951) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2952) static __always_inline struct page *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2953) __rmqueue(struct zone *zone, unsigned int order, int migratetype,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2954) unsigned int alloc_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2955) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2956) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2957)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2958) retry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2959) page = __rmqueue_smallest(zone, order, migratetype);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2960)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2961) if (unlikely(!page) && __rmqueue_fallback(zone, order, migratetype,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2962) alloc_flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2963) goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2964)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2965) trace_mm_page_alloc_zone_locked(page, order, migratetype);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2966) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2967) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2968)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2969) #ifdef CONFIG_CMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2970) static struct page *__rmqueue_cma(struct zone *zone, unsigned int order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2971) int migratetype,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2972) unsigned int alloc_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2973) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2974) struct page *page = __rmqueue_cma_fallback(zone, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2975) trace_mm_page_alloc_zone_locked(page, order, MIGRATE_CMA);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2976) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2977) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2978) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2979) static inline struct page *__rmqueue_cma(struct zone *zone, unsigned int order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2980) int migratetype,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2981) unsigned int alloc_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2982) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2983) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2984) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2985) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2986)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2987) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2988) * Obtain a specified number of elements from the buddy allocator, all under
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2989) * a single hold of the lock, for efficiency. Add them to the supplied list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2990) * Returns the number of new pages which were placed at *list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2991) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2992) static int rmqueue_bulk(struct zone *zone, unsigned int order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2993) unsigned long count, struct list_head *list,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2994) int migratetype, unsigned int alloc_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2995) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2996) int i, alloced = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2997)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2998) spin_lock(&zone->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2999) for (i = 0; i < count; ++i) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3000) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3001)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3002) if (is_migrate_cma(migratetype))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3003) page = __rmqueue_cma(zone, order, migratetype,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3004) alloc_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3005) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3006) page = __rmqueue(zone, order, migratetype, alloc_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3007)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3008) if (unlikely(page == NULL))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3009) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3010)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3011) if (unlikely(check_pcp_refill(page)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3012) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3013)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3014) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3015) * Split buddy pages returned by expand() are received here in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3016) * physical page order. The page is added to the tail of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3017) * caller's list. From the callers perspective, the linked list
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3018) * is ordered by page number under some conditions. This is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3019) * useful for IO devices that can forward direction from the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3020) * head, thus also in the physical page order. This is useful
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3021) * for IO devices that can merge IO requests if the physical
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3022) * pages are ordered properly.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3023) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3024) list_add_tail(&page->lru, list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3025) alloced++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3026) if (is_migrate_cma(get_pcppage_migratetype(page)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3027) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3028) -(1 << order));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3029) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3030)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3031) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3032) * i pages were removed from the buddy list even if some leak due
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3033) * to check_pcp_refill failing so adjust NR_FREE_PAGES based
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3034) * on i. Do not confuse with 'alloced' which is the number of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3035) * pages added to the pcp list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3036) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3037) __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3038) spin_unlock(&zone->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3039) return alloced;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3040) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3041)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3042) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3043) * Return the pcp list that corresponds to the migrate type if that list isn't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3044) * empty.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3045) * If the list is empty return NULL.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3046) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3047) static struct list_head *get_populated_pcp_list(struct zone *zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3048) unsigned int order, struct per_cpu_pages *pcp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3049) int migratetype, unsigned int alloc_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3050) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3051) struct list_head *list = &pcp->lists[migratetype];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3052)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3053) if (list_empty(list)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3054) pcp->count += rmqueue_bulk(zone, order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3055) pcp->batch, list,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3056) migratetype, alloc_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3057)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3058) if (list_empty(list))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3059) list = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3060) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3061) return list;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3062) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3063)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3064) #ifdef CONFIG_NUMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3065) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3066) * Called from the vmstat counter updater to drain pagesets of this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3067) * currently executing processor on remote nodes after they have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3068) * expired.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3069) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3070) * Note that this function must be called with the thread pinned to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3071) * a single processor.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3072) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3073) void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3074) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3075) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3076) int to_drain, batch;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3077)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3078) local_irq_save(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3079) batch = READ_ONCE(pcp->batch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3080) to_drain = min(pcp->count, batch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3081) if (to_drain > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3082) free_pcppages_bulk(zone, to_drain, pcp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3083) local_irq_restore(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3084) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3085) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3086)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3087) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3088) * Drain pcplists of the indicated processor and zone.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3089) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3090) * The processor must either be the current processor and the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3091) * thread pinned to the current processor or a processor that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3092) * is not online.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3093) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3094) static void drain_pages_zone(unsigned int cpu, struct zone *zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3095) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3096) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3097) struct per_cpu_pageset *pset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3098) struct per_cpu_pages *pcp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3099)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3100) local_irq_save(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3101) pset = per_cpu_ptr(zone->pageset, cpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3102)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3103) pcp = &pset->pcp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3104) if (pcp->count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3105) free_pcppages_bulk(zone, pcp->count, pcp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3106) local_irq_restore(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3107) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3108)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3109) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3110) * Drain pcplists of all zones on the indicated processor.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3111) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3112) * The processor must either be the current processor and the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3113) * thread pinned to the current processor or a processor that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3114) * is not online.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3115) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3116) static void drain_pages(unsigned int cpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3117) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3118) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3119)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3120) for_each_populated_zone(zone) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3121) drain_pages_zone(cpu, zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3122) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3123) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3124)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3125) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3126) * Spill all of this CPU's per-cpu pages back into the buddy allocator.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3127) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3128) * The CPU has to be pinned. When zone parameter is non-NULL, spill just
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3129) * the single zone's pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3130) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3131) void drain_local_pages(struct zone *zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3132) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3133) int cpu = smp_processor_id();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3134)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3135) if (zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3136) drain_pages_zone(cpu, zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3137) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3138) drain_pages(cpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3139) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3140)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3141) static void drain_local_pages_wq(struct work_struct *work)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3142) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3143) struct pcpu_drain *drain;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3144)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3145) drain = container_of(work, struct pcpu_drain, work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3146)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3147) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3148) * drain_all_pages doesn't use proper cpu hotplug protection so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3149) * we can race with cpu offline when the WQ can move this from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3150) * a cpu pinned worker to an unbound one. We can operate on a different
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3151) * cpu which is allright but we also have to make sure to not move to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3152) * a different one.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3153) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3154) preempt_disable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3155) drain_local_pages(drain->zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3156) preempt_enable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3157) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3158)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3159) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3160) * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3161) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3162) * When zone parameter is non-NULL, spill just the single zone's pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3163) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3164) * Note that this can be extremely slow as the draining happens in a workqueue.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3165) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3166) void drain_all_pages(struct zone *zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3167) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3168) int cpu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3169)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3170) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3171) * Allocate in the BSS so we wont require allocation in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3172) * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3173) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3174) static cpumask_t cpus_with_pcps;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3175)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3176) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3177) * Make sure nobody triggers this path before mm_percpu_wq is fully
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3178) * initialized.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3179) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3180) if (WARN_ON_ONCE(!mm_percpu_wq))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3181) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3182)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3183) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3184) * Do not drain if one is already in progress unless it's specific to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3185) * a zone. Such callers are primarily CMA and memory hotplug and need
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3186) * the drain to be complete when the call returns.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3187) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3188) if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3189) if (!zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3190) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3191) mutex_lock(&pcpu_drain_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3192) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3193)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3194) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3195) * We don't care about racing with CPU hotplug event
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3196) * as offline notification will cause the notified
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3197) * cpu to drain that CPU pcps and on_each_cpu_mask
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3198) * disables preemption as part of its processing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3199) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3200) for_each_online_cpu(cpu) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3201) struct per_cpu_pageset *pcp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3202) struct zone *z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3203) bool has_pcps = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3204)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3205) if (zone) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3206) pcp = per_cpu_ptr(zone->pageset, cpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3207) if (pcp->pcp.count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3208) has_pcps = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3209) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3210) for_each_populated_zone(z) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3211) pcp = per_cpu_ptr(z->pageset, cpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3212) if (pcp->pcp.count) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3213) has_pcps = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3214) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3215) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3216) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3217) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3218)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3219) if (has_pcps)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3220) cpumask_set_cpu(cpu, &cpus_with_pcps);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3221) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3222) cpumask_clear_cpu(cpu, &cpus_with_pcps);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3223) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3224)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3225) for_each_cpu(cpu, &cpus_with_pcps) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3226) struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3227)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3228) drain->zone = zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3229) INIT_WORK(&drain->work, drain_local_pages_wq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3230) queue_work_on(cpu, mm_percpu_wq, &drain->work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3231) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3232) for_each_cpu(cpu, &cpus_with_pcps)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3233) flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3234)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3235) mutex_unlock(&pcpu_drain_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3236) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3237)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3238) #ifdef CONFIG_HIBERNATION
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3239)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3240) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3241) * Touch the watchdog for every WD_PAGE_COUNT pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3242) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3243) #define WD_PAGE_COUNT (128*1024)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3244)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3245) void mark_free_pages(struct zone *zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3246) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3247) unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3248) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3249) unsigned int order, t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3250) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3251)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3252) if (zone_is_empty(zone))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3253) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3254)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3255) spin_lock_irqsave(&zone->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3256)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3257) max_zone_pfn = zone_end_pfn(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3258) for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3259) if (pfn_valid(pfn)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3260) page = pfn_to_page(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3261)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3262) if (!--page_count) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3263) touch_nmi_watchdog();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3264) page_count = WD_PAGE_COUNT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3265) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3266)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3267) if (page_zone(page) != zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3268) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3269)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3270) if (!swsusp_page_is_forbidden(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3271) swsusp_unset_page_free(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3272) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3273)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3274) for_each_migratetype_order(order, t) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3275) list_for_each_entry(page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3276) &zone->free_area[order].free_list[t], lru) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3277) unsigned long i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3278)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3279) pfn = page_to_pfn(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3280) for (i = 0; i < (1UL << order); i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3281) if (!--page_count) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3282) touch_nmi_watchdog();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3283) page_count = WD_PAGE_COUNT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3284) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3285) swsusp_set_page_free(pfn_to_page(pfn + i));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3286) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3287) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3288) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3289) spin_unlock_irqrestore(&zone->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3290) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3291) #endif /* CONFIG_PM */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3292)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3293) static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3294) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3295) int migratetype;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3296)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3297) if (!free_pcp_prepare(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3298) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3299)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3300) migratetype = get_pfnblock_migratetype(page, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3301) set_pcppage_migratetype(page, migratetype);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3302) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3303) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3304)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3305) static void free_unref_page_commit(struct page *page, unsigned long pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3306) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3307) struct zone *zone = page_zone(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3308) struct per_cpu_pages *pcp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3309) int migratetype;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3310) bool pcp_skip_cma_pages = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3311)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3312) migratetype = get_pcppage_migratetype(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3313) __count_vm_event(PGFREE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3314)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3315) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3316) * We only track unmovable, reclaimable and movable on pcp lists.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3317) * Free ISOLATE pages back to the allocator because they are being
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3318) * offlined but treat HIGHATOMIC as movable pages so we can get those
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3319) * areas back if necessary. Otherwise, we may have to free
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3320) * excessively into the page allocator
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3321) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3322) if (migratetype >= MIGRATE_PCPTYPES) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3323) trace_android_vh_pcplist_add_cma_pages_bypass(migratetype,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3324) &pcp_skip_cma_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3325) if (unlikely(is_migrate_isolate(migratetype)) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3326) pcp_skip_cma_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3327) free_one_page(zone, page, pfn, 0, migratetype,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3328) FPI_NONE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3329) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3330) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3331) migratetype = MIGRATE_MOVABLE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3332) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3333)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3334) pcp = &this_cpu_ptr(zone->pageset)->pcp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3335) list_add(&page->lru, &pcp->lists[migratetype]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3336) pcp->count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3337) if (pcp->count >= pcp->high) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3338) unsigned long batch = READ_ONCE(pcp->batch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3339) free_pcppages_bulk(zone, batch, pcp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3340) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3341) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3342)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3343) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3344) * Free a 0-order page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3345) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3346) void free_unref_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3347) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3348) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3349) unsigned long pfn = page_to_pfn(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3350)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3351) if (!free_unref_page_prepare(page, pfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3352) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3353)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3354) local_irq_save(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3355) free_unref_page_commit(page, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3356) local_irq_restore(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3357) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3358)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3359) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3360) * Free a list of 0-order pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3361) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3362) void free_unref_page_list(struct list_head *list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3363) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3364) struct page *page, *next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3365) unsigned long flags, pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3366) int batch_count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3367)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3368) /* Prepare pages for freeing */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3369) list_for_each_entry_safe(page, next, list, lru) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3370) pfn = page_to_pfn(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3371) if (!free_unref_page_prepare(page, pfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3372) list_del(&page->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3373) set_page_private(page, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3374) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3375)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3376) local_irq_save(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3377) list_for_each_entry_safe(page, next, list, lru) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3378) unsigned long pfn = page_private(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3379)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3380) set_page_private(page, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3381) trace_mm_page_free_batched(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3382) free_unref_page_commit(page, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3383)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3384) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3385) * Guard against excessive IRQ disabled times when we get
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3386) * a large list of pages to free.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3387) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3388) if (++batch_count == SWAP_CLUSTER_MAX) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3389) local_irq_restore(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3390) batch_count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3391) local_irq_save(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3392) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3393) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3394) local_irq_restore(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3395) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3396)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3397) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3398) * split_page takes a non-compound higher-order page, and splits it into
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3399) * n (1<<order) sub-pages: page[0..n]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3400) * Each sub-page must be freed individually.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3401) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3402) * Note: this is probably too low level an operation for use in drivers.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3403) * Please consult with lkml before using this in your driver.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3404) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3405) void split_page(struct page *page, unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3406) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3407) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3408)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3409) VM_BUG_ON_PAGE(PageCompound(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3410) VM_BUG_ON_PAGE(!page_count(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3411)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3412) for (i = 1; i < (1 << order); i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3413) set_page_refcounted(page + i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3414) split_page_owner(page, 1 << order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3415) split_page_memcg(page, 1 << order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3416) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3417) EXPORT_SYMBOL_GPL(split_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3418)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3419) int __isolate_free_page(struct page *page, unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3420) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3421) unsigned long watermark;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3422) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3423) int mt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3424)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3425) BUG_ON(!PageBuddy(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3426)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3427) zone = page_zone(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3428) mt = get_pageblock_migratetype(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3429)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3430) if (!is_migrate_isolate(mt)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3431) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3432) * Obey watermarks as if the page was being allocated. We can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3433) * emulate a high-order watermark check with a raised order-0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3434) * watermark, because we already know our high-order page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3435) * exists.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3436) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3437) watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3438) if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3439) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3440)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3441) __mod_zone_freepage_state(zone, -(1UL << order), mt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3442) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3443)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3444) /* Remove page from free list */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3445)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3446) del_page_from_free_list(page, zone, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3447)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3448) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3449) * Set the pageblock if the isolated page is at least half of a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3450) * pageblock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3451) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3452) if (order >= pageblock_order - 1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3453) struct page *endpage = page + (1 << order) - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3454) for (; page < endpage; page += pageblock_nr_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3455) int mt = get_pageblock_migratetype(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3456) if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3457) && !is_migrate_highatomic(mt))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3458) set_pageblock_migratetype(page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3459) MIGRATE_MOVABLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3460) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3461) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3462)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3463)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3464) return 1UL << order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3465) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3466)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3467) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3468) * __putback_isolated_page - Return a now-isolated page back where we got it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3469) * @page: Page that was isolated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3470) * @order: Order of the isolated page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3471) * @mt: The page's pageblock's migratetype
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3472) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3473) * This function is meant to return a page pulled from the free lists via
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3474) * __isolate_free_page back to the free lists they were pulled from.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3475) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3476) void __putback_isolated_page(struct page *page, unsigned int order, int mt)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3477) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3478) struct zone *zone = page_zone(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3479)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3480) /* zone lock should be held when this function is called */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3481) lockdep_assert_held(&zone->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3482)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3483) /* Return isolated page to tail of freelist. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3484) __free_one_page(page, page_to_pfn(page), zone, order, mt,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3485) FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3486) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3487)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3488) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3489) * Update NUMA hit/miss statistics
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3490) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3491) * Must be called with interrupts disabled.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3492) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3493) static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3494) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3495) #ifdef CONFIG_NUMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3496) enum numa_stat_item local_stat = NUMA_LOCAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3497)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3498) /* skip numa counters update if numa stats is disabled */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3499) if (!static_branch_likely(&vm_numa_stat_key))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3500) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3501)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3502) if (zone_to_nid(z) != numa_node_id())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3503) local_stat = NUMA_OTHER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3504)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3505) if (zone_to_nid(z) == zone_to_nid(preferred_zone))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3506) __inc_numa_state(z, NUMA_HIT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3507) else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3508) __inc_numa_state(z, NUMA_MISS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3509) __inc_numa_state(preferred_zone, NUMA_FOREIGN);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3510) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3511) __inc_numa_state(z, local_stat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3512) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3513) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3514)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3515) /* Remove page from the per-cpu list, caller must protect the list */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3516) static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3517) unsigned int alloc_flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3518) struct per_cpu_pages *pcp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3519) gfp_t gfp_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3520) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3521) struct page *page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3522) struct list_head *list = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3523)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3524) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3525) /* First try to get CMA pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3526) if (migratetype == MIGRATE_MOVABLE &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3527) alloc_flags & ALLOC_CMA) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3528) list = get_populated_pcp_list(zone, 0, pcp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3529) get_cma_migrate_type(), alloc_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3530) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3531)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3532) if (list == NULL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3533) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3534) * Either CMA is not suitable or there are no
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3535) * free CMA pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3536) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3537) list = get_populated_pcp_list(zone, 0, pcp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3538) migratetype, alloc_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3539) if (unlikely(list == NULL) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3540) unlikely(list_empty(list)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3541) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3542) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3543)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3544) page = list_first_entry(list, struct page, lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3545) list_del(&page->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3546) pcp->count--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3547) } while (check_new_pcp(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3548)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3549) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3550) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3551)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3552) /* Lock and remove page from the per-cpu list */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3553) static struct page *rmqueue_pcplist(struct zone *preferred_zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3554) struct zone *zone, gfp_t gfp_flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3555) int migratetype, unsigned int alloc_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3556) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3557) struct per_cpu_pages *pcp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3558) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3559) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3560)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3561) local_irq_save(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3562) pcp = &this_cpu_ptr(zone->pageset)->pcp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3563) page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3564) gfp_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3565) if (page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3566) __count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3567) zone_statistics(preferred_zone, zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3568) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3569) local_irq_restore(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3570) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3571) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3572)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3573) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3574) * Allocate a page from the given zone. Use pcplists for order-0 allocations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3575) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3576) static inline
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3577) struct page *rmqueue(struct zone *preferred_zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3578) struct zone *zone, unsigned int order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3579) gfp_t gfp_flags, unsigned int alloc_flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3580) int migratetype)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3581) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3582) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3583) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3584)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3585) if (likely(order == 0)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3586) page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3587) migratetype, alloc_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3588) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3589) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3590)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3591) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3592) * We most definitely don't want callers attempting to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3593) * allocate greater than order-1 page units with __GFP_NOFAIL.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3594) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3595) WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3596) spin_lock_irqsave(&zone->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3597)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3598) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3599) page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3600) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3601) * order-0 request can reach here when the pcplist is skipped
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3602) * due to non-CMA allocation context. HIGHATOMIC area is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3603) * reserved for high-order atomic allocation, so order-0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3604) * request should skip it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3605) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3606) if (order > 0 && alloc_flags & ALLOC_HARDER) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3607) page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3608) if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3609) trace_mm_page_alloc_zone_locked(page, order, migratetype);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3610) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3611) if (!page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3612) if (migratetype == MIGRATE_MOVABLE &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3613) alloc_flags & ALLOC_CMA)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3614) page = __rmqueue_cma(zone, order, migratetype,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3615) alloc_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3616) if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3617) page = __rmqueue(zone, order, migratetype,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3618) alloc_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3619) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3620) } while (page && check_new_pages(page, order));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3621) spin_unlock(&zone->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3622) if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3623) goto failed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3624) __mod_zone_freepage_state(zone, -(1 << order),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3625) get_pcppage_migratetype(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3626)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3627) __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3628) zone_statistics(preferred_zone, zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3629) trace_android_vh_rmqueue(preferred_zone, zone, order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3630) gfp_flags, alloc_flags, migratetype);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3631) local_irq_restore(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3632)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3633) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3634) /* Separate test+clear to avoid unnecessary atomics */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3635) if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3636) clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3637) wakeup_kswapd(zone, 0, 0, zone_idx(zone));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3638) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3639)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3640) VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3641) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3642)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3643) failed:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3644) local_irq_restore(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3645) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3646) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3647)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3648) #ifdef CONFIG_FAIL_PAGE_ALLOC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3649)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3650) static struct {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3651) struct fault_attr attr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3652)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3653) bool ignore_gfp_highmem;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3654) bool ignore_gfp_reclaim;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3655) u32 min_order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3656) } fail_page_alloc = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3657) .attr = FAULT_ATTR_INITIALIZER,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3658) .ignore_gfp_reclaim = true,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3659) .ignore_gfp_highmem = true,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3660) .min_order = 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3661) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3662)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3663) static int __init setup_fail_page_alloc(char *str)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3664) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3665) return setup_fault_attr(&fail_page_alloc.attr, str);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3666) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3667) __setup("fail_page_alloc=", setup_fail_page_alloc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3668)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3669) static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3670) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3671) if (order < fail_page_alloc.min_order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3672) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3673) if (gfp_mask & __GFP_NOFAIL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3674) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3675) if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3676) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3677) if (fail_page_alloc.ignore_gfp_reclaim &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3678) (gfp_mask & __GFP_DIRECT_RECLAIM))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3679) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3680)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3681) return should_fail(&fail_page_alloc.attr, 1 << order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3682) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3683)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3684) #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3685)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3686) static int __init fail_page_alloc_debugfs(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3687) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3688) umode_t mode = S_IFREG | 0600;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3689) struct dentry *dir;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3690)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3691) dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3692) &fail_page_alloc.attr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3693)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3694) debugfs_create_bool("ignore-gfp-wait", mode, dir,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3695) &fail_page_alloc.ignore_gfp_reclaim);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3696) debugfs_create_bool("ignore-gfp-highmem", mode, dir,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3697) &fail_page_alloc.ignore_gfp_highmem);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3698) debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3699)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3700) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3701) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3702)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3703) late_initcall(fail_page_alloc_debugfs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3704)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3705) #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3706)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3707) #else /* CONFIG_FAIL_PAGE_ALLOC */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3708)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3709) static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3710) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3711) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3712) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3713)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3714) #endif /* CONFIG_FAIL_PAGE_ALLOC */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3715)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3716) noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3717) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3718) return __should_fail_alloc_page(gfp_mask, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3719) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3720) ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3721)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3722) static inline long __zone_watermark_unusable_free(struct zone *z,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3723) unsigned int order, unsigned int alloc_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3724) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3725) const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3726) long unusable_free = (1 << order) - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3727)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3728) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3729) * If the caller does not have rights to ALLOC_HARDER then subtract
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3730) * the high-atomic reserves. This will over-estimate the size of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3731) * atomic reserve but it avoids a search.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3732) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3733) if (likely(!alloc_harder))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3734) unusable_free += z->nr_reserved_highatomic;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3735)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3736) #ifdef CONFIG_CMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3737) /* If allocation can't use CMA areas don't use free CMA pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3738) if (!(alloc_flags & ALLOC_CMA))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3739) unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3740) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3741)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3742) return unusable_free;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3743) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3744)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3745) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3746) * Return true if free base pages are above 'mark'. For high-order checks it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3747) * will return true of the order-0 watermark is reached and there is at least
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3748) * one free page of a suitable size. Checking now avoids taking the zone lock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3749) * to check in the allocation paths if no pages are free.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3750) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3751) bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3752) int highest_zoneidx, unsigned int alloc_flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3753) long free_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3754) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3755) long min = mark;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3756) int o;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3757) const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3758)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3759) /* free_pages may go negative - that's OK */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3760) free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3761)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3762) if (alloc_flags & ALLOC_HIGH)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3763) min -= min / 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3764)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3765) if (unlikely(alloc_harder)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3766) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3767) * OOM victims can try even harder than normal ALLOC_HARDER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3768) * users on the grounds that it's definitely going to be in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3769) * the exit path shortly and free memory. Any allocation it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3770) * makes during the free path will be small and short-lived.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3771) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3772) if (alloc_flags & ALLOC_OOM)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3773) min -= min / 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3774) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3775) min -= min / 4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3776) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3777)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3778) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3779) * Check watermarks for an order-0 allocation request. If these
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3780) * are not met, then a high-order request also cannot go ahead
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3781) * even if a suitable page happened to be free.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3782) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3783) if (free_pages <= min + z->lowmem_reserve[highest_zoneidx])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3784) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3785)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3786) /* If this is an order-0 request then the watermark is fine */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3787) if (!order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3788) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3789)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3790) /* For a high-order request, check at least one suitable page is free */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3791) for (o = order; o < MAX_ORDER; o++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3792) struct free_area *area = &z->free_area[o];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3793) int mt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3794)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3795) if (!area->nr_free)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3796) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3797)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3798) for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3799) #ifdef CONFIG_CMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3800) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3801) * Note that this check is needed only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3802) * when MIGRATE_CMA < MIGRATE_PCPTYPES.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3803) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3804) if (mt == MIGRATE_CMA)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3805) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3806) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3807) if (!free_area_empty(area, mt))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3808) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3809) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3810)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3811) #ifdef CONFIG_CMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3812) if ((alloc_flags & ALLOC_CMA) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3813) !free_area_empty(area, MIGRATE_CMA)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3814) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3815) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3816) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3817) if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3818) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3819) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3820) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3821) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3822)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3823) bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3824) int highest_zoneidx, unsigned int alloc_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3825) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3826) return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3827) zone_page_state(z, NR_FREE_PAGES));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3828) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3829) EXPORT_SYMBOL_GPL(zone_watermark_ok);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3830)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3831) static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3832) unsigned long mark, int highest_zoneidx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3833) unsigned int alloc_flags, gfp_t gfp_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3834) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3835) long free_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3836)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3837) free_pages = zone_page_state(z, NR_FREE_PAGES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3838)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3839) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3840) * Fast check for order-0 only. If this fails then the reserves
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3841) * need to be calculated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3842) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3843) if (!order) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3844) long fast_free;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3845)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3846) fast_free = free_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3847) fast_free -= __zone_watermark_unusable_free(z, 0, alloc_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3848) if (fast_free > mark + z->lowmem_reserve[highest_zoneidx])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3849) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3850) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3851)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3852) if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3853) free_pages))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3854) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3855) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3856) * Ignore watermark boosting for GFP_ATOMIC order-0 allocations
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3857) * when checking the min watermark. The min watermark is the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3858) * point where boosting is ignored so that kswapd is woken up
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3859) * when below the low watermark.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3860) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3861) if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3862) && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3863) mark = z->_watermark[WMARK_MIN];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3864) return __zone_watermark_ok(z, order, mark, highest_zoneidx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3865) alloc_flags, free_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3866) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3867)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3868) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3869) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3870)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3871) bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3872) unsigned long mark, int highest_zoneidx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3873) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3874) long free_pages = zone_page_state(z, NR_FREE_PAGES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3875)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3876) if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3877) free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3878)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3879) return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3880) free_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3881) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3882) EXPORT_SYMBOL_GPL(zone_watermark_ok_safe);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3883)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3884) #ifdef CONFIG_NUMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3885) static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3886) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3887) return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3888) node_reclaim_distance;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3889) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3890) #else /* CONFIG_NUMA */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3891) static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3892) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3893) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3894) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3895) #endif /* CONFIG_NUMA */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3896)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3897) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3898) * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3899) * fragmentation is subtle. If the preferred zone was HIGHMEM then
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3900) * premature use of a lower zone may cause lowmem pressure problems that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3901) * are worse than fragmentation. If the next zone is ZONE_DMA then it is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3902) * probably too small. It only makes sense to spread allocations to avoid
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3903) * fragmentation between the Normal and DMA32 zones.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3904) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3905) static inline unsigned int
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3906) alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3907) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3908) unsigned int alloc_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3909)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3910) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3911) * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3912) * to save a branch.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3913) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3914) alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3915)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3916) #ifdef CONFIG_ZONE_DMA32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3917) if (!zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3918) return alloc_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3919)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3920) if (zone_idx(zone) != ZONE_NORMAL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3921) return alloc_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3922)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3923) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3924) * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3925) * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3926) * on UMA that if Normal is populated then so is DMA32.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3927) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3928) BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3929) if (nr_online_nodes > 1 && !populated_zone(--zone))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3930) return alloc_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3931)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3932) alloc_flags |= ALLOC_NOFRAGMENT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3933) #endif /* CONFIG_ZONE_DMA32 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3934) return alloc_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3935) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3936)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3937) static inline unsigned int current_alloc_flags(gfp_t gfp_mask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3938) unsigned int alloc_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3939) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3940) #ifdef CONFIG_CMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3941) unsigned int pflags = current->flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3942)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3943) if (!(pflags & PF_MEMALLOC_NOCMA) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3944) gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3945) gfp_mask & __GFP_CMA)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3946) alloc_flags |= ALLOC_CMA;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3947)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3948) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3949) return alloc_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3950) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3951)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3952) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3953) * get_page_from_freelist goes through the zonelist trying to allocate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3954) * a page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3955) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3956) static struct page *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3957) get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3958) const struct alloc_context *ac)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3959) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3960) struct zoneref *z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3961) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3962) struct pglist_data *last_pgdat_dirty_limit = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3963) bool no_fallback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3964)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3965) retry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3966) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3967) * Scan zonelist, looking for a zone with enough free.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3968) * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3969) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3970) no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3971) z = ac->preferred_zoneref;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3972) for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3973) ac->nodemask) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3974) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3975) unsigned long mark;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3976)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3977) if (cpusets_enabled() &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3978) (alloc_flags & ALLOC_CPUSET) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3979) !__cpuset_zone_allowed(zone, gfp_mask))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3980) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3981) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3982) * When allocating a page cache page for writing, we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3983) * want to get it from a node that is within its dirty
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3984) * limit, such that no single node holds more than its
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3985) * proportional share of globally allowed dirty pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3986) * The dirty limits take into account the node's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3987) * lowmem reserves and high watermark so that kswapd
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3988) * should be able to balance it without having to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3989) * write pages from its LRU list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3990) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3991) * XXX: For now, allow allocations to potentially
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3992) * exceed the per-node dirty limit in the slowpath
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3993) * (spread_dirty_pages unset) before going into reclaim,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3994) * which is important when on a NUMA setup the allowed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3995) * nodes are together not big enough to reach the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3996) * global limit. The proper fix for these situations
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3997) * will require awareness of nodes in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3998) * dirty-throttling and the flusher threads.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3999) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4000) if (ac->spread_dirty_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4001) if (last_pgdat_dirty_limit == zone->zone_pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4002) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4003)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4004) if (!node_dirty_ok(zone->zone_pgdat)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4005) last_pgdat_dirty_limit = zone->zone_pgdat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4006) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4007) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4008) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4009)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4010) if (no_fallback && nr_online_nodes > 1 &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4011) zone != ac->preferred_zoneref->zone) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4012) int local_nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4013)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4014) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4015) * If moving to a remote node, retry but allow
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4016) * fragmenting fallbacks. Locality is more important
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4017) * than fragmentation avoidance.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4018) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4019) local_nid = zone_to_nid(ac->preferred_zoneref->zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4020) if (zone_to_nid(zone) != local_nid) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4021) alloc_flags &= ~ALLOC_NOFRAGMENT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4022) goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4023) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4024) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4025)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4026) mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4027) if (!zone_watermark_fast(zone, order, mark,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4028) ac->highest_zoneidx, alloc_flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4029) gfp_mask)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4030) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4031)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4032) #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4033) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4034) * Watermark failed for this zone, but see if we can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4035) * grow this zone if it contains deferred pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4036) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4037) if (static_branch_unlikely(&deferred_pages)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4038) if (_deferred_grow_zone(zone, order))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4039) goto try_this_zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4040) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4041) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4042) /* Checked here to keep the fast path fast */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4043) BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4044) if (alloc_flags & ALLOC_NO_WATERMARKS)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4045) goto try_this_zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4046)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4047) if (node_reclaim_mode == 0 ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4048) !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4049) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4050)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4051) ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4052) switch (ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4053) case NODE_RECLAIM_NOSCAN:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4054) /* did not scan */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4055) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4056) case NODE_RECLAIM_FULL:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4057) /* scanned but unreclaimable */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4058) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4059) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4060) /* did we reclaim enough */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4061) if (zone_watermark_ok(zone, order, mark,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4062) ac->highest_zoneidx, alloc_flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4063) goto try_this_zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4064)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4065) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4066) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4067) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4068)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4069) try_this_zone:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4070) page = rmqueue(ac->preferred_zoneref->zone, zone, order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4071) gfp_mask, alloc_flags, ac->migratetype);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4072) if (page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4073) prep_new_page(page, order, gfp_mask, alloc_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4074)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4075) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4076) * If this is a high-order atomic allocation then check
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4077) * if the pageblock should be reserved for the future
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4078) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4079) if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4080) reserve_highatomic_pageblock(page, zone, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4081)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4082) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4083) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4084) #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4085) /* Try again if zone has deferred pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4086) if (static_branch_unlikely(&deferred_pages)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4087) if (_deferred_grow_zone(zone, order))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4088) goto try_this_zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4089) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4090) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4091) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4092) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4093)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4094) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4095) * It's possible on a UMA machine to get through all zones that are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4096) * fragmented. If avoiding fragmentation, reset and try again.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4097) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4098) if (no_fallback) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4099) alloc_flags &= ~ALLOC_NOFRAGMENT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4100) goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4101) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4102)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4103) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4104) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4105)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4106) static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4107) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4108) unsigned int filter = SHOW_MEM_FILTER_NODES;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4109)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4110) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4111) * This documents exceptions given to allocations in certain
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4112) * contexts that are allowed to allocate outside current's set
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4113) * of allowed nodes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4114) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4115) if (!(gfp_mask & __GFP_NOMEMALLOC))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4116) if (tsk_is_oom_victim(current) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4117) (current->flags & (PF_MEMALLOC | PF_EXITING)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4118) filter &= ~SHOW_MEM_FILTER_NODES;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4119) if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4120) filter &= ~SHOW_MEM_FILTER_NODES;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4121)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4122) show_mem(filter, nodemask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4123) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4124)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4125) void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4126) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4127) struct va_format vaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4128) va_list args;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4129) static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4130)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4131) if ((gfp_mask & __GFP_NOWARN) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4132) !__ratelimit(&nopage_rs) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4133) ((gfp_mask & __GFP_DMA) && !has_managed_dma()))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4134) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4135)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4136) va_start(args, fmt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4137) vaf.fmt = fmt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4138) vaf.va = &args;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4139) pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4140) current->comm, &vaf, gfp_mask, &gfp_mask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4141) nodemask_pr_args(nodemask));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4142) va_end(args);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4143)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4144) cpuset_print_current_mems_allowed();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4145) pr_cont("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4146) dump_stack();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4147) warn_alloc_show_mem(gfp_mask, nodemask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4148) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4149)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4150) static inline struct page *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4151) __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4152) unsigned int alloc_flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4153) const struct alloc_context *ac)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4154) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4155) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4156)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4157) page = get_page_from_freelist(gfp_mask, order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4158) alloc_flags|ALLOC_CPUSET, ac);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4159) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4160) * fallback to ignore cpuset restriction if our nodes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4161) * are depleted
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4162) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4163) if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4164) page = get_page_from_freelist(gfp_mask, order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4165) alloc_flags, ac);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4166)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4167) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4168) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4169)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4170) static inline struct page *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4171) __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4172) const struct alloc_context *ac, unsigned long *did_some_progress)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4173) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4174) struct oom_control oc = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4175) .zonelist = ac->zonelist,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4176) .nodemask = ac->nodemask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4177) .memcg = NULL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4178) .gfp_mask = gfp_mask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4179) .order = order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4180) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4181) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4182)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4183) *did_some_progress = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4184)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4185) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4186) * Acquire the oom lock. If that fails, somebody else is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4187) * making progress for us.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4188) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4189) if (!mutex_trylock(&oom_lock)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4190) *did_some_progress = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4191) schedule_timeout_uninterruptible(1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4192) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4193) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4194)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4195) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4196) * Go through the zonelist yet one more time, keep very high watermark
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4197) * here, this is only to catch a parallel oom killing, we must fail if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4198) * we're still under heavy pressure. But make sure that this reclaim
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4199) * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4200) * allocation which will never fail due to oom_lock already held.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4201) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4202) page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4203) ~__GFP_DIRECT_RECLAIM, order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4204) ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4205) if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4206) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4207)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4208) /* Coredumps can quickly deplete all memory reserves */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4209) if (current->flags & PF_DUMPCORE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4210) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4211) /* The OOM killer will not help higher order allocs */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4212) if (order > PAGE_ALLOC_COSTLY_ORDER)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4213) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4214) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4215) * We have already exhausted all our reclaim opportunities without any
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4216) * success so it is time to admit defeat. We will skip the OOM killer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4217) * because it is very likely that the caller has a more reasonable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4218) * fallback than shooting a random task.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4219) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4220) * The OOM killer may not free memory on a specific node.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4221) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4222) if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4223) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4224) /* The OOM killer does not needlessly kill tasks for lowmem */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4225) if (ac->highest_zoneidx < ZONE_NORMAL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4226) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4227) if (pm_suspended_storage())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4228) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4229) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4230) * XXX: GFP_NOFS allocations should rather fail than rely on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4231) * other request to make a forward progress.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4232) * We are in an unfortunate situation where out_of_memory cannot
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4233) * do much for this context but let's try it to at least get
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4234) * access to memory reserved if the current task is killed (see
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4235) * out_of_memory). Once filesystems are ready to handle allocation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4236) * failures more gracefully we should just bail out here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4237) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4238)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4239) /* Exhausted what can be done so it's blame time */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4240) if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4241) *did_some_progress = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4242)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4243) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4244) * Help non-failing allocations by giving them access to memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4245) * reserves
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4246) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4247) if (gfp_mask & __GFP_NOFAIL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4248) page = __alloc_pages_cpuset_fallback(gfp_mask, order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4249) ALLOC_NO_WATERMARKS, ac);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4250) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4251) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4252) mutex_unlock(&oom_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4253) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4254) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4255)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4256) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4257) * Maximum number of compaction retries wit a progress before OOM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4258) * killer is consider as the only way to move forward.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4259) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4260) #define MAX_COMPACT_RETRIES 16
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4261)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4262) #ifdef CONFIG_COMPACTION
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4263) /* Try memory compaction for high-order allocations before reclaim */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4264) static struct page *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4265) __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4266) unsigned int alloc_flags, const struct alloc_context *ac,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4267) enum compact_priority prio, enum compact_result *compact_result)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4268) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4269) struct page *page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4270) unsigned long pflags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4271) unsigned int noreclaim_flag;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4272)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4273) if (!order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4274) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4275)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4276) psi_memstall_enter(&pflags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4277) noreclaim_flag = memalloc_noreclaim_save();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4278)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4279) *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4280) prio, &page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4281)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4282) memalloc_noreclaim_restore(noreclaim_flag);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4283) psi_memstall_leave(&pflags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4284)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4285) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4286) * At least in one zone compaction wasn't deferred or skipped, so let's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4287) * count a compaction stall
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4288) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4289) count_vm_event(COMPACTSTALL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4290)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4291) /* Prep a captured page if available */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4292) if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4293) prep_new_page(page, order, gfp_mask, alloc_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4294)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4295) /* Try get a page from the freelist if available */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4296) if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4297) page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4298)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4299) if (page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4300) struct zone *zone = page_zone(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4301)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4302) zone->compact_blockskip_flush = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4303) compaction_defer_reset(zone, order, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4304) count_vm_event(COMPACTSUCCESS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4305) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4306) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4307)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4308) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4309) * It's bad if compaction run occurs and fails. The most likely reason
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4310) * is that pages exist, but not enough to satisfy watermarks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4311) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4312) count_vm_event(COMPACTFAIL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4313)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4314) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4315)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4316) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4317) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4318)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4319) static inline bool
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4320) should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4321) enum compact_result compact_result,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4322) enum compact_priority *compact_priority,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4323) int *compaction_retries)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4324) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4325) int max_retries = MAX_COMPACT_RETRIES;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4326) int min_priority;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4327) bool ret = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4328) int retries = *compaction_retries;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4329) enum compact_priority priority = *compact_priority;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4330)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4331) if (!order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4332) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4333)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4334) if (compaction_made_progress(compact_result))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4335) (*compaction_retries)++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4336)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4337) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4338) * compaction considers all the zone as desperately out of memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4339) * so it doesn't really make much sense to retry except when the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4340) * failure could be caused by insufficient priority
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4341) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4342) if (compaction_failed(compact_result))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4343) goto check_priority;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4344)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4345) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4346) * compaction was skipped because there are not enough order-0 pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4347) * to work with, so we retry only if it looks like reclaim can help.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4348) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4349) if (compaction_needs_reclaim(compact_result)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4350) ret = compaction_zonelist_suitable(ac, order, alloc_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4351) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4352) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4353)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4354) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4355) * make sure the compaction wasn't deferred or didn't bail out early
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4356) * due to locks contention before we declare that we should give up.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4357) * But the next retry should use a higher priority if allowed, so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4358) * we don't just keep bailing out endlessly.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4359) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4360) if (compaction_withdrawn(compact_result)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4361) goto check_priority;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4362) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4363)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4364) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4365) * !costly requests are much more important than __GFP_RETRY_MAYFAIL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4366) * costly ones because they are de facto nofail and invoke OOM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4367) * killer to move on while costly can fail and users are ready
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4368) * to cope with that. 1/4 retries is rather arbitrary but we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4369) * would need much more detailed feedback from compaction to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4370) * make a better decision.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4371) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4372) if (order > PAGE_ALLOC_COSTLY_ORDER)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4373) max_retries /= 4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4374) if (*compaction_retries <= max_retries) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4375) ret = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4376) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4377) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4378)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4379) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4380) * Make sure there are attempts at the highest priority if we exhausted
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4381) * all retries or failed at the lower priorities.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4382) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4383) check_priority:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4384) min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4385) MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4386)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4387) if (*compact_priority > min_priority) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4388) (*compact_priority)--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4389) *compaction_retries = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4390) ret = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4391) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4392) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4393) trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4394) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4395) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4396) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4397) static inline struct page *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4398) __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4399) unsigned int alloc_flags, const struct alloc_context *ac,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4400) enum compact_priority prio, enum compact_result *compact_result)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4401) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4402) *compact_result = COMPACT_SKIPPED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4403) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4404) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4405)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4406) static inline bool
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4407) should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4408) enum compact_result compact_result,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4409) enum compact_priority *compact_priority,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4410) int *compaction_retries)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4411) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4412) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4413) struct zoneref *z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4414)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4415) if (!order || order > PAGE_ALLOC_COSTLY_ORDER)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4416) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4417)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4418) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4419) * There are setups with compaction disabled which would prefer to loop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4420) * inside the allocator rather than hit the oom killer prematurely.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4421) * Let's give them a good hope and keep retrying while the order-0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4422) * watermarks are OK.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4423) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4424) for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4425) ac->highest_zoneidx, ac->nodemask) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4426) if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4427) ac->highest_zoneidx, alloc_flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4428) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4429) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4430) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4431) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4432) #endif /* CONFIG_COMPACTION */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4433)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4434) #ifdef CONFIG_LOCKDEP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4435) static struct lockdep_map __fs_reclaim_map =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4436) STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4437)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4438) static bool __need_fs_reclaim(gfp_t gfp_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4439) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4440) gfp_mask = current_gfp_context(gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4441)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4442) /* no reclaim without waiting on it */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4443) if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4444) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4445)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4446) /* this guy won't enter reclaim */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4447) if (current->flags & PF_MEMALLOC)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4448) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4449)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4450) /* We're only interested __GFP_FS allocations for now */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4451) if (!(gfp_mask & __GFP_FS))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4452) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4453)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4454) if (gfp_mask & __GFP_NOLOCKDEP)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4455) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4456)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4457) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4458) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4459)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4460) void __fs_reclaim_acquire(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4461) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4462) lock_map_acquire(&__fs_reclaim_map);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4463) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4464)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4465) void __fs_reclaim_release(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4466) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4467) lock_map_release(&__fs_reclaim_map);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4468) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4469)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4470) void fs_reclaim_acquire(gfp_t gfp_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4471) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4472) if (__need_fs_reclaim(gfp_mask))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4473) __fs_reclaim_acquire();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4474) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4475) EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4476)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4477) void fs_reclaim_release(gfp_t gfp_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4478) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4479) if (__need_fs_reclaim(gfp_mask))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4480) __fs_reclaim_release();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4481) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4482) EXPORT_SYMBOL_GPL(fs_reclaim_release);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4483) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4484)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4485) /* Perform direct synchronous page reclaim */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4486) static unsigned long
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4487) __perform_reclaim(gfp_t gfp_mask, unsigned int order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4488) const struct alloc_context *ac)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4489) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4490) unsigned int noreclaim_flag;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4491) unsigned long progress;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4492)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4493) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4494)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4495) /* We now go into synchronous reclaim */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4496) cpuset_memory_pressure_bump();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4497) fs_reclaim_acquire(gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4498) noreclaim_flag = memalloc_noreclaim_save();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4499)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4500) progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4501) ac->nodemask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4502)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4503) memalloc_noreclaim_restore(noreclaim_flag);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4504) fs_reclaim_release(gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4505)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4506) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4507)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4508) return progress;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4509) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4510)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4511) /* The really slow allocator path where we enter direct reclaim */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4512) static inline struct page *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4513) __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4514) unsigned int alloc_flags, const struct alloc_context *ac,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4515) unsigned long *did_some_progress)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4516) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4517) struct page *page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4518) unsigned long pflags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4519) bool drained = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4520) bool skip_pcp_drain = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4521)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4522) psi_memstall_enter(&pflags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4523) *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4524) if (unlikely(!(*did_some_progress)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4525) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4526)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4527) retry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4528) page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4529)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4530) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4531) * If an allocation failed after direct reclaim, it could be because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4532) * pages are pinned on the per-cpu lists or in high alloc reserves.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4533) * Shrink them and try again
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4534) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4535) if (!page && !drained) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4536) unreserve_highatomic_pageblock(ac, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4537) trace_android_vh_drain_all_pages_bypass(gfp_mask, order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4538) alloc_flags, ac->migratetype, *did_some_progress, &skip_pcp_drain);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4539) if (!skip_pcp_drain)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4540) drain_all_pages(NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4541) drained = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4542) goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4543) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4544) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4545) psi_memstall_leave(&pflags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4546)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4547) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4548) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4549)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4550) static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4551) const struct alloc_context *ac)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4552) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4553) struct zoneref *z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4554) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4555) pg_data_t *last_pgdat = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4556) enum zone_type highest_zoneidx = ac->highest_zoneidx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4557)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4558) for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4559) ac->nodemask) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4560) if (last_pgdat != zone->zone_pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4561) wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4562) last_pgdat = zone->zone_pgdat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4563) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4564) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4565)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4566) static inline unsigned int
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4567) gfp_to_alloc_flags(gfp_t gfp_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4568) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4569) unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4570)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4571) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4572) * __GFP_HIGH is assumed to be the same as ALLOC_HIGH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4573) * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4574) * to save two branches.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4575) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4576) BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4577) BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4578)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4579) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4580) * The caller may dip into page reserves a bit more if the caller
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4581) * cannot run direct reclaim, or if the caller has realtime scheduling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4582) * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4583) * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4584) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4585) alloc_flags |= (__force int)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4586) (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4587)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4588) if (gfp_mask & __GFP_ATOMIC) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4589) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4590) * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4591) * if it can't schedule.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4592) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4593) if (!(gfp_mask & __GFP_NOMEMALLOC))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4594) alloc_flags |= ALLOC_HARDER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4595) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4596) * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4597) * comment for __cpuset_node_allowed().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4598) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4599) alloc_flags &= ~ALLOC_CPUSET;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4600) } else if (unlikely(rt_task(current)) && !in_interrupt())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4601) alloc_flags |= ALLOC_HARDER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4602)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4603) alloc_flags = current_alloc_flags(gfp_mask, alloc_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4604)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4605) return alloc_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4606) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4607)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4608) static bool oom_reserves_allowed(struct task_struct *tsk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4609) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4610) if (!tsk_is_oom_victim(tsk))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4611) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4612)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4613) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4614) * !MMU doesn't have oom reaper so give access to memory reserves
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4615) * only to the thread with TIF_MEMDIE set
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4616) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4617) if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4618) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4619)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4620) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4621) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4622)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4623) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4624) * Distinguish requests which really need access to full memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4625) * reserves from oom victims which can live with a portion of it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4626) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4627) static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4628) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4629) if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4630) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4631) if (gfp_mask & __GFP_MEMALLOC)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4632) return ALLOC_NO_WATERMARKS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4633) if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4634) return ALLOC_NO_WATERMARKS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4635) if (!in_interrupt()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4636) if (current->flags & PF_MEMALLOC)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4637) return ALLOC_NO_WATERMARKS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4638) else if (oom_reserves_allowed(current))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4639) return ALLOC_OOM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4640) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4641)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4642) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4643) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4644)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4645) bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4646) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4647) return !!__gfp_pfmemalloc_flags(gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4648) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4649)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4650) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4651) * Checks whether it makes sense to retry the reclaim to make a forward progress
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4652) * for the given allocation request.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4653) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4654) * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4655) * without success, or when we couldn't even meet the watermark if we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4656) * reclaimed all remaining pages on the LRU lists.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4657) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4658) * Returns true if a retry is viable or false to enter the oom path.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4659) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4660) static inline bool
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4661) should_reclaim_retry(gfp_t gfp_mask, unsigned order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4662) struct alloc_context *ac, int alloc_flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4663) bool did_some_progress, int *no_progress_loops)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4664) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4665) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4666) struct zoneref *z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4667) bool ret = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4668)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4669) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4670) * Costly allocations might have made a progress but this doesn't mean
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4671) * their order will become available due to high fragmentation so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4672) * always increment the no progress counter for them
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4673) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4674) if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4675) *no_progress_loops = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4676) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4677) (*no_progress_loops)++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4678)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4679) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4680) * Make sure we converge to OOM if we cannot make any progress
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4681) * several times in the row.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4682) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4683) if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4684) /* Before OOM, exhaust highatomic_reserve */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4685) return unreserve_highatomic_pageblock(ac, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4686) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4687)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4688) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4689) * Keep reclaiming pages while there is a chance this will lead
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4690) * somewhere. If none of the target zones can satisfy our allocation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4691) * request even if all reclaimable pages are considered then we are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4692) * screwed and have to go OOM.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4693) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4694) for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4695) ac->highest_zoneidx, ac->nodemask) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4696) unsigned long available;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4697) unsigned long reclaimable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4698) unsigned long min_wmark = min_wmark_pages(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4699) bool wmark;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4700)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4701) available = reclaimable = zone_reclaimable_pages(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4702) available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4703)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4704) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4705) * Would the allocation succeed if we reclaimed all
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4706) * reclaimable pages?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4707) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4708) wmark = __zone_watermark_ok(zone, order, min_wmark,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4709) ac->highest_zoneidx, alloc_flags, available);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4710) trace_reclaim_retry_zone(z, order, reclaimable,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4711) available, min_wmark, *no_progress_loops, wmark);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4712) if (wmark) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4713) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4714) * If we didn't make any progress and have a lot of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4715) * dirty + writeback pages then we should wait for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4716) * an IO to complete to slow down the reclaim and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4717) * prevent from pre mature OOM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4718) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4719) if (!did_some_progress) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4720) unsigned long write_pending;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4721)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4722) write_pending = zone_page_state_snapshot(zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4723) NR_ZONE_WRITE_PENDING);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4724)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4725) if (2 * write_pending > reclaimable) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4726) congestion_wait(BLK_RW_ASYNC, HZ/10);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4727) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4728) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4729) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4730)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4731) ret = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4732) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4733) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4734) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4735)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4736) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4737) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4738) * Memory allocation/reclaim might be called from a WQ context and the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4739) * current implementation of the WQ concurrency control doesn't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4740) * recognize that a particular WQ is congested if the worker thread is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4741) * looping without ever sleeping. Therefore we have to do a short sleep
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4742) * here rather than calling cond_resched().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4743) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4744) if (current->flags & PF_WQ_WORKER)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4745) schedule_timeout_uninterruptible(1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4746) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4747) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4748) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4749) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4750)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4751) static inline bool
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4752) check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4753) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4754) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4755) * It's possible that cpuset's mems_allowed and the nodemask from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4756) * mempolicy don't intersect. This should be normally dealt with by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4757) * policy_nodemask(), but it's possible to race with cpuset update in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4758) * such a way the check therein was true, and then it became false
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4759) * before we got our cpuset_mems_cookie here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4760) * This assumes that for all allocations, ac->nodemask can come only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4761) * from MPOL_BIND mempolicy (whose documented semantics is to be ignored
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4762) * when it does not intersect with the cpuset restrictions) or the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4763) * caller can deal with a violated nodemask.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4764) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4765) if (cpusets_enabled() && ac->nodemask &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4766) !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4767) ac->nodemask = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4768) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4769) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4770)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4771) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4772) * When updating a task's mems_allowed or mempolicy nodemask, it is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4773) * possible to race with parallel threads in such a way that our
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4774) * allocation can fail while the mask is being updated. If we are about
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4775) * to fail, check if the cpuset changed during allocation and if so,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4776) * retry.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4777) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4778) if (read_mems_allowed_retry(cpuset_mems_cookie))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4779) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4780)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4781) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4782) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4783)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4784) static inline struct page *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4785) __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4786) struct alloc_context *ac)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4787) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4788) bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4789) const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4790) struct page *page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4791) unsigned int alloc_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4792) unsigned long did_some_progress;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4793) enum compact_priority compact_priority;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4794) enum compact_result compact_result;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4795) int compaction_retries;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4796) int no_progress_loops;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4797) unsigned int cpuset_mems_cookie;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4798) int reserve_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4799)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4800) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4801) * We also sanity check to catch abuse of atomic reserves being used by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4802) * callers that are not in atomic context.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4803) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4804) if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4805) (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4806) gfp_mask &= ~__GFP_ATOMIC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4807)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4808) retry_cpuset:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4809) compaction_retries = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4810) no_progress_loops = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4811) compact_priority = DEF_COMPACT_PRIORITY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4812) cpuset_mems_cookie = read_mems_allowed_begin();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4813)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4814) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4815) * The fast path uses conservative alloc_flags to succeed only until
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4816) * kswapd needs to be woken up, and to avoid the cost of setting up
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4817) * alloc_flags precisely. So we do that now.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4818) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4819) alloc_flags = gfp_to_alloc_flags(gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4820)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4821) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4822) * We need to recalculate the starting point for the zonelist iterator
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4823) * because we might have used different nodemask in the fast path, or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4824) * there was a cpuset modification and we are retrying - otherwise we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4825) * could end up iterating over non-eligible zones endlessly.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4826) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4827) ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4828) ac->highest_zoneidx, ac->nodemask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4829) if (!ac->preferred_zoneref->zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4830) goto nopage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4831)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4832) if (alloc_flags & ALLOC_KSWAPD)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4833) wake_all_kswapds(order, gfp_mask, ac);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4834)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4835) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4836) * The adjusted alloc_flags might result in immediate success, so try
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4837) * that first
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4838) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4839) page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4840) if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4841) goto got_pg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4842)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4843) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4844) * For costly allocations, try direct compaction first, as it's likely
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4845) * that we have enough base pages and don't need to reclaim. For non-
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4846) * movable high-order allocations, do that as well, as compaction will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4847) * try prevent permanent fragmentation by migrating from blocks of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4848) * same migratetype.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4849) * Don't try this for allocations that are allowed to ignore
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4850) * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4851) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4852) if (can_direct_reclaim &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4853) (costly_order ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4854) (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4855) && !gfp_pfmemalloc_allowed(gfp_mask)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4856) page = __alloc_pages_direct_compact(gfp_mask, order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4857) alloc_flags, ac,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4858) INIT_COMPACT_PRIORITY,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4859) &compact_result);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4860) if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4861) goto got_pg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4862)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4863) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4864) * Checks for costly allocations with __GFP_NORETRY, which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4865) * includes some THP page fault allocations
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4866) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4867) if (costly_order && (gfp_mask & __GFP_NORETRY)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4868) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4869) * If allocating entire pageblock(s) and compaction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4870) * failed because all zones are below low watermarks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4871) * or is prohibited because it recently failed at this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4872) * order, fail immediately unless the allocator has
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4873) * requested compaction and reclaim retry.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4874) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4875) * Reclaim is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4876) * - potentially very expensive because zones are far
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4877) * below their low watermarks or this is part of very
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4878) * bursty high order allocations,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4879) * - not guaranteed to help because isolate_freepages()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4880) * may not iterate over freed pages as part of its
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4881) * linear scan, and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4882) * - unlikely to make entire pageblocks free on its
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4883) * own.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4884) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4885) if (compact_result == COMPACT_SKIPPED ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4886) compact_result == COMPACT_DEFERRED)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4887) goto nopage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4888)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4889) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4890) * Looks like reclaim/compaction is worth trying, but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4891) * sync compaction could be very expensive, so keep
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4892) * using async compaction.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4893) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4894) compact_priority = INIT_COMPACT_PRIORITY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4895) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4896) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4897)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4898) retry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4899) /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4900) if (alloc_flags & ALLOC_KSWAPD)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4901) wake_all_kswapds(order, gfp_mask, ac);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4902)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4903) reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4904) if (reserve_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4905) alloc_flags = current_alloc_flags(gfp_mask, reserve_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4906)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4907) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4908) * Reset the nodemask and zonelist iterators if memory policies can be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4909) * ignored. These allocations are high priority and system rather than
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4910) * user oriented.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4911) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4912) if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4913) ac->nodemask = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4914) ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4915) ac->highest_zoneidx, ac->nodemask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4916) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4917)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4918) /* Attempt with potentially adjusted zonelist and alloc_flags */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4919) page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4920) if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4921) goto got_pg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4922)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4923) /* Caller is not willing to reclaim, we can't balance anything */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4924) if (!can_direct_reclaim)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4925) goto nopage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4926)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4927) /* Avoid recursion of direct reclaim */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4928) if (current->flags & PF_MEMALLOC)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4929) goto nopage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4930)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4931) /* Try direct reclaim and then allocating */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4932) page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4933) &did_some_progress);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4934) if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4935) goto got_pg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4936)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4937) /* Try direct compaction and then allocating */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4938) page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4939) compact_priority, &compact_result);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4940) if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4941) goto got_pg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4942)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4943) /* Do not loop if specifically requested */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4944) if (gfp_mask & __GFP_NORETRY)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4945) goto nopage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4946)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4947) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4948) * Do not retry costly high order allocations unless they are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4949) * __GFP_RETRY_MAYFAIL
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4950) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4951) if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4952) goto nopage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4953)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4954) if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4955) did_some_progress > 0, &no_progress_loops))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4956) goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4957)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4958) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4959) * It doesn't make any sense to retry for the compaction if the order-0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4960) * reclaim is not able to make any progress because the current
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4961) * implementation of the compaction depends on the sufficient amount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4962) * of free memory (see __compaction_suitable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4963) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4964) if (did_some_progress > 0 &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4965) should_compact_retry(ac, order, alloc_flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4966) compact_result, &compact_priority,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4967) &compaction_retries))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4968) goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4969)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4970)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4971) /* Deal with possible cpuset update races before we start OOM killing */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4972) if (check_retry_cpuset(cpuset_mems_cookie, ac))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4973) goto retry_cpuset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4974)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4975) /* Reclaim has failed us, start killing things */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4976) page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4977) if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4978) goto got_pg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4979)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4980) /* Avoid allocations with no watermarks from looping endlessly */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4981) if (tsk_is_oom_victim(current) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4982) (alloc_flags & ALLOC_OOM ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4983) (gfp_mask & __GFP_NOMEMALLOC)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4984) goto nopage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4985)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4986) /* Retry as long as the OOM killer is making progress */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4987) if (did_some_progress) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4988) no_progress_loops = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4989) goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4990) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4991)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4992) nopage:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4993) /* Deal with possible cpuset update races before we fail */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4994) if (check_retry_cpuset(cpuset_mems_cookie, ac))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4995) goto retry_cpuset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4996)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4997) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4998) * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4999) * we always retry
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5000) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5001) if (gfp_mask & __GFP_NOFAIL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5002) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5003) * All existing users of the __GFP_NOFAIL are blockable, so warn
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5004) * of any new users that actually require GFP_NOWAIT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5005) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5006) if (WARN_ON_ONCE(!can_direct_reclaim))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5007) goto fail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5008)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5009) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5010) * PF_MEMALLOC request from this context is rather bizarre
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5011) * because we cannot reclaim anything and only can loop waiting
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5012) * for somebody to do a work for us
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5013) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5014) WARN_ON_ONCE(current->flags & PF_MEMALLOC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5015)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5016) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5017) * non failing costly orders are a hard requirement which we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5018) * are not prepared for much so let's warn about these users
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5019) * so that we can identify them and convert them to something
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5020) * else.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5021) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5022) WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5023)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5024) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5025) * Help non-failing allocations by giving them access to memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5026) * reserves but do not use ALLOC_NO_WATERMARKS because this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5027) * could deplete whole memory reserves which would just make
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5028) * the situation worse
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5029) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5030) page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5031) if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5032) goto got_pg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5033)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5034) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5035) goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5036) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5037) fail:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5038) warn_alloc(gfp_mask, ac->nodemask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5039) "page allocation failure: order:%u", order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5040) got_pg:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5041) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5042) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5043)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5044) static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5045) int preferred_nid, nodemask_t *nodemask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5046) struct alloc_context *ac, gfp_t *alloc_mask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5047) unsigned int *alloc_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5048) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5049) ac->highest_zoneidx = gfp_zone(gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5050) ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5051) ac->nodemask = nodemask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5052) ac->migratetype = gfp_migratetype(gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5053)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5054) if (cpusets_enabled()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5055) *alloc_mask |= __GFP_HARDWALL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5056) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5057) * When we are in the interrupt context, it is irrelevant
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5058) * to the current task context. It means that any node ok.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5059) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5060) if (!in_interrupt() && !ac->nodemask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5061) ac->nodemask = &cpuset_current_mems_allowed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5062) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5063) *alloc_flags |= ALLOC_CPUSET;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5064) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5065)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5066) fs_reclaim_acquire(gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5067) fs_reclaim_release(gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5068)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5069) might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5070)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5071) if (should_fail_alloc_page(gfp_mask, order))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5072) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5073)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5074) *alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5075)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5076) /* Dirty zone balancing only done in the fast path */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5077) ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5078)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5079) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5080) * The preferred zone is used for statistics but crucially it is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5081) * also used as the starting point for the zonelist iterator. It
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5082) * may get reset for allocations that ignore memory policies.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5083) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5084) ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5085) ac->highest_zoneidx, ac->nodemask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5086)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5087) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5088) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5089)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5090) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5091) * This is the 'heart' of the zoned buddy allocator.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5092) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5093) struct page *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5094) __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5095) nodemask_t *nodemask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5096) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5097) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5098) unsigned int alloc_flags = ALLOC_WMARK_LOW;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5099) gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5100) struct alloc_context ac = { };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5101)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5102) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5103) * There are several places where we assume that the order value is sane
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5104) * so bail out early if the request is out of bound.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5105) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5106) if (unlikely(order >= MAX_ORDER)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5107) WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5108) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5109) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5110)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5111) gfp_mask &= gfp_allowed_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5112) alloc_mask = gfp_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5113) if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5114) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5115)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5116) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5117) * Forbid the first pass from falling back to types that fragment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5118) * memory until all local zones are considered.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5119) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5120) alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5121)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5122) /* First allocation attempt */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5123) page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5124) if (likely(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5125) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5126)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5127) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5128) * Apply scoped allocation constraints. This is mainly about GFP_NOFS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5129) * resp. GFP_NOIO which has to be inherited for all allocation requests
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5130) * from a particular context which has been marked by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5131) * memalloc_no{fs,io}_{save,restore}.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5132) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5133) alloc_mask = current_gfp_context(gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5134) ac.spread_dirty_pages = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5135)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5136) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5137) * Restore the original nodemask if it was potentially replaced with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5138) * &cpuset_current_mems_allowed to optimize the fast-path attempt.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5139) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5140) ac.nodemask = nodemask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5141)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5142) page = __alloc_pages_slowpath(alloc_mask, order, &ac);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5143)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5144) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5145) if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5146) unlikely(__memcg_kmem_charge_page(page, gfp_mask, order) != 0)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5147) __free_pages(page, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5148) page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5149) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5150)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5151) trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5152)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5153) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5154) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5155) EXPORT_SYMBOL(__alloc_pages_nodemask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5156)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5157) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5158) * Common helper functions. Never use with __GFP_HIGHMEM because the returned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5159) * address cannot represent highmem pages. Use alloc_pages and then kmap if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5160) * you need to access high mem.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5161) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5162) unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5163) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5164) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5165)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5166) page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5167) if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5168) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5169) return (unsigned long) page_address(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5170) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5171) EXPORT_SYMBOL(__get_free_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5172)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5173) unsigned long get_zeroed_page(gfp_t gfp_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5174) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5175) return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5176) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5177) EXPORT_SYMBOL(get_zeroed_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5178)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5179) static inline void free_the_page(struct page *page, unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5180) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5181) if (order == 0) /* Via pcp? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5182) free_unref_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5183) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5184) __free_pages_ok(page, order, FPI_NONE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5185) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5186)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5187) void __free_pages(struct page *page, unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5188) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5189) trace_android_vh_free_pages(page, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5190) if (put_page_testzero(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5191) free_the_page(page, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5192) else if (!PageHead(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5193) while (order-- > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5194) free_the_page(page + (1 << order), order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5195) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5196) EXPORT_SYMBOL(__free_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5197)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5198) void free_pages(unsigned long addr, unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5199) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5200) if (addr != 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5201) VM_BUG_ON(!virt_addr_valid((void *)addr));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5202) __free_pages(virt_to_page((void *)addr), order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5203) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5204) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5205)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5206) EXPORT_SYMBOL(free_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5207)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5208) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5209) * Page Fragment:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5210) * An arbitrary-length arbitrary-offset area of memory which resides
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5211) * within a 0 or higher order page. Multiple fragments within that page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5212) * are individually refcounted, in the page's reference counter.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5213) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5214) * The page_frag functions below provide a simple allocation framework for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5215) * page fragments. This is used by the network stack and network device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5216) * drivers to provide a backing region of memory for use as either an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5217) * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5218) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5219) static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5220) gfp_t gfp_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5221) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5222) struct page *page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5223) gfp_t gfp = gfp_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5224)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5225) #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5226) gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5227) __GFP_NOMEMALLOC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5228) page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5229) PAGE_FRAG_CACHE_MAX_ORDER);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5230) nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5231) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5232) if (unlikely(!page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5233) page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5234)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5235) nc->va = page ? page_address(page) : NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5236)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5237) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5238) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5239)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5240) void __page_frag_cache_drain(struct page *page, unsigned int count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5241) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5242) VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5243)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5244) if (page_ref_sub_and_test(page, count))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5245) free_the_page(page, compound_order(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5246) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5247) EXPORT_SYMBOL(__page_frag_cache_drain);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5248)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5249) void *page_frag_alloc(struct page_frag_cache *nc,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5250) unsigned int fragsz, gfp_t gfp_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5251) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5252) unsigned int size = PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5253) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5254) int offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5255)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5256) if (unlikely(!nc->va)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5257) refill:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5258) page = __page_frag_cache_refill(nc, gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5259) if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5260) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5261)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5262) #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5263) /* if size can vary use size else just use PAGE_SIZE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5264) size = nc->size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5265) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5266) /* Even if we own the page, we do not use atomic_set().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5267) * This would break get_page_unless_zero() users.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5268) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5269) page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5270)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5271) /* reset page count bias and offset to start of new frag */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5272) nc->pfmemalloc = page_is_pfmemalloc(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5273) nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5274) nc->offset = size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5275) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5276)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5277) offset = nc->offset - fragsz;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5278) if (unlikely(offset < 0)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5279) page = virt_to_page(nc->va);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5280)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5281) if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5282) goto refill;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5283)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5284) if (unlikely(nc->pfmemalloc)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5285) free_the_page(page, compound_order(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5286) goto refill;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5287) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5288)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5289) #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5290) /* if size can vary use size else just use PAGE_SIZE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5291) size = nc->size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5292) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5293) /* OK, page count is 0, we can safely set it */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5294) set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5295)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5296) /* reset page count bias and offset to start of new frag */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5297) nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5298) offset = size - fragsz;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5299) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5300)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5301) nc->pagecnt_bias--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5302) nc->offset = offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5303)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5304) return nc->va + offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5305) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5306) EXPORT_SYMBOL(page_frag_alloc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5307)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5308) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5309) * Frees a page fragment allocated out of either a compound or order 0 page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5310) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5311) void page_frag_free(void *addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5312) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5313) struct page *page = virt_to_head_page(addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5314)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5315) if (unlikely(put_page_testzero(page)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5316) free_the_page(page, compound_order(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5317) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5318) EXPORT_SYMBOL(page_frag_free);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5319)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5320) static void *make_alloc_exact(unsigned long addr, unsigned int order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5321) size_t size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5322) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5323) if (addr) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5324) unsigned long alloc_end = addr + (PAGE_SIZE << order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5325) unsigned long used = addr + PAGE_ALIGN(size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5326)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5327) split_page(virt_to_page((void *)addr), order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5328) while (used < alloc_end) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5329) free_page(used);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5330) used += PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5331) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5332) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5333) return (void *)addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5334) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5335)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5336) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5337) * alloc_pages_exact - allocate an exact number physically-contiguous pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5338) * @size: the number of bytes to allocate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5339) * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5340) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5341) * This function is similar to alloc_pages(), except that it allocates the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5342) * minimum number of pages to satisfy the request. alloc_pages() can only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5343) * allocate memory in power-of-two pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5344) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5345) * This function is also limited by MAX_ORDER.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5346) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5347) * Memory allocated by this function must be released by free_pages_exact().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5348) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5349) * Return: pointer to the allocated area or %NULL in case of error.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5350) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5351) void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5352) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5353) unsigned int order = get_order(size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5354) unsigned long addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5355)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5356) if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5357) gfp_mask &= ~__GFP_COMP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5358)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5359) addr = __get_free_pages(gfp_mask, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5360) return make_alloc_exact(addr, order, size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5361) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5362) EXPORT_SYMBOL(alloc_pages_exact);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5363)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5364) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5365) * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5366) * pages on a node.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5367) * @nid: the preferred node ID where memory should be allocated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5368) * @size: the number of bytes to allocate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5369) * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5370) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5371) * Like alloc_pages_exact(), but try to allocate on node nid first before falling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5372) * back.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5373) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5374) * Return: pointer to the allocated area or %NULL in case of error.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5375) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5376) void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5377) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5378) unsigned int order = get_order(size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5379) struct page *p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5380)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5381) if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5382) gfp_mask &= ~__GFP_COMP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5383)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5384) p = alloc_pages_node(nid, gfp_mask, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5385) if (!p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5386) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5387) return make_alloc_exact((unsigned long)page_address(p), order, size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5388) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5389)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5390) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5391) * free_pages_exact - release memory allocated via alloc_pages_exact()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5392) * @virt: the value returned by alloc_pages_exact.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5393) * @size: size of allocation, same value as passed to alloc_pages_exact().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5394) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5395) * Release the memory allocated by a previous call to alloc_pages_exact.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5396) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5397) void free_pages_exact(void *virt, size_t size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5398) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5399) unsigned long addr = (unsigned long)virt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5400) unsigned long end = addr + PAGE_ALIGN(size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5401)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5402) while (addr < end) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5403) free_page(addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5404) addr += PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5405) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5406) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5407) EXPORT_SYMBOL(free_pages_exact);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5408)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5409) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5410) * nr_free_zone_pages - count number of pages beyond high watermark
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5411) * @offset: The zone index of the highest zone
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5412) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5413) * nr_free_zone_pages() counts the number of pages which are beyond the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5414) * high watermark within all zones at or below a given zone index. For each
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5415) * zone, the number of pages is calculated as:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5416) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5417) * nr_free_zone_pages = managed_pages - high_pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5418) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5419) * Return: number of pages beyond high watermark.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5420) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5421) static unsigned long nr_free_zone_pages(int offset)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5422) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5423) struct zoneref *z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5424) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5425)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5426) /* Just pick one node, since fallback list is circular */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5427) unsigned long sum = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5428)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5429) struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5430)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5431) for_each_zone_zonelist(zone, z, zonelist, offset) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5432) unsigned long size = zone_managed_pages(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5433) unsigned long high = high_wmark_pages(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5434) if (size > high)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5435) sum += size - high;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5436) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5437)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5438) return sum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5439) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5440)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5441) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5442) * nr_free_buffer_pages - count number of pages beyond high watermark
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5443) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5444) * nr_free_buffer_pages() counts the number of pages which are beyond the high
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5445) * watermark within ZONE_DMA and ZONE_NORMAL.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5446) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5447) * Return: number of pages beyond high watermark within ZONE_DMA and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5448) * ZONE_NORMAL.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5449) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5450) unsigned long nr_free_buffer_pages(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5451) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5452) return nr_free_zone_pages(gfp_zone(GFP_USER));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5453) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5454) EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5455)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5456) static inline void show_node(struct zone *zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5457) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5458) if (IS_ENABLED(CONFIG_NUMA))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5459) printk("Node %d ", zone_to_nid(zone));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5460) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5461)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5462) long si_mem_available(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5463) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5464) long available;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5465) unsigned long pagecache;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5466) unsigned long wmark_low = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5467) unsigned long pages[NR_LRU_LISTS];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5468) unsigned long reclaimable;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5469) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5470) int lru;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5471)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5472) for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5473) pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5474)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5475) for_each_zone(zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5476) wmark_low += low_wmark_pages(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5477)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5478) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5479) * Estimate the amount of memory available for userspace allocations,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5480) * without causing swapping.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5481) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5482) available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5483)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5484) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5485) * Not all the page cache can be freed, otherwise the system will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5486) * start swapping. Assume at least half of the page cache, or the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5487) * low watermark worth of cache, needs to stay.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5488) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5489) pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5490) pagecache -= min(pagecache / 2, wmark_low);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5491) available += pagecache;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5492)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5493) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5494) * Part of the reclaimable slab and other kernel memory consists of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5495) * items that are in use, and cannot be freed. Cap this estimate at the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5496) * low watermark.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5497) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5498) reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5499) global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5500) available += reclaimable - min(reclaimable / 2, wmark_low);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5501)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5502) if (available < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5503) available = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5504) return available;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5505) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5506) EXPORT_SYMBOL_GPL(si_mem_available);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5507)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5508) void si_meminfo(struct sysinfo *val)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5509) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5510) val->totalram = totalram_pages();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5511) val->sharedram = global_node_page_state(NR_SHMEM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5512) val->freeram = global_zone_page_state(NR_FREE_PAGES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5513) val->bufferram = nr_blockdev_pages();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5514) val->totalhigh = totalhigh_pages();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5515) val->freehigh = nr_free_highpages();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5516) val->mem_unit = PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5517) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5518)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5519) EXPORT_SYMBOL(si_meminfo);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5520)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5521) #ifdef CONFIG_NUMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5522) void si_meminfo_node(struct sysinfo *val, int nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5523) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5524) int zone_type; /* needs to be signed */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5525) unsigned long managed_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5526) unsigned long managed_highpages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5527) unsigned long free_highpages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5528) pg_data_t *pgdat = NODE_DATA(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5529)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5530) for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5531) managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5532) val->totalram = managed_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5533) val->sharedram = node_page_state(pgdat, NR_SHMEM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5534) val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5535) #ifdef CONFIG_HIGHMEM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5536) for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5537) struct zone *zone = &pgdat->node_zones[zone_type];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5538)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5539) if (is_highmem(zone)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5540) managed_highpages += zone_managed_pages(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5541) free_highpages += zone_page_state(zone, NR_FREE_PAGES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5542) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5543) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5544) val->totalhigh = managed_highpages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5545) val->freehigh = free_highpages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5546) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5547) val->totalhigh = managed_highpages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5548) val->freehigh = free_highpages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5549) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5550) val->mem_unit = PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5551) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5552) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5553)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5554) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5555) * Determine whether the node should be displayed or not, depending on whether
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5556) * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5557) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5558) static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5559) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5560) if (!(flags & SHOW_MEM_FILTER_NODES))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5561) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5562)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5563) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5564) * no node mask - aka implicit memory numa policy. Do not bother with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5565) * the synchronization - read_mems_allowed_begin - because we do not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5566) * have to be precise here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5567) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5568) if (!nodemask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5569) nodemask = &cpuset_current_mems_allowed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5570)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5571) return !node_isset(nid, *nodemask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5572) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5573)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5574) #define K(x) ((x) << (PAGE_SHIFT-10))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5575)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5576) static void show_migration_types(unsigned char type)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5577) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5578) static const char types[MIGRATE_TYPES] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5579) [MIGRATE_UNMOVABLE] = 'U',
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5580) [MIGRATE_MOVABLE] = 'M',
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5581) [MIGRATE_RECLAIMABLE] = 'E',
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5582) [MIGRATE_HIGHATOMIC] = 'H',
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5583) #ifdef CONFIG_CMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5584) [MIGRATE_CMA] = 'C',
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5585) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5586) #ifdef CONFIG_MEMORY_ISOLATION
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5587) [MIGRATE_ISOLATE] = 'I',
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5588) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5589) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5590) char tmp[MIGRATE_TYPES + 1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5591) char *p = tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5592) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5593)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5594) for (i = 0; i < MIGRATE_TYPES; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5595) if (type & (1 << i))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5596) *p++ = types[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5597) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5598)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5599) *p = '\0';
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5600) printk(KERN_CONT "(%s) ", tmp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5601) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5602)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5603) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5604) * Show free area list (used inside shift_scroll-lock stuff)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5605) * We also calculate the percentage fragmentation. We do this by counting the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5606) * memory on each free list with the exception of the first item on the list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5607) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5608) * Bits in @filter:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5609) * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5610) * cpuset.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5611) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5612) void show_free_areas(unsigned int filter, nodemask_t *nodemask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5613) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5614) unsigned long free_pcp = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5615) int cpu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5616) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5617) pg_data_t *pgdat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5618)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5619) for_each_populated_zone(zone) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5620) if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5621) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5622)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5623) for_each_online_cpu(cpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5624) free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5625) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5626)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5627) printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5628) " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5629) " unevictable:%lu dirty:%lu writeback:%lu\n"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5630) " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5631) " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5632) " free:%lu free_pcp:%lu free_cma:%lu\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5633) global_node_page_state(NR_ACTIVE_ANON),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5634) global_node_page_state(NR_INACTIVE_ANON),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5635) global_node_page_state(NR_ISOLATED_ANON),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5636) global_node_page_state(NR_ACTIVE_FILE),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5637) global_node_page_state(NR_INACTIVE_FILE),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5638) global_node_page_state(NR_ISOLATED_FILE),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5639) global_node_page_state(NR_UNEVICTABLE),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5640) global_node_page_state(NR_FILE_DIRTY),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5641) global_node_page_state(NR_WRITEBACK),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5642) global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5643) global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5644) global_node_page_state(NR_FILE_MAPPED),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5645) global_node_page_state(NR_SHMEM),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5646) global_zone_page_state(NR_PAGETABLE),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5647) global_zone_page_state(NR_BOUNCE),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5648) global_zone_page_state(NR_FREE_PAGES),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5649) free_pcp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5650) global_zone_page_state(NR_FREE_CMA_PAGES));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5651)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5652) for_each_online_pgdat(pgdat) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5653) if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5654) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5655)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5656) printk("Node %d"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5657) " active_anon:%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5658) " inactive_anon:%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5659) " active_file:%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5660) " inactive_file:%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5661) " unevictable:%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5662) " isolated(anon):%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5663) " isolated(file):%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5664) " mapped:%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5665) " dirty:%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5666) " writeback:%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5667) " shmem:%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5668) #ifdef CONFIG_TRANSPARENT_HUGEPAGE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5669) " shmem_thp: %lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5670) " shmem_pmdmapped: %lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5671) " anon_thp: %lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5672) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5673) " writeback_tmp:%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5674) " kernel_stack:%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5675) #ifdef CONFIG_SHADOW_CALL_STACK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5676) " shadow_call_stack:%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5677) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5678) " all_unreclaimable? %s"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5679) "\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5680) pgdat->node_id,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5681) K(node_page_state(pgdat, NR_ACTIVE_ANON)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5682) K(node_page_state(pgdat, NR_INACTIVE_ANON)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5683) K(node_page_state(pgdat, NR_ACTIVE_FILE)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5684) K(node_page_state(pgdat, NR_INACTIVE_FILE)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5685) K(node_page_state(pgdat, NR_UNEVICTABLE)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5686) K(node_page_state(pgdat, NR_ISOLATED_ANON)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5687) K(node_page_state(pgdat, NR_ISOLATED_FILE)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5688) K(node_page_state(pgdat, NR_FILE_MAPPED)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5689) K(node_page_state(pgdat, NR_FILE_DIRTY)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5690) K(node_page_state(pgdat, NR_WRITEBACK)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5691) K(node_page_state(pgdat, NR_SHMEM)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5692) #ifdef CONFIG_TRANSPARENT_HUGEPAGE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5693) K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5694) K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5695) * HPAGE_PMD_NR),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5696) K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5697) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5698) K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5699) node_page_state(pgdat, NR_KERNEL_STACK_KB),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5700) #ifdef CONFIG_SHADOW_CALL_STACK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5701) node_page_state(pgdat, NR_KERNEL_SCS_KB),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5702) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5703) pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5704) "yes" : "no");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5705) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5706)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5707) for_each_populated_zone(zone) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5708) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5709)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5710) if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5711) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5712)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5713) free_pcp = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5714) for_each_online_cpu(cpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5715) free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5716)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5717) show_node(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5718) printk(KERN_CONT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5719) "%s"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5720) " free:%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5721) " min:%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5722) " low:%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5723) " high:%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5724) " reserved_highatomic:%luKB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5725) " active_anon:%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5726) " inactive_anon:%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5727) " active_file:%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5728) " inactive_file:%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5729) " unevictable:%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5730) " writepending:%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5731) " present:%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5732) " managed:%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5733) " mlocked:%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5734) " pagetables:%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5735) " bounce:%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5736) " free_pcp:%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5737) " local_pcp:%ukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5738) " free_cma:%lukB"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5739) "\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5740) zone->name,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5741) K(zone_page_state(zone, NR_FREE_PAGES)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5742) K(min_wmark_pages(zone)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5743) K(low_wmark_pages(zone)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5744) K(high_wmark_pages(zone)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5745) K(zone->nr_reserved_highatomic),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5746) K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5747) K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5748) K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5749) K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5750) K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5751) K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5752) K(zone->present_pages),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5753) K(zone_managed_pages(zone)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5754) K(zone_page_state(zone, NR_MLOCK)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5755) K(zone_page_state(zone, NR_PAGETABLE)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5756) K(zone_page_state(zone, NR_BOUNCE)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5757) K(free_pcp),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5758) K(this_cpu_read(zone->pageset->pcp.count)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5759) K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5760) printk("lowmem_reserve[]:");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5761) for (i = 0; i < MAX_NR_ZONES; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5762) printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5763) printk(KERN_CONT "\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5764) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5765)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5766) for_each_populated_zone(zone) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5767) unsigned int order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5768) unsigned long nr[MAX_ORDER], flags, total = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5769) unsigned char types[MAX_ORDER];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5770)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5771) if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5772) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5773) show_node(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5774) printk(KERN_CONT "%s: ", zone->name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5775)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5776) spin_lock_irqsave(&zone->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5777) for (order = 0; order < MAX_ORDER; order++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5778) struct free_area *area = &zone->free_area[order];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5779) int type;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5780)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5781) nr[order] = area->nr_free;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5782) total += nr[order] << order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5783)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5784) types[order] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5785) for (type = 0; type < MIGRATE_TYPES; type++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5786) if (!free_area_empty(area, type))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5787) types[order] |= 1 << type;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5788) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5789) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5790) spin_unlock_irqrestore(&zone->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5791) for (order = 0; order < MAX_ORDER; order++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5792) printk(KERN_CONT "%lu*%lukB ",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5793) nr[order], K(1UL) << order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5794) if (nr[order])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5795) show_migration_types(types[order]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5796) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5797) printk(KERN_CONT "= %lukB\n", K(total));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5798) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5799)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5800) hugetlb_show_meminfo();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5801)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5802) printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5803)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5804) show_swap_cache_info();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5805) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5806)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5807) static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5808) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5809) zoneref->zone = zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5810) zoneref->zone_idx = zone_idx(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5811) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5812)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5813) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5814) * Builds allocation fallback zone lists.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5815) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5816) * Add all populated zones of a node to the zonelist.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5817) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5818) static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5819) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5820) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5821) enum zone_type zone_type = MAX_NR_ZONES;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5822) int nr_zones = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5823)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5824) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5825) zone_type--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5826) zone = pgdat->node_zones + zone_type;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5827) if (managed_zone(zone)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5828) zoneref_set_zone(zone, &zonerefs[nr_zones++]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5829) check_highest_zone(zone_type);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5830) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5831) } while (zone_type);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5832)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5833) return nr_zones;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5834) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5835)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5836) #ifdef CONFIG_NUMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5837)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5838) static int __parse_numa_zonelist_order(char *s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5839) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5840) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5841) * We used to support different zonlists modes but they turned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5842) * out to be just not useful. Let's keep the warning in place
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5843) * if somebody still use the cmd line parameter so that we do
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5844) * not fail it silently
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5845) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5846) if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5847) pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5848) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5849) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5850) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5851) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5852)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5853) char numa_zonelist_order[] = "Node";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5854)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5855) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5856) * sysctl handler for numa_zonelist_order
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5857) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5858) int numa_zonelist_order_handler(struct ctl_table *table, int write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5859) void *buffer, size_t *length, loff_t *ppos)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5860) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5861) if (write)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5862) return __parse_numa_zonelist_order(buffer);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5863) return proc_dostring(table, write, buffer, length, ppos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5864) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5865)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5866)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5867) #define MAX_NODE_LOAD (nr_online_nodes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5868) static int node_load[MAX_NUMNODES];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5869)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5870) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5871) * find_next_best_node - find the next node that should appear in a given node's fallback list
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5872) * @node: node whose fallback list we're appending
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5873) * @used_node_mask: nodemask_t of already used nodes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5874) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5875) * We use a number of factors to determine which is the next node that should
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5876) * appear on a given node's fallback list. The node should not have appeared
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5877) * already in @node's fallback list, and it should be the next closest node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5878) * according to the distance array (which contains arbitrary distance values
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5879) * from each node to each node in the system), and should also prefer nodes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5880) * with no CPUs, since presumably they'll have very little allocation pressure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5881) * on them otherwise.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5882) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5883) * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5884) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5885) static int find_next_best_node(int node, nodemask_t *used_node_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5886) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5887) int n, val;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5888) int min_val = INT_MAX;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5889) int best_node = NUMA_NO_NODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5890)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5891) /* Use the local node if we haven't already */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5892) if (!node_isset(node, *used_node_mask)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5893) node_set(node, *used_node_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5894) return node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5895) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5896)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5897) for_each_node_state(n, N_MEMORY) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5898)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5899) /* Don't want a node to appear more than once */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5900) if (node_isset(n, *used_node_mask))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5901) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5902)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5903) /* Use the distance array to find the distance */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5904) val = node_distance(node, n);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5905)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5906) /* Penalize nodes under us ("prefer the next node") */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5907) val += (n < node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5908)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5909) /* Give preference to headless and unused nodes */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5910) if (!cpumask_empty(cpumask_of_node(n)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5911) val += PENALTY_FOR_NODE_WITH_CPUS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5912)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5913) /* Slight preference for less loaded node */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5914) val *= (MAX_NODE_LOAD*MAX_NUMNODES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5915) val += node_load[n];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5916)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5917) if (val < min_val) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5918) min_val = val;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5919) best_node = n;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5920) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5921) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5922)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5923) if (best_node >= 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5924) node_set(best_node, *used_node_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5925)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5926) return best_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5927) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5928)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5929)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5930) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5931) * Build zonelists ordered by node and zones within node.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5932) * This results in maximum locality--normal zone overflows into local
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5933) * DMA zone, if any--but risks exhausting DMA zone.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5934) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5935) static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5936) unsigned nr_nodes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5937) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5938) struct zoneref *zonerefs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5939) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5940)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5941) zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5942)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5943) for (i = 0; i < nr_nodes; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5944) int nr_zones;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5945)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5946) pg_data_t *node = NODE_DATA(node_order[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5947)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5948) nr_zones = build_zonerefs_node(node, zonerefs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5949) zonerefs += nr_zones;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5950) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5951) zonerefs->zone = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5952) zonerefs->zone_idx = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5953) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5954)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5955) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5956) * Build gfp_thisnode zonelists
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5957) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5958) static void build_thisnode_zonelists(pg_data_t *pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5959) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5960) struct zoneref *zonerefs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5961) int nr_zones;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5962)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5963) zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5964) nr_zones = build_zonerefs_node(pgdat, zonerefs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5965) zonerefs += nr_zones;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5966) zonerefs->zone = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5967) zonerefs->zone_idx = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5968) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5969)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5970) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5971) * Build zonelists ordered by zone and nodes within zones.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5972) * This results in conserving DMA zone[s] until all Normal memory is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5973) * exhausted, but results in overflowing to remote node while memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5974) * may still exist in local DMA zone.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5975) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5976)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5977) static void build_zonelists(pg_data_t *pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5978) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5979) static int node_order[MAX_NUMNODES];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5980) int node, load, nr_nodes = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5981) nodemask_t used_mask = NODE_MASK_NONE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5982) int local_node, prev_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5983)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5984) /* NUMA-aware ordering of nodes */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5985) local_node = pgdat->node_id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5986) load = nr_online_nodes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5987) prev_node = local_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5988)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5989) memset(node_order, 0, sizeof(node_order));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5990) while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5991) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5992) * We don't want to pressure a particular node.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5993) * So adding penalty to the first node in same
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5994) * distance group to make it round-robin.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5995) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5996) if (node_distance(local_node, node) !=
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5997) node_distance(local_node, prev_node))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5998) node_load[node] = load;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5999)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6000) node_order[nr_nodes++] = node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6001) prev_node = node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6002) load--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6003) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6004)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6005) build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6006) build_thisnode_zonelists(pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6007) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6008)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6009) #ifdef CONFIG_HAVE_MEMORYLESS_NODES
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6010) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6011) * Return node id of node used for "local" allocations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6012) * I.e., first node id of first zone in arg node's generic zonelist.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6013) * Used for initializing percpu 'numa_mem', which is used primarily
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6014) * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6015) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6016) int local_memory_node(int node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6017) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6018) struct zoneref *z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6019)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6020) z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6021) gfp_zone(GFP_KERNEL),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6022) NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6023) return zone_to_nid(z->zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6024) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6025) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6026)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6027) static void setup_min_unmapped_ratio(void);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6028) static void setup_min_slab_ratio(void);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6029) #else /* CONFIG_NUMA */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6030)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6031) static void build_zonelists(pg_data_t *pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6032) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6033) int node, local_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6034) struct zoneref *zonerefs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6035) int nr_zones;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6036)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6037) local_node = pgdat->node_id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6038)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6039) zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6040) nr_zones = build_zonerefs_node(pgdat, zonerefs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6041) zonerefs += nr_zones;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6042)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6043) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6044) * Now we build the zonelist so that it contains the zones
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6045) * of all the other nodes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6046) * We don't want to pressure a particular node, so when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6047) * building the zones for node N, we make sure that the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6048) * zones coming right after the local ones are those from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6049) * node N+1 (modulo N)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6050) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6051) for (node = local_node + 1; node < MAX_NUMNODES; node++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6052) if (!node_online(node))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6053) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6054) nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6055) zonerefs += nr_zones;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6056) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6057) for (node = 0; node < local_node; node++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6058) if (!node_online(node))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6059) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6060) nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6061) zonerefs += nr_zones;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6062) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6063)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6064) zonerefs->zone = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6065) zonerefs->zone_idx = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6066) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6067)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6068) #endif /* CONFIG_NUMA */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6069)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6070) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6071) * Boot pageset table. One per cpu which is going to be used for all
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6072) * zones and all nodes. The parameters will be set in such a way
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6073) * that an item put on a list will immediately be handed over to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6074) * the buddy list. This is safe since pageset manipulation is done
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6075) * with interrupts disabled.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6076) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6077) * The boot_pagesets must be kept even after bootup is complete for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6078) * unused processors and/or zones. They do play a role for bootstrapping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6079) * hotplugged processors.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6080) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6081) * zoneinfo_show() and maybe other functions do
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6082) * not check if the processor is online before following the pageset pointer.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6083) * Other parts of the kernel may not check if the zone is available.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6084) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6085) static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6086) static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6087) static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6088)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6089) static void __build_all_zonelists(void *data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6090) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6091) int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6092) int __maybe_unused cpu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6093) pg_data_t *self = data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6094) static DEFINE_SPINLOCK(lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6095)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6096) spin_lock(&lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6097)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6098) #ifdef CONFIG_NUMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6099) memset(node_load, 0, sizeof(node_load));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6100) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6101)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6102) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6103) * This node is hotadded and no memory is yet present. So just
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6104) * building zonelists is fine - no need to touch other nodes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6105) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6106) if (self && !node_online(self->node_id)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6107) build_zonelists(self);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6108) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6109) for_each_online_node(nid) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6110) pg_data_t *pgdat = NODE_DATA(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6111)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6112) build_zonelists(pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6113) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6114)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6115) #ifdef CONFIG_HAVE_MEMORYLESS_NODES
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6116) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6117) * We now know the "local memory node" for each node--
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6118) * i.e., the node of the first zone in the generic zonelist.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6119) * Set up numa_mem percpu variable for on-line cpus. During
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6120) * boot, only the boot cpu should be on-line; we'll init the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6121) * secondary cpus' numa_mem as they come on-line. During
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6122) * node/memory hotplug, we'll fixup all on-line cpus.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6123) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6124) for_each_online_cpu(cpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6125) set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6126) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6127) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6128)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6129) spin_unlock(&lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6130) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6131)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6132) static noinline void __init
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6133) build_all_zonelists_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6134) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6135) int cpu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6136)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6137) __build_all_zonelists(NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6138)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6139) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6140) * Initialize the boot_pagesets that are going to be used
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6141) * for bootstrapping processors. The real pagesets for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6142) * each zone will be allocated later when the per cpu
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6143) * allocator is available.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6144) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6145) * boot_pagesets are used also for bootstrapping offline
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6146) * cpus if the system is already booted because the pagesets
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6147) * are needed to initialize allocators on a specific cpu too.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6148) * F.e. the percpu allocator needs the page allocator which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6149) * needs the percpu allocator in order to allocate its pagesets
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6150) * (a chicken-egg dilemma).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6151) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6152) for_each_possible_cpu(cpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6153) setup_pageset(&per_cpu(boot_pageset, cpu), 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6154)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6155) mminit_verify_zonelist();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6156) cpuset_init_current_mems_allowed();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6157) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6158)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6159) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6160) * unless system_state == SYSTEM_BOOTING.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6161) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6162) * __ref due to call of __init annotated helper build_all_zonelists_init
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6163) * [protected by SYSTEM_BOOTING].
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6164) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6165) void __ref build_all_zonelists(pg_data_t *pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6166) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6167) unsigned long vm_total_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6168)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6169) if (system_state == SYSTEM_BOOTING) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6170) build_all_zonelists_init();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6171) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6172) __build_all_zonelists(pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6173) /* cpuset refresh routine should be here */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6174) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6175) /* Get the number of free pages beyond high watermark in all zones. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6176) vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6177) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6178) * Disable grouping by mobility if the number of pages in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6179) * system is too low to allow the mechanism to work. It would be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6180) * more accurate, but expensive to check per-zone. This check is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6181) * made on memory-hotadd so a system can start with mobility
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6182) * disabled and enable it later
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6183) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6184) if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6185) page_group_by_mobility_disabled = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6186) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6187) page_group_by_mobility_disabled = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6188)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6189) pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6190) nr_online_nodes,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6191) page_group_by_mobility_disabled ? "off" : "on",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6192) vm_total_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6193) #ifdef CONFIG_NUMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6194) pr_info("Policy zone: %s\n", zone_names[policy_zone]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6195) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6196) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6197)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6198) /* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6199) static bool __meminit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6200) overlap_memmap_init(unsigned long zone, unsigned long *pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6201) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6202) static struct memblock_region *r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6203)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6204) if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6205) if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6206) for_each_mem_region(r) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6207) if (*pfn < memblock_region_memory_end_pfn(r))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6208) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6209) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6210) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6211) if (*pfn >= memblock_region_memory_base_pfn(r) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6212) memblock_is_mirror(r)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6213) *pfn = memblock_region_memory_end_pfn(r);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6214) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6215) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6216) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6217) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6218) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6219)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6220) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6221) * Initially all pages are reserved - free ones are freed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6222) * up by memblock_free_all() once the early boot process is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6223) * done. Non-atomic initialization, single-pass.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6224) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6225) * All aligned pageblocks are initialized to the specified migratetype
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6226) * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6227) * zone stats (e.g., nr_isolate_pageblock) are touched.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6228) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6229) void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6230) unsigned long start_pfn, unsigned long zone_end_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6231) enum meminit_context context,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6232) struct vmem_altmap *altmap, int migratetype)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6233) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6234) unsigned long pfn, end_pfn = start_pfn + size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6235) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6236)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6237) if (highest_memmap_pfn < end_pfn - 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6238) highest_memmap_pfn = end_pfn - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6239)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6240) #ifdef CONFIG_ZONE_DEVICE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6241) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6242) * Honor reservation requested by the driver for this ZONE_DEVICE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6243) * memory. We limit the total number of pages to initialize to just
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6244) * those that might contain the memory mapping. We will defer the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6245) * ZONE_DEVICE page initialization until after we have released
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6246) * the hotplug lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6247) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6248) if (zone == ZONE_DEVICE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6249) if (!altmap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6250) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6251)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6252) if (start_pfn == altmap->base_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6253) start_pfn += altmap->reserve;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6254) end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6255) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6256) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6257)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6258) #ifdef CONFIG_ROCKCHIP_THUNDER_BOOT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6259) /* Zero all page struct in advance */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6260) memset(pfn_to_page(start_pfn), 0, sizeof(struct page) * size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6261) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6262)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6263) for (pfn = start_pfn; pfn < end_pfn; ) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6264) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6265) * There can be holes in boot-time mem_map[]s handed to this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6266) * function. They do not exist on hotplugged memory.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6267) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6268) if (context == MEMINIT_EARLY) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6269) if (overlap_memmap_init(zone, &pfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6270) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6271) if (defer_init(nid, pfn, zone_end_pfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6272) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6273) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6274)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6275) page = pfn_to_page(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6276) __init_single_page(page, pfn, zone, nid, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6277) if (context == MEMINIT_HOTPLUG)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6278) __SetPageReserved(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6279)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6280) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6281) * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6282) * such that unmovable allocations won't be scattered all
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6283) * over the place during system boot.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6284) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6285) if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6286) set_pageblock_migratetype(page, migratetype);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6287) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6288) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6289) pfn++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6290) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6291) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6292)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6293) #ifdef CONFIG_ZONE_DEVICE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6294) void __ref memmap_init_zone_device(struct zone *zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6295) unsigned long start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6296) unsigned long nr_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6297) struct dev_pagemap *pgmap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6298) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6299) unsigned long pfn, end_pfn = start_pfn + nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6300) struct pglist_data *pgdat = zone->zone_pgdat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6301) struct vmem_altmap *altmap = pgmap_altmap(pgmap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6302) unsigned long zone_idx = zone_idx(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6303) unsigned long start = jiffies;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6304) int nid = pgdat->node_id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6305)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6306) if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6307) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6308)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6309) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6310) * The call to memmap_init should have already taken care
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6311) * of the pages reserved for the memmap, so we can just jump to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6312) * the end of that region and start processing the device pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6313) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6314) if (altmap) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6315) start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6316) nr_pages = end_pfn - start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6317) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6318)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6319) for (pfn = start_pfn; pfn < end_pfn; pfn++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6320) struct page *page = pfn_to_page(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6321)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6322) __init_single_page(page, pfn, zone_idx, nid, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6323)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6324) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6325) * Mark page reserved as it will need to wait for onlining
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6326) * phase for it to be fully associated with a zone.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6327) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6328) * We can use the non-atomic __set_bit operation for setting
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6329) * the flag as we are still initializing the pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6330) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6331) __SetPageReserved(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6332)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6333) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6334) * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6335) * and zone_device_data. It is a bug if a ZONE_DEVICE page is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6336) * ever freed or placed on a driver-private list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6337) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6338) page->pgmap = pgmap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6339) page->zone_device_data = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6340)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6341) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6342) * Mark the block movable so that blocks are reserved for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6343) * movable at startup. This will force kernel allocations
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6344) * to reserve their blocks rather than leaking throughout
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6345) * the address space during boot when many long-lived
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6346) * kernel allocations are made.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6347) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6348) * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6349) * because this is done early in section_activate()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6350) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6351) if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6352) set_pageblock_migratetype(page, MIGRATE_MOVABLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6353) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6354) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6355) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6356)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6357) pr_info("%s initialised %lu pages in %ums\n", __func__,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6358) nr_pages, jiffies_to_msecs(jiffies - start));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6359) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6360)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6361) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6362) static void __meminit zone_init_free_lists(struct zone *zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6363) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6364) unsigned int order, t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6365) for_each_migratetype_order(order, t) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6366) INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6367) zone->free_area[order].nr_free = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6368) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6369) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6370)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6371) #if !defined(CONFIG_FLAT_NODE_MEM_MAP)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6372) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6373) * Only struct pages that correspond to ranges defined by memblock.memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6374) * are zeroed and initialized by going through __init_single_page() during
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6375) * memmap_init_zone_range().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6376) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6377) * But, there could be struct pages that correspond to holes in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6378) * memblock.memory. This can happen because of the following reasons:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6379) * - physical memory bank size is not necessarily the exact multiple of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6380) * arbitrary section size
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6381) * - early reserved memory may not be listed in memblock.memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6382) * - memory layouts defined with memmap= kernel parameter may not align
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6383) * nicely with memmap sections
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6384) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6385) * Explicitly initialize those struct pages so that:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6386) * - PG_Reserved is set
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6387) * - zone and node links point to zone and node that span the page if the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6388) * hole is in the middle of a zone
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6389) * - zone and node links point to adjacent zone/node if the hole falls on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6390) * the zone boundary; the pages in such holes will be prepended to the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6391) * zone/node above the hole except for the trailing pages in the last
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6392) * section that will be appended to the zone/node below.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6393) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6394) static void __init init_unavailable_range(unsigned long spfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6395) unsigned long epfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6396) int zone, int node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6397) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6398) unsigned long pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6399) u64 pgcnt = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6400)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6401) for (pfn = spfn; pfn < epfn; pfn++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6402) if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6403) pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6404) + pageblock_nr_pages - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6405) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6406) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6407) __init_single_page(pfn_to_page(pfn), pfn, zone, node, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6408) __SetPageReserved(pfn_to_page(pfn));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6409) pgcnt++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6410) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6411)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6412) if (pgcnt)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6413) pr_info("On node %d, zone %s: %lld pages in unavailable ranges",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6414) node, zone_names[zone], pgcnt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6415) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6416) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6417) static inline void init_unavailable_range(unsigned long spfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6418) unsigned long epfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6419) int zone, int node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6420) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6421) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6422) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6423)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6424) static void __init memmap_init_zone_range(struct zone *zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6425) unsigned long start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6426) unsigned long end_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6427) unsigned long *hole_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6428) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6429) unsigned long zone_start_pfn = zone->zone_start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6430) unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6431) int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6432)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6433) start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6434) end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6435)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6436) if (start_pfn >= end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6437) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6438)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6439) memmap_init_zone(end_pfn - start_pfn, nid, zone_id, start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6440) zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6441)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6442) if (*hole_pfn < start_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6443) init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6444)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6445) *hole_pfn = end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6446) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6447)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6448) void __init __weak memmap_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6449) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6450) unsigned long start_pfn, end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6451) unsigned long hole_pfn = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6452) int i, j, zone_id, nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6453)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6454) for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6455) struct pglist_data *node = NODE_DATA(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6456)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6457) for (j = 0; j < MAX_NR_ZONES; j++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6458) struct zone *zone = node->node_zones + j;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6459)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6460) if (!populated_zone(zone))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6461) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6462)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6463) memmap_init_zone_range(zone, start_pfn, end_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6464) &hole_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6465) zone_id = j;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6466) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6467) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6468)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6469) #ifdef CONFIG_SPARSEMEM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6470) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6471) * Initialize the memory map for hole in the range [memory_end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6472) * section_end].
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6473) * Append the pages in this hole to the highest zone in the last
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6474) * node.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6475) * The call to init_unavailable_range() is outside the ifdef to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6476) * silence the compiler warining about zone_id set but not used;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6477) * for FLATMEM it is a nop anyway
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6478) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6479) end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6480) if (hole_pfn < end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6481) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6482) init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6483) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6484)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6485) /* A stub for backwards compatibility with custom implementatin on IA-64 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6486) void __meminit __weak arch_memmap_init(unsigned long size, int nid,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6487) unsigned long zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6488) unsigned long range_start_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6489) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6490) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6491)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6492) static int zone_batchsize(struct zone *zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6493) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6494) #ifdef CONFIG_MMU
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6495) int batch;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6496)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6497) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6498) * The per-cpu-pages pools are set to around 1000th of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6499) * size of the zone.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6500) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6501) batch = zone_managed_pages(zone) / 1024;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6502) /* But no more than a meg. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6503) if (batch * PAGE_SIZE > 1024 * 1024)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6504) batch = (1024 * 1024) / PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6505) batch /= 4; /* We effectively *= 4 below */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6506) if (batch < 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6507) batch = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6508)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6509) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6510) * Clamp the batch to a 2^n - 1 value. Having a power
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6511) * of 2 value was found to be more likely to have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6512) * suboptimal cache aliasing properties in some cases.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6513) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6514) * For example if 2 tasks are alternately allocating
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6515) * batches of pages, one task can end up with a lot
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6516) * of pages of one half of the possible page colors
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6517) * and the other with pages of the other colors.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6518) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6519) batch = rounddown_pow_of_two(batch + batch/2) - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6520)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6521) return batch;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6522)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6523) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6524) /* The deferral and batching of frees should be suppressed under NOMMU
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6525) * conditions.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6526) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6527) * The problem is that NOMMU needs to be able to allocate large chunks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6528) * of contiguous memory as there's no hardware page translation to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6529) * assemble apparent contiguous memory from discontiguous pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6530) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6531) * Queueing large contiguous runs of pages for batching, however,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6532) * causes the pages to actually be freed in smaller chunks. As there
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6533) * can be a significant delay between the individual batches being
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6534) * recycled, this leads to the once large chunks of space being
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6535) * fragmented and becoming unavailable for high-order allocations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6536) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6537) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6538) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6539) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6540)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6541) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6542) * pcp->high and pcp->batch values are related and dependent on one another:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6543) * ->batch must never be higher then ->high.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6544) * The following function updates them in a safe manner without read side
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6545) * locking.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6546) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6547) * Any new users of pcp->batch and pcp->high should ensure they can cope with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6548) * those fields changing asynchronously (acording to the above rule).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6549) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6550) * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6551) * outside of boot time (or some other assurance that no concurrent updaters
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6552) * exist).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6553) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6554) static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6555) unsigned long batch)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6556) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6557) /* start with a fail safe value for batch */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6558) pcp->batch = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6559) smp_wmb();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6560)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6561) /* Update high, then batch, in order */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6562) pcp->high = high;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6563) smp_wmb();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6564)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6565) pcp->batch = batch;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6566) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6567)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6568) /* a companion to pageset_set_high() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6569) static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6570) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6571) pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6572) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6573)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6574) static void pageset_init(struct per_cpu_pageset *p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6575) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6576) struct per_cpu_pages *pcp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6577) int migratetype;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6578)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6579) memset(p, 0, sizeof(*p));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6580)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6581) pcp = &p->pcp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6582) for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6583) INIT_LIST_HEAD(&pcp->lists[migratetype]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6584) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6585)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6586) static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6587) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6588) pageset_init(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6589) pageset_set_batch(p, batch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6590) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6591)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6592) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6593) * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6594) * to the value high for the pageset p.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6595) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6596) static void pageset_set_high(struct per_cpu_pageset *p,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6597) unsigned long high)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6598) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6599) unsigned long batch = max(1UL, high / 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6600) if ((high / 4) > (PAGE_SHIFT * 8))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6601) batch = PAGE_SHIFT * 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6602)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6603) pageset_update(&p->pcp, high, batch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6604) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6605)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6606) static void pageset_set_high_and_batch(struct zone *zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6607) struct per_cpu_pageset *pcp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6608) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6609) if (percpu_pagelist_fraction)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6610) pageset_set_high(pcp,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6611) (zone_managed_pages(zone) /
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6612) percpu_pagelist_fraction));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6613) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6614) pageset_set_batch(pcp, zone_batchsize(zone));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6615) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6616)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6617) static void __meminit zone_pageset_init(struct zone *zone, int cpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6618) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6619) struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6620)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6621) pageset_init(pcp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6622) pageset_set_high_and_batch(zone, pcp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6623) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6624)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6625) void __meminit setup_zone_pageset(struct zone *zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6626) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6627) int cpu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6628) zone->pageset = alloc_percpu(struct per_cpu_pageset);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6629) for_each_possible_cpu(cpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6630) zone_pageset_init(zone, cpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6631) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6632)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6633) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6634) * Allocate per cpu pagesets and initialize them.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6635) * Before this call only boot pagesets were available.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6636) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6637) void __init setup_per_cpu_pageset(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6638) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6639) struct pglist_data *pgdat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6640) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6641) int __maybe_unused cpu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6642)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6643) for_each_populated_zone(zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6644) setup_zone_pageset(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6645)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6646) #ifdef CONFIG_NUMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6647) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6648) * Unpopulated zones continue using the boot pagesets.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6649) * The numa stats for these pagesets need to be reset.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6650) * Otherwise, they will end up skewing the stats of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6651) * the nodes these zones are associated with.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6652) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6653) for_each_possible_cpu(cpu) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6654) struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6655) memset(pcp->vm_numa_stat_diff, 0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6656) sizeof(pcp->vm_numa_stat_diff));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6657) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6658) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6659)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6660) for_each_online_pgdat(pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6661) pgdat->per_cpu_nodestats =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6662) alloc_percpu(struct per_cpu_nodestat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6663) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6664)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6665) static __meminit void zone_pcp_init(struct zone *zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6666) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6667) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6668) * per cpu subsystem is not up at this point. The following code
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6669) * relies on the ability of the linker to provide the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6670) * offset of a (static) per cpu variable into the per cpu area.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6671) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6672) zone->pageset = &boot_pageset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6673)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6674) if (populated_zone(zone))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6675) printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6676) zone->name, zone->present_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6677) zone_batchsize(zone));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6678) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6679)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6680) void __meminit init_currently_empty_zone(struct zone *zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6681) unsigned long zone_start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6682) unsigned long size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6683) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6684) struct pglist_data *pgdat = zone->zone_pgdat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6685) int zone_idx = zone_idx(zone) + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6686)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6687) if (zone_idx > pgdat->nr_zones)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6688) pgdat->nr_zones = zone_idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6689)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6690) zone->zone_start_pfn = zone_start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6691)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6692) mminit_dprintk(MMINIT_TRACE, "memmap_init",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6693) "Initialising map node %d zone %lu pfns %lu -> %lu\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6694) pgdat->node_id,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6695) (unsigned long)zone_idx(zone),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6696) zone_start_pfn, (zone_start_pfn + size));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6697)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6698) zone_init_free_lists(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6699) zone->initialized = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6700) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6701)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6702) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6703) * get_pfn_range_for_nid - Return the start and end page frames for a node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6704) * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6705) * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6706) * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6707) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6708) * It returns the start and end page frame of a node based on information
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6709) * provided by memblock_set_node(). If called for a node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6710) * with no available memory, a warning is printed and the start and end
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6711) * PFNs will be 0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6712) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6713) void __init get_pfn_range_for_nid(unsigned int nid,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6714) unsigned long *start_pfn, unsigned long *end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6715) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6716) unsigned long this_start_pfn, this_end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6717) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6718)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6719) *start_pfn = -1UL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6720) *end_pfn = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6721)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6722) for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6723) *start_pfn = min(*start_pfn, this_start_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6724) *end_pfn = max(*end_pfn, this_end_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6725) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6726)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6727) if (*start_pfn == -1UL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6728) *start_pfn = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6729) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6730)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6731) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6732) * This finds a zone that can be used for ZONE_MOVABLE pages. The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6733) * assumption is made that zones within a node are ordered in monotonic
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6734) * increasing memory addresses so that the "highest" populated zone is used
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6735) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6736) static void __init find_usable_zone_for_movable(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6737) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6738) int zone_index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6739) for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6740) if (zone_index == ZONE_MOVABLE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6741) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6742)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6743) if (arch_zone_highest_possible_pfn[zone_index] >
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6744) arch_zone_lowest_possible_pfn[zone_index])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6745) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6746) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6747)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6748) VM_BUG_ON(zone_index == -1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6749) movable_zone = zone_index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6750) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6751)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6752) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6753) * The zone ranges provided by the architecture do not include ZONE_MOVABLE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6754) * because it is sized independent of architecture. Unlike the other zones,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6755) * the starting point for ZONE_MOVABLE is not fixed. It may be different
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6756) * in each node depending on the size of each node and how evenly kernelcore
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6757) * is distributed. This helper function adjusts the zone ranges
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6758) * provided by the architecture for a given node by using the end of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6759) * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6760) * zones within a node are in order of monotonic increases memory addresses
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6761) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6762) static void __init adjust_zone_range_for_zone_movable(int nid,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6763) unsigned long zone_type,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6764) unsigned long node_start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6765) unsigned long node_end_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6766) unsigned long *zone_start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6767) unsigned long *zone_end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6768) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6769) /* Only adjust if ZONE_MOVABLE is on this node */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6770) if (zone_movable_pfn[nid]) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6771) /* Size ZONE_MOVABLE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6772) if (zone_type == ZONE_MOVABLE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6773) *zone_start_pfn = zone_movable_pfn[nid];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6774) *zone_end_pfn = min(node_end_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6775) arch_zone_highest_possible_pfn[movable_zone]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6776)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6777) /* Adjust for ZONE_MOVABLE starting within this range */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6778) } else if (!mirrored_kernelcore &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6779) *zone_start_pfn < zone_movable_pfn[nid] &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6780) *zone_end_pfn > zone_movable_pfn[nid]) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6781) *zone_end_pfn = zone_movable_pfn[nid];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6782)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6783) /* Check if this whole range is within ZONE_MOVABLE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6784) } else if (*zone_start_pfn >= zone_movable_pfn[nid])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6785) *zone_start_pfn = *zone_end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6786) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6787) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6788)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6789) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6790) * Return the number of pages a zone spans in a node, including holes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6791) * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6792) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6793) static unsigned long __init zone_spanned_pages_in_node(int nid,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6794) unsigned long zone_type,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6795) unsigned long node_start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6796) unsigned long node_end_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6797) unsigned long *zone_start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6798) unsigned long *zone_end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6799) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6800) unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6801) unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6802) /* When hotadd a new node from cpu_up(), the node should be empty */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6803) if (!node_start_pfn && !node_end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6804) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6805)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6806) /* Get the start and end of the zone */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6807) *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6808) *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6809) adjust_zone_range_for_zone_movable(nid, zone_type,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6810) node_start_pfn, node_end_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6811) zone_start_pfn, zone_end_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6812)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6813) /* Check that this node has pages within the zone's required range */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6814) if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6815) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6816)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6817) /* Move the zone boundaries inside the node if necessary */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6818) *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6819) *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6820)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6821) /* Return the spanned pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6822) return *zone_end_pfn - *zone_start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6823) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6824)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6825) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6826) * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6827) * then all holes in the requested range will be accounted for.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6828) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6829) unsigned long __init __absent_pages_in_range(int nid,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6830) unsigned long range_start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6831) unsigned long range_end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6832) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6833) unsigned long nr_absent = range_end_pfn - range_start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6834) unsigned long start_pfn, end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6835) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6836)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6837) for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6838) start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6839) end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6840) nr_absent -= end_pfn - start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6841) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6842) return nr_absent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6843) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6844)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6845) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6846) * absent_pages_in_range - Return number of page frames in holes within a range
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6847) * @start_pfn: The start PFN to start searching for holes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6848) * @end_pfn: The end PFN to stop searching for holes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6849) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6850) * Return: the number of pages frames in memory holes within a range.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6851) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6852) unsigned long __init absent_pages_in_range(unsigned long start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6853) unsigned long end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6854) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6855) return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6856) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6857)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6858) /* Return the number of page frames in holes in a zone on a node */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6859) static unsigned long __init zone_absent_pages_in_node(int nid,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6860) unsigned long zone_type,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6861) unsigned long node_start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6862) unsigned long node_end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6863) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6864) unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6865) unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6866) unsigned long zone_start_pfn, zone_end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6867) unsigned long nr_absent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6868)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6869) /* When hotadd a new node from cpu_up(), the node should be empty */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6870) if (!node_start_pfn && !node_end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6871) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6872)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6873) zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6874) zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6875)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6876) adjust_zone_range_for_zone_movable(nid, zone_type,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6877) node_start_pfn, node_end_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6878) &zone_start_pfn, &zone_end_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6879) nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6880)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6881) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6882) * ZONE_MOVABLE handling.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6883) * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6884) * and vice versa.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6885) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6886) if (mirrored_kernelcore && zone_movable_pfn[nid]) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6887) unsigned long start_pfn, end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6888) struct memblock_region *r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6889)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6890) for_each_mem_region(r) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6891) start_pfn = clamp(memblock_region_memory_base_pfn(r),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6892) zone_start_pfn, zone_end_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6893) end_pfn = clamp(memblock_region_memory_end_pfn(r),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6894) zone_start_pfn, zone_end_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6895)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6896) if (zone_type == ZONE_MOVABLE &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6897) memblock_is_mirror(r))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6898) nr_absent += end_pfn - start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6899)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6900) if (zone_type == ZONE_NORMAL &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6901) !memblock_is_mirror(r))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6902) nr_absent += end_pfn - start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6903) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6904) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6905)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6906) return nr_absent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6907) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6908)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6909) static void __init calculate_node_totalpages(struct pglist_data *pgdat,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6910) unsigned long node_start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6911) unsigned long node_end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6912) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6913) unsigned long realtotalpages = 0, totalpages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6914) enum zone_type i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6915)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6916) for (i = 0; i < MAX_NR_ZONES; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6917) struct zone *zone = pgdat->node_zones + i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6918) unsigned long zone_start_pfn, zone_end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6919) unsigned long spanned, absent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6920) unsigned long size, real_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6921)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6922) spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6923) node_start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6924) node_end_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6925) &zone_start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6926) &zone_end_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6927) absent = zone_absent_pages_in_node(pgdat->node_id, i,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6928) node_start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6929) node_end_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6930)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6931) size = spanned;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6932) real_size = size - absent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6933)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6934) if (size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6935) zone->zone_start_pfn = zone_start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6936) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6937) zone->zone_start_pfn = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6938) zone->spanned_pages = size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6939) zone->present_pages = real_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6940)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6941) totalpages += size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6942) realtotalpages += real_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6943) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6944)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6945) pgdat->node_spanned_pages = totalpages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6946) pgdat->node_present_pages = realtotalpages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6947) printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6948) realtotalpages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6949) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6950)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6951) #ifndef CONFIG_SPARSEMEM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6952) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6953) * Calculate the size of the zone->blockflags rounded to an unsigned long
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6954) * Start by making sure zonesize is a multiple of pageblock_order by rounding
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6955) * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6956) * round what is now in bits to nearest long in bits, then return it in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6957) * bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6958) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6959) static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6960) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6961) unsigned long usemapsize;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6962)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6963) zonesize += zone_start_pfn & (pageblock_nr_pages-1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6964) usemapsize = roundup(zonesize, pageblock_nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6965) usemapsize = usemapsize >> pageblock_order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6966) usemapsize *= NR_PAGEBLOCK_BITS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6967) usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6968)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6969) return usemapsize / 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6970) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6971)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6972) static void __ref setup_usemap(struct pglist_data *pgdat,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6973) struct zone *zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6974) unsigned long zone_start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6975) unsigned long zonesize)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6976) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6977) unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6978) zone->pageblock_flags = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6979) if (usemapsize) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6980) zone->pageblock_flags =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6981) memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6982) pgdat->node_id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6983) if (!zone->pageblock_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6984) panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6985) usemapsize, zone->name, pgdat->node_id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6986) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6987) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6988) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6989) static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6990) unsigned long zone_start_pfn, unsigned long zonesize) {}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6991) #endif /* CONFIG_SPARSEMEM */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6992)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6993) #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6994)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6995) /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6996) void __init set_pageblock_order(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6997) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6998) unsigned int order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6999)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7000) /* Check that pageblock_nr_pages has not already been setup */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7001) if (pageblock_order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7002) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7003)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7004) if (HPAGE_SHIFT > PAGE_SHIFT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7005) order = HUGETLB_PAGE_ORDER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7006) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7007) order = MAX_ORDER - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7008)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7009) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7010) * Assume the largest contiguous order of interest is a huge page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7011) * This value may be variable depending on boot parameters on IA64 and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7012) * powerpc.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7013) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7014) pageblock_order = order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7015) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7016) #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7017)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7018) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7019) * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7020) * is unused as pageblock_order is set at compile-time. See
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7021) * include/linux/pageblock-flags.h for the values of pageblock_order based on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7022) * the kernel config
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7023) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7024) void __init set_pageblock_order(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7025) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7026) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7027)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7028) #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7029)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7030) static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7031) unsigned long present_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7032) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7033) unsigned long pages = spanned_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7034)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7035) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7036) * Provide a more accurate estimation if there are holes within
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7037) * the zone and SPARSEMEM is in use. If there are holes within the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7038) * zone, each populated memory region may cost us one or two extra
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7039) * memmap pages due to alignment because memmap pages for each
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7040) * populated regions may not be naturally aligned on page boundary.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7041) * So the (present_pages >> 4) heuristic is a tradeoff for that.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7042) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7043) if (spanned_pages > present_pages + (present_pages >> 4) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7044) IS_ENABLED(CONFIG_SPARSEMEM))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7045) pages = present_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7046)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7047) return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7048) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7049)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7050) #ifdef CONFIG_TRANSPARENT_HUGEPAGE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7051) static void pgdat_init_split_queue(struct pglist_data *pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7052) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7053) struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7054)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7055) spin_lock_init(&ds_queue->split_queue_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7056) INIT_LIST_HEAD(&ds_queue->split_queue);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7057) ds_queue->split_queue_len = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7058) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7059) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7060) static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7061) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7062)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7063) #ifdef CONFIG_COMPACTION
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7064) static void pgdat_init_kcompactd(struct pglist_data *pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7065) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7066) init_waitqueue_head(&pgdat->kcompactd_wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7067) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7068) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7069) static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7070) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7071)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7072) static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7073) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7074) pgdat_resize_init(pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7075)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7076) pgdat_init_split_queue(pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7077) pgdat_init_kcompactd(pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7078)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7079) init_waitqueue_head(&pgdat->kswapd_wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7080) init_waitqueue_head(&pgdat->pfmemalloc_wait);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7081)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7082) pgdat_page_ext_init(pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7083) spin_lock_init(&pgdat->lru_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7084) lruvec_init(&pgdat->__lruvec);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7085) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7086)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7087) static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7088) unsigned long remaining_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7089) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7090) atomic_long_set(&zone->managed_pages, remaining_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7091) zone_set_nid(zone, nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7092) zone->name = zone_names[idx];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7093) zone->zone_pgdat = NODE_DATA(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7094) spin_lock_init(&zone->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7095) zone_seqlock_init(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7096) zone_pcp_init(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7097) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7098)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7099) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7100) * Set up the zone data structures
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7101) * - init pgdat internals
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7102) * - init all zones belonging to this node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7103) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7104) * NOTE: this function is only called during memory hotplug
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7105) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7106) #ifdef CONFIG_MEMORY_HOTPLUG
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7107) void __ref free_area_init_core_hotplug(int nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7108) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7109) enum zone_type z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7110) pg_data_t *pgdat = NODE_DATA(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7111)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7112) pgdat_init_internals(pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7113) for (z = 0; z < MAX_NR_ZONES; z++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7114) zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7115) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7116) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7117)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7118) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7119) * Set up the zone data structures:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7120) * - mark all pages reserved
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7121) * - mark all memory queues empty
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7122) * - clear the memory bitmaps
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7123) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7124) * NOTE: pgdat should get zeroed by caller.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7125) * NOTE: this function is only called during early init.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7126) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7127) static void __init free_area_init_core(struct pglist_data *pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7128) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7129) enum zone_type j;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7130) int nid = pgdat->node_id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7131)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7132) pgdat_init_internals(pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7133) pgdat->per_cpu_nodestats = &boot_nodestats;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7134)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7135) for (j = 0; j < MAX_NR_ZONES; j++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7136) struct zone *zone = pgdat->node_zones + j;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7137) unsigned long size, freesize, memmap_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7138) unsigned long zone_start_pfn = zone->zone_start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7139)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7140) size = zone->spanned_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7141) freesize = zone->present_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7142)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7143) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7144) * Adjust freesize so that it accounts for how much memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7145) * is used by this zone for memmap. This affects the watermark
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7146) * and per-cpu initialisations
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7147) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7148) memmap_pages = calc_memmap_size(size, freesize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7149) if (!is_highmem_idx(j)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7150) if (freesize >= memmap_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7151) freesize -= memmap_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7152) if (memmap_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7153) printk(KERN_DEBUG
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7154) " %s zone: %lu pages used for memmap\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7155) zone_names[j], memmap_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7156) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7157) pr_warn(" %s zone: %lu pages exceeds freesize %lu\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7158) zone_names[j], memmap_pages, freesize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7159) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7160)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7161) /* Account for reserved pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7162) if (j == 0 && freesize > dma_reserve) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7163) freesize -= dma_reserve;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7164) printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7165) zone_names[0], dma_reserve);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7166) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7167)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7168) if (!is_highmem_idx(j))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7169) nr_kernel_pages += freesize;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7170) /* Charge for highmem memmap if there are enough kernel pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7171) else if (nr_kernel_pages > memmap_pages * 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7172) nr_kernel_pages -= memmap_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7173) nr_all_pages += freesize;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7174)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7175) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7176) * Set an approximate value for lowmem here, it will be adjusted
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7177) * when the bootmem allocator frees pages into the buddy system.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7178) * And all highmem pages will be managed by the buddy system.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7179) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7180) zone_init_internals(zone, j, nid, freesize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7181)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7182) if (!size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7183) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7184)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7185) set_pageblock_order();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7186) setup_usemap(pgdat, zone, zone_start_pfn, size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7187) init_currently_empty_zone(zone, zone_start_pfn, size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7188) arch_memmap_init(size, nid, j, zone_start_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7189) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7190) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7191)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7192) #ifdef CONFIG_FLAT_NODE_MEM_MAP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7193) static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7194) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7195) unsigned long __maybe_unused start = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7196) unsigned long __maybe_unused offset = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7197)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7198) /* Skip empty nodes */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7199) if (!pgdat->node_spanned_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7200) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7201)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7202) start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7203) offset = pgdat->node_start_pfn - start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7204) /* ia64 gets its own node_mem_map, before this, without bootmem */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7205) if (!pgdat->node_mem_map) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7206) unsigned long size, end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7207) struct page *map;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7208)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7209) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7210) * The zone's endpoints aren't required to be MAX_ORDER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7211) * aligned but the node_mem_map endpoints must be in order
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7212) * for the buddy allocator to function correctly.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7213) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7214) end = pgdat_end_pfn(pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7215) end = ALIGN(end, MAX_ORDER_NR_PAGES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7216) size = (end - start) * sizeof(struct page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7217) map = memblock_alloc_node(size, SMP_CACHE_BYTES,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7218) pgdat->node_id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7219) if (!map)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7220) panic("Failed to allocate %ld bytes for node %d memory map\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7221) size, pgdat->node_id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7222) pgdat->node_mem_map = map + offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7223) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7224) pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7225) __func__, pgdat->node_id, (unsigned long)pgdat,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7226) (unsigned long)pgdat->node_mem_map);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7227) #ifndef CONFIG_NEED_MULTIPLE_NODES
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7228) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7229) * With no DISCONTIG, the global mem_map is just set as node 0's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7230) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7231) if (pgdat == NODE_DATA(0)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7232) mem_map = NODE_DATA(0)->node_mem_map;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7233) if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7234) mem_map -= offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7235) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7236) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7237) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7238) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7239) static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7240) #endif /* CONFIG_FLAT_NODE_MEM_MAP */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7241)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7242) #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7243) static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7244) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7245) pgdat->first_deferred_pfn = ULONG_MAX;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7246) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7247) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7248) static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7249) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7250)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7251) static void __init free_area_init_node(int nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7252) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7253) pg_data_t *pgdat = NODE_DATA(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7254) unsigned long start_pfn = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7255) unsigned long end_pfn = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7256)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7257) /* pg_data_t should be reset to zero when it's allocated */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7258) WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7259)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7260) get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7261)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7262) pgdat->node_id = nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7263) pgdat->node_start_pfn = start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7264) pgdat->per_cpu_nodestats = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7265)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7266) pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7267) (u64)start_pfn << PAGE_SHIFT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7268) end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7269) calculate_node_totalpages(pgdat, start_pfn, end_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7270)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7271) alloc_node_mem_map(pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7272) pgdat_set_deferred_range(pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7273)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7274) free_area_init_core(pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7275) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7276)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7277) void __init free_area_init_memoryless_node(int nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7278) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7279) free_area_init_node(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7280) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7281)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7282) #if MAX_NUMNODES > 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7283) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7284) * Figure out the number of possible node ids.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7285) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7286) void __init setup_nr_node_ids(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7287) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7288) unsigned int highest;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7289)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7290) highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7291) nr_node_ids = highest + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7292) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7293) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7294)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7295) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7296) * node_map_pfn_alignment - determine the maximum internode alignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7297) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7298) * This function should be called after node map is populated and sorted.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7299) * It calculates the maximum power of two alignment which can distinguish
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7300) * all the nodes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7301) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7302) * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7303) * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7304) * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7305) * shifted, 1GiB is enough and this function will indicate so.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7306) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7307) * This is used to test whether pfn -> nid mapping of the chosen memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7308) * model has fine enough granularity to avoid incorrect mapping for the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7309) * populated node map.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7310) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7311) * Return: the determined alignment in pfn's. 0 if there is no alignment
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7312) * requirement (single node).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7313) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7314) unsigned long __init node_map_pfn_alignment(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7315) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7316) unsigned long accl_mask = 0, last_end = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7317) unsigned long start, end, mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7318) int last_nid = NUMA_NO_NODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7319) int i, nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7320)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7321) for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7322) if (!start || last_nid < 0 || last_nid == nid) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7323) last_nid = nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7324) last_end = end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7325) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7326) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7327)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7328) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7329) * Start with a mask granular enough to pin-point to the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7330) * start pfn and tick off bits one-by-one until it becomes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7331) * too coarse to separate the current node from the last.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7332) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7333) mask = ~((1 << __ffs(start)) - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7334) while (mask && last_end <= (start & (mask << 1)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7335) mask <<= 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7336)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7337) /* accumulate all internode masks */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7338) accl_mask |= mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7339) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7340)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7341) /* convert mask to number of pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7342) return ~accl_mask + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7343) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7344)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7345) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7346) * find_min_pfn_with_active_regions - Find the minimum PFN registered
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7347) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7348) * Return: the minimum PFN based on information provided via
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7349) * memblock_set_node().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7350) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7351) unsigned long __init find_min_pfn_with_active_regions(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7352) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7353) return PHYS_PFN(memblock_start_of_DRAM());
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7354) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7355)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7356) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7357) * early_calculate_totalpages()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7358) * Sum pages in active regions for movable zone.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7359) * Populate N_MEMORY for calculating usable_nodes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7360) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7361) static unsigned long __init early_calculate_totalpages(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7362) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7363) unsigned long totalpages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7364) unsigned long start_pfn, end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7365) int i, nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7366)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7367) for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7368) unsigned long pages = end_pfn - start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7369)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7370) totalpages += pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7371) if (pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7372) node_set_state(nid, N_MEMORY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7373) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7374) return totalpages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7375) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7376)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7377) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7378) * Find the PFN the Movable zone begins in each node. Kernel memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7379) * is spread evenly between nodes as long as the nodes have enough
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7380) * memory. When they don't, some nodes will have more kernelcore than
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7381) * others
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7382) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7383) static void __init find_zone_movable_pfns_for_nodes(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7384) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7385) int i, nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7386) unsigned long usable_startpfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7387) unsigned long kernelcore_node, kernelcore_remaining;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7388) /* save the state before borrow the nodemask */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7389) nodemask_t saved_node_state = node_states[N_MEMORY];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7390) unsigned long totalpages = early_calculate_totalpages();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7391) int usable_nodes = nodes_weight(node_states[N_MEMORY]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7392) struct memblock_region *r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7393)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7394) /* Need to find movable_zone earlier when movable_node is specified. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7395) find_usable_zone_for_movable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7396)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7397) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7398) * If movable_node is specified, ignore kernelcore and movablecore
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7399) * options.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7400) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7401) if (movable_node_is_enabled()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7402) for_each_mem_region(r) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7403) if (!memblock_is_hotpluggable(r))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7404) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7405)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7406) nid = memblock_get_region_node(r);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7407)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7408) usable_startpfn = PFN_DOWN(r->base);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7409) zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7410) min(usable_startpfn, zone_movable_pfn[nid]) :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7411) usable_startpfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7412) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7413)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7414) goto out2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7415) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7416)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7417) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7418) * If kernelcore=mirror is specified, ignore movablecore option
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7419) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7420) if (mirrored_kernelcore) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7421) bool mem_below_4gb_not_mirrored = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7422)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7423) for_each_mem_region(r) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7424) if (memblock_is_mirror(r))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7425) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7426)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7427) nid = memblock_get_region_node(r);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7428)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7429) usable_startpfn = memblock_region_memory_base_pfn(r);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7430)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7431) if (usable_startpfn < 0x100000) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7432) mem_below_4gb_not_mirrored = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7433) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7434) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7435)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7436) zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7437) min(usable_startpfn, zone_movable_pfn[nid]) :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7438) usable_startpfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7439) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7440)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7441) if (mem_below_4gb_not_mirrored)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7442) pr_warn("This configuration results in unmirrored kernel memory.\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7443)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7444) goto out2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7445) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7446)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7447) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7448) * If kernelcore=nn% or movablecore=nn% was specified, calculate the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7449) * amount of necessary memory.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7450) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7451) if (required_kernelcore_percent)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7452) required_kernelcore = (totalpages * 100 * required_kernelcore_percent) /
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7453) 10000UL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7454) if (required_movablecore_percent)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7455) required_movablecore = (totalpages * 100 * required_movablecore_percent) /
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7456) 10000UL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7457)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7458) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7459) * If movablecore= was specified, calculate what size of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7460) * kernelcore that corresponds so that memory usable for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7461) * any allocation type is evenly spread. If both kernelcore
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7462) * and movablecore are specified, then the value of kernelcore
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7463) * will be used for required_kernelcore if it's greater than
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7464) * what movablecore would have allowed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7465) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7466) if (required_movablecore) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7467) unsigned long corepages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7468)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7469) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7470) * Round-up so that ZONE_MOVABLE is at least as large as what
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7471) * was requested by the user
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7472) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7473) required_movablecore =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7474) roundup(required_movablecore, MAX_ORDER_NR_PAGES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7475) required_movablecore = min(totalpages, required_movablecore);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7476) corepages = totalpages - required_movablecore;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7477)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7478) required_kernelcore = max(required_kernelcore, corepages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7479) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7480)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7481) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7482) * If kernelcore was not specified or kernelcore size is larger
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7483) * than totalpages, there is no ZONE_MOVABLE.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7484) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7485) if (!required_kernelcore || required_kernelcore >= totalpages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7486) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7487)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7488) /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7489) usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7490)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7491) restart:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7492) /* Spread kernelcore memory as evenly as possible throughout nodes */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7493) kernelcore_node = required_kernelcore / usable_nodes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7494) for_each_node_state(nid, N_MEMORY) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7495) unsigned long start_pfn, end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7496)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7497) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7498) * Recalculate kernelcore_node if the division per node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7499) * now exceeds what is necessary to satisfy the requested
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7500) * amount of memory for the kernel
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7501) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7502) if (required_kernelcore < kernelcore_node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7503) kernelcore_node = required_kernelcore / usable_nodes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7504)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7505) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7506) * As the map is walked, we track how much memory is usable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7507) * by the kernel using kernelcore_remaining. When it is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7508) * 0, the rest of the node is usable by ZONE_MOVABLE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7509) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7510) kernelcore_remaining = kernelcore_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7511)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7512) /* Go through each range of PFNs within this node */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7513) for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7514) unsigned long size_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7515)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7516) start_pfn = max(start_pfn, zone_movable_pfn[nid]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7517) if (start_pfn >= end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7518) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7519)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7520) /* Account for what is only usable for kernelcore */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7521) if (start_pfn < usable_startpfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7522) unsigned long kernel_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7523) kernel_pages = min(end_pfn, usable_startpfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7524) - start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7525)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7526) kernelcore_remaining -= min(kernel_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7527) kernelcore_remaining);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7528) required_kernelcore -= min(kernel_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7529) required_kernelcore);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7530)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7531) /* Continue if range is now fully accounted */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7532) if (end_pfn <= usable_startpfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7533)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7534) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7535) * Push zone_movable_pfn to the end so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7536) * that if we have to rebalance
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7537) * kernelcore across nodes, we will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7538) * not double account here
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7539) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7540) zone_movable_pfn[nid] = end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7541) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7542) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7543) start_pfn = usable_startpfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7544) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7545)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7546) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7547) * The usable PFN range for ZONE_MOVABLE is from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7548) * start_pfn->end_pfn. Calculate size_pages as the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7549) * number of pages used as kernelcore
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7550) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7551) size_pages = end_pfn - start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7552) if (size_pages > kernelcore_remaining)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7553) size_pages = kernelcore_remaining;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7554) zone_movable_pfn[nid] = start_pfn + size_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7555)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7556) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7557) * Some kernelcore has been met, update counts and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7558) * break if the kernelcore for this node has been
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7559) * satisfied
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7560) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7561) required_kernelcore -= min(required_kernelcore,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7562) size_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7563) kernelcore_remaining -= size_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7564) if (!kernelcore_remaining)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7565) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7566) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7567) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7568)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7569) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7570) * If there is still required_kernelcore, we do another pass with one
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7571) * less node in the count. This will push zone_movable_pfn[nid] further
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7572) * along on the nodes that still have memory until kernelcore is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7573) * satisfied
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7574) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7575) usable_nodes--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7576) if (usable_nodes && required_kernelcore > usable_nodes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7577) goto restart;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7578)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7579) out2:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7580) /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7581) for (nid = 0; nid < MAX_NUMNODES; nid++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7582) unsigned long start_pfn, end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7583)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7584) zone_movable_pfn[nid] =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7585) roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7586)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7587) get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7588) if (zone_movable_pfn[nid] >= end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7589) zone_movable_pfn[nid] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7590) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7591)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7592) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7593) /* restore the node_state */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7594) node_states[N_MEMORY] = saved_node_state;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7595) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7596)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7597) /* Any regular or high memory on that node ? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7598) static void check_for_memory(pg_data_t *pgdat, int nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7599) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7600) enum zone_type zone_type;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7601)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7602) for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7603) struct zone *zone = &pgdat->node_zones[zone_type];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7604) if (populated_zone(zone)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7605) if (IS_ENABLED(CONFIG_HIGHMEM))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7606) node_set_state(nid, N_HIGH_MEMORY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7607) if (zone_type <= ZONE_NORMAL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7608) node_set_state(nid, N_NORMAL_MEMORY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7609) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7610) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7611) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7612) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7613)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7614) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7615) * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7616) * such cases we allow max_zone_pfn sorted in the descending order
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7617) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7618) bool __weak arch_has_descending_max_zone_pfns(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7619) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7620) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7621) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7622)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7623) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7624) * free_area_init - Initialise all pg_data_t and zone data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7625) * @max_zone_pfn: an array of max PFNs for each zone
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7626) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7627) * This will call free_area_init_node() for each active node in the system.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7628) * Using the page ranges provided by memblock_set_node(), the size of each
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7629) * zone in each node and their holes is calculated. If the maximum PFN
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7630) * between two adjacent zones match, it is assumed that the zone is empty.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7631) * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7632) * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7633) * starts where the previous one ended. For example, ZONE_DMA32 starts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7634) * at arch_max_dma_pfn.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7635) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7636) void __init free_area_init(unsigned long *max_zone_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7637) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7638) unsigned long start_pfn, end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7639) int i, nid, zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7640) bool descending;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7641)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7642) /* Record where the zone boundaries are */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7643) memset(arch_zone_lowest_possible_pfn, 0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7644) sizeof(arch_zone_lowest_possible_pfn));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7645) memset(arch_zone_highest_possible_pfn, 0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7646) sizeof(arch_zone_highest_possible_pfn));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7647)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7648) start_pfn = find_min_pfn_with_active_regions();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7649) descending = arch_has_descending_max_zone_pfns();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7650)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7651) for (i = 0; i < MAX_NR_ZONES; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7652) if (descending)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7653) zone = MAX_NR_ZONES - i - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7654) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7655) zone = i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7656)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7657) if (zone == ZONE_MOVABLE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7658) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7659)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7660) end_pfn = max(max_zone_pfn[zone], start_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7661) arch_zone_lowest_possible_pfn[zone] = start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7662) arch_zone_highest_possible_pfn[zone] = end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7663)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7664) start_pfn = end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7665) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7666)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7667) /* Find the PFNs that ZONE_MOVABLE begins at in each node */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7668) memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7669) find_zone_movable_pfns_for_nodes();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7670)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7671) /* Print out the zone ranges */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7672) pr_info("Zone ranges:\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7673) for (i = 0; i < MAX_NR_ZONES; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7674) if (i == ZONE_MOVABLE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7675) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7676) pr_info(" %-8s ", zone_names[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7677) if (arch_zone_lowest_possible_pfn[i] ==
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7678) arch_zone_highest_possible_pfn[i])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7679) pr_cont("empty\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7680) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7681) pr_cont("[mem %#018Lx-%#018Lx]\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7682) (u64)arch_zone_lowest_possible_pfn[i]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7683) << PAGE_SHIFT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7684) ((u64)arch_zone_highest_possible_pfn[i]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7685) << PAGE_SHIFT) - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7686) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7687)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7688) /* Print out the PFNs ZONE_MOVABLE begins at in each node */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7689) pr_info("Movable zone start for each node\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7690) for (i = 0; i < MAX_NUMNODES; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7691) if (zone_movable_pfn[i])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7692) pr_info(" Node %d: %#018Lx\n", i,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7693) (u64)zone_movable_pfn[i] << PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7694) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7695)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7696) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7697) * Print out the early node map, and initialize the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7698) * subsection-map relative to active online memory ranges to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7699) * enable future "sub-section" extensions of the memory map.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7700) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7701) pr_info("Early memory node ranges\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7702) for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7703) pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7704) (u64)start_pfn << PAGE_SHIFT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7705) ((u64)end_pfn << PAGE_SHIFT) - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7706) subsection_map_init(start_pfn, end_pfn - start_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7707) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7708)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7709) /* Initialise every node */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7710) mminit_verify_pageflags_layout();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7711) setup_nr_node_ids();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7712) for_each_online_node(nid) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7713) pg_data_t *pgdat = NODE_DATA(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7714) free_area_init_node(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7715)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7716) /* Any memory on that node */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7717) if (pgdat->node_present_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7718) node_set_state(nid, N_MEMORY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7719) check_for_memory(pgdat, nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7720) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7721)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7722) memmap_init();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7723) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7724)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7725) static int __init cmdline_parse_core(char *p, unsigned long *core,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7726) unsigned long *percent)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7727) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7728) unsigned long long coremem;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7729) char *endptr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7730)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7731) if (!p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7732) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7733)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7734) /* Value may be a percentage of total memory, otherwise bytes */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7735) coremem = simple_strtoull(p, &endptr, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7736) if (*endptr == '%') {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7737) /* Paranoid check for percent values greater than 100 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7738) WARN_ON(coremem > 100);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7739)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7740) *percent = coremem;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7741) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7742) coremem = memparse(p, &p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7743) /* Paranoid check that UL is enough for the coremem value */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7744) WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7745)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7746) *core = coremem >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7747) *percent = 0UL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7748) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7749) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7750) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7751)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7752) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7753) * kernelcore=size sets the amount of memory for use for allocations that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7754) * cannot be reclaimed or migrated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7755) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7756) static int __init cmdline_parse_kernelcore(char *p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7757) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7758) /* parse kernelcore=mirror */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7759) if (parse_option_str(p, "mirror")) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7760) mirrored_kernelcore = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7761) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7762) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7763)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7764) return cmdline_parse_core(p, &required_kernelcore,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7765) &required_kernelcore_percent);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7766) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7767)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7768) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7769) * movablecore=size sets the amount of memory for use for allocations that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7770) * can be reclaimed or migrated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7771) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7772) static int __init cmdline_parse_movablecore(char *p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7773) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7774) return cmdline_parse_core(p, &required_movablecore,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7775) &required_movablecore_percent);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7776) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7777)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7778) early_param("kernelcore", cmdline_parse_kernelcore);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7779) early_param("movablecore", cmdline_parse_movablecore);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7780)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7781) void adjust_managed_page_count(struct page *page, long count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7782) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7783) atomic_long_add(count, &page_zone(page)->managed_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7784) totalram_pages_add(count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7785) #ifdef CONFIG_HIGHMEM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7786) if (PageHighMem(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7787) totalhigh_pages_add(count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7788) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7789) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7790) EXPORT_SYMBOL(adjust_managed_page_count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7791)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7792) unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7793) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7794) void *pos;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7795) unsigned long pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7796)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7797) start = (void *)PAGE_ALIGN((unsigned long)start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7798) end = (void *)((unsigned long)end & PAGE_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7799) for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7800) struct page *page = virt_to_page(pos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7801) void *direct_map_addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7802)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7803) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7804) * 'direct_map_addr' might be different from 'pos'
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7805) * because some architectures' virt_to_page()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7806) * work with aliases. Getting the direct map
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7807) * address ensures that we get a _writeable_
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7808) * alias for the memset().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7809) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7810) direct_map_addr = page_address(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7811) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7812) * Perform a kasan-unchecked memset() since this memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7813) * has not been initialized.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7814) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7815) direct_map_addr = kasan_reset_tag(direct_map_addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7816) if ((unsigned int)poison <= 0xFF)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7817) memset(direct_map_addr, poison, PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7818)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7819) free_reserved_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7820) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7821)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7822) if (pages && s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7823) pr_info("Freeing %s memory: %ldK\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7824) s, pages << (PAGE_SHIFT - 10));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7825)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7826) return pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7827) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7828)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7829) #ifdef CONFIG_HIGHMEM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7830) void free_highmem_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7831) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7832) __free_reserved_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7833) totalram_pages_inc();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7834) atomic_long_inc(&page_zone(page)->managed_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7835) totalhigh_pages_inc();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7836) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7837) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7838)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7839)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7840) void __init mem_init_print_info(const char *str)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7841) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7842) unsigned long physpages, codesize, datasize, rosize, bss_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7843) unsigned long init_code_size, init_data_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7844)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7845) physpages = get_num_physpages();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7846) codesize = _etext - _stext;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7847) datasize = _edata - _sdata;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7848) rosize = __end_rodata - __start_rodata;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7849) bss_size = __bss_stop - __bss_start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7850) init_data_size = __init_end - __init_begin;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7851) init_code_size = _einittext - _sinittext;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7852)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7853) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7854) * Detect special cases and adjust section sizes accordingly:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7855) * 1) .init.* may be embedded into .data sections
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7856) * 2) .init.text.* may be out of [__init_begin, __init_end],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7857) * please refer to arch/tile/kernel/vmlinux.lds.S.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7858) * 3) .rodata.* may be embedded into .text or .data sections.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7859) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7860) #define adj_init_size(start, end, size, pos, adj) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7861) do { \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7862) if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7863) size -= adj; \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7864) } while (0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7865)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7866) adj_init_size(__init_begin, __init_end, init_data_size,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7867) _sinittext, init_code_size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7868) adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7869) adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7870) adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7871) adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7872)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7873) #undef adj_init_size
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7874)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7875) pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7876) #ifdef CONFIG_HIGHMEM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7877) ", %luK highmem"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7878) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7879) "%s%s)\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7880) nr_free_pages() << (PAGE_SHIFT - 10),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7881) physpages << (PAGE_SHIFT - 10),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7882) codesize >> 10, datasize >> 10, rosize >> 10,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7883) (init_data_size + init_code_size) >> 10, bss_size >> 10,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7884) (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7885) totalcma_pages << (PAGE_SHIFT - 10),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7886) #ifdef CONFIG_HIGHMEM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7887) totalhigh_pages() << (PAGE_SHIFT - 10),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7888) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7889) str ? ", " : "", str ? str : "");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7890) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7891)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7892) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7893) * set_dma_reserve - set the specified number of pages reserved in the first zone
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7894) * @new_dma_reserve: The number of pages to mark reserved
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7895) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7896) * The per-cpu batchsize and zone watermarks are determined by managed_pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7897) * In the DMA zone, a significant percentage may be consumed by kernel image
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7898) * and other unfreeable allocations which can skew the watermarks badly. This
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7899) * function may optionally be used to account for unfreeable pages in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7900) * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7901) * smaller per-cpu batchsize.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7902) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7903) void __init set_dma_reserve(unsigned long new_dma_reserve)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7904) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7905) dma_reserve = new_dma_reserve;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7906) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7907)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7908) static int page_alloc_cpu_dead(unsigned int cpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7909) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7910)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7911) lru_add_drain_cpu(cpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7912) drain_pages(cpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7913)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7914) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7915) * Spill the event counters of the dead processor
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7916) * into the current processors event counters.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7917) * This artificially elevates the count of the current
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7918) * processor.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7919) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7920) vm_events_fold_cpu(cpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7921)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7922) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7923) * Zero the differential counters of the dead processor
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7924) * so that the vm statistics are consistent.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7925) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7926) * This is only okay since the processor is dead and cannot
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7927) * race with what we are doing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7928) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7929) cpu_vm_stats_fold(cpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7930) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7931) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7932)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7933) #ifdef CONFIG_NUMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7934) int hashdist = HASHDIST_DEFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7935)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7936) static int __init set_hashdist(char *str)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7937) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7938) if (!str)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7939) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7940) hashdist = simple_strtoul(str, &str, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7941) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7942) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7943) __setup("hashdist=", set_hashdist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7944) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7945)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7946) void __init page_alloc_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7947) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7948) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7949)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7950) #ifdef CONFIG_NUMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7951) if (num_node_state(N_MEMORY) == 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7952) hashdist = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7953) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7954)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7955) ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7956) "mm/page_alloc:dead", NULL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7957) page_alloc_cpu_dead);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7958) WARN_ON(ret < 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7959) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7960)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7961) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7962) * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7963) * or min_free_kbytes changes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7964) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7965) static void calculate_totalreserve_pages(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7966) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7967) struct pglist_data *pgdat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7968) unsigned long reserve_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7969) enum zone_type i, j;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7970)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7971) for_each_online_pgdat(pgdat) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7972)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7973) pgdat->totalreserve_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7974)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7975) for (i = 0; i < MAX_NR_ZONES; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7976) struct zone *zone = pgdat->node_zones + i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7977) long max = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7978) unsigned long managed_pages = zone_managed_pages(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7979)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7980) /* Find valid and maximum lowmem_reserve in the zone */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7981) for (j = i; j < MAX_NR_ZONES; j++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7982) if (zone->lowmem_reserve[j] > max)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7983) max = zone->lowmem_reserve[j];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7984) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7985)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7986) /* we treat the high watermark as reserved pages. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7987) max += high_wmark_pages(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7988)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7989) if (max > managed_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7990) max = managed_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7991)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7992) pgdat->totalreserve_pages += max;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7993)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7994) reserve_pages += max;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7995) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7996) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7997) totalreserve_pages = reserve_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7998) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7999)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8000) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8001) * setup_per_zone_lowmem_reserve - called whenever
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8002) * sysctl_lowmem_reserve_ratio changes. Ensures that each zone
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8003) * has a correct pages reserved value, so an adequate number of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8004) * pages are left in the zone after a successful __alloc_pages().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8005) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8006) static void setup_per_zone_lowmem_reserve(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8007) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8008) struct pglist_data *pgdat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8009) enum zone_type i, j;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8010)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8011) for_each_online_pgdat(pgdat) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8012) for (i = 0; i < MAX_NR_ZONES - 1; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8013) struct zone *zone = &pgdat->node_zones[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8014) int ratio = sysctl_lowmem_reserve_ratio[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8015) bool clear = !ratio || !zone_managed_pages(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8016) unsigned long managed_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8017)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8018) for (j = i + 1; j < MAX_NR_ZONES; j++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8019) struct zone *upper_zone = &pgdat->node_zones[j];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8020)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8021) managed_pages += zone_managed_pages(upper_zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8022)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8023) if (clear)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8024) zone->lowmem_reserve[j] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8025) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8026) zone->lowmem_reserve[j] = managed_pages / ratio;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8027) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8028) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8029) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8030)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8031) /* update totalreserve_pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8032) calculate_totalreserve_pages();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8033) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8034)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8035) static void __setup_per_zone_wmarks(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8036) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8037) unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8038) unsigned long pages_low = extra_free_kbytes >> (PAGE_SHIFT - 10);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8039) unsigned long lowmem_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8040) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8041) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8042)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8043) /* Calculate total number of !ZONE_HIGHMEM pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8044) for_each_zone(zone) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8045) if (!is_highmem(zone))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8046) lowmem_pages += zone_managed_pages(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8047) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8048)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8049) for_each_zone(zone) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8050) u64 tmp, low;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8051)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8052) spin_lock_irqsave(&zone->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8053) tmp = (u64)pages_min * zone_managed_pages(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8054) do_div(tmp, lowmem_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8055) low = (u64)pages_low * zone_managed_pages(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8056) do_div(low, nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8057) if (is_highmem(zone)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8058) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8059) * __GFP_HIGH and PF_MEMALLOC allocations usually don't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8060) * need highmem pages, so cap pages_min to a small
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8061) * value here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8062) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8063) * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8064) * deltas control async page reclaim, and so should
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8065) * not be capped for highmem.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8066) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8067) unsigned long min_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8068)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8069) min_pages = zone_managed_pages(zone) / 1024;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8070) min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8071) zone->_watermark[WMARK_MIN] = min_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8072) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8073) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8074) * If it's a lowmem zone, reserve a number of pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8075) * proportionate to the zone's size.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8076) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8077) zone->_watermark[WMARK_MIN] = tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8078) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8079)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8080) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8081) * Set the kswapd watermarks distance according to the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8082) * scale factor in proportion to available memory, but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8083) * ensure a minimum size on small systems.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8084) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8085) tmp = max_t(u64, tmp >> 2,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8086) mult_frac(zone_managed_pages(zone),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8087) watermark_scale_factor, 10000));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8088)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8089) zone->watermark_boost = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8090) zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + low + tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8091) zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + low + tmp * 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8092)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8093) spin_unlock_irqrestore(&zone->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8094) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8095)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8096) /* update totalreserve_pages */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8097) calculate_totalreserve_pages();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8098) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8099)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8100) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8101) * setup_per_zone_wmarks - called when min_free_kbytes changes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8102) * or when memory is hot-{added|removed}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8103) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8104) * Ensures that the watermark[min,low,high] values for each zone are set
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8105) * correctly with respect to min_free_kbytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8106) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8107) void setup_per_zone_wmarks(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8108) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8109) static DEFINE_SPINLOCK(lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8110)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8111) spin_lock(&lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8112) __setup_per_zone_wmarks();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8113) spin_unlock(&lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8114) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8115)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8116) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8117) * Initialise min_free_kbytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8118) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8119) * For small machines we want it small (128k min). For large machines
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8120) * we want it large (256MB max). But it is not linear, because network
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8121) * bandwidth does not increase linearly with machine size. We use
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8122) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8123) * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8124) * min_free_kbytes = sqrt(lowmem_kbytes * 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8125) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8126) * which yields
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8127) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8128) * 16MB: 512k
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8129) * 32MB: 724k
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8130) * 64MB: 1024k
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8131) * 128MB: 1448k
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8132) * 256MB: 2048k
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8133) * 512MB: 2896k
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8134) * 1024MB: 4096k
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8135) * 2048MB: 5792k
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8136) * 4096MB: 8192k
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8137) * 8192MB: 11584k
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8138) * 16384MB: 16384k
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8139) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8140) int __meminit init_per_zone_wmark_min(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8141) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8142) unsigned long lowmem_kbytes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8143) int new_min_free_kbytes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8144)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8145) lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8146) new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8147)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8148) if (new_min_free_kbytes > user_min_free_kbytes) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8149) min_free_kbytes = new_min_free_kbytes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8150) if (min_free_kbytes < 128)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8151) min_free_kbytes = 128;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8152) if (min_free_kbytes > 262144)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8153) min_free_kbytes = 262144;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8154) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8155) pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8156) new_min_free_kbytes, user_min_free_kbytes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8157) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8158) setup_per_zone_wmarks();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8159) refresh_zone_stat_thresholds();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8160) setup_per_zone_lowmem_reserve();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8161)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8162) #ifdef CONFIG_NUMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8163) setup_min_unmapped_ratio();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8164) setup_min_slab_ratio();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8165) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8166)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8167) khugepaged_min_free_kbytes_update();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8168)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8169) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8170) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8171) postcore_initcall(init_per_zone_wmark_min)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8172)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8173) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8174) * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8175) * that we can call two helper functions whenever min_free_kbytes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8176) * or extra_free_kbytes changes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8177) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8178) int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8179) void *buffer, size_t *length, loff_t *ppos)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8180) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8181) int rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8182)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8183) rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8184) if (rc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8185) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8186)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8187) if (write) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8188) user_min_free_kbytes = min_free_kbytes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8189) setup_per_zone_wmarks();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8190) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8191) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8192) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8193)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8194) int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8195) void *buffer, size_t *length, loff_t *ppos)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8196) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8197) int rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8198)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8199) rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8200) if (rc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8201) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8202)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8203) if (write)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8204) setup_per_zone_wmarks();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8205)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8206) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8207) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8208)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8209) #ifdef CONFIG_NUMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8210) static void setup_min_unmapped_ratio(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8211) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8212) pg_data_t *pgdat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8213) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8214)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8215) for_each_online_pgdat(pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8216) pgdat->min_unmapped_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8217)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8218) for_each_zone(zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8219) zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8220) sysctl_min_unmapped_ratio) / 100;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8221) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8222)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8223)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8224) int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8225) void *buffer, size_t *length, loff_t *ppos)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8226) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8227) int rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8228)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8229) rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8230) if (rc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8231) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8232)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8233) setup_min_unmapped_ratio();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8234)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8235) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8236) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8237)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8238) static void setup_min_slab_ratio(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8239) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8240) pg_data_t *pgdat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8241) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8242)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8243) for_each_online_pgdat(pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8244) pgdat->min_slab_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8245)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8246) for_each_zone(zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8247) zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8248) sysctl_min_slab_ratio) / 100;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8249) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8250)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8251) int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8252) void *buffer, size_t *length, loff_t *ppos)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8253) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8254) int rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8255)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8256) rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8257) if (rc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8258) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8259)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8260) setup_min_slab_ratio();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8261)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8262) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8263) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8264) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8265)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8266) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8267) * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8268) * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8269) * whenever sysctl_lowmem_reserve_ratio changes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8270) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8271) * The reserve ratio obviously has absolutely no relation with the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8272) * minimum watermarks. The lowmem reserve ratio can only make sense
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8273) * if in function of the boot time zone sizes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8274) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8275) int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8276) void *buffer, size_t *length, loff_t *ppos)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8277) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8278) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8279)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8280) proc_dointvec_minmax(table, write, buffer, length, ppos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8281)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8282) for (i = 0; i < MAX_NR_ZONES; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8283) if (sysctl_lowmem_reserve_ratio[i] < 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8284) sysctl_lowmem_reserve_ratio[i] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8285) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8286)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8287) setup_per_zone_lowmem_reserve();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8288) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8289) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8290)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8291) static void __zone_pcp_update(struct zone *zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8292) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8293) unsigned int cpu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8294)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8295) for_each_possible_cpu(cpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8296) pageset_set_high_and_batch(zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8297) per_cpu_ptr(zone->pageset, cpu));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8298) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8299)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8300) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8301) * percpu_pagelist_fraction - changes the pcp->high for each zone on each
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8302) * cpu. It is the fraction of total pages in each zone that a hot per cpu
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8303) * pagelist can have before it gets flushed back to buddy allocator.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8304) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8305) int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8306) void *buffer, size_t *length, loff_t *ppos)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8307) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8308) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8309) int old_percpu_pagelist_fraction;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8310) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8311)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8312) mutex_lock(&pcp_batch_high_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8313) old_percpu_pagelist_fraction = percpu_pagelist_fraction;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8314)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8315) ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8316) if (!write || ret < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8317) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8318)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8319) /* Sanity checking to avoid pcp imbalance */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8320) if (percpu_pagelist_fraction &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8321) percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8322) percpu_pagelist_fraction = old_percpu_pagelist_fraction;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8323) ret = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8324) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8325) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8326)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8327) /* No change? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8328) if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8329) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8330)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8331) for_each_populated_zone(zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8332) __zone_pcp_update(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8333) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8334) mutex_unlock(&pcp_batch_high_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8335) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8336) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8337)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8338) #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8339) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8340) * Returns the number of pages that arch has reserved but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8341) * is not known to alloc_large_system_hash().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8342) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8343) static unsigned long __init arch_reserved_kernel_pages(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8344) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8345) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8346) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8347) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8348)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8349) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8350) * Adaptive scale is meant to reduce sizes of hash tables on large memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8351) * machines. As memory size is increased the scale is also increased but at
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8352) * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8353) * quadruples the scale is increased by one, which means the size of hash table
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8354) * only doubles, instead of quadrupling as well.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8355) * Because 32-bit systems cannot have large physical memory, where this scaling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8356) * makes sense, it is disabled on such platforms.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8357) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8358) #if __BITS_PER_LONG > 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8359) #define ADAPT_SCALE_BASE (64ul << 30)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8360) #define ADAPT_SCALE_SHIFT 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8361) #define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8362) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8363)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8364) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8365) * allocate a large system hash table from bootmem
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8366) * - it is assumed that the hash table must contain an exact power-of-2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8367) * quantity of entries
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8368) * - limit is the number of hash buckets, not the total allocation size
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8369) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8370) void *__init alloc_large_system_hash(const char *tablename,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8371) unsigned long bucketsize,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8372) unsigned long numentries,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8373) int scale,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8374) int flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8375) unsigned int *_hash_shift,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8376) unsigned int *_hash_mask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8377) unsigned long low_limit,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8378) unsigned long high_limit)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8379) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8380) unsigned long long max = high_limit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8381) unsigned long log2qty, size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8382) void *table = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8383) gfp_t gfp_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8384) bool virt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8385)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8386) /* allow the kernel cmdline to have a say */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8387) if (!numentries) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8388) /* round applicable memory size up to nearest megabyte */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8389) numentries = nr_kernel_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8390) numentries -= arch_reserved_kernel_pages();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8391)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8392) /* It isn't necessary when PAGE_SIZE >= 1MB */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8393) if (PAGE_SHIFT < 20)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8394) numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8395)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8396) #if __BITS_PER_LONG > 32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8397) if (!high_limit) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8398) unsigned long adapt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8399)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8400) for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8401) adapt <<= ADAPT_SCALE_SHIFT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8402) scale++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8403) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8404) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8405)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8406) /* limit to 1 bucket per 2^scale bytes of low memory */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8407) if (scale > PAGE_SHIFT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8408) numentries >>= (scale - PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8409) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8410) numentries <<= (PAGE_SHIFT - scale);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8411)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8412) /* Make sure we've got at least a 0-order allocation.. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8413) if (unlikely(flags & HASH_SMALL)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8414) /* Makes no sense without HASH_EARLY */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8415) WARN_ON(!(flags & HASH_EARLY));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8416) if (!(numentries >> *_hash_shift)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8417) numentries = 1UL << *_hash_shift;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8418) BUG_ON(!numentries);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8419) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8420) } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8421) numentries = PAGE_SIZE / bucketsize;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8422) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8423) numentries = roundup_pow_of_two(numentries);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8424)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8425) /* limit allocation size to 1/16 total memory by default */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8426) if (max == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8427) max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8428) do_div(max, bucketsize);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8429) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8430) max = min(max, 0x80000000ULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8431)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8432) if (numentries < low_limit)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8433) numentries = low_limit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8434) if (numentries > max)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8435) numentries = max;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8436)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8437) log2qty = ilog2(numentries);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8438)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8439) gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8440) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8441) virt = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8442) size = bucketsize << log2qty;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8443) if (flags & HASH_EARLY) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8444) if (flags & HASH_ZERO)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8445) table = memblock_alloc(size, SMP_CACHE_BYTES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8446) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8447) table = memblock_alloc_raw(size,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8448) SMP_CACHE_BYTES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8449) } else if (get_order(size) >= MAX_ORDER || hashdist) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8450) table = __vmalloc(size, gfp_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8451) virt = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8452) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8453) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8454) * If bucketsize is not a power-of-two, we may free
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8455) * some pages at the end of hash table which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8456) * alloc_pages_exact() automatically does
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8457) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8458) table = alloc_pages_exact(size, gfp_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8459) kmemleak_alloc(table, size, 1, gfp_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8460) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8461) } while (!table && size > PAGE_SIZE && --log2qty);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8462)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8463) if (!table)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8464) panic("Failed to allocate %s hash table\n", tablename);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8465)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8466) pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8467) tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8468) virt ? "vmalloc" : "linear");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8469)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8470) if (_hash_shift)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8471) *_hash_shift = log2qty;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8472) if (_hash_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8473) *_hash_mask = (1 << log2qty) - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8474)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8475) return table;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8476) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8477)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8478) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8479) * This function checks whether pageblock includes unmovable pages or not.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8480) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8481) * PageLRU check without isolation or lru_lock could race so that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8482) * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8483) * check without lock_page also may miss some movable non-lru pages at
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8484) * race condition. So you can't expect this function should be exact.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8485) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8486) * Returns a page without holding a reference. If the caller wants to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8487) * dereference that page (e.g., dumping), it has to make sure that it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8488) * cannot get removed (e.g., via memory unplug) concurrently.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8489) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8490) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8491) struct page *has_unmovable_pages(struct zone *zone, struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8492) int migratetype, int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8493) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8494) unsigned long iter = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8495) unsigned long pfn = page_to_pfn(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8496) unsigned long offset = pfn % pageblock_nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8497)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8498) if (is_migrate_cma_page(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8499) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8500) * CMA allocations (alloc_contig_range) really need to mark
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8501) * isolate CMA pageblocks even when they are not movable in fact
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8502) * so consider them movable here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8503) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8504) if (is_migrate_cma(migratetype))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8505) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8506)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8507) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8508) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8509)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8510) for (; iter < pageblock_nr_pages - offset; iter++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8511) if (!pfn_valid_within(pfn + iter))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8512) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8513)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8514) page = pfn_to_page(pfn + iter);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8515)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8516) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8517) * Both, bootmem allocations and memory holes are marked
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8518) * PG_reserved and are unmovable. We can even have unmovable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8519) * allocations inside ZONE_MOVABLE, for example when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8520) * specifying "movablecore".
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8521) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8522) if (PageReserved(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8523) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8524)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8525) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8526) * If the zone is movable and we have ruled out all reserved
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8527) * pages then it should be reasonably safe to assume the rest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8528) * is movable.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8529) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8530) if (zone_idx(zone) == ZONE_MOVABLE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8531) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8532)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8533) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8534) * Hugepages are not in LRU lists, but they're movable.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8535) * THPs are on the LRU, but need to be counted as #small pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8536) * We need not scan over tail pages because we don't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8537) * handle each tail page individually in migration.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8538) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8539) if (PageHuge(page) || PageTransCompound(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8540) struct page *head = compound_head(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8541) unsigned int skip_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8542)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8543) if (PageHuge(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8544) if (!hugepage_migration_supported(page_hstate(head)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8545) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8546) } else if (!PageLRU(head) && !__PageMovable(head)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8547) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8548) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8549)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8550) skip_pages = compound_nr(head) - (page - head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8551) iter += skip_pages - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8552) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8553) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8554)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8555) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8556) * We can't use page_count without pin a page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8557) * because another CPU can free compound page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8558) * This check already skips compound tails of THP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8559) * because their page->_refcount is zero at all time.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8560) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8561) if (!page_ref_count(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8562) if (PageBuddy(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8563) iter += (1 << buddy_order(page)) - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8564) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8565) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8566)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8567) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8568) * The HWPoisoned page may be not in buddy system, and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8569) * page_count() is not 0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8570) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8571) if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8572) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8573)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8574) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8575) * We treat all PageOffline() pages as movable when offlining
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8576) * to give drivers a chance to decrement their reference count
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8577) * in MEM_GOING_OFFLINE in order to indicate that these pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8578) * can be offlined as there are no direct references anymore.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8579) * For actually unmovable PageOffline() where the driver does
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8580) * not support this, we will fail later when trying to actually
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8581) * move these pages that still have a reference count > 0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8582) * (false negatives in this function only)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8583) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8584) if ((flags & MEMORY_OFFLINE) && PageOffline(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8585) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8586)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8587) if (__PageMovable(page) || PageLRU(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8588) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8589)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8590) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8591) * If there are RECLAIMABLE pages, we need to check
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8592) * it. But now, memory offline itself doesn't call
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8593) * shrink_node_slabs() and it still to be fixed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8594) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8595) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8596) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8597) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8598) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8599)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8600) #ifdef CONFIG_CONTIG_ALLOC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8601) static unsigned long pfn_max_align_down(unsigned long pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8602) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8603) return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8604) pageblock_nr_pages) - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8605) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8606)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8607) unsigned long pfn_max_align_up(unsigned long pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8608) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8609) return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8610) pageblock_nr_pages));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8611) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8612)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8613) #if defined(CONFIG_DYNAMIC_DEBUG) || \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8614) (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8615) /* Usage: See admin-guide/dynamic-debug-howto.rst */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8616) static void alloc_contig_dump_pages(struct list_head *page_list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8617) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8618) DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8619)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8620) if (DYNAMIC_DEBUG_BRANCH(descriptor)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8621) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8622) unsigned long nr_skip = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8623) unsigned long nr_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8624)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8625) dump_stack();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8626) list_for_each_entry(page, page_list, lru) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8627) nr_pages++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8628) /* The page will be freed by putback_movable_pages soon */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8629) if (page_count(page) == 1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8630) nr_skip++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8631) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8632) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8633) dump_page(page, "migration failure");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8634) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8635) pr_warn("total dump_pages %lu skipping %lu\n", nr_pages, nr_skip);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8636) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8637) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8638) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8639) static inline void alloc_contig_dump_pages(struct list_head *page_list)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8640) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8641) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8642) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8643)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8644) /* [start, end) must belong to a single zone. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8645) static int __alloc_contig_migrate_range(struct compact_control *cc,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8646) unsigned long start, unsigned long end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8647) struct acr_info *info)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8648) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8649) /* This function is based on compact_zone() from compaction.c. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8650) unsigned int nr_reclaimed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8651) unsigned long pfn = start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8652) unsigned int tries = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8653) unsigned int max_tries = 5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8654) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8655) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8656) struct migration_target_control mtc = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8657) .nid = zone_to_nid(cc->zone),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8658) .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8659) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8660)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8661) if (cc->alloc_contig && cc->mode == MIGRATE_ASYNC)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8662) max_tries = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8663)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8664) lru_cache_disable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8665)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8666) while (pfn < end || !list_empty(&cc->migratepages)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8667) if (fatal_signal_pending(current)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8668) ret = -EINTR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8669) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8670) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8671)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8672) if (list_empty(&cc->migratepages)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8673) cc->nr_migratepages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8674) pfn = isolate_migratepages_range(cc, pfn, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8675) if (!pfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8676) ret = -EINTR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8677) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8678) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8679) tries = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8680) } else if (++tries == max_tries) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8681) ret = ret < 0 ? ret : -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8682) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8683) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8684)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8685) nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8686) &cc->migratepages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8687) info->nr_reclaimed += nr_reclaimed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8688) cc->nr_migratepages -= nr_reclaimed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8689)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8690) list_for_each_entry(page, &cc->migratepages, lru)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8691) info->nr_mapped += page_mapcount(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8692)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8693) ret = migrate_pages(&cc->migratepages, alloc_migration_target,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8694) NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8695) if (!ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8696) info->nr_migrated += cc->nr_migratepages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8697) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8698)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8699) lru_cache_enable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8700) if (ret < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8701) if (ret == -EBUSY) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8702) alloc_contig_dump_pages(&cc->migratepages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8703) page_pinner_mark_migration_failed_pages(&cc->migratepages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8704) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8705)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8706) if (!list_empty(&cc->migratepages)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8707) page = list_first_entry(&cc->migratepages, struct page , lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8708) info->failed_pfn = page_to_pfn(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8709) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8710)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8711) putback_movable_pages(&cc->migratepages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8712) info->err |= ACR_ERR_MIGRATE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8713) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8714) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8715) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8716) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8717)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8718) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8719) * alloc_contig_range() -- tries to allocate given range of pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8720) * @start: start PFN to allocate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8721) * @end: one-past-the-last PFN to allocate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8722) * @migratetype: migratetype of the underlaying pageblocks (either
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8723) * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8724) * in range must have the same migratetype and it must
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8725) * be either of the two.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8726) * @gfp_mask: GFP mask to use during compaction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8727) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8728) * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8729) * aligned. The PFN range must belong to a single zone.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8730) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8731) * The first thing this routine does is attempt to MIGRATE_ISOLATE all
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8732) * pageblocks in the range. Once isolated, the pageblocks should not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8733) * be modified by others.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8734) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8735) * Return: zero on success or negative error code. On success all
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8736) * pages which PFN is in [start, end) are allocated for the caller and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8737) * need to be freed with free_contig_range().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8738) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8739) int alloc_contig_range(unsigned long start, unsigned long end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8740) unsigned migratetype, gfp_t gfp_mask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8741) struct acr_info *info)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8742) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8743) unsigned long outer_start, outer_end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8744) unsigned int order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8745) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8746) bool skip_drain_all_pages = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8747)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8748) struct compact_control cc = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8749) .nr_migratepages = 0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8750) .order = -1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8751) .zone = page_zone(pfn_to_page(start)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8752) .mode = gfp_mask & __GFP_NORETRY ? MIGRATE_ASYNC : MIGRATE_SYNC,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8753) .ignore_skip_hint = true,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8754) .no_set_skip_hint = true,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8755) .gfp_mask = current_gfp_context(gfp_mask),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8756) .alloc_contig = true,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8757) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8758) INIT_LIST_HEAD(&cc.migratepages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8759)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8760) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8761) * What we do here is we mark all pageblocks in range as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8762) * MIGRATE_ISOLATE. Because pageblock and max order pages may
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8763) * have different sizes, and due to the way page allocator
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8764) * work, we align the range to biggest of the two pages so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8765) * that page allocator won't try to merge buddies from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8766) * different pageblocks and change MIGRATE_ISOLATE to some
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8767) * other migration type.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8768) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8769) * Once the pageblocks are marked as MIGRATE_ISOLATE, we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8770) * migrate the pages from an unaligned range (ie. pages that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8771) * we are interested in). This will put all the pages in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8772) * range back to page allocator as MIGRATE_ISOLATE.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8773) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8774) * When this is done, we take the pages in range from page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8775) * allocator removing them from the buddy system. This way
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8776) * page allocator will never consider using them.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8777) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8778) * This lets us mark the pageblocks back as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8779) * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8780) * aligned range but not in the unaligned, original range are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8781) * put back to page allocator so that buddy can use them.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8782) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8783)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8784) ret = start_isolate_page_range(pfn_max_align_down(start),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8785) pfn_max_align_up(end), migratetype, 0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8786) &info->failed_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8787) if (ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8788) info->err |= ACR_ERR_ISOLATE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8789) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8790) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8791)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8792) trace_android_vh_cma_drain_all_pages_bypass(migratetype,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8793) &skip_drain_all_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8794) if (!skip_drain_all_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8795) drain_all_pages(cc.zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8796)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8797) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8798) * In case of -EBUSY, we'd like to know which page causes problem.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8799) * So, just fall through. test_pages_isolated() has a tracepoint
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8800) * which will report the busy page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8801) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8802) * It is possible that busy pages could become available before
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8803) * the call to test_pages_isolated, and the range will actually be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8804) * allocated. So, if we fall through be sure to clear ret so that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8805) * -EBUSY is not accidentally used or returned to caller.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8806) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8807) ret = __alloc_contig_migrate_range(&cc, start, end, info);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8808) if (ret && (ret != -EBUSY || (gfp_mask & __GFP_NORETRY)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8809) goto done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8810) ret =0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8811)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8812) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8813) * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8814) * aligned blocks that are marked as MIGRATE_ISOLATE. What's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8815) * more, all pages in [start, end) are free in page allocator.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8816) * What we are going to do is to allocate all pages from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8817) * [start, end) (that is remove them from page allocator).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8818) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8819) * The only problem is that pages at the beginning and at the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8820) * end of interesting range may be not aligned with pages that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8821) * page allocator holds, ie. they can be part of higher order
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8822) * pages. Because of this, we reserve the bigger range and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8823) * once this is done free the pages we are not interested in.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8824) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8825) * We don't have to hold zone->lock here because the pages are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8826) * isolated thus they won't get removed from buddy.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8827) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8828)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8829) order = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8830) outer_start = start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8831) while (!PageBuddy(pfn_to_page(outer_start))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8832) if (++order >= MAX_ORDER) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8833) outer_start = start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8834) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8835) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8836) outer_start &= ~0UL << order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8837) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8838)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8839) if (outer_start != start) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8840) order = buddy_order(pfn_to_page(outer_start));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8841)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8842) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8843) * outer_start page could be small order buddy page and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8844) * it doesn't include start page. Adjust outer_start
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8845) * in this case to report failed page properly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8846) * on tracepoint in test_pages_isolated()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8847) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8848) if (outer_start + (1UL << order) <= start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8849) outer_start = start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8850) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8851)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8852) /* Make sure the range is really isolated. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8853) if (test_pages_isolated(outer_start, end, 0, &info->failed_pfn)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8854) pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8855) __func__, outer_start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8856) ret = -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8857) info->err |= ACR_ERR_TEST;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8858) goto done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8859) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8860)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8861) /* Grab isolated pages from freelists. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8862) outer_end = isolate_freepages_range(&cc, outer_start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8863) if (!outer_end) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8864) ret = -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8865) goto done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8866) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8867)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8868) /* Free head and tail (if any) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8869) if (start != outer_start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8870) free_contig_range(outer_start, start - outer_start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8871) if (end != outer_end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8872) free_contig_range(end, outer_end - end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8873)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8874) done:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8875) undo_isolate_page_range(pfn_max_align_down(start),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8876) pfn_max_align_up(end), migratetype);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8877) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8878) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8879) EXPORT_SYMBOL(alloc_contig_range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8880)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8881) static int __alloc_contig_pages(unsigned long start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8882) unsigned long nr_pages, gfp_t gfp_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8883) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8884) struct acr_info dummy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8885) unsigned long end_pfn = start_pfn + nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8886)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8887) return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8888) gfp_mask, &dummy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8889) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8890)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8891) static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8892) unsigned long nr_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8893) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8894) unsigned long i, end_pfn = start_pfn + nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8895) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8896)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8897) for (i = start_pfn; i < end_pfn; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8898) page = pfn_to_online_page(i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8899) if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8900) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8901)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8902) if (page_zone(page) != z)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8903) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8904)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8905) if (PageReserved(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8906) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8907)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8908) if (page_count(page) > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8909) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8910)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8911) if (PageHuge(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8912) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8913) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8914) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8915) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8916)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8917) static bool zone_spans_last_pfn(const struct zone *zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8918) unsigned long start_pfn, unsigned long nr_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8919) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8920) unsigned long last_pfn = start_pfn + nr_pages - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8921)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8922) return zone_spans_pfn(zone, last_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8923) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8924)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8925) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8926) * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8927) * @nr_pages: Number of contiguous pages to allocate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8928) * @gfp_mask: GFP mask to limit search and used during compaction
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8929) * @nid: Target node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8930) * @nodemask: Mask for other possible nodes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8931) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8932) * This routine is a wrapper around alloc_contig_range(). It scans over zones
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8933) * on an applicable zonelist to find a contiguous pfn range which can then be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8934) * tried for allocation with alloc_contig_range(). This routine is intended
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8935) * for allocation requests which can not be fulfilled with the buddy allocator.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8936) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8937) * The allocated memory is always aligned to a page boundary. If nr_pages is a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8938) * power of two then the alignment is guaranteed to be to the given nr_pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8939) * (e.g. 1GB request would be aligned to 1GB).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8940) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8941) * Allocated pages can be freed with free_contig_range() or by manually calling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8942) * __free_page() on each allocated page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8943) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8944) * Return: pointer to contiguous pages on success, or NULL if not successful.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8945) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8946) struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8947) int nid, nodemask_t *nodemask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8948) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8949) unsigned long ret, pfn, flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8950) struct zonelist *zonelist;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8951) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8952) struct zoneref *z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8953)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8954) zonelist = node_zonelist(nid, gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8955) for_each_zone_zonelist_nodemask(zone, z, zonelist,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8956) gfp_zone(gfp_mask), nodemask) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8957) spin_lock_irqsave(&zone->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8958)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8959) pfn = ALIGN(zone->zone_start_pfn, nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8960) while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8961) if (pfn_range_valid_contig(zone, pfn, nr_pages)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8962) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8963) * We release the zone lock here because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8964) * alloc_contig_range() will also lock the zone
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8965) * at some point. If there's an allocation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8966) * spinning on this lock, it may win the race
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8967) * and cause alloc_contig_range() to fail...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8968) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8969) spin_unlock_irqrestore(&zone->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8970) ret = __alloc_contig_pages(pfn, nr_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8971) gfp_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8972) if (!ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8973) return pfn_to_page(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8974) spin_lock_irqsave(&zone->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8975) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8976) pfn += nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8977) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8978) spin_unlock_irqrestore(&zone->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8979) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8980) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8981) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8982) #endif /* CONFIG_CONTIG_ALLOC */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8983)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8984) void free_contig_range(unsigned long pfn, unsigned int nr_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8985) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8986) unsigned int count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8987)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8988) for (; nr_pages--; pfn++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8989) struct page *page = pfn_to_page(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8990)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8991) count += page_count(page) != 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8992) __free_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8993) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8994) WARN(count != 0, "%d pages are still in use!\n", count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8995) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8996) EXPORT_SYMBOL(free_contig_range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8997)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8998) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8999) * The zone indicated has a new number of managed_pages; batch sizes and percpu
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9000) * page high values need to be recalulated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9001) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9002) void __meminit zone_pcp_update(struct zone *zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9003) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9004) mutex_lock(&pcp_batch_high_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9005) __zone_pcp_update(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9006) mutex_unlock(&pcp_batch_high_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9007) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9008)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9009) void zone_pcp_reset(struct zone *zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9010) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9011) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9012) int cpu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9013) struct per_cpu_pageset *pset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9014)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9015) /* avoid races with drain_pages() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9016) local_irq_save(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9017) if (zone->pageset != &boot_pageset) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9018) for_each_online_cpu(cpu) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9019) pset = per_cpu_ptr(zone->pageset, cpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9020) drain_zonestat(zone, pset);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9021) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9022) free_percpu(zone->pageset);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9023) zone->pageset = &boot_pageset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9024) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9025) local_irq_restore(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9026) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9027)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9028) #ifdef CONFIG_MEMORY_HOTREMOVE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9029) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9030) * All pages in the range must be in a single zone, must not contain holes,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9031) * must span full sections, and must be isolated before calling this function.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9032) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9033) void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9034) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9035) unsigned long pfn = start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9036) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9037) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9038) unsigned int order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9039) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9040)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9041) offline_mem_sections(pfn, end_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9042) zone = page_zone(pfn_to_page(pfn));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9043) spin_lock_irqsave(&zone->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9044) while (pfn < end_pfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9045) page = pfn_to_page(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9046) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9047) * The HWPoisoned page may be not in buddy system, and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9048) * page_count() is not 0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9049) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9050) if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9051) pfn++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9052) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9053) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9054) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9055) * At this point all remaining PageOffline() pages have a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9056) * reference count of 0 and can simply be skipped.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9057) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9058) if (PageOffline(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9059) BUG_ON(page_count(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9060) BUG_ON(PageBuddy(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9061) pfn++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9062) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9063) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9064)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9065) BUG_ON(page_count(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9066) BUG_ON(!PageBuddy(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9067) order = buddy_order(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9068) del_page_from_free_list(page, zone, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9069) pfn += (1 << order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9070) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9071) spin_unlock_irqrestore(&zone->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9072) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9073) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9074)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9075) bool is_free_buddy_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9076) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9077) struct zone *zone = page_zone(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9078) unsigned long pfn = page_to_pfn(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9079) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9080) unsigned int order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9081)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9082) spin_lock_irqsave(&zone->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9083) for (order = 0; order < MAX_ORDER; order++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9084) struct page *page_head = page - (pfn & ((1 << order) - 1));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9085)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9086) if (PageBuddy(page_head) && buddy_order(page_head) >= order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9087) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9088) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9089) spin_unlock_irqrestore(&zone->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9090)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9091) return order < MAX_ORDER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9092) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9093)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9094) #ifdef CONFIG_MEMORY_FAILURE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9095) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9096) * Break down a higher-order page in sub-pages, and keep our target out of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9097) * buddy allocator.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9098) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9099) static void break_down_buddy_pages(struct zone *zone, struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9100) struct page *target, int low, int high,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9101) int migratetype)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9102) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9103) unsigned long size = 1 << high;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9104) struct page *current_buddy, *next_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9105)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9106) while (high > low) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9107) high--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9108) size >>= 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9109)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9110) if (target >= &page[size]) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9111) next_page = page + size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9112) current_buddy = page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9113) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9114) next_page = page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9115) current_buddy = page + size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9116) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9117)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9118) if (set_page_guard(zone, current_buddy, high, migratetype))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9119) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9120)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9121) if (current_buddy != target) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9122) add_to_free_list(current_buddy, zone, high, migratetype);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9123) set_buddy_order(current_buddy, high);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9124) page = next_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9125) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9126) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9127) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9128)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9129) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9130) * Take a page that will be marked as poisoned off the buddy allocator.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9131) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9132) bool take_page_off_buddy(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9133) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9134) struct zone *zone = page_zone(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9135) unsigned long pfn = page_to_pfn(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9136) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9137) unsigned int order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9138) bool ret = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9139)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9140) spin_lock_irqsave(&zone->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9141) for (order = 0; order < MAX_ORDER; order++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9142) struct page *page_head = page - (pfn & ((1 << order) - 1));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9143) int page_order = buddy_order(page_head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9144)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9145) if (PageBuddy(page_head) && page_order >= order) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9146) unsigned long pfn_head = page_to_pfn(page_head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9147) int migratetype = get_pfnblock_migratetype(page_head,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9148) pfn_head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9149)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9150) del_page_from_free_list(page_head, zone, page_order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9151) break_down_buddy_pages(zone, page_head, page, 0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9152) page_order, migratetype);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9153) if (!is_migrate_isolate(migratetype))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9154) __mod_zone_freepage_state(zone, -1, migratetype);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9155) ret = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9156) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9157) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9158) if (page_count(page_head) > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9159) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9160) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9161) spin_unlock_irqrestore(&zone->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9162) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9163) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9164) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9165)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9166) #ifdef CONFIG_ZONE_DMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9167) bool has_managed_dma(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9168) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9169) struct pglist_data *pgdat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9170)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9171) for_each_online_pgdat(pgdat) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9172) struct zone *zone = &pgdat->node_zones[ZONE_DMA];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9173)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9174) if (managed_zone(zone))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9175) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9176) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9177) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9178) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9179) #endif /* CONFIG_ZONE_DMA */