^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0-only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * linux/mm/memory_hotplug.c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Copyright (C)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) #include <linux/stddef.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) #include <linux/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) #include <linux/sched/signal.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) #include <linux/swap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #include <linux/interrupt.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) #include <linux/pagemap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #include <linux/compiler.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #include <linux/export.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #include <linux/pagevec.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #include <linux/writeback.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #include <linux/slab.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #include <linux/sysctl.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #include <linux/cpu.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #include <linux/memory.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) #include <linux/memremap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) #include <linux/memory_hotplug.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) #include <linux/highmem.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #include <linux/vmalloc.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) #include <linux/ioport.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) #include <linux/delay.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) #include <linux/migrate.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) #include <linux/page-isolation.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) #include <linux/pfn.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) #include <linux/suspend.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) #include <linux/mm_inline.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) #include <linux/firmware-map.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) #include <linux/stop_machine.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) #include <linux/hugetlb.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) #include <linux/memblock.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) #include <linux/compaction.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) #include <linux/rmap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) #include <asm/tlbflush.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) #include "internal.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) #include "shuffle.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) * online_page_callback contains pointer to current page onlining function.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) * Initially it is generic_online_page(). If it is required it could be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) * changed by calling set_online_page_callback() for callback registration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) * and restore_online_page_callback() for generic callback restore.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) static online_page_callback_t online_page_callback = generic_online_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) static DEFINE_MUTEX(online_page_callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) void get_online_mems(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) percpu_down_read(&mem_hotplug_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) void put_online_mems(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) percpu_up_read(&mem_hotplug_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) bool movable_node_enabled = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) int memhp_default_online_type = MMOP_OFFLINE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) int memhp_default_online_type = MMOP_ONLINE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) static int __init setup_memhp_default_state(char *str)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) const int online_type = memhp_online_type_from_str(str);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) if (online_type >= 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) memhp_default_online_type = online_type;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) __setup("memhp_default_state=", setup_memhp_default_state);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) void mem_hotplug_begin(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) cpus_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) percpu_down_write(&mem_hotplug_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) void mem_hotplug_done(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) percpu_up_write(&mem_hotplug_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) cpus_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) u64 max_mem_size = U64_MAX;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) /* add this memory to iomem resource */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) static struct resource *register_memory_resource(u64 start, u64 size,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) const char *resource_name)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) struct resource *res;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) if (strcmp(resource_name, "System RAM"))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) flags |= IORESOURCE_SYSRAM_DRIVER_MANAGED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) * Make sure value parsed from 'mem=' only restricts memory adding
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) * while booting, so that memory hotplug won't be impacted. Please
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) * refer to document of 'mem=' in kernel-parameters.txt for more
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) * details.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) if (start + size > max_mem_size && system_state < SYSTEM_RUNNING)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) return ERR_PTR(-E2BIG);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) * Request ownership of the new memory range. This might be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) * a child of an existing resource that was present but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) * not marked as busy.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) res = __request_region(&iomem_resource, start, size,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) resource_name, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) if (!res) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) pr_debug("Unable to reserve System RAM region: %016llx->%016llx\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) start, start + size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) return ERR_PTR(-EEXIST);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) return res;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) static void release_memory_resource(struct resource *res)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) if (!res)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) release_resource(res);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) kfree(res);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) void get_page_bootmem(unsigned long info, struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) unsigned long type)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) page->freelist = (void *)type;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) SetPagePrivate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) set_page_private(page, info);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) page_ref_inc(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) void put_page_bootmem(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) unsigned long type;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) type = (unsigned long) page->freelist;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) if (page_ref_dec_return(page) == 1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) page->freelist = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) ClearPagePrivate(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) set_page_private(page, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) INIT_LIST_HEAD(&page->lru);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) free_reserved_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) #ifndef CONFIG_SPARSEMEM_VMEMMAP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) static void register_page_bootmem_info_section(unsigned long start_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) unsigned long mapsize, section_nr, i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) struct mem_section *ms;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) struct page *page, *memmap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) struct mem_section_usage *usage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) section_nr = pfn_to_section_nr(start_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) ms = __nr_to_section(section_nr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) /* Get section's memmap address */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) * Get page for the memmap's phys address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) * XXX: need more consideration for sparse_vmemmap...
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) page = virt_to_page(memmap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) mapsize = sizeof(struct page) * PAGES_PER_SECTION;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) /* remember memmap's page */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) for (i = 0; i < mapsize; i++, page++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) get_page_bootmem(section_nr, page, SECTION_INFO);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) usage = ms->usage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) page = virt_to_page(usage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) for (i = 0; i < mapsize; i++, page++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) #else /* CONFIG_SPARSEMEM_VMEMMAP */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) static void register_page_bootmem_info_section(unsigned long start_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) unsigned long mapsize, section_nr, i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) struct mem_section *ms;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) struct page *page, *memmap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) struct mem_section_usage *usage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) section_nr = pfn_to_section_nr(start_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) ms = __nr_to_section(section_nr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) usage = ms->usage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) page = virt_to_page(usage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) for (i = 0; i < mapsize; i++, page++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) unsigned long i, pfn, end_pfn, nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) int node = pgdat->node_id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) page = virt_to_page(pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) for (i = 0; i < nr_pages; i++, page++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) get_page_bootmem(node, page, NODE_INFO);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) pfn = pgdat->node_start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) end_pfn = pgdat_end_pfn(pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) /* register section info */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) * Some platforms can assign the same pfn to multiple nodes - on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) * node0 as well as nodeN. To avoid registering a pfn against
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) * multiple nodes we check that this pfn does not already
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) * reside in some other nodes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) register_page_bootmem_info_section(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) const char *reason)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) * Disallow all operations smaller than a sub-section and only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) * allow operations smaller than a section for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) * SPARSEMEM_VMEMMAP. Note that check_hotplug_memory_range()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) * enforces a larger memory_block_size_bytes() granularity for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) * memory that will be marked online, so this check should only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) * fire for direct arch_{add,remove}_memory() users outside of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) * add_memory_resource().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) unsigned long min_align;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) min_align = PAGES_PER_SUBSECTION;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) min_align = PAGES_PER_SECTION;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) if (!IS_ALIGNED(pfn, min_align)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) || !IS_ALIGNED(nr_pages, min_align)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) WARN(1, "Misaligned __%s_pages start: %#lx end: #%lx\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) reason, pfn, pfn + nr_pages - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) static int check_hotplug_memory_addressable(unsigned long pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) unsigned long nr_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) const u64 max_addr = PFN_PHYS(pfn + nr_pages) - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) if (max_addr >> MAX_PHYSMEM_BITS) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) const u64 max_allowed = (1ull << (MAX_PHYSMEM_BITS + 1)) - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) WARN(1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) "Hotplugged memory exceeds maximum addressable address, range=%#llx-%#llx, maximum=%#llx\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) (u64)PFN_PHYS(pfn), max_addr, max_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) return -E2BIG;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) * Reasonably generic function for adding memory. It is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) * expected that archs that support memory hotplug will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) * call this function after deciding the zone to which to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) * add the new pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) struct mhp_params *params)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) const unsigned long end_pfn = pfn + nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) unsigned long cur_nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) struct vmem_altmap *altmap = params->altmap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) if (WARN_ON_ONCE(!params->pgprot.pgprot))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) err = check_hotplug_memory_addressable(pfn, nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) if (altmap) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) * Validate altmap is within bounds of the total request
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) if (altmap->base_pfn != pfn
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) || vmem_altmap_offset(altmap) > nr_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) pr_warn_once("memory add fail, invalid altmap\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) altmap->alloc = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) err = check_pfn_span(pfn, nr_pages, "add");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) for (; pfn < end_pfn; pfn += cur_nr_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) /* Select all remaining pages up to the next section boundary */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) cur_nr_pages = min(end_pfn - pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) SECTION_ALIGN_UP(pfn + 1) - pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) err = sparse_add_section(nid, pfn, cur_nr_pages, altmap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) vmemmap_populate_print_last();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) /* find the smallest valid pfn in the range [start_pfn, end_pfn) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) unsigned long start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) unsigned long end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) if (unlikely(!pfn_to_online_page(start_pfn)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) if (unlikely(pfn_to_nid(start_pfn) != nid))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) if (zone != page_zone(pfn_to_page(start_pfn)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) return start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) /* find the biggest valid pfn in the range [start_pfn, end_pfn). */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) unsigned long start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) unsigned long end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) unsigned long pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) /* pfn is the end pfn of a memory section. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) pfn = end_pfn - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) if (unlikely(!pfn_to_online_page(pfn)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) if (unlikely(pfn_to_nid(pfn) != nid))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) if (zone != page_zone(pfn_to_page(pfn)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) return pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) unsigned long end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) unsigned long pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) int nid = zone_to_nid(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) zone_span_writelock(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) if (zone->zone_start_pfn == start_pfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) * If the section is smallest section in the zone, it need
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) * shrink zone->zone_start_pfn and zone->zone_spanned_pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) * In this case, we find second smallest valid mem_section
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) * for shrinking zone.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) pfn = find_smallest_section_pfn(nid, zone, end_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) zone_end_pfn(zone));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) if (pfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) zone->spanned_pages = zone_end_pfn(zone) - pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) zone->zone_start_pfn = pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) zone->zone_start_pfn = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) zone->spanned_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) } else if (zone_end_pfn(zone) == end_pfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) * If the section is biggest section in the zone, it need
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) * shrink zone->spanned_pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) * In this case, we find second biggest valid mem_section for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) * shrinking zone.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) start_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) if (pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) zone->spanned_pages = pfn - zone->zone_start_pfn + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) zone->zone_start_pfn = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) zone->spanned_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) zone_span_writeunlock(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) static void update_pgdat_span(struct pglist_data *pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) unsigned long node_start_pfn = 0, node_end_pfn = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) for (zone = pgdat->node_zones;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) unsigned long zone_end_pfn = zone->zone_start_pfn +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) zone->spanned_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) /* No need to lock the zones, they can't change. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) if (!zone->spanned_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) if (!node_end_pfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) node_start_pfn = zone->zone_start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) node_end_pfn = zone_end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) if (zone_end_pfn > node_end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) node_end_pfn = zone_end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) if (zone->zone_start_pfn < node_start_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) node_start_pfn = zone->zone_start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) pgdat->node_start_pfn = node_start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) pgdat->node_spanned_pages = node_end_pfn - node_start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) void __ref remove_pfn_range_from_zone(struct zone *zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) unsigned long start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) unsigned long nr_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) const unsigned long end_pfn = start_pfn + nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) struct pglist_data *pgdat = zone->zone_pgdat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) unsigned long pfn, cur_nr_pages, flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) /* Poison struct pages because they are now uninitialized again. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) /* Select all remaining pages up to the next section boundary */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) cur_nr_pages =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) page_init_poison(pfn_to_page(pfn),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) sizeof(struct page) * cur_nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) #ifdef CONFIG_ZONE_DEVICE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) * Zone shrinking code cannot properly deal with ZONE_DEVICE. So
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) * we will not try to shrink the zones - which is okay as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) * set_zone_contiguous() cannot deal with ZONE_DEVICE either way.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) if (zone_idx(zone) == ZONE_DEVICE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) clear_zone_contiguous(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) pgdat_resize_lock(zone->zone_pgdat, &flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) update_pgdat_span(pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) pgdat_resize_unlock(zone->zone_pgdat, &flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) set_zone_contiguous(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) static void __remove_section(unsigned long pfn, unsigned long nr_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) unsigned long map_offset,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) struct vmem_altmap *altmap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) struct mem_section *ms = __pfn_to_section(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) if (WARN_ON_ONCE(!valid_section(ms)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) * __remove_pages() - remove sections of pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) * @pfn: starting pageframe (must be aligned to start of a section)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) * @nr_pages: number of pages to remove (must be multiple of section size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) * @altmap: alternative device page map or %NULL if default memmap is used
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) * Generic helper function to remove section mappings and sysfs entries
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) * for the section of the memory we are removing. Caller needs to make
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) * sure that pages are marked reserved and zones are adjust properly by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) * calling offline_pages().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) void __remove_pages(unsigned long pfn, unsigned long nr_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) struct vmem_altmap *altmap)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) const unsigned long end_pfn = pfn + nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) unsigned long cur_nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) unsigned long map_offset = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) map_offset = vmem_altmap_offset(altmap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) if (check_pfn_span(pfn, nr_pages, "remove"))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) for (; pfn < end_pfn; pfn += cur_nr_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) /* Select all remaining pages up to the next section boundary */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) cur_nr_pages = min(end_pfn - pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) SECTION_ALIGN_UP(pfn + 1) - pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) __remove_section(pfn, cur_nr_pages, map_offset, altmap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) map_offset = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) int set_online_page_callback(online_page_callback_t callback)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) int rc = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) get_online_mems();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) mutex_lock(&online_page_callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) if (online_page_callback == generic_online_page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) online_page_callback = callback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) rc = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) mutex_unlock(&online_page_callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) put_online_mems();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) EXPORT_SYMBOL_GPL(set_online_page_callback);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) int restore_online_page_callback(online_page_callback_t callback)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) int rc = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) get_online_mems();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) mutex_lock(&online_page_callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) if (online_page_callback == callback) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) online_page_callback = generic_online_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) rc = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) mutex_unlock(&online_page_callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) put_online_mems();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) EXPORT_SYMBOL_GPL(restore_online_page_callback);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) void generic_online_page(struct page *page, unsigned int order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) * Freeing the page with debug_pagealloc enabled will try to unmap it,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) * so we should map it first. This is better than introducing a special
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) * case in page freeing fast path.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) debug_pagealloc_map_pages(page, 1 << order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) __free_pages_core(page, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) totalram_pages_add(1UL << order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) #ifdef CONFIG_HIGHMEM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) if (PageHighMem(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) totalhigh_pages_add(1UL << order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) EXPORT_SYMBOL_GPL(generic_online_page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) const unsigned long end_pfn = start_pfn + nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) unsigned long pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) * Online the pages in MAX_ORDER - 1 aligned chunks. The callback might
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) * decide to not expose all pages to the buddy (e.g., expose them
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) * later). We account all pages as being online and belonging to this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) * zone ("present").
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) (*online_page_callback)(pfn_to_page(pfn), MAX_ORDER - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) /* mark all involved sections as online */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) online_mem_sections(start_pfn, end_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) /* check which state of node_states will be changed when online memory */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) static void node_states_check_changes_online(unsigned long nr_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) struct zone *zone, struct memory_notify *arg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) int nid = zone_to_nid(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) arg->status_change_nid = NUMA_NO_NODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) arg->status_change_nid_normal = NUMA_NO_NODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) arg->status_change_nid_high = NUMA_NO_NODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) if (!node_state(nid, N_MEMORY))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) arg->status_change_nid = nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) arg->status_change_nid_normal = nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) #ifdef CONFIG_HIGHMEM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) if (zone_idx(zone) <= ZONE_HIGHMEM && !node_state(nid, N_HIGH_MEMORY))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) arg->status_change_nid_high = nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) static void node_states_set_node(int node, struct memory_notify *arg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) if (arg->status_change_nid_normal >= 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) node_set_state(node, N_NORMAL_MEMORY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) if (arg->status_change_nid_high >= 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) node_set_state(node, N_HIGH_MEMORY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) if (arg->status_change_nid >= 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) node_set_state(node, N_MEMORY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) unsigned long nr_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) unsigned long old_end_pfn = zone_end_pfn(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) zone->zone_start_pfn = start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) unsigned long nr_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) unsigned long old_end_pfn = pgdat_end_pfn(pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) pgdat->node_start_pfn = start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) * Associate the pfn range with the given zone, initializing the memmaps
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) * and resizing the pgdat/zone data to span the added pages. After this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) * call, all affected pages are PG_reserved.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) * All aligned pageblocks are initialized to the specified migratetype
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) * zone stats (e.g., nr_isolate_pageblock) are touched.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) unsigned long nr_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) struct vmem_altmap *altmap, int migratetype)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) struct pglist_data *pgdat = zone->zone_pgdat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) int nid = pgdat->node_id;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) clear_zone_contiguous(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) pgdat_resize_lock(pgdat, &flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) zone_span_writelock(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) if (zone_is_empty(zone))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) init_currently_empty_zone(zone, start_pfn, nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) resize_zone_range(zone, start_pfn, nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) zone_span_writeunlock(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) resize_pgdat_range(pgdat, start_pfn, nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) pgdat_resize_unlock(pgdat, &flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) * TODO now we have a visible range of pages which are not associated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) * with their zone properly. Not nice but set_pfnblock_flags_mask
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) * expects the zone spans the pfn range. All the pages in the range
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) * are reserved so nobody should be touching them so we should be safe
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, 0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) MEMINIT_HOTPLUG, altmap, migratetype);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) set_zone_contiguous(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) * Returns a default kernel memory zone for the given pfn range.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) * If no kernel zone covers this pfn range it will automatically go
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) * to the ZONE_NORMAL.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) unsigned long nr_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) struct pglist_data *pgdat = NODE_DATA(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) int zid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) for (zid = 0; zid <= ZONE_NORMAL; zid++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) struct zone *zone = &pgdat->node_zones[zid];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) if (zone_intersects(zone, start_pfn, nr_pages))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) return zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) return &pgdat->node_zones[ZONE_NORMAL];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) unsigned long nr_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) struct zone *kernel_zone = default_kernel_zone_for_pfn(nid, start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) struct zone *movable_zone = &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) bool in_kernel = zone_intersects(kernel_zone, start_pfn, nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) bool in_movable = zone_intersects(movable_zone, start_pfn, nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) * We inherit the existing zone in a simple case where zones do not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) * overlap in the given range
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) if (in_kernel ^ in_movable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) return (in_kernel) ? kernel_zone : movable_zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) * If the range doesn't belong to any zone or two zones overlap in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) * given range then we use movable zone only if movable_node is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) * enabled because we always online to a kernel zone by default.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) return movable_node_enabled ? movable_zone : kernel_zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) struct zone *zone_for_pfn_range(int online_type, int nid,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) unsigned long start_pfn, unsigned long nr_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) if (online_type == MMOP_ONLINE_KERNEL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) if (online_type == MMOP_ONLINE_MOVABLE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) return default_zone_for_pfn(nid, start_pfn, nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) int online_type, int nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) int need_zonelists_rebuild = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) struct memory_notify arg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) /* We can only online full sections (e.g., SECTION_IS_ONLINE) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) if (WARN_ON_ONCE(!nr_pages ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) !IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793) mem_hotplug_begin();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) /* associate pfn range with the zone */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) arg.start_pfn = pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) arg.nr_pages = nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) node_states_check_changes_online(nr_pages, zone, &arg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803) ret = memory_notify(MEM_GOING_ONLINE, &arg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) ret = notifier_to_errno(ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) goto failed_addition;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) * Fixup the number of isolated pageblocks before marking the sections
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) * onlining, such that undo_isolate_page_range() works correctly.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) spin_lock_irqsave(&zone->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) spin_unlock_irqrestore(&zone->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) * If this zone is not populated, then it is not in zonelist.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818) * This means the page allocator ignores this zone.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) * So, zonelist must be updated after online.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821) if (!populated_zone(zone)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) need_zonelists_rebuild = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823) setup_zone_pageset(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) online_pages_range(pfn, nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) zone->present_pages += nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829) pgdat_resize_lock(zone->zone_pgdat, &flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) zone->zone_pgdat->node_present_pages += nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) pgdat_resize_unlock(zone->zone_pgdat, &flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) node_states_set_node(nid, &arg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834) if (need_zonelists_rebuild)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) build_all_zonelists(NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) zone_pcp_update(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838) /* Basic onlining is complete, allow allocation of onlined pages. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839) undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) * Freshly onlined pages aren't shuffled (e.g., all pages are placed to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) * the tail of the freelist when undoing isolation). Shuffle the whole
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) * zone to make sure the just onlined pages are properly distributed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845) * across the whole freelist - to create an initial shuffle.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847) shuffle_zone(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849) init_per_zone_wmark_min();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) kswapd_run(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852) kcompactd_run(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854) writeback_set_ratelimit();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) memory_notify(MEM_ONLINE, &arg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857) mem_hotplug_done();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860) failed_addition:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) pr_debug("online_pages [mem %#010llx-%#010llx] failed\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) (unsigned long long) pfn << PAGE_SHIFT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863) (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) memory_notify(MEM_CANCEL_ONLINE, &arg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865) remove_pfn_range_from_zone(zone, pfn, nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) mem_hotplug_done();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869) #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) static void reset_node_present_pages(pg_data_t *pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873) struct zone *z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875) for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) z->present_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878) pgdat->node_present_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881) /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) static pg_data_t __ref *hotadd_new_pgdat(int nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884) struct pglist_data *pgdat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) pgdat = NODE_DATA(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887) if (!pgdat) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) pgdat = arch_alloc_nodedata(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) if (!pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892) pgdat->per_cpu_nodestats =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893) alloc_percpu(struct per_cpu_nodestat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894) arch_refresh_nodedata(nid, pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) int cpu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898) * Reset the nr_zones, order and highest_zoneidx before reuse.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899) * Note that kswapd will init kswapd_highest_zoneidx properly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) * when it starts in the near future.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902) pgdat->nr_zones = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903) pgdat->kswapd_order = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) pgdat->kswapd_highest_zoneidx = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) for_each_online_cpu(cpu) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906) struct per_cpu_nodestat *p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909) memset(p, 0, sizeof(*p));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913) /* we can use NODE_DATA(nid) from here */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914) pgdat->node_id = nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915) pgdat->node_start_pfn = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) /* init node's zones as empty zones, we don't have any present pages.*/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) free_area_init_core_hotplug(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921) * The node we allocated has no zone fallback lists. For avoiding
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922) * to access not-initialized zonelist, build here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924) build_all_zonelists(pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927) * When memory is hot-added, all the memory is in offline state. So
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928) * clear all zones' present_pages because they will be updated in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929) * online_pages() and offline_pages().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931) reset_node_managed_pages(pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932) reset_node_present_pages(pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934) return pgdat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) static void rollback_node_hotadd(int nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939) pg_data_t *pgdat = NODE_DATA(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941) arch_refresh_nodedata(nid, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942) free_percpu(pgdat->per_cpu_nodestats);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943) arch_free_nodedata(pgdat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948) * try_online_node - online a node if offlined
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949) * @nid: the node ID
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950) * @set_node_online: Whether we want to online the node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951) * called by cpu_up() to online a node without onlined memory.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953) * Returns:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) * 1 -> a new node has been allocated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955) * 0 -> the node is already online
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956) * -ENOMEM -> the node could not be allocated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958) static int __try_online_node(int nid, bool set_node_online)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) pg_data_t *pgdat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961) int ret = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963) if (node_online(nid))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) pgdat = hotadd_new_pgdat(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967) if (!pgdat) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968) pr_err("Cannot online node %d due to NULL pgdat\n", nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969) ret = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973) if (set_node_online) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974) node_set_online(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975) ret = register_one_node(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976) BUG_ON(ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983) * Users of this function always want to online/register the node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985) int try_online_node(int nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989) mem_hotplug_begin();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990) ret = __try_online_node(nid, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991) mem_hotplug_done();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995) static int check_hotplug_memory_range(u64 start, u64 size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997) /* memory range must be block size aligned */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998) if (!size || !IS_ALIGNED(start, memory_block_size_bytes()) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999) !IS_ALIGNED(size, memory_block_size_bytes())) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) memory_block_size_bytes(), start, size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) static int online_memory_block(struct memory_block *mem, void *arg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) mem->online_type = memhp_default_online_type;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) return device_online(&mem->dev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) * and online/offline operations (triggered e.g. by sysfs).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) u64 start, size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) bool new_node = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) start = res->start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) size = resource_size(res);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) ret = check_hotplug_memory_range(start, size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) if (!node_possible(nid)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) WARN(1, "node %d was absent from the node_possible_map\n", nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) mem_hotplug_begin();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) memblock_add_node(start, size, nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) ret = __try_online_node(nid, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) if (ret < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) goto error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) new_node = ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) /* call arch's memory hotadd */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) ret = arch_add_memory(nid, start, size, ¶ms);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) if (ret < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) goto error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) /* create memory block devices after memory was added */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) ret = create_memory_block_devices(start, size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) if (ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) arch_remove_memory(nid, start, size, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) goto error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061) if (new_node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) /* If sysfs file of new node can't be created, cpu on the node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) * can't be hot-added. There is no rollback way now.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) * So, check by BUG_ON() to catch it reluctantly..
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) * We online node here. We can't roll back from here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) node_set_online(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) ret = __register_one_node(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) BUG_ON(ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) /* link memory sections under this node.*/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073) link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) MEMINIT_HOTPLUG);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) /* create new memmap entry */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) if (!strcmp(res->name, "System RAM"))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) firmware_map_add_hotplug(start, start + size, "System RAM");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) /* device_online() will take the lock when calling online_pages() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081) mem_hotplug_done();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084) * In case we're allowed to merge the resource, flag it and trigger
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) * merging now that adding succeeded.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) if (mhp_flags & MEMHP_MERGE_RESOURCE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) merge_system_ram_resource(res);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) /* online pages if requested */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) if (memhp_default_online_type != MMOP_OFFLINE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) walk_memory_blocks(start, size, NULL, online_memory_block);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) error:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) /* rollback pgdat allocation and others */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) if (new_node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) rollback_node_hotadd(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) memblock_remove(start, size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) mem_hotplug_done();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105) /* requires device_hotplug_lock, see add_memory_resource() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) struct resource *res;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) res = register_memory_resource(start, size, "System RAM");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) if (IS_ERR(res))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) return PTR_ERR(res);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) ret = add_memory_resource(nid, res, mhp_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) if (ret < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) release_memory_resource(res);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) int rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) lock_device_hotplug();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) rc = __add_memory(nid, start, size, mhp_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) unlock_device_hotplug();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) EXPORT_SYMBOL_GPL(add_memory);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) int add_memory_subsection(int nid, u64 start, u64 size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) struct mhp_params params = { .pgprot = PAGE_KERNEL };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) struct resource *res;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) if (!IS_ALIGNED(start, SUBSECTION_SIZE) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140) !IS_ALIGNED(size, SUBSECTION_SIZE)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) pr_err("%s: start 0x%llx size 0x%llx not aligned to subsection size\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) __func__, start, size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) res = register_memory_resource(start, size, "System RAM");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147) if (IS_ERR(res))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) return PTR_ERR(res);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) mem_hotplug_begin();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) nid = memory_add_physaddr_to_nid(start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154) if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) memblock_add_node(start, size, nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) ret = arch_add_memory(nid, start, size, ¶ms);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158) if (ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) memblock_remove(start, size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) pr_err("%s failed to add subsection start 0x%llx size 0x%llx\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) __func__, start, size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) mem_hotplug_done();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) EXPORT_SYMBOL_GPL(add_memory_subsection);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) * Add special, driver-managed memory to the system as system RAM. Such
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) * memory is not exposed via the raw firmware-provided memmap as system
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) * RAM, instead, it is detected and added by a driver - during cold boot,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) * after a reboot, and after kexec.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) * Reasons why this memory should not be used for the initial memmap of a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177) * kexec kernel or for placing kexec images:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) * - The booting kernel is in charge of determining how this memory will be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) * used (e.g., use persistent memory as system RAM)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) * - Coordination with a hypervisor is required before this memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) * can be used (e.g., inaccessible parts).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) * For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) * memory map") are created. Also, the created memory resource is flagged
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) * with IORESOURCE_SYSRAM_DRIVER_MANAGED, so in-kernel users can special-case
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) * this memory as well (esp., not place kexec images onto it).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188) * The resource_name (visible via /proc/iomem) has to have the format
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) * "System RAM ($DRIVER)".
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) int add_memory_driver_managed(int nid, u64 start, u64 size,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) const char *resource_name, mhp_t mhp_flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194) struct resource *res;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) int rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) if (!resource_name ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) strstr(resource_name, "System RAM (") != resource_name ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199) resource_name[strlen(resource_name) - 1] != ')')
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) lock_device_hotplug();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204) res = register_memory_resource(start, size, resource_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) if (IS_ERR(res)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206) rc = PTR_ERR(res);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) rc = add_memory_resource(nid, res, mhp_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211) if (rc < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) release_memory_resource(res);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) unlock_device_hotplug();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) EXPORT_SYMBOL_GPL(add_memory_driver_managed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) #ifdef CONFIG_MEMORY_HOTREMOVE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) * Confirm all pages in a range [start, end) belong to the same zone (skipping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223) * memory holes). When true, return the zone.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) struct zone *test_pages_in_a_zone(unsigned long start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) unsigned long end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) unsigned long pfn, sec_end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) struct zone *zone = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) pfn < end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) /* Make sure the memory section is present first */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) if (!present_section_nr(pfn_to_section_nr(pfn)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238) for (; pfn < sec_end_pfn && pfn < end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) pfn += MAX_ORDER_NR_PAGES) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240) i = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241) /* This is just a CONFIG_HOLES_IN_ZONE check.*/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242) while ((i < MAX_ORDER_NR_PAGES) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243) !pfn_valid_within(pfn + i))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244) i++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245) if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) /* Check if we got outside of the zone */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) if (zone && !zone_spans_pfn(zone, pfn + i))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250) page = pfn_to_page(pfn + i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) if (zone && page_zone(page) != zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) zone = page_zone(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) return zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261) * Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262) * non-lru movable pages and hugepages). Will skip over most unmovable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263) * pages (esp., pages that can be skipped when offlining), but bail out on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264) * definitely unmovable pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266) * Returns:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267) * 0 in case a movable page is found and movable_pfn was updated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268) * -ENOENT in case no movable page was found.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269) * -EBUSY in case a definitely unmovable page was found.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271) static int scan_movable_pages(unsigned long start, unsigned long end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272) unsigned long *movable_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274) unsigned long pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276) for (pfn = start; pfn < end; pfn++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) struct page *page, *head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) unsigned long skip;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280) if (!pfn_valid(pfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282) page = pfn_to_page(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) if (PageLRU(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284) goto found;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285) if (__PageMovable(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286) goto found;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289) * PageOffline() pages that are not marked __PageMovable() and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1290) * have a reference count > 0 (after MEM_GOING_OFFLINE) are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1291) * definitely unmovable. If their reference count would be 0,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1292) * they could at least be skipped when offlining memory.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1293) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1294) if (PageOffline(page) && page_count(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1295) return -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1296)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1297) if (!PageHuge(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1298) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1299) head = compound_head(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1300) if (page_huge_active(head))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1301) goto found;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1302) skip = compound_nr(head) - (page - head);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1303) pfn += skip - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1304) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1305) return -ENOENT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1306) found:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1307) *movable_pfn = pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1308) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1309) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1310)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1311) static int
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1312) do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1313) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1314) unsigned long pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1315) struct page *page, *head;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1316) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1317) LIST_HEAD(source);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1318) static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1319) DEFAULT_RATELIMIT_BURST);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1320)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1321) for (pfn = start_pfn; pfn < end_pfn; pfn++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1322) if (!pfn_valid(pfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1323) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1324) page = pfn_to_page(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1325) head = compound_head(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1326)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1327) if (PageHuge(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1328) pfn = page_to_pfn(head) + compound_nr(head) - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1329) isolate_huge_page(head, &source);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1330) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1331) } else if (PageTransHuge(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1332) pfn = page_to_pfn(head) + thp_nr_pages(page) - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1333)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1334) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1335) * HWPoison pages have elevated reference counts so the migration would
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1336) * fail on them. It also doesn't make any sense to migrate them in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1337) * first place. Still try to unmap such a page in case it is still mapped
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1338) * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1339) * the unmap as the catch all safety net).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1340) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1341) if (PageHWPoison(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1342) if (WARN_ON(PageLRU(page)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1343) isolate_lru_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1344) if (page_mapped(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1345) try_to_unmap(page, TTU_IGNORE_MLOCK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1346) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1347) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1348)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1349) if (!get_page_unless_zero(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1350) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1351) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1352) * We can skip free pages. And we can deal with pages on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1353) * LRU and non-lru movable pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1354) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1355) if (PageLRU(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1356) ret = isolate_lru_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1357) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1358) ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1359) if (!ret) { /* Success */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1360) list_add_tail(&page->lru, &source);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1361) if (!__PageMovable(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1362) inc_node_page_state(page, NR_ISOLATED_ANON +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1363) page_is_file_lru(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1364)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1365) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1366) if (__ratelimit(&migrate_rs)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1367) pr_warn("failed to isolate pfn %lx\n", pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1368) dump_page(page, "isolation failed");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1369) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1370) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1371) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1372) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1373) if (!list_empty(&source)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1374) nodemask_t nmask = node_states[N_MEMORY];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1375) struct migration_target_control mtc = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1376) .nmask = &nmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1377) .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1378) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1379)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1380) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1381) * We have checked that migration range is on a single zone so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1382) * we can use the nid of the first page to all the others.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1383) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1384) mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1385)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1386) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1387) * try to allocate from a different node but reuse this node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1388) * if there are no other online nodes to be used (e.g. we are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1389) * offlining a part of the only existing node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1390) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1391) node_clear(mtc.nid, nmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1392) if (nodes_empty(nmask))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1393) node_set(mtc.nid, nmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1394) ret = migrate_pages(&source, alloc_migration_target, NULL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1395) (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1396) if (ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1397) list_for_each_entry(page, &source, lru) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1398) if (__ratelimit(&migrate_rs)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1399) pr_warn("migrating pfn %lx failed ret:%d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1400) page_to_pfn(page), ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1401) dump_page(page, "migration failure");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1402) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1403) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1404) putback_movable_pages(&source);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1405) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1406) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1407)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1408) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1409) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1410)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1411) static int __init cmdline_parse_movable_node(char *p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1412) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1413) movable_node_enabled = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1414) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1415) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1416) early_param("movable_node", cmdline_parse_movable_node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1417)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1418) /* check which state of node_states will be changed when offline memory */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1419) static void node_states_check_changes_offline(unsigned long nr_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1420) struct zone *zone, struct memory_notify *arg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1421) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1422) struct pglist_data *pgdat = zone->zone_pgdat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1423) unsigned long present_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1424) enum zone_type zt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1425)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1426) arg->status_change_nid = NUMA_NO_NODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1427) arg->status_change_nid_normal = NUMA_NO_NODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1428) arg->status_change_nid_high = NUMA_NO_NODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1429)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1430) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1431) * Check whether node_states[N_NORMAL_MEMORY] will be changed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1432) * If the memory to be offline is within the range
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1433) * [0..ZONE_NORMAL], and it is the last present memory there,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1434) * the zones in that range will become empty after the offlining,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1435) * thus we can determine that we need to clear the node from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1436) * node_states[N_NORMAL_MEMORY].
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1437) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1438) for (zt = 0; zt <= ZONE_NORMAL; zt++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1439) present_pages += pgdat->node_zones[zt].present_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1440) if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1441) arg->status_change_nid_normal = zone_to_nid(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1442)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1443) #ifdef CONFIG_HIGHMEM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1444) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1445) * node_states[N_HIGH_MEMORY] contains nodes which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1446) * have normal memory or high memory.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1447) * Here we add the present_pages belonging to ZONE_HIGHMEM.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1448) * If the zone is within the range of [0..ZONE_HIGHMEM), and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1449) * we determine that the zones in that range become empty,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1450) * we need to clear the node for N_HIGH_MEMORY.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1451) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1452) present_pages += pgdat->node_zones[ZONE_HIGHMEM].present_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1453) if (zone_idx(zone) <= ZONE_HIGHMEM && nr_pages >= present_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1454) arg->status_change_nid_high = zone_to_nid(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1455) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1456)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1457) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1458) * We have accounted the pages from [0..ZONE_NORMAL), and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1459) * in case of CONFIG_HIGHMEM the pages from ZONE_HIGHMEM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1460) * as well.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1461) * Here we count the possible pages from ZONE_MOVABLE.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1462) * If after having accounted all the pages, we see that the nr_pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1463) * to be offlined is over or equal to the accounted pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1464) * we know that the node will become empty, and so, we can clear
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1465) * it for N_MEMORY as well.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1466) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1467) present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1468)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1469) if (nr_pages >= present_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1470) arg->status_change_nid = zone_to_nid(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1471) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1472)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1473) static void node_states_clear_node(int node, struct memory_notify *arg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1474) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1475) if (arg->status_change_nid_normal >= 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1476) node_clear_state(node, N_NORMAL_MEMORY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1477)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1478) if (arg->status_change_nid_high >= 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1479) node_clear_state(node, N_HIGH_MEMORY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1480)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1481) if (arg->status_change_nid >= 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1482) node_clear_state(node, N_MEMORY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1483) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1484)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1485) static int count_system_ram_pages_cb(unsigned long start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1486) unsigned long nr_pages, void *data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1487) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1488) unsigned long *nr_system_ram_pages = data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1489)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1490) *nr_system_ram_pages += nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1491) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1492) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1493)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1494) int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1495) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1496) const unsigned long end_pfn = start_pfn + nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1497) unsigned long pfn, system_ram_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1498) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1499) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1500) struct memory_notify arg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1501) int ret, node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1502) char *reason;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1503)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1504) /* We can only offline full sections (e.g., SECTION_IS_ONLINE) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1505) if (WARN_ON_ONCE(!nr_pages ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1506) !IS_ALIGNED(start_pfn | nr_pages, PAGES_PER_SECTION)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1507) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1508)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1509) mem_hotplug_begin();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1510)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1511) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1512) * Don't allow to offline memory blocks that contain holes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1513) * Consequently, memory blocks with holes can never get onlined
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1514) * via the hotplug path - online_pages() - as hotplugged memory has
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1515) * no holes. This way, we e.g., don't have to worry about marking
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1516) * memory holes PG_reserved, don't need pfn_valid() checks, and can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1517) * avoid using walk_system_ram_range() later.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1518) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1519) walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1520) count_system_ram_pages_cb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1521) if (system_ram_pages != nr_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1522) ret = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1523) reason = "memory holes";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1524) goto failed_removal;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1525) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1526)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1527) /* This makes hotplug much easier...and readable.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1528) we assume this for now. .*/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1529) zone = test_pages_in_a_zone(start_pfn, end_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1530) if (!zone) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1531) ret = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1532) reason = "multizone range";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1533) goto failed_removal;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1534) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1535) node = zone_to_nid(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1536)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1537) lru_cache_disable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1538) /* set above range as isolated */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1539) ret = start_isolate_page_range(start_pfn, end_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1540) MIGRATE_MOVABLE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1541) MEMORY_OFFLINE | REPORT_FAILURE, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1542) if (ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1543) reason = "failure to isolate range";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1544) goto failed_removal_lru_cache_disabled;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1545) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1546)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1547) drain_all_pages(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1548)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1549) arg.start_pfn = start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1550) arg.nr_pages = nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1551) node_states_check_changes_offline(nr_pages, zone, &arg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1552)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1553) ret = memory_notify(MEM_GOING_OFFLINE, &arg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1554) ret = notifier_to_errno(ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1555) if (ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1556) reason = "notifier failure";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1557) goto failed_removal_isolated;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1558) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1559)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1560) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1561) pfn = start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1562) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1563) if (signal_pending(current)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1564) ret = -EINTR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1565) reason = "signal backoff";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1566) goto failed_removal_isolated;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1567) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1568)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1569) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1570)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1571) ret = scan_movable_pages(pfn, end_pfn, &pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1572) if (!ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1573) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1574) * TODO: fatal migration failures should bail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1575) * out
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1576) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1577) do_migrate_range(pfn, end_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1578) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1579) } while (!ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1580)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1581) if (ret != -ENOENT) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1582) reason = "unmovable page";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1583) goto failed_removal_isolated;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1584) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1585)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1586) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1587) * Dissolve free hugepages in the memory block before doing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1588) * offlining actually in order to make hugetlbfs's object
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1589) * counting consistent.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1590) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1591) ret = dissolve_free_huge_pages(start_pfn, end_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1592) if (ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1593) reason = "failure to dissolve huge pages";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1594) goto failed_removal_isolated;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1595) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1596)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1597) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1598) * per-cpu pages are drained after start_isolate_page_range, but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1599) * if there are still pages that are not free, make sure that we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1600) * drain again, because when we isolated range we might have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1601) * raced with another thread that was adding pages to pcp list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1602) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1603) * Forward progress should be still guaranteed because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1604) * pages on the pcp list can only belong to MOVABLE_ZONE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1605) * because has_unmovable_pages explicitly checks for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1606) * PageBuddy on freed pages on other zones.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1607) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1608) ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1609) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1610) drain_all_pages(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1611) } while (ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1612)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1613) /* Mark all sections offline and remove free pages from the buddy. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1614) __offline_isolated_pages(start_pfn, end_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1615) pr_info("Offlined Pages %ld\n", nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1616)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1617) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1618) * The memory sections are marked offline, and the pageblock flags
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1619) * effectively stale; nobody should be touching them. Fixup the number
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1620) * of isolated pageblocks, memory onlining will properly revert this.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1621) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1622) spin_lock_irqsave(&zone->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1623) zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1624) spin_unlock_irqrestore(&zone->lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1625)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1626) lru_cache_enable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1627) /* removal success */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1628) adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1629) zone->present_pages -= nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1630)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1631) pgdat_resize_lock(zone->zone_pgdat, &flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1632) zone->zone_pgdat->node_present_pages -= nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1633) pgdat_resize_unlock(zone->zone_pgdat, &flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1634)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1635) init_per_zone_wmark_min();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1636)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1637) if (!populated_zone(zone)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1638) zone_pcp_reset(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1639) build_all_zonelists(NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1640) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1641) zone_pcp_update(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1642)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1643) node_states_clear_node(node, &arg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1644) if (arg.status_change_nid >= 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1645) kswapd_stop(node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1646) kcompactd_stop(node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1647) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1648)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1649) writeback_set_ratelimit();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1650)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1651) memory_notify(MEM_OFFLINE, &arg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1652) remove_pfn_range_from_zone(zone, start_pfn, nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1653) mem_hotplug_done();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1654) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1655)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1656) failed_removal_isolated:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1657) undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1658) memory_notify(MEM_CANCEL_OFFLINE, &arg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1659) failed_removal_lru_cache_disabled:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1660) lru_cache_enable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1661) failed_removal:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1662) pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1663) (unsigned long long) start_pfn << PAGE_SHIFT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1664) ((unsigned long long) end_pfn << PAGE_SHIFT) - 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1665) reason);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1666) /* pushback to free area */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1667) mem_hotplug_done();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1668) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1669) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1670)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1671) static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1672) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1673) int ret = !is_memblock_offlined(mem);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1674)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1675) if (unlikely(ret)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1676) phys_addr_t beginpa, endpa;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1677)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1678) beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1679) endpa = beginpa + memory_block_size_bytes() - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1680) pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1681) &beginpa, &endpa);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1682)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1683) return -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1684) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1685) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1686) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1687)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1688) static int check_cpu_on_node(pg_data_t *pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1689) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1690) int cpu;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1691)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1692) for_each_present_cpu(cpu) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1693) if (cpu_to_node(cpu) == pgdat->node_id)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1694) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1695) * the cpu on this node isn't removed, and we can't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1696) * offline this node.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1697) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1698) return -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1699) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1700)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1701) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1702) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1703)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1704) static int check_no_memblock_for_node_cb(struct memory_block *mem, void *arg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1705) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1706) int nid = *(int *)arg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1707)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1708) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1709) * If a memory block belongs to multiple nodes, the stored nid is not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1710) * reliable. However, such blocks are always online (e.g., cannot get
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1711) * offlined) and, therefore, are still spanned by the node.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1712) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1713) return mem->nid == nid ? -EEXIST : 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1714) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1715)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1716) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1717) * try_offline_node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1718) * @nid: the node ID
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1719) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1720) * Offline a node if all memory sections and cpus of the node are removed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1721) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1722) * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1723) * and online/offline operations before this call.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1724) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1725) void try_offline_node(int nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1726) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1727) pg_data_t *pgdat = NODE_DATA(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1728) int rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1729)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1730) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1731) * If the node still spans pages (especially ZONE_DEVICE), don't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1732) * offline it. A node spans memory after move_pfn_range_to_zone(),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1733) * e.g., after the memory block was onlined.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1734) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1735) if (pgdat->node_spanned_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1736) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1737)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1738) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1739) * Especially offline memory blocks might not be spanned by the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1740) * node. They will get spanned by the node once they get onlined.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1741) * However, they link to the node in sysfs and can get onlined later.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1742) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1743) rc = for_each_memory_block(&nid, check_no_memblock_for_node_cb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1744) if (rc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1745) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1746)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1747) if (check_cpu_on_node(pgdat))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1748) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1749)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1750) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1751) * all memory/cpu of this node are removed, we can offline this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1752) * node now.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1753) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1754) node_set_offline(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1755) unregister_one_node(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1756) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1757) EXPORT_SYMBOL(try_offline_node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1758)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1759) static int __ref try_remove_memory(int nid, u64 start, u64 size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1760) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1761) int rc = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1762)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1763) BUG_ON(check_hotplug_memory_range(start, size));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1764)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1765) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1766) * All memory blocks must be offlined before removing memory. Check
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1767) * whether all memory blocks in question are offline and return error
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1768) * if this is not the case.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1769) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1770) rc = walk_memory_blocks(start, size, NULL, check_memblock_offlined_cb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1771) if (rc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1772) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1773)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1774) /* remove memmap entry */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1775) firmware_map_remove(start, start + size, "System RAM");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1776)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1777) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1778) * Memory block device removal under the device_hotplug_lock is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1779) * a barrier against racing online attempts.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1780) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1781) remove_memory_block_devices(start, size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1782)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1783) mem_hotplug_begin();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1784)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1785) arch_remove_memory(nid, start, size, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1786)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1787) if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1788) memblock_free(start, size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1789) memblock_remove(start, size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1790) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1791)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1792) release_mem_region_adjustable(start, size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1793)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1794) try_offline_node(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1795)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1796) mem_hotplug_done();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1797) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1798) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1799)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1800) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1801) * remove_memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1802) * @nid: the node ID
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1803) * @start: physical address of the region to remove
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1804) * @size: size of the region to remove
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1805) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1806) * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1807) * and online/offline operations before this call, as required by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1808) * try_offline_node().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1809) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1810) void __remove_memory(int nid, u64 start, u64 size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1811) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1812)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1813) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1814) * trigger BUG() if some memory is not offlined prior to calling this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1815) * function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1816) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1817) if (try_remove_memory(nid, start, size))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1818) BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1819) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1820)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1821) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1822) * Remove memory if every memory block is offline, otherwise return -EBUSY is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1823) * some memory is not offline
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1824) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1825) int remove_memory(int nid, u64 start, u64 size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1826) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1827) int rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1828)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1829) lock_device_hotplug();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1830) rc = try_remove_memory(nid, start, size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1831) unlock_device_hotplug();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1832)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1833) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1834) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1835) EXPORT_SYMBOL_GPL(remove_memory);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1836)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1837) int remove_memory_subsection(int nid, u64 start, u64 size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1838) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1839) if (!IS_ALIGNED(start, SUBSECTION_SIZE) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1840) !IS_ALIGNED(size, SUBSECTION_SIZE)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1841) pr_err("%s: start 0x%llx size 0x%llx not aligned to subsection size\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1842) __func__, start, size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1843) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1844) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1845)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1846) mem_hotplug_begin();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1847) arch_remove_memory(nid, start, size, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1848)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1849) if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1850) memblock_remove(start, size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1851)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1852) release_mem_region_adjustable(start, size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1853)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1854) mem_hotplug_done();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1855)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1856) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1857) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1858) EXPORT_SYMBOL_GPL(remove_memory_subsection);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1859)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1860) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1861) * Try to offline and remove a memory block. Might take a long time to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1862) * finish in case memory is still in use. Primarily useful for memory devices
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1863) * that logically unplugged all memory (so it's no longer in use) and want to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1864) * offline + remove the memory block.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1865) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1866) int offline_and_remove_memory(int nid, u64 start, u64 size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1867) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1868) struct memory_block *mem;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1869) int rc = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1870)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1871) if (!IS_ALIGNED(start, memory_block_size_bytes()) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1872) size != memory_block_size_bytes())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1873) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1874)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1875) lock_device_hotplug();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1876) mem = find_memory_block(__pfn_to_section(PFN_DOWN(start)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1877) if (mem)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1878) rc = device_offline(&mem->dev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1879) /* Ignore if the device is already offline. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1880) if (rc > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1881) rc = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1882)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1883) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1884) * In case we succeeded to offline the memory block, remove it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1885) * This cannot fail as it cannot get onlined in the meantime.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1886) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1887) if (!rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1888) rc = try_remove_memory(nid, start, size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1889) WARN_ON_ONCE(rc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1890) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1891) unlock_device_hotplug();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1892)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1893) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1894) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1895) EXPORT_SYMBOL_GPL(offline_and_remove_memory);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1896) #endif /* CONFIG_MEMORY_HOTREMOVE */