^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) #include <linux/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) #include <linux/mmzone.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) #include <linux/memblock.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) #include <linux/page_ext.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) #include <linux/memory.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) #include <linux/vmalloc.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) #include <linux/kmemleak.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) #include <linux/page_owner.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) #include <linux/page_idle.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) * struct page extension
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) * This is the feature to manage memory for extended data per page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) * Until now, we must modify struct page itself to store extra data per page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) * This requires rebuilding the kernel and it is really time consuming process.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) * And, sometimes, rebuild is impossible due to third party module dependency.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) * At last, enlarging struct page could cause un-wanted system behaviour change.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) * This feature is intended to overcome above mentioned problems. This feature
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) * allocates memory for extended data per page in certain place rather than
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) * the struct page itself. This memory can be accessed by the accessor
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) * functions provided by this code. During the boot process, it checks whether
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) * allocation of huge chunk of memory is needed or not. If not, it avoids
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) * allocating memory at all. With this advantage, we can include this feature
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) * into the kernel in default and can avoid rebuild and solve related problems.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) * To help these things to work well, there are two callbacks for clients. One
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) * is the need callback which is mandatory if user wants to avoid useless
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) * memory allocation at boot-time. The other is optional, init callback, which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) * is used to do proper initialization after memory is allocated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) * The need callback is used to decide whether extended memory allocation is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) * needed or not. Sometimes users want to deactivate some features in this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) * boot and extra memory would be unneccessary. In this case, to avoid
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) * allocating huge chunk of memory, each clients represent their need of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) * extra memory through the need callback. If one of the need callbacks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) * returns true, it means that someone needs extra memory so that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) * page extension core should allocates memory for page extension. If
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) * none of need callbacks return true, memory isn't needed at all in this boot
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) * and page extension core can skip to allocate memory. As result,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) * none of memory is wasted.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) * When need callback returns true, page_ext checks if there is a request for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) * extra memory through size in struct page_ext_operations. If it is non-zero,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) * extra space is allocated for each page_ext entry and offset is returned to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) * user through offset in struct page_ext_operations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) * The init callback is used to do proper initialization after page extension
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) * is completely initialized. In sparse memory system, extra memory is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) * allocated some time later than memmap is allocated. In other words, lifetime
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) * of memory for page extension isn't same with memmap for struct page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) * Therefore, clients can't store extra data until page extension is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) * initialized, even if pages are allocated and used freely. This could
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) * cause inadequate state of extra data per page, so, to prevent it, client
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) * can utilize this callback to initialize the state of it correctly.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) static bool need_page_idle(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) struct page_ext_operations page_idle_ops = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) .need = need_page_idle,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) static struct page_ext_operations *page_ext_ops[] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) #ifdef CONFIG_PAGE_OWNER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) &page_owner_ops,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) &page_idle_ops,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) #ifdef CONFIG_PAGE_PINNER
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) &page_pinner_ops,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) unsigned long page_ext_size = sizeof(struct page_ext);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) static unsigned long total_usage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) static bool __init invoke_need_callbacks(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) int entries = ARRAY_SIZE(page_ext_ops);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) bool need = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) for (i = 0; i < entries; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) if (page_ext_ops[i]->need && page_ext_ops[i]->need()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) page_ext_ops[i]->offset = page_ext_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) page_ext_size += page_ext_ops[i]->size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) need = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) return need;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) static void __init invoke_init_callbacks(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) int entries = ARRAY_SIZE(page_ext_ops);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) for (i = 0; i < entries; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) if (page_ext_ops[i]->init)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) page_ext_ops[i]->init();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) #ifndef CONFIG_SPARSEMEM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) void __init page_ext_init_flatmem_late(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) invoke_init_callbacks();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) static inline struct page_ext *get_entry(void *base, unsigned long index)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) return base + page_ext_size * index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) #ifndef CONFIG_SPARSEMEM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) pgdat->node_page_ext = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) struct page_ext *lookup_page_ext(const struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) unsigned long pfn = page_to_pfn(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) unsigned long index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) struct page_ext *base;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) base = NODE_DATA(page_to_nid(page))->node_page_ext;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) * The sanity checks the page allocator does upon freeing a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) * page can reach here before the page_ext arrays are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) * allocated when feeding a range of pages to the allocator
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) * for the first time during bootup or memory hotplug.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) if (unlikely(!base))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) index = pfn - round_down(node_start_pfn(page_to_nid(page)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) MAX_ORDER_NR_PAGES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) return get_entry(base, index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) EXPORT_SYMBOL_GPL(lookup_page_ext);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) static int __init alloc_node_page_ext(int nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) struct page_ext *base;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) unsigned long table_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) unsigned long nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) nr_pages = NODE_DATA(nid)->node_spanned_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) if (!nr_pages)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) * Need extra space if node range is not aligned with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) * checks buddy's status, range could be out of exact node range.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) nr_pages += MAX_ORDER_NR_PAGES;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) table_size = page_ext_size * nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) base = memblock_alloc_try_nid(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) MEMBLOCK_ALLOC_ACCESSIBLE, nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) if (!base)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) NODE_DATA(nid)->node_page_ext = base;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) total_usage += table_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) void __init page_ext_init_flatmem(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) int nid, fail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) if (!invoke_need_callbacks())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) for_each_online_node(nid) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) fail = alloc_node_page_ext(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) if (fail)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) goto fail;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) pr_info("allocated %ld bytes of page_ext\n", total_usage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) fail:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) pr_crit("allocation of page_ext failed.\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) panic("Out of memory");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) #else /* CONFIG_FLAT_NODE_MEM_MAP */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) struct page_ext *lookup_page_ext(const struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) unsigned long pfn = page_to_pfn(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) struct mem_section *section = __pfn_to_section(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) * The sanity checks the page allocator does upon freeing a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) * page can reach here before the page_ext arrays are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) * allocated when feeding a range of pages to the allocator
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) * for the first time during bootup or memory hotplug.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) if (!section->page_ext)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) return get_entry(section->page_ext, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) EXPORT_SYMBOL_GPL(lookup_page_ext);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) static void *__meminit alloc_page_ext(size_t size, int nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) void *addr = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) addr = alloc_pages_exact_nid(nid, size, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) if (addr) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) kmemleak_alloc(addr, size, 1, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) return addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) addr = vzalloc_node(size, nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) return addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) static int __meminit init_section_page_ext(unsigned long pfn, int nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) struct mem_section *section;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) struct page_ext *base;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) unsigned long table_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) section = __pfn_to_section(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) if (section->page_ext)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) table_size = page_ext_size * PAGES_PER_SECTION;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) base = alloc_page_ext(table_size, nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) * The value stored in section->page_ext is (base - pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) * and it does not point to the memory block allocated above,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) * causing kmemleak false positives.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) kmemleak_not_leak(base);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) if (!base) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) pr_err("page ext allocation failure\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) * The passed "pfn" may not be aligned to SECTION. For the calculation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) * we need to apply a mask.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) pfn &= PAGE_SECTION_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) section->page_ext = (void *)base - page_ext_size * pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) total_usage += table_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) #ifdef CONFIG_MEMORY_HOTPLUG
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) static void free_page_ext(void *addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) if (is_vmalloc_addr(addr)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) vfree(addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) struct page *page = virt_to_page(addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) size_t table_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) table_size = page_ext_size * PAGES_PER_SECTION;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) BUG_ON(PageReserved(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) kmemleak_free(addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) free_pages_exact(addr, table_size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) static void __free_page_ext(unsigned long pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) struct mem_section *ms;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) struct page_ext *base;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) ms = __pfn_to_section(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) if (!ms || !ms->page_ext)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) base = get_entry(ms->page_ext, pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) free_page_ext(base);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) ms->page_ext = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) static int __meminit online_page_ext(unsigned long start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) unsigned long nr_pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) int nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) unsigned long start, end, pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) int fail = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) start = SECTION_ALIGN_DOWN(start_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) end = SECTION_ALIGN_UP(start_pfn + nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) if (nid == NUMA_NO_NODE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) * In this case, "nid" already exists and contains valid memory.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) * "start_pfn" passed to us is a pfn which is an arg for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) * online__pages(), and start_pfn should exist.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) nid = pfn_to_nid(start_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) VM_BUG_ON(!node_state(nid, N_ONLINE));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) fail = init_section_page_ext(pfn, nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) if (!fail)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) /* rollback */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) __free_page_ext(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) static int __meminit offline_page_ext(unsigned long start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) unsigned long nr_pages, int nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) unsigned long start, end, pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) start = SECTION_ALIGN_DOWN(start_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) end = SECTION_ALIGN_UP(start_pfn + nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) __free_page_ext(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) static int __meminit page_ext_callback(struct notifier_block *self,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) unsigned long action, void *arg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) struct memory_notify *mn = arg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) switch (action) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) case MEM_GOING_ONLINE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) ret = online_page_ext(mn->start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) mn->nr_pages, mn->status_change_nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) case MEM_OFFLINE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) offline_page_ext(mn->start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) mn->nr_pages, mn->status_change_nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) case MEM_CANCEL_ONLINE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) offline_page_ext(mn->start_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) mn->nr_pages, mn->status_change_nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) case MEM_GOING_OFFLINE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) case MEM_ONLINE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) case MEM_CANCEL_OFFLINE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) return notifier_from_errno(ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) void __init page_ext_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) unsigned long pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) if (!invoke_need_callbacks())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) for_each_node_state(nid, N_MEMORY) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) unsigned long start_pfn, end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) start_pfn = node_start_pfn(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) end_pfn = node_end_pfn(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) * start_pfn and end_pfn may not be aligned to SECTION and the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) * page->flags of out of node pages are not initialized. So we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) * scan [start_pfn, the biggest section's pfn < end_pfn) here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) for (pfn = start_pfn; pfn < end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) if (!pfn_valid(pfn))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) * Nodes's pfns can be overlapping.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) * We know some arch can have a nodes layout such as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) * -------------pfn-------------->
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) * N0 | N1 | N2 | N0 | N1 | N2|....
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) if (pfn_to_nid(pfn) != nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) if (init_section_page_ext(pfn, nid))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) goto oom;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) hotplug_memory_notifier(page_ext_callback, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) pr_info("allocated %ld bytes of page_ext\n", total_usage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) invoke_init_callbacks();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) oom:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) panic("Out of memory");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) #endif