^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) #include <linux/gfp.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) #include <linux/initrd.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) #include <linux/ioport.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) #include <linux/swap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) #include <linux/memblock.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) #include <linux/swapfile.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) #include <linux/swapops.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) #include <linux/kmemleak.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) #include <linux/sched/task.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) #include <asm/set_memory.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #include <asm/e820/api.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) #include <asm/init.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #include <asm/page.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #include <asm/page_types.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #include <asm/sections.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #include <asm/setup.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #include <asm/tlbflush.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #include <asm/tlb.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #include <asm/proto.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #include <asm/dma.h> /* for MAX_DMA_PFN */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) #include <asm/microcode.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) #include <asm/kaslr.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) #include <asm/hypervisor.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #include <asm/cpufeature.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) #include <asm/pti.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) #include <asm/text-patching.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) #include <asm/memtype.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) * We need to define the tracepoints somewhere, and tlb.c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) * is only compied when SMP=y.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) #define CREATE_TRACE_POINTS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) #include <trace/events/tlb.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) #include "mm_internal.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) * Tables translating between page_cache_type_t and pte encoding.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) * The default values are defined statically as minimal supported mode;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) * WC and WT fall back to UC-. pat_init() updates these values to support
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) * more cache modes, WC and WT, when it is safe to do so. See pat_init()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) * for the details. Note, __early_ioremap() used during early boot-time
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) * takes pgprot_t (pte encoding) and does not use these tables.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) * Index into __cachemode2pte_tbl[] is the cachemode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) * Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) static uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) [_PAGE_CACHE_MODE_WB ] = 0 | 0 ,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) [_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) [_PAGE_CACHE_MODE_UC ] = _PAGE_PWT | _PAGE_PCD,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) [_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) unsigned long cachemode2protval(enum page_cache_mode pcm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) if (likely(pcm == 0))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) return __cachemode2pte_tbl[pcm];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) EXPORT_SYMBOL(cachemode2protval);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) static uint8_t __pte2cachemode_tbl[8] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) [__pte2cm_idx( 0 | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) /* Check that the write-protect PAT entry is set for write-protect */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) bool x86_has_pat_wp(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) return __pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] == _PAGE_CACHE_MODE_WP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) enum page_cache_mode pgprot2cachemode(pgprot_t pgprot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) unsigned long masked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) if (likely(masked == 0))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) return __pte2cachemode_tbl[__pte2cm_idx(masked)];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) static unsigned long __initdata pgt_buf_start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) static unsigned long __initdata pgt_buf_end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) static unsigned long __initdata pgt_buf_top;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) static unsigned long min_pfn_mapped;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) static bool __initdata can_use_brk_pgt = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) * Provide a run-time mean of disabling ZONE_DMA32 if it is enabled via
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) * CONFIG_ZONE_DMA32.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) static bool disable_dma32 __ro_after_init;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) * Pages returned are already directly mapped.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) * Changing that is likely to break Xen, see commit:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) * 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) * for detailed information.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) __ref void *alloc_low_pages(unsigned int num)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) unsigned long pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) if (after_bootmem) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) unsigned int order;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) order = get_order((unsigned long)num << PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) unsigned long ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) if (min_pfn_mapped < max_pfn_mapped) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) ret = memblock_find_in_range(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) min_pfn_mapped << PAGE_SHIFT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) max_pfn_mapped << PAGE_SHIFT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) PAGE_SIZE * num , PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) memblock_reserve(ret, PAGE_SIZE * num);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) else if (can_use_brk_pgt)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) if (!ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) panic("alloc_low_pages: can not alloc memory");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) pfn = ret >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) pfn = pgt_buf_end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) pgt_buf_end += num;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) for (i = 0; i < num; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) void *adr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) adr = __va((pfn + i) << PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) clear_page(adr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) return __va(pfn << PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) * By default need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) * With KASLR memory randomization, depending on the machine e820 memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) * and the PUD alignment. We may need twice more pages when KASLR memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) * randomization is enabled.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) #ifndef CONFIG_RANDOMIZE_MEMORY
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) #define INIT_PGD_PAGE_COUNT 6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) #define INIT_PGD_PAGE_COUNT 12
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) #define INIT_PGT_BUF_SIZE (INIT_PGD_PAGE_COUNT * PAGE_SIZE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) void __init early_alloc_pgt_buf(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) unsigned long tables = INIT_PGT_BUF_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) phys_addr_t base;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) base = __pa(extend_brk(tables, PAGE_SIZE));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) pgt_buf_start = base >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) pgt_buf_end = pgt_buf_start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) int after_bootmem;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) early_param_on_off("gbpages", "nogbpages", direct_gbpages, CONFIG_X86_DIRECT_GBPAGES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) struct map_range {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) unsigned long start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) unsigned long end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) unsigned page_size_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) static int page_size_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) * Save some of cr4 feature set we're using (e.g. Pentium 4MB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) * enable and PPro Global page enable), so that any CPU's that boot
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) * up after us can get the correct flags. Invoked on the boot CPU.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) static inline void cr4_set_bits_and_update_boot(unsigned long mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) mmu_cr4_features |= mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) if (trampoline_cr4_features)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) *trampoline_cr4_features = mmu_cr4_features;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) cr4_set_bits(mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) static void __init probe_page_size_mask(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) * For pagealloc debugging, identity mapping will use small pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) * This will simplify cpa(), which otherwise needs to support splitting
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) * large pages into small in interrupt context, etc.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) page_size_mask |= 1 << PG_LEVEL_2M;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) direct_gbpages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) /* Enable PSE if available */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) if (boot_cpu_has(X86_FEATURE_PSE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) cr4_set_bits_and_update_boot(X86_CR4_PSE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) /* Enable PGE if available */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) __supported_pte_mask &= ~_PAGE_GLOBAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) if (boot_cpu_has(X86_FEATURE_PGE)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) cr4_set_bits_and_update_boot(X86_CR4_PGE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) __supported_pte_mask |= _PAGE_GLOBAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) /* By the default is everything supported: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) __default_kernel_pte_mask = __supported_pte_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) /* Except when with PTI where the kernel is mostly non-Global: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) if (cpu_feature_enabled(X86_FEATURE_PTI))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) __default_kernel_pte_mask &= ~_PAGE_GLOBAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) /* Enable 1 GB linear kernel mappings if available: */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) printk(KERN_INFO "Using GB pages for direct mapping\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) page_size_mask |= 1 << PG_LEVEL_1G;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) direct_gbpages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) static void setup_pcid(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) if (!IS_ENABLED(CONFIG_X86_64))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) if (!boot_cpu_has(X86_FEATURE_PCID))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) if (boot_cpu_has(X86_FEATURE_PGE)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) * This can't be cr4_set_bits_and_update_boot() -- the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) * trampoline code can't handle CR4.PCIDE and it wouldn't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) * do any good anyway. Despite the name,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) * cr4_set_bits_and_update_boot() doesn't actually cause
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) * the bits in question to remain set all the way through
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) * the secondary boot asm.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) * Instead, we brute-force it and set CR4.PCIDE manually in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) * start_secondary().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) cr4_set_bits(X86_CR4_PCIDE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) * INVPCID's single-context modes (2/3) only work if we set
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) * X86_CR4_PCIDE, *and* we INVPCID support. It's unusable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) * on systems that have X86_CR4_PCIDE clear, or that have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) * no INVPCID support at all.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) if (boot_cpu_has(X86_FEATURE_INVPCID))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) * flush_tlb_all(), as currently implemented, won't work if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) * PCID is on but PGE is not. Since that combination
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) * doesn't exist on real hardware, there's no reason to try
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) * to fully support it, but it's polite to avoid corrupting
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) * data if we're on an improperly configured VM.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) setup_clear_cpu_cap(X86_FEATURE_PCID);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) #ifdef CONFIG_X86_32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) #define NR_RANGE_MR 3
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) #else /* CONFIG_X86_64 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) #define NR_RANGE_MR 5
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) static int __meminit save_mr(struct map_range *mr, int nr_range,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) unsigned long start_pfn, unsigned long end_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) unsigned long page_size_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) if (start_pfn < end_pfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) if (nr_range >= NR_RANGE_MR)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) panic("run out of range for init_memory_mapping\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) mr[nr_range].start = start_pfn<<PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) mr[nr_range].end = end_pfn<<PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) mr[nr_range].page_size_mask = page_size_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) nr_range++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) return nr_range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) * adjust the page_size_mask for small range to go with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) * big page size instead small one if nearby are ram too.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) static void __ref adjust_range_page_size_mask(struct map_range *mr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) int nr_range)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) for (i = 0; i < nr_range; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) if ((page_size_mask & (1<<PG_LEVEL_2M)) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) unsigned long start = round_down(mr[i].start, PMD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) unsigned long end = round_up(mr[i].end, PMD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) #ifdef CONFIG_X86_32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) if ((end >> PAGE_SHIFT) > max_low_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) if (memblock_is_region_memory(start, end - start))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) mr[i].page_size_mask |= 1<<PG_LEVEL_2M;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) if ((page_size_mask & (1<<PG_LEVEL_1G)) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) unsigned long start = round_down(mr[i].start, PUD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) unsigned long end = round_up(mr[i].end, PUD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) if (memblock_is_region_memory(start, end - start))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) mr[i].page_size_mask |= 1<<PG_LEVEL_1G;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) static const char *page_size_string(struct map_range *mr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) static const char str_1g[] = "1G";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) static const char str_2m[] = "2M";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) static const char str_4m[] = "4M";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) static const char str_4k[] = "4k";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) if (mr->page_size_mask & (1<<PG_LEVEL_1G))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) return str_1g;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) * 32-bit without PAE has a 4M large page size.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) * PG_LEVEL_2M is misnamed, but we can at least
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) * print out the right size in the string.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) if (IS_ENABLED(CONFIG_X86_32) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) !IS_ENABLED(CONFIG_X86_PAE) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) mr->page_size_mask & (1<<PG_LEVEL_2M))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) return str_4m;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) if (mr->page_size_mask & (1<<PG_LEVEL_2M))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) return str_2m;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) return str_4k;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) static int __meminit split_mem_range(struct map_range *mr, int nr_range,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) unsigned long start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) unsigned long end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) unsigned long start_pfn, end_pfn, limit_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) unsigned long pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) limit_pfn = PFN_DOWN(end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) /* head if not big page alignment ? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) pfn = start_pfn = PFN_DOWN(start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) #ifdef CONFIG_X86_32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) * Don't use a large page for the first 2/4MB of memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) * because there are often fixed size MTRRs in there
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) * and overlapping MTRRs into large pages can cause
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) * slowdowns.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) if (pfn == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) end_pfn = PFN_DOWN(PMD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) #else /* CONFIG_X86_64 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) if (end_pfn > limit_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) end_pfn = limit_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) if (start_pfn < end_pfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) pfn = end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) /* big page (2M) range */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) #ifdef CONFIG_X86_32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) #else /* CONFIG_X86_64 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) if (start_pfn < end_pfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) page_size_mask & (1<<PG_LEVEL_2M));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) pfn = end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) #ifdef CONFIG_X86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) /* big page (1G) range */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) if (start_pfn < end_pfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) page_size_mask &
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) pfn = end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) /* tail is not big page (1G) alignment */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) if (start_pfn < end_pfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) page_size_mask & (1<<PG_LEVEL_2M));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) pfn = end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) /* tail is not big page (2M) alignment */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) start_pfn = pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) end_pfn = limit_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) if (!after_bootmem)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) adjust_range_page_size_mask(mr, nr_range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) /* try to merge same page size and continuous */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) unsigned long old_start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) if (mr[i].end != mr[i+1].start ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) mr[i].page_size_mask != mr[i+1].page_size_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) /* move it */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) old_start = mr[i].start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) memmove(&mr[i], &mr[i+1],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) (nr_range - 1 - i) * sizeof(struct map_range));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) mr[i--].start = old_start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) nr_range--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) for (i = 0; i < nr_range; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) pr_debug(" [mem %#010lx-%#010lx] page %s\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) mr[i].start, mr[i].end - 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) page_size_string(&mr[i]));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) return nr_range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) struct range pfn_mapped[E820_MAX_ENTRIES];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) int nr_pfn_mapped;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_MAX_ENTRIES,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) nr_pfn_mapped, start_pfn, end_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_MAX_ENTRIES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) max_pfn_mapped = max(max_pfn_mapped, end_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) if (start_pfn < (1UL<<(32-PAGE_SHIFT)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) max_low_pfn_mapped = max(max_low_pfn_mapped,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) min(end_pfn, 1UL<<(32-PAGE_SHIFT)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) for (i = 0; i < nr_pfn_mapped; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) if ((start_pfn >= pfn_mapped[i].start) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) (end_pfn <= pfn_mapped[i].end))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) * Setup the direct mapping of the physical memory at PAGE_OFFSET.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) * This runs before bootmem is initialized and gets pages directly from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) * the physical memory. To access them they are temporarily mapped.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) unsigned long __ref init_memory_mapping(unsigned long start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) unsigned long end, pgprot_t prot)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) struct map_range mr[NR_RANGE_MR];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) unsigned long ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) int nr_range, i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) pr_debug("init_memory_mapping: [mem %#010lx-%#010lx]\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) start, end - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) memset(mr, 0, sizeof(mr));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) nr_range = split_mem_range(mr, 0, start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) for (i = 0; i < nr_range; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) mr[i].page_size_mask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) prot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) return ret >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) * We need to iterate through the E820 memory map and create direct mappings
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) * for only E820_TYPE_RAM and E820_KERN_RESERVED regions. We cannot simply
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) * create direct mappings for all pfns from [0 to max_low_pfn) and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) * [4GB to max_pfn) because of possible memory holes in high addresses
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) * that cannot be marked as UC by fixed/variable range MTRRs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) * Depending on the alignment of E820 ranges, this may possibly result
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) * in using smaller size (i.e. 4K instead of 2M or 1G) page tables.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) * init_mem_mapping() calls init_range_memory_mapping() with big range.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) * That range would have hole in the middle or ends, and only ram parts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) * will be mapped in init_range_memory_mapping().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) static unsigned long __init init_range_memory_mapping(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) unsigned long r_start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) unsigned long r_end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) unsigned long start_pfn, end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) unsigned long mapped_ram_size = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) if (start >= end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) * if it is overlapping with brk pgt, we need to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) * alloc pgt buf from memblock instead.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >=
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) min(end, (u64)pgt_buf_top<<PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) init_memory_mapping(start, end, PAGE_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) mapped_ram_size += end - start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) can_use_brk_pgt = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) return mapped_ram_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) static unsigned long __init get_new_step_size(unsigned long step_size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) * Initial mapped size is PMD_SIZE (2M).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) * We can not set step_size to be PUD_SIZE (1G) yet.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) * In worse case, when we cross the 1G boundary, and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) * PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) * to map 1G range with PTE. Hence we use one less than the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) * difference of page table level shifts.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) * Don't need to worry about overflow in the top-down case, on 32bit,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) * when step_size is 0, round_down() returns 0 for start, and that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) * turns it into 0x100000000ULL.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) * In the bottom-up case, round_up(x, 0) returns 0 though too, which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) * needs to be taken into consideration by the code below.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) return step_size << (PMD_SHIFT - PAGE_SHIFT - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) * memory_map_top_down - Map [map_start, map_end) top down
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) * @map_start: start address of the target memory range
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) * @map_end: end address of the target memory range
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) * This function will setup direct mapping for memory range
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) * [map_start, map_end) in top-down. That said, the page tables
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) * will be allocated at the end of the memory, and we map the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) * memory in top-down.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) static void __init memory_map_top_down(unsigned long map_start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) unsigned long map_end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) unsigned long real_end, start, last_start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) unsigned long step_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) unsigned long addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) unsigned long mapped_ram_size = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) /* xen has big range in reserved near end of ram, skip it at first.*/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) real_end = addr + PMD_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) /* step_size need to be small so pgt_buf from BRK could cover it */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) step_size = PMD_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) max_pfn_mapped = 0; /* will get exact value next */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) min_pfn_mapped = real_end >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) last_start = start = real_end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) * We start from the top (end of memory) and go to the bottom.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) * The memblock_find_in_range() gets us a block of RAM from the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) * for page table.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) while (last_start > map_start) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) if (last_start > step_size) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) start = round_down(last_start - 1, step_size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) if (start < map_start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) start = map_start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) start = map_start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) mapped_ram_size += init_range_memory_mapping(start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) last_start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) last_start = start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) min_pfn_mapped = last_start >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) if (mapped_ram_size >= step_size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) step_size = get_new_step_size(step_size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) if (real_end < map_end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) init_range_memory_mapping(real_end, map_end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) * memory_map_bottom_up - Map [map_start, map_end) bottom up
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) * @map_start: start address of the target memory range
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) * @map_end: end address of the target memory range
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) * This function will setup direct mapping for memory range
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) * [map_start, map_end) in bottom-up. Since we have limited the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) * bottom-up allocation above the kernel, the page tables will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) * be allocated just above the kernel and we map the memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) * in [map_start, map_end) in bottom-up.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) static void __init memory_map_bottom_up(unsigned long map_start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) unsigned long map_end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) unsigned long next, start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) unsigned long mapped_ram_size = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) /* step_size need to be small so pgt_buf from BRK could cover it */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) unsigned long step_size = PMD_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) start = map_start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) min_pfn_mapped = start >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) * We start from the bottom (@map_start) and go to the top (@map_end).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) * The memblock_find_in_range() gets us a block of RAM from the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) * for page table.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) while (start < map_end) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) if (step_size && map_end - start > step_size) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) next = round_up(start + 1, step_size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) if (next > map_end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) next = map_end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) next = map_end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) mapped_ram_size += init_range_memory_mapping(start, next);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) start = next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) if (mapped_ram_size >= step_size)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) step_size = get_new_step_size(step_size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) * The real mode trampoline, which is required for bootstrapping CPUs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) * occupies only a small area under the low 1MB. See reserve_real_mode()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) * for details.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) * If KASLR is disabled the first PGD entry of the direct mapping is copied
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) * to map the real mode trampoline.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) * If KASLR is enabled, copy only the PUD which covers the low 1MB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) * area. This limits the randomization granularity to 1GB for both 4-level
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) * and 5-level paging.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) static void __init init_trampoline(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) #ifdef CONFIG_X86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) if (!kaslr_memory_enabled())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) init_trampoline_kaslr();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) void __init init_mem_mapping(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) unsigned long end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) pti_check_boottime_disable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) probe_page_size_mask();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) setup_pcid();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) #ifdef CONFIG_X86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) end = max_pfn << PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) end = max_low_pfn << PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) /* the ISA range is always mapped regardless of memory holes */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) init_memory_mapping(0, ISA_END_ADDRESS, PAGE_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) /* Init the trampoline, possibly with KASLR memory offset */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) init_trampoline();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) * If the allocation is in bottom-up direction, we setup direct mapping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) * in bottom-up, otherwise we setup direct mapping in top-down.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) if (memblock_bottom_up()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) unsigned long kernel_end = __pa_symbol(_end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) * we need two separate calls here. This is because we want to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) * allocate page tables above the kernel. So we first map
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) * [kernel_end, end) to make memory above the kernel be mapped
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) * as soon as possible. And then use page tables allocated above
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) * the kernel to map [ISA_END_ADDRESS, kernel_end).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) memory_map_bottom_up(kernel_end, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) memory_map_bottom_up(ISA_END_ADDRESS, kernel_end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) memory_map_top_down(ISA_END_ADDRESS, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) #ifdef CONFIG_X86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) if (max_pfn > max_low_pfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) /* can we preseve max_low_pfn ?*/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) max_low_pfn = max_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) early_ioremap_page_table_range_init();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) load_cr3(swapper_pg_dir);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) __flush_tlb_all();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) x86_init.hyper.init_mem_mapping();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) * Initialize an mm_struct to be used during poking and a pointer to be used
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) * during patching.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) void __init poking_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) pte_t *ptep;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) poking_mm = copy_init_mm();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) BUG_ON(!poking_mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) * Randomize the poking address, but make sure that the following page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) * will be mapped at the same PMD. We need 2 pages, so find space for 3,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) * and adjust the address if the PMD ends after the first one.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) poking_addr = TASK_UNMAPPED_BASE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) %
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) (TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) poking_addr += PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) * We need to trigger the allocation of the page-tables that will be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) * needed for poking now. Later, poking may be performed in an atomic
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) * section, which might cause allocation to fail.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) BUG_ON(!ptep);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) pte_unmap_unlock(ptep, ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) * devmem_is_allowed() checks to see if /dev/mem access to a certain address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) * is valid. The argument is a physical page number.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) * On x86, access has to be given to the first megabyte of RAM because that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) * area traditionally contains BIOS code and data regions used by X, dosemu,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) * and similar apps. Since they map the entire memory range, the whole range
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) * must be allowed (for mapping), but any areas that would otherwise be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) * disallowed are flagged as being "zero filled" instead of rejected.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) * Access has to be given to non-kernel-ram areas as well, these contain the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) * PCI mmio resources as well as potential bios/acpi data regions.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) int devmem_is_allowed(unsigned long pagenr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818) if (region_intersects(PFN_PHYS(pagenr), PAGE_SIZE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) != REGION_DISJOINT) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) * For disallowed memory regions in the low 1MB range,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823) * request that the page be shown as all zeros.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) if (pagenr < 256)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) return 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832) * This must follow RAM test, since System RAM is considered a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) * restricted resource under CONFIG_STRICT_IOMEM.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) /* Low 1MB bypasses iomem restrictions. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837) if (pagenr < 256)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) void free_init_pages(const char *what, unsigned long begin, unsigned long end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848) unsigned long begin_aligned, end_aligned;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850) /* Make sure boundaries are page aligned */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) begin_aligned = PAGE_ALIGN(begin);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852) end_aligned = end & PAGE_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854) if (WARN_ON(begin_aligned != begin || end_aligned != end)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) begin = begin_aligned;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) end = end_aligned;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) if (begin >= end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863) * If debugging page accesses then do not free this memory but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) * mark them not present - any buggy init-section access will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865) * create a kernel page fault:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867) if (debug_pagealloc_enabled()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) pr_info("debug: unmapping init [mem %#010lx-%#010lx]\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869) begin, end - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) * Inform kmemleak about the hole in the memory since the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872) * corresponding pages will be unmapped.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) kmemleak_free_part((void *)begin, end - begin);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875) set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878) * We just marked the kernel text read only above, now that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) * we are going to free part of that, we need to make that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880) * writeable and non-executable first.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883) set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) free_reserved_area((void *)begin, (void *)end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) POISON_FREE_INITMEM, what);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) * begin/end can be in the direct map or the "high kernel mapping"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892) * used for the kernel image only. free_init_pages() will do the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893) * right thing for either kind of address.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895) void free_kernel_image_pages(const char *what, void *begin, void *end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897) unsigned long begin_ul = (unsigned long)begin;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898) unsigned long end_ul = (unsigned long)end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899) unsigned long len_pages = (end_ul - begin_ul) >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) free_init_pages(what, begin_ul, end_ul);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) * PTI maps some of the kernel into userspace. For performance,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) * this includes some kernel areas that do not contain secrets.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906) * Those areas might be adjacent to the parts of the kernel image
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907) * being freed, which may contain secrets. Remove the "high kernel
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) * image mapping" for these freed areas, ensuring they are not even
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909) * potentially vulnerable to Meltdown regardless of the specific
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) * optimizations PTI is currently using.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912) * The "noalias" prevents unmapping the direct map alias which is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913) * needed to access the freed pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915) * This is only valid for 64bit kernels. 32bit has only one mapping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916) * which can't be treated in this way for obvious reasons.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) if (IS_ENABLED(CONFIG_X86_64) && cpu_feature_enabled(X86_FEATURE_PTI))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919) set_memory_np_noalias(begin_ul, len_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922) void __ref free_initmem(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924) e820__reallocate_tables();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926) mem_encrypt_free_decrypted_mem();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928) free_kernel_image_pages("unused kernel image (initmem)",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929) &__init_begin, &__init_end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932) #ifdef CONFIG_BLK_DEV_INITRD
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933) void __init free_initrd_mem(unsigned long start, unsigned long end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936) * end could be not aligned, and We can not align that,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) * decompresser could be confused by aligned initrd_end
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938) * We already reserve the end partial page before in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939) * - i386_start_kernel()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940) * - x86_64_start_kernel()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941) * - relocate_initrd()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942) * So here We can do PAGE_ALIGN() safely to get partial page to be freed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944) free_init_pages("initrd", start, PAGE_ALIGN(end));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949) * Calculate the precise size of the DMA zone (first 16 MB of RAM),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950) * and pass it to the MM layer - to help it set zone watermarks more
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951) * accurately.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953) * Done on 64-bit systems only for the time being, although 32-bit systems
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) * might benefit from this as well.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956) void __init memblock_find_dma_reserve(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958) #ifdef CONFIG_X86_64
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) u64 nr_pages = 0, nr_free_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) unsigned long start_pfn, end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961) phys_addr_t start_addr, end_addr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963) u64 u;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) * Iterate over all memory ranges (free and reserved ones alike),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967) * to calculate the total number of pages in the first 16 MB of RAM:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969) nr_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970) for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971) start_pfn = min(start_pfn, MAX_DMA_PFN);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972) end_pfn = min(end_pfn, MAX_DMA_PFN);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974) nr_pages += end_pfn - start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978) * Iterate over free memory ranges to calculate the number of free
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979) * pages in the DMA zone, while not counting potential partial
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980) * pages at the beginning or the end of the range:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) nr_free_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983) for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start_addr, &end_addr, NULL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984) start_pfn = min_t(unsigned long, PFN_UP(start_addr), MAX_DMA_PFN);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985) end_pfn = min_t(unsigned long, PFN_DOWN(end_addr), MAX_DMA_PFN);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987) if (start_pfn < end_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988) nr_free_pages += end_pfn - start_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991) set_dma_reserve(nr_pages - nr_free_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995) void __init zone_sizes_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997) unsigned long max_zone_pfns[MAX_NR_ZONES];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999) memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) #ifdef CONFIG_ZONE_DMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) #ifdef CONFIG_ZONE_DMA32
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) max_zone_pfns[ZONE_DMA32] = disable_dma32 ? 0 : min(MAX_DMA32_PFN, max_low_pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) #ifdef CONFIG_HIGHMEM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) free_area_init(max_zone_pfns);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) static int __init early_disable_dma32(char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) if (!buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) if (!strcmp(buf, "on"))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) disable_dma32 = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) early_param("disable_dma32", early_disable_dma32);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) .loaded_mm = &init_mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) .next_asid = 1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) /* entry 0 MUST be WB (hardwired to speed up translations) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) BUG_ON(!entry && cache != _PAGE_CACHE_MODE_WB);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) __cachemode2pte_tbl[cache] = __cm_idx2pte(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) __pte2cachemode_tbl[entry] = cache;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) #ifdef CONFIG_SWAP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) unsigned long max_swapfile_size(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) unsigned long pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) pages = generic_max_swapfile_size();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) if (boot_cpu_has_bug(X86_BUG_L1TF) && l1tf_mitigation != L1TF_MITIGATION_OFF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) /* Limit the swap file size to MAX_PA/2 for L1TF workaround */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) unsigned long long l1tf_limit = l1tf_pfn_limit();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) * We encode swap offsets also with 3 bits below those for pfn
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) * which makes the usable limit higher.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) #if CONFIG_PGTABLE_LEVELS > 2
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) pages = min_t(unsigned long long, l1tf_limit, pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061) return pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) #endif