^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) #include <linux/init.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) #include <linux/memblock.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) #include <linux/fs.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) #include <linux/sysfs.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) #include <linux/kobject.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) #include <linux/memory_hotplug.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) #include <linux/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) #include <linux/mmzone.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) #include <linux/pagemap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) #include <linux/rmap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #include <linux/mmu_notifier.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) #include <linux/page_ext.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #include <linux/page_idle.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #define BITMAP_CHUNK_SIZE sizeof(u64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #define BITMAP_CHUNK_BITS (BITMAP_CHUNK_SIZE * BITS_PER_BYTE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) * Idle page tracking only considers user memory pages, for other types of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) * pages the idle flag is always unset and an attempt to set it is silently
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) * ignored.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) * We treat a page as a user memory page if it is on an LRU list, because it is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) * always safe to pass such a page to rmap_walk(), which is essential for idle
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) * page tracking. With such an indicator of user pages we can skip isolated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) * pages, but since there are not usually many of them, it will hardly affect
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) * the overall result.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) * This function tries to get a user memory page by pfn as described above.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) static struct page *page_idle_get_page(unsigned long pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) struct page *page = pfn_to_online_page(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) pg_data_t *pgdat;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) if (!page || !PageLRU(page) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) !get_page_unless_zero(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) pgdat = page_pgdat(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) spin_lock_irq(&pgdat->lru_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) if (unlikely(!PageLRU(page))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) spin_unlock_irq(&pgdat->lru_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) static bool page_idle_clear_pte_refs_one(struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) unsigned long addr, void *arg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) struct page_vma_mapped_walk pvmw = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) .page = page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) .vma = vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) .address = addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) bool referenced = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) while (page_vma_mapped_walk(&pvmw)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) addr = pvmw.address;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) if (pvmw.pte) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) * For PTE-mapped THP, one sub page is referenced,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) * the whole THP is referenced.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) if (ptep_clear_young_notify(vma, addr, pvmw.pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) referenced = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) if (pmdp_clear_young_notify(vma, addr, pvmw.pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) referenced = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) /* unexpected pmd-mapped page? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) WARN_ON_ONCE(1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) if (referenced) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) clear_page_idle(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) * We cleared the referenced bit in a mapping to this page. To
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) * avoid interference with page reclaim, mark it young so that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) * page_referenced() will return > 0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) set_page_young(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) static void page_idle_clear_pte_refs(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) * Since rwc.arg is unused, rwc is effectively immutable, so we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) * can make it static const to save some cycles and stack.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) static const struct rmap_walk_control rwc = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) .rmap_one = page_idle_clear_pte_refs_one,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) .anon_lock = page_lock_anon_vma_read,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) bool need_lock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) if (!page_mapped(page) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) !page_rmapping(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) need_lock = !PageAnon(page) || PageKsm(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) if (need_lock && !trylock_page(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) rmap_walk(page, (struct rmap_walk_control *)&rwc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) if (need_lock)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) struct bin_attribute *attr, char *buf,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) loff_t pos, size_t count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) u64 *out = (u64 *)buf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) unsigned long pfn, end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) int bit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) pfn = pos * BITS_PER_BYTE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) if (pfn >= max_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) end_pfn = pfn + count * BITS_PER_BYTE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) if (end_pfn > max_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) end_pfn = max_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) for (; pfn < end_pfn; pfn++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) bit = pfn % BITMAP_CHUNK_BITS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) if (!bit)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) *out = 0ULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) page = page_idle_get_page(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) if (page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) if (page_is_idle(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) * The page might have been referenced via a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) * pte, in which case it is not idle. Clear
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) * refs and recheck.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) page_idle_clear_pte_refs(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) if (page_is_idle(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) *out |= 1ULL << bit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) if (bit == BITMAP_CHUNK_BITS - 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) out++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) return (char *)out - buf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) struct bin_attribute *attr, char *buf,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) loff_t pos, size_t count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) const u64 *in = (u64 *)buf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) unsigned long pfn, end_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) int bit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) pfn = pos * BITS_PER_BYTE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) if (pfn >= max_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) return -ENXIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) end_pfn = pfn + count * BITS_PER_BYTE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) if (end_pfn > max_pfn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) end_pfn = max_pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) for (; pfn < end_pfn; pfn++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) bit = pfn % BITMAP_CHUNK_BITS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) if ((*in >> bit) & 1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) page = page_idle_get_page(pfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) if (page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) page_idle_clear_pte_refs(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) set_page_idle(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) if (bit == BITMAP_CHUNK_BITS - 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) in++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) return (char *)in - buf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) static struct bin_attribute page_idle_bitmap_attr =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) __BIN_ATTR(bitmap, 0600,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) page_idle_bitmap_read, page_idle_bitmap_write, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) static struct bin_attribute *page_idle_bin_attrs[] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) &page_idle_bitmap_attr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) NULL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) static const struct attribute_group page_idle_attr_group = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) .bin_attrs = page_idle_bin_attrs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) .name = "page_idle",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) static int __init page_idle_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) err = sysfs_create_group(mm_kobj, &page_idle_attr_group);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) if (err) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) pr_err("page_idle: register sysfs failed\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) subsys_initcall(page_idle_init);