^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * linux/mm/mlock.c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * (C) Copyright 1995 Linus Torvalds
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * (C) Copyright 2002 Christoph Hellwig
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) #include <linux/capability.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) #include <linux/mman.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) #include <linux/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #include <linux/sched/user.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) #include <linux/swap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #include <linux/swapops.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #include <linux/pagemap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #include <linux/pagevec.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #include <linux/mempolicy.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #include <linux/syscalls.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #include <linux/sched.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #include <linux/page_pinner.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #include <linux/export.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) #include <linux/rmap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) #include <linux/mmzone.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) #include <linux/hugetlb.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #include <linux/memcontrol.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) #include <linux/mm_inline.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) #include "internal.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) bool can_do_mlock(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) if (rlimit(RLIMIT_MEMLOCK) != 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) if (capable(CAP_IPC_LOCK))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) EXPORT_SYMBOL(can_do_mlock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) * Mlocked pages are marked with PageMlocked() flag for efficient testing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) * in vmscan and, possibly, the fault path; and to support semi-accurate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) * statistics.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) * An mlocked page [PageMlocked(page)] is unevictable. As such, it will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) * be placed on the LRU "unevictable" list, rather than the [in]active lists.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) * The unevictable list is an LRU sibling list to the [in]active lists.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) * PageUnevictable is set to indicate the unevictable state.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) * When lazy mlocking via vmscan, it is important to ensure that the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) * vma's VM_LOCKED status is not concurrently being modified, otherwise we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) * may have mlocked a page that is being munlocked. So lazy mlock must take
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) * the mmap_lock for read, and verify that the vma really is locked
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) * (see mm/rmap.c).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) * LRU accounting for clear_page_mlock()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) void clear_page_mlock(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) int nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) if (!TestClearPageMlocked(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) nr_pages = thp_nr_pages(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) * The previous TestClearPageMlocked() corresponds to the smp_mb()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) * in __pagevec_lru_add_fn().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) * See __pagevec_lru_add_fn for more explanation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) if (!isolate_lru_page(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) putback_lru_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) * We lost the race. the page already moved to evictable list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) if (PageUnevictable(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) * Mark page as mlocked if not already.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) * If page on LRU, isolate and putback to move to unevictable list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) void mlock_vma_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) /* Serialize with page migration */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) BUG_ON(!PageLocked(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) VM_BUG_ON_PAGE(PageTail(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) if (!TestSetPageMlocked(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) int nr_pages = thp_nr_pages(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) if (!isolate_lru_page(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) putback_lru_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) * Isolate a page from LRU with optional get_page() pin.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) * Assumes lru_lock already held and page already pinned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) if (PageLRU(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) struct lruvec *lruvec;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) if (getpage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) get_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) ClearPageLRU(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) del_page_from_lru_list(page, lruvec, page_lru(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) * Finish munlock after successful page isolation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) * Page must be locked. This is a wrapper for try_to_munlock()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) * and putback_lru_page() with munlock accounting.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) static void __munlock_isolated_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) * Optimization: if the page was mapped just once, that's our mapping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) * and we don't need to check all the other vmas.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) if (page_mapcount(page) > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) try_to_munlock(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) /* Did try_to_unlock() succeed or punt? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) if (!PageMlocked(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) count_vm_events(UNEVICTABLE_PGMUNLOCKED, thp_nr_pages(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) putback_lru_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) * Accounting for page isolation fail during munlock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) * Performs accounting when page isolation fails in munlock. There is nothing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) * else to do because it means some other task has already removed the page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) * from the LRU. putback_lru_page() will take care of removing the page from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) * the unevictable list, if necessary. vmscan [page_referenced()] will move
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) * the page back to the unevictable list if some other vma has it mlocked.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) static void __munlock_isolation_failed(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) int nr_pages = thp_nr_pages(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) if (PageUnevictable(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) * munlock_vma_page - munlock a vma page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) * @page: page to be unlocked, either a normal page or THP page head
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) * returns the size of the page as a page mask (0 for normal page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) * HPAGE_PMD_NR - 1 for THP head page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) * called from munlock()/munmap() path with page supposedly on the LRU.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) * When we munlock a page, because the vma where we found the page is being
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) * munlock()ed or munmap()ed, we want to check whether other vmas hold the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) * page locked so that we can leave it on the unevictable lru list and not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) * bother vmscan with it. However, to walk the page's rmap list in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) * try_to_munlock() we must isolate the page from the LRU. If some other
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) * task has removed the page from the LRU, we won't be able to do that.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) * So we clear the PageMlocked as we might not get another chance. If we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) * can't isolate the page, we leave it for putback_lru_page() and vmscan
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) * [page_referenced()/try_to_unmap()] to deal with.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) unsigned int munlock_vma_page(struct page *page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) int nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) pg_data_t *pgdat = page_pgdat(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) /* For try_to_munlock() and to serialize with page migration */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) BUG_ON(!PageLocked(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) VM_BUG_ON_PAGE(PageTail(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) * Serialize with any parallel __split_huge_page_refcount() which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) * might otherwise copy PageMlocked to part of the tail pages before
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) * we clear it in the head page. It also stabilizes thp_nr_pages().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) spin_lock_irq(&pgdat->lru_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) if (!TestClearPageMlocked(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) /* Potentially, PTE-mapped THP: do not skip the rest PTEs */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) nr_pages = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) goto unlock_out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) nr_pages = thp_nr_pages(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) if (__munlock_isolate_lru_page(page, true)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) spin_unlock_irq(&pgdat->lru_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) __munlock_isolated_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) __munlock_isolation_failed(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) unlock_out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) spin_unlock_irq(&pgdat->lru_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) return nr_pages - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) * convert get_user_pages() return value to posix mlock() error
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) static int __mlock_posix_error_return(long retval)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) if (retval == -EFAULT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) retval = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) else if (retval == -ENOMEM)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) retval = -EAGAIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) return retval;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) * Prepare page for fast batched LRU putback via putback_lru_evictable_pagevec()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) * The fast path is available only for evictable pages with single mapping.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) * Then we can bypass the per-cpu pvec and get better performance.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) * when mapcount > 1 we need try_to_munlock() which can fail.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) * when !page_evictable(), we need the full redo logic of putback_lru_page to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) * avoid leaving evictable page in unevictable list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) * In case of success, @page is added to @pvec and @pgrescued is incremented
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) * in case that the page was previously unevictable. @page is also unlocked.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) int *pgrescued)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) VM_BUG_ON_PAGE(PageLRU(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) VM_BUG_ON_PAGE(!PageLocked(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) if (page_mapcount(page) <= 1 && page_evictable(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) pagevec_add(pvec, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) if (TestClearPageUnevictable(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) (*pgrescued)++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) * Putback multiple evictable pages to the LRU
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) * Batched putback of evictable pages that bypasses the per-cpu pvec. Some of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) * the pages might have meanwhile become unevictable but that is OK.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) static void __putback_lru_fast(struct pagevec *pvec, int pgrescued)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) count_vm_events(UNEVICTABLE_PGMUNLOCKED, pagevec_count(pvec));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) *__pagevec_lru_add() calls release_pages() so we don't call
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) * put_page() explicitly
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) __pagevec_lru_add(pvec);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) * Munlock a batch of pages from the same zone
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) * The work is split to two main phases. First phase clears the Mlocked flag
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) * and attempts to isolate the pages, all under a single zone lru lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) * The second phase finishes the munlock only for pages where isolation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) * succeeded.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) * Note that the pagevec may be modified during the process.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) int nr = pagevec_count(pvec);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) int delta_munlocked = -nr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) struct pagevec pvec_putback;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) int pgrescued = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) pagevec_init(&pvec_putback);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) /* Phase 1: page isolation */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) spin_lock_irq(&zone->zone_pgdat->lru_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) for (i = 0; i < nr; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) struct page *page = pvec->pages[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) if (TestClearPageMlocked(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) * We already have pin from follow_page_mask()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) * so we can spare the get_page() here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) if (__munlock_isolate_lru_page(page, false))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) __munlock_isolation_failed(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) delta_munlocked++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) * We won't be munlocking this page in the next phase
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) * but we still need to release the follow_page_mask()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) * pin. We cannot do it under lru_lock however. If it's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) * the last pin, __page_cache_release() would deadlock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) pagevec_add(&pvec_putback, pvec->pages[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) pvec->pages[i] = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) spin_unlock_irq(&zone->zone_pgdat->lru_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) /* Now we can release pins of pages that we are not munlocking */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) pagevec_release(&pvec_putback);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) /* Phase 2: page munlock */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) for (i = 0; i < nr; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) struct page *page = pvec->pages[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) if (page) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) lock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) if (!__putback_lru_fast_prepare(page, &pvec_putback,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) &pgrescued)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) * Slow path. We don't want to lose the last
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) * pin before unlock_page()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) get_page(page); /* for putback_lru_page() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) __munlock_isolated_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) put_page(page); /* from follow_page_mask() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) * Phase 3: page putback for pages that qualified for the fast path
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) * This will also call put_page() to return pin from follow_page_mask()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) if (pagevec_count(&pvec_putback))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) __putback_lru_fast(&pvec_putback, pgrescued);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) * Fill up pagevec for __munlock_pagevec using pte walk
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) * The function expects that the struct page corresponding to @start address is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) * The rest of @pvec is filled by subsequent pages within the same pmd and same
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) * zone, as long as the pte's are present and vm_normal_page() succeeds. These
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) * pages also get pinned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) * Returns the address of the next page that should be scanned. This equals
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) * @start + PAGE_SIZE when no page could be added by the pte walk.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) struct vm_area_struct *vma, struct zone *zone,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) unsigned long start, unsigned long end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) pte_t *pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) * Initialize pte walk starting at the already pinned page where we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) * are sure that there is a pte, as it was pinned under the same
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) * mmap_lock write op.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) pte = get_locked_pte(vma->vm_mm, start, &ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) /* Make sure we do not cross the page table boundary */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) end = pgd_addr_end(start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) end = p4d_addr_end(start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) end = pud_addr_end(start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) end = pmd_addr_end(start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) /* The page next to the pinned page is the first we will try to get */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) start += PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) while (start < end) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) struct page *page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) pte++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) if (pte_present(*pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) page = vm_normal_page(vma, start, *pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) * Break if page could not be obtained or the page's node+zone does not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) * match
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) if (!page || page_zone(page) != zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) * Do not use pagevec for PTE-mapped THP,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) * munlock_vma_pages_range() will handle them.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) if (PageTransCompound(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) get_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) * Increase the address that will be returned *before* the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) * eventual break due to pvec becoming full by adding the page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) start += PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) if (pagevec_add(pvec, page) == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) pte_unmap_unlock(pte, ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) return start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) * munlock_vma_pages_range() - munlock all pages in the vma range.'
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) * @vma - vma containing range to be munlock()ed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) * @start - start address in @vma of the range
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) * @end - end of range in @vma.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) * For mremap(), munmap() and exit().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) * Called with @vma VM_LOCKED.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) * Returns with VM_LOCKED cleared. Callers must be prepared to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) * deal with this.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) * We don't save and restore VM_LOCKED here because pages are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) * still on lru. In unmap path, pages might be scanned by reclaim
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) * and re-mlocked by try_to_{munlock|unmap} before we unmap and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) * free them. This will result in freeing mlocked pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) void munlock_vma_pages_range(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) unsigned long start, unsigned long end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) vm_write_begin(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) WRITE_ONCE(vma->vm_flags, vma->vm_flags & VM_LOCKED_CLEAR_MASK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) vm_write_end(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) while (start < end) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) unsigned int page_mask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) unsigned long page_increm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) struct pagevec pvec;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) pagevec_init(&pvec);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) * Although FOLL_DUMP is intended for get_dump_page(),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) * it just so happens that its special treatment of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) * ZERO_PAGE (returning an error instead of doing get_page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) * suits munlock very well (and if somehow an abnormal page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) * has sneaked into the range, we won't oops here: great).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) page = follow_page(vma, start, FOLL_GET | FOLL_DUMP);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) if (page && !IS_ERR(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) * munlock_vma_pages_range uses follow_page(FOLL_GET)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) * so it need to use put_user_page but the munlock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) * path is quite complicated to deal with each put
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) * sites correctly so just unattribute them to avoid
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) * false positive at this moment.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) reset_page_pinner(page, compound_order(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) if (PageTransTail(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) VM_BUG_ON_PAGE(PageMlocked(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) put_page(page); /* follow_page_mask() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) } else if (PageTransHuge(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) lock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) * Any THP page found by follow_page_mask() may
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) * have gotten split before reaching
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) * munlock_vma_page(), so we need to compute
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) * the page_mask here instead.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) page_mask = munlock_vma_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) put_page(page); /* follow_page_mask() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) * Non-huge pages are handled in batches via
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) * pagevec. The pin from follow_page_mask()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) * prevents them from collapsing by THP.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) pagevec_add(&pvec, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) zone = page_zone(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) * Try to fill the rest of pagevec using fast
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) * pte walk. This will also update start to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) * the next page to process. Then munlock the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) * pagevec.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) start = __munlock_pagevec_fill(&pvec, vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) zone, start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) __munlock_pagevec(&pvec, zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) goto next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) page_increm = 1 + page_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) start += page_increm * PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) next:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) * mlock_fixup - handle mlock[all]/munlock[all] requests.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) * munlock is a no-op. However, for some special vmas, we go ahead and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) * populate the ptes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) * For vmas that pass the filters, merge/split as appropriate.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) unsigned long start, unsigned long end, vm_flags_t newflags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) struct mm_struct *mm = vma->vm_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) pgoff_t pgoff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) int nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) int lock = !!(newflags & VM_LOCKED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) vm_flags_t old_flags = vma->vm_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) vma_is_dax(vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) vma->vm_file, pgoff, vma_policy(vma),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) if (*prev) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) vma = *prev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) goto success;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) if (start != vma->vm_start) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) ret = split_vma(mm, vma, start, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) if (end != vma->vm_end) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) ret = split_vma(mm, vma, end, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) success:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) * Keep track of amount of locked VM.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) nr_pages = (end - start) >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) if (!lock)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) nr_pages = -nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) else if (old_flags & VM_LOCKED)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) nr_pages = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) mm->locked_vm += nr_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) * vm_flags is protected by the mmap_lock held in write mode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) * It's okay if try_to_unmap_one unmaps a page just after we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) * set VM_LOCKED, populate_vma_page_range will bring it back.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) if (lock) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) vm_write_begin(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) WRITE_ONCE(vma->vm_flags, newflags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) vm_write_end(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) munlock_vma_pages_range(vma, start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) *prev = vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) static int apply_vma_lock_flags(unsigned long start, size_t len,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) vm_flags_t flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) unsigned long nstart, end, tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) struct vm_area_struct * vma, * prev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) int error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) VM_BUG_ON(offset_in_page(start));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) VM_BUG_ON(len != PAGE_ALIGN(len));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) end = start + len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) if (end < start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) if (end == start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) vma = find_vma(current->mm, start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) if (!vma || vma->vm_start > start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) prev = vma->vm_prev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) if (start > vma->vm_start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) prev = vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) for (nstart = start ; ; ) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) newflags |= flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) tmp = vma->vm_end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) if (tmp > end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) tmp = end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) if (error)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) nstart = tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) if (nstart < prev->vm_end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) nstart = prev->vm_end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) if (nstart >= end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) vma = prev->vm_next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) if (!vma || vma->vm_start != nstart) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) error = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) return error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) * Go through vma areas and sum size of mlocked
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) * vma pages, as return value.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) * is also counted.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) * Return value: previously mlocked page counts
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) unsigned long start, size_t len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) struct vm_area_struct *vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) unsigned long count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) if (mm == NULL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) mm = current->mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) vma = find_vma(mm, start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) if (vma == NULL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) vma = mm->mmap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) for (; vma ; vma = vma->vm_next) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) if (start >= vma->vm_end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) if (start + len <= vma->vm_start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) if (vma->vm_flags & VM_LOCKED) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) if (start > vma->vm_start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) count -= (start - vma->vm_start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) if (start + len < vma->vm_end) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) count += start + len - vma->vm_start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) count += vma->vm_end - vma->vm_start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) return count >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) unsigned long locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) unsigned long lock_limit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) int error = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) start = untagged_addr(start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) if (!can_do_mlock())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) return -EPERM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) len = PAGE_ALIGN(len + (offset_in_page(start)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) start &= PAGE_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) lock_limit = rlimit(RLIMIT_MEMLOCK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) lock_limit >>= PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) locked = len >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) if (mmap_write_lock_killable(current->mm))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) return -EINTR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) locked += current->mm->locked_vm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) * It is possible that the regions requested intersect with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) * previously mlocked areas, that part area in "mm->locked_vm"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) * should not be counted to new mlock increment count. So check
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) * and adjust locked count if necessary.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) locked -= count_mm_mlocked_page_nr(current->mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) start, len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) /* check against resource limits */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) error = apply_vma_lock_flags(start, len, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) mmap_write_unlock(current->mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) if (error)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) return error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) error = __mm_populate(start, len, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) if (error)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) return __mlock_posix_error_return(error);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) return do_mlock(start, len, VM_LOCKED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) vm_flags_t vm_flags = VM_LOCKED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) if (flags & ~MLOCK_ONFAULT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) if (flags & MLOCK_ONFAULT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) vm_flags |= VM_LOCKONFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) return do_mlock(start, len, vm_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) start = untagged_addr(start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) len = PAGE_ALIGN(len + (offset_in_page(start)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) start &= PAGE_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) if (mmap_write_lock_killable(current->mm))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) return -EINTR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) ret = apply_vma_lock_flags(start, len, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) mmap_write_unlock(current->mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) * and translate into the appropriate modifications to mm->def_flags and/or the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) * flags for all current VMAs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) * There are a couple of subtleties with this. If mlockall() is called multiple
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) * times with different flags, the values do not necessarily stack. If mlockall
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) * is called once including the MCL_FUTURE flag and then a second time without
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) static int apply_mlockall_flags(int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) struct vm_area_struct * vma, * prev = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) vm_flags_t to_add = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) current->mm->def_flags &= VM_LOCKED_CLEAR_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) if (flags & MCL_FUTURE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) current->mm->def_flags |= VM_LOCKED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) if (flags & MCL_ONFAULT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) current->mm->def_flags |= VM_LOCKONFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) if (!(flags & MCL_CURRENT))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) if (flags & MCL_CURRENT) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) to_add |= VM_LOCKED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) if (flags & MCL_ONFAULT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) to_add |= VM_LOCKONFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) vm_flags_t newflags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) newflags |= to_add;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807) /* Ignore errors */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) SYSCALL_DEFINE1(mlockall, int, flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) unsigned long lock_limit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821) flags == MCL_ONFAULT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) if (!can_do_mlock())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) return -EPERM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) lock_limit = rlimit(RLIMIT_MEMLOCK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) lock_limit >>= PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) if (mmap_write_lock_killable(current->mm))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) return -EINTR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) ret = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834) if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) capable(CAP_IPC_LOCK))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) ret = apply_mlockall_flags(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837) mmap_write_unlock(current->mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838) if (!ret && (flags & MCL_CURRENT))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839) mm_populate(0, TASK_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) SYSCALL_DEFINE0(munlockall)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848) if (mmap_write_lock_killable(current->mm))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849) return -EINTR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850) ret = apply_mlockall_flags(0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) mmap_write_unlock(current->mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857) * shm segments) get accounted against the user_struct instead.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) static DEFINE_SPINLOCK(shmlock_user_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) int user_shm_lock(size_t size, struct user_struct *user)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863) unsigned long lock_limit, locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) int allowed = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867) lock_limit = rlimit(RLIMIT_MEMLOCK);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) if (lock_limit == RLIM_INFINITY)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869) allowed = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) lock_limit >>= PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) spin_lock(&shmlock_user_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872) if (!allowed &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873) locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875) get_uid(user);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) user->locked_shm += locked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877) allowed = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) spin_unlock(&shmlock_user_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880) return allowed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883) void user_shm_unlock(size_t size, struct user_struct *user)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) spin_lock(&shmlock_user_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887) spin_unlock(&shmlock_user_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) free_uid(user);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) }