^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * linux/mm/madvise.c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Copyright (C) 1999 Linus Torvalds
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * Copyright (C) 2002 Christoph Hellwig
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) #include <linux/mman.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) #include <linux/pagemap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) #include <linux/syscalls.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #include <linux/mempolicy.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) #include <linux/page-isolation.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #include <linux/page_idle.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #include <linux/userfaultfd_k.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #include <linux/hugetlb.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #include <linux/falloc.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #include <linux/fadvise.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #include <linux/sched.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #include <linux/sched/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #include <linux/uio.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) #include <linux/ksm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) #include <linux/fs.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) #include <linux/file.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #include <linux/blkdev.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) #include <linux/backing-dev.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) #include <linux/pagewalk.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) #include <linux/swap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) #include <linux/swapops.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) #include <linux/shmem_fs.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) #include <linux/mmu_notifier.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) #include <trace/hooks/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) #include <asm/tlb.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) #include "internal.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) struct madvise_walk_private {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) struct mmu_gather *tlb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) bool pageout;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) * Any behaviour which results in changes to the vma->vm_flags needs to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) * take mmap_lock for writing. Others, which simply traverse vmas, need
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) * to only take it for reading.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) static int madvise_need_mmap_write(int behavior)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) switch (behavior) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) case MADV_REMOVE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) case MADV_WILLNEED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) case MADV_DONTNEED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) case MADV_COLD:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) case MADV_PAGEOUT:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) case MADV_FREE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) /* be safe, default to 1. list exceptions explicitly */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) * We can potentially split a vm area into separate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) * areas, each area with its own behavior.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) static long madvise_behavior(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) struct vm_area_struct **prev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) unsigned long start, unsigned long end, int behavior)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) struct mm_struct *mm = vma->vm_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) int error = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) pgoff_t pgoff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) unsigned long new_flags = vma->vm_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) switch (behavior) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) case MADV_NORMAL:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) case MADV_SEQUENTIAL:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) case MADV_RANDOM:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) case MADV_DONTFORK:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) new_flags |= VM_DONTCOPY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) case MADV_DOFORK:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) if (vma->vm_flags & VM_IO) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) error = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) new_flags &= ~VM_DONTCOPY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) case MADV_WIPEONFORK:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) /* MADV_WIPEONFORK is only supported on anonymous memory. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) if (vma->vm_file || vma->vm_flags & VM_SHARED) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) error = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) new_flags |= VM_WIPEONFORK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) case MADV_KEEPONFORK:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) new_flags &= ~VM_WIPEONFORK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) case MADV_DONTDUMP:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) new_flags |= VM_DONTDUMP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) case MADV_DODUMP:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) error = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) new_flags &= ~VM_DONTDUMP;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) case MADV_MERGEABLE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) case MADV_UNMERGEABLE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) error = ksm_madvise(vma, start, end, behavior, &new_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) if (error)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) goto out_convert_errno;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) case MADV_HUGEPAGE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) case MADV_NOHUGEPAGE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) error = hugepage_madvise(vma, &new_flags, behavior);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) if (error)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) goto out_convert_errno;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) if (new_flags == vma->vm_flags) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) *prev = vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) vma->vm_file, pgoff, vma_policy(vma),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) if (*prev) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) vma = *prev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) goto success;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) *prev = vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) if (start != vma->vm_start) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) if (unlikely(mm->map_count >= sysctl_max_map_count)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) error = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) error = __split_vma(mm, vma, start, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) if (error)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) goto out_convert_errno;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) if (end != vma->vm_end) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) if (unlikely(mm->map_count >= sysctl_max_map_count)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) error = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) error = __split_vma(mm, vma, end, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) if (error)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) goto out_convert_errno;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) success:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) * vm_flags is protected by the mmap_lock held in write mode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) vm_write_begin(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) WRITE_ONCE(vma->vm_flags, new_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) vm_write_end(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) out_convert_errno:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) * madvise() returns EAGAIN if kernel resources, such as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) * slab, are temporarily unavailable.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) if (error == -ENOMEM)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) error = -EAGAIN;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) return error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) #ifdef CONFIG_SWAP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) unsigned long end, struct mm_walk *walk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) pte_t *orig_pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) struct vm_area_struct *vma = walk->private;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) unsigned long index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) if (pmd_none_or_trans_huge_or_clear_bad(pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) for (index = start; index != end; index += PAGE_SIZE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) pte_t pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) swp_entry_t entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) pte = *(orig_pte + ((index - start) / PAGE_SIZE));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) pte_unmap_unlock(orig_pte, ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) if (pte_present(pte) || pte_none(pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) entry = pte_to_swp_entry(pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) if (unlikely(non_swap_entry(entry)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) vma, index, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) static const struct mm_walk_ops swapin_walk_ops = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) .pmd_entry = swapin_walk_pmd_entry,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) static void force_shm_swapin_readahead(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) unsigned long start, unsigned long end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) struct address_space *mapping)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) xas_for_each(&xas, page, end_index) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) swp_entry_t swap;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) if (!xa_is_value(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) xas_pause(&xas);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) swap = radix_to_swp_entry(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) NULL, 0, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) if (page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) lru_add_drain(); /* Push any new pages onto the LRU now */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) #endif /* CONFIG_SWAP */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) * Schedule all required I/O operations. Do not wait for completion.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) static long madvise_willneed(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) struct vm_area_struct **prev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) unsigned long start, unsigned long end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) struct mm_struct *mm = vma->vm_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) struct file *file = vma->vm_file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) loff_t offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) *prev = vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) #ifdef CONFIG_SWAP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) if (!file) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) lru_add_drain(); /* Push any new pages onto the LRU now */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) if (shmem_mapping(file->f_mapping)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) force_shm_swapin_readahead(vma, start, end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) file->f_mapping);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) if (!file)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) return -EBADF;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) if (IS_DAX(file_inode(file))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) /* no bad return value, but ignore advice */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) * Filesystem's fadvise may need to take various locks. We need to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) * explicitly grab a reference because the vma (and hence the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) * vma's reference to the file) can go away as soon as we drop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) * mmap_lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) *prev = NULL; /* tell sys_madvise we drop mmap_lock */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) get_file(file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) offset = (loff_t)(start - vma->vm_start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) mmap_read_unlock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) fput(file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) mmap_read_lock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) unsigned long addr, unsigned long end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) struct mm_walk *walk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) struct madvise_walk_private *private = walk->private;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) struct mmu_gather *tlb = private->tlb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) bool pageout = private->pageout;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) struct mm_struct *mm = tlb->mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) struct vm_area_struct *vma = walk->vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) pte_t *orig_pte, *pte, ptent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) struct page *page = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) LIST_HEAD(page_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) if (fatal_signal_pending(current))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) return -EINTR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) #ifdef CONFIG_TRANSPARENT_HUGEPAGE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) if (pmd_trans_huge(*pmd)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) pmd_t orig_pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) unsigned long next = pmd_addr_end(addr, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) ptl = pmd_trans_huge_lock(pmd, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) if (!ptl)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) orig_pmd = *pmd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) if (is_huge_zero_pmd(orig_pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) goto huge_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) if (unlikely(!pmd_present(orig_pmd))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) VM_BUG_ON(thp_migration_supported() &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) !is_pmd_migration_entry(orig_pmd));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) goto huge_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) page = pmd_page(orig_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) /* Do not interfere with other mappings of this page */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) if (page_mapcount(page) != 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) goto huge_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) if (next - addr != HPAGE_PMD_SIZE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) get_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) lock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) err = split_huge_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) if (!err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) goto regular_page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) if (pmd_young(orig_pmd)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) pmdp_invalidate(vma, addr, pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) orig_pmd = pmd_mkold(orig_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) set_pmd_at(mm, addr, pmd, orig_pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) ClearPageReferenced(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) test_and_clear_page_young(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) if (pageout) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) if (!isolate_lru_page(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) if (PageUnevictable(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) putback_lru_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) list_add(&page->lru, &page_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) deactivate_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) huge_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) if (pageout)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) reclaim_pages(&page_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) regular_page:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) if (pmd_trans_unstable(pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) tlb_change_page_size(tlb, PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) flush_tlb_batched_pending(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) arch_enter_lazy_mmu_mode();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) for (; addr < end; pte++, addr += PAGE_SIZE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) ptent = *pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) if (pte_none(ptent))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) if (!pte_present(ptent))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) page = vm_normal_page(vma, addr, ptent);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) * Creating a THP page is expensive so split it only if we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) * are sure it's worth. Split it if we are only owner.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) if (PageTransCompound(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) if (page_mapcount(page) != 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) get_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) if (!trylock_page(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) pte_unmap_unlock(orig_pte, ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) if (split_huge_page(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) pte_offset_map_lock(mm, pmd, addr, &ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) pte--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) addr -= PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) /* Do not interfere with other mappings of this page */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) if (page_mapcount(page) != 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) VM_BUG_ON_PAGE(PageTransCompound(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) if (pte_young(ptent)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) ptent = ptep_get_and_clear_full(mm, addr, pte,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) tlb->fullmm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) ptent = pte_mkold(ptent);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) set_pte_at(mm, addr, pte, ptent);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) tlb_remove_tlb_entry(tlb, pte, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) * We are deactivating a page for accelerating reclaiming.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) * VM couldn't reclaim the page unless we clear PG_young.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) * As a side effect, it makes confuse idle-page tracking
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) * because they will miss recent referenced history.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) ClearPageReferenced(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) test_and_clear_page_young(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) if (pageout) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) if (!isolate_lru_page(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) if (PageUnevictable(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) putback_lru_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) list_add(&page->lru, &page_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) trace_android_vh_page_isolated_for_reclaim(mm, page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) deactivate_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) arch_leave_lazy_mmu_mode();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) pte_unmap_unlock(orig_pte, ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) if (pageout)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) reclaim_pages(&page_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) static const struct mm_walk_ops cold_walk_ops = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) .pmd_entry = madvise_cold_or_pageout_pte_range,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) static void madvise_cold_page_range(struct mmu_gather *tlb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) unsigned long addr, unsigned long end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) struct madvise_walk_private walk_private = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) .pageout = false,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) .tlb = tlb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) vm_write_begin(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) tlb_start_vma(tlb, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) tlb_end_vma(tlb, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) vm_write_end(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) static long madvise_cold(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) struct vm_area_struct **prev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) unsigned long start_addr, unsigned long end_addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) struct mm_struct *mm = vma->vm_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) struct mmu_gather tlb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) *prev = vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) if (!can_madv_lru_vma(vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) lru_add_drain();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) tlb_finish_mmu(&tlb, start_addr, end_addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) static void madvise_pageout_page_range(struct mmu_gather *tlb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) unsigned long addr, unsigned long end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) struct madvise_walk_private walk_private = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) .pageout = true,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) .tlb = tlb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) vm_write_begin(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) tlb_start_vma(tlb, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) tlb_end_vma(tlb, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) vm_write_end(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) static inline bool can_do_pageout(struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) if (vma_is_anonymous(vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) if (!vma->vm_file)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) * paging out pagecache only for non-anonymous mappings that correspond
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) * to the files the calling process could (if tried) open for writing;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) * otherwise we'd be including shared non-exclusive mappings, which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) * opens a side channel.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) return inode_owner_or_capable(file_inode(vma->vm_file)) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) static long madvise_pageout(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) struct vm_area_struct **prev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) unsigned long start_addr, unsigned long end_addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) struct mm_struct *mm = vma->vm_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) struct mmu_gather tlb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) *prev = vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) if (!can_madv_lru_vma(vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) if (!can_do_pageout(vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) lru_add_drain();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) tlb_finish_mmu(&tlb, start_addr, end_addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) unsigned long end, struct mm_walk *walk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) struct mmu_gather *tlb = walk->private;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) struct mm_struct *mm = tlb->mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) struct vm_area_struct *vma = walk->vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) pte_t *orig_pte, *pte, ptent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) int nr_swap = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) unsigned long next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) next = pmd_addr_end(addr, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) if (pmd_trans_huge(*pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) goto next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) if (pmd_trans_unstable(pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) tlb_change_page_size(tlb, PAGE_SIZE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) flush_tlb_batched_pending(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) arch_enter_lazy_mmu_mode();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) for (; addr != end; pte++, addr += PAGE_SIZE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) ptent = *pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) if (pte_none(ptent))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) * If the pte has swp_entry, just clear page table to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) * prevent swap-in which is more expensive rather than
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) * (page allocation + zeroing).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) if (!pte_present(ptent)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) swp_entry_t entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) entry = pte_to_swp_entry(ptent);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) if (non_swap_entry(entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) nr_swap--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) free_swap_and_cache(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) page = vm_normal_page(vma, addr, ptent);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) * If pmd isn't transhuge but the page is THP and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) * is owned by only this process, split it and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) * deactivate all pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) if (PageTransCompound(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) if (page_mapcount(page) != 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) get_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) if (!trylock_page(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) pte_unmap_unlock(orig_pte, ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) if (split_huge_page(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) pte_offset_map_lock(mm, pmd, addr, &ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) put_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) pte--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) addr -= PAGE_SIZE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) VM_BUG_ON_PAGE(PageTransCompound(page), page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) if (PageSwapCache(page) || PageDirty(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) if (!trylock_page(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) * If page is shared with others, we couldn't clear
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) * PG_dirty of the page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) if (page_mapcount(page) != 1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) if (PageSwapCache(page) && !try_to_free_swap(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) ClearPageDirty(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) unlock_page(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) if (pte_young(ptent) || pte_dirty(ptent)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) * Some of architecture(ex, PPC) don't update TLB
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) * with set_pte_at and tlb_remove_tlb_entry so for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) * the portability, remap the pte with old|clean
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) * after pte clearing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) ptent = ptep_get_and_clear_full(mm, addr, pte,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) tlb->fullmm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) ptent = pte_mkold(ptent);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) ptent = pte_mkclean(ptent);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) set_pte_at(mm, addr, pte, ptent);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) tlb_remove_tlb_entry(tlb, pte, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) mark_page_lazyfree(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) if (nr_swap) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) if (current->mm == mm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) sync_mm_rss(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) add_mm_counter(mm, MM_SWAPENTS, nr_swap);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) arch_leave_lazy_mmu_mode();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) pte_unmap_unlock(orig_pte, ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) next:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) static const struct mm_walk_ops madvise_free_walk_ops = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) .pmd_entry = madvise_free_pte_range,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) static int madvise_free_single_vma(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) unsigned long start_addr, unsigned long end_addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) struct mm_struct *mm = vma->vm_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) struct mmu_notifier_range range;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) struct mmu_gather tlb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) /* MADV_FREE works for only anon vma at the moment */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) if (!vma_is_anonymous(vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) range.start = max(vma->vm_start, start_addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) if (range.start >= vma->vm_end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) range.end = min(vma->vm_end, end_addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) if (range.end <= vma->vm_start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) range.start, range.end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) lru_add_drain();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) tlb_gather_mmu(&tlb, mm, range.start, range.end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) update_hiwater_rss(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) mmu_notifier_invalidate_range_start(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) vm_write_begin(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) tlb_start_vma(&tlb, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) walk_page_range(vma->vm_mm, range.start, range.end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) &madvise_free_walk_ops, &tlb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) tlb_end_vma(&tlb, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) vm_write_end(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) mmu_notifier_invalidate_range_end(&range);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) tlb_finish_mmu(&tlb, range.start, range.end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) * Application no longer needs these pages. If the pages are dirty,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) * it's OK to just throw them away. The app will be more careful about
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) * data it wants to keep. Be sure to free swap resources too. The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) * zap_page_range call sets things up for shrink_active_list to actually free
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) * these pages later if no one else has touched them in the meantime,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) * although we could add these pages to a global reuse list for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) * shrink_active_list to pick up before reclaiming other pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) * NB: This interface discards data rather than pushes it out to swap,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) * as some implementations do. This has performance implications for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) * applications like large transactional databases which want to discard
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) * pages in anonymous maps after committing to backing store the data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) * that was kept in them. There is no reason to write this data out to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) * the swap area if the application is discarding it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) * An interface that causes the system to free clean pages and flush
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) * dirty pages is already available as msync(MS_INVALIDATE).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) unsigned long start, unsigned long end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) zap_page_range(vma, start, end - start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) static long madvise_dontneed_free(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) struct vm_area_struct **prev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) unsigned long start, unsigned long end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) int behavior)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) struct mm_struct *mm = vma->vm_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) *prev = vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) if (!can_madv_lru_vma(vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) if (!userfaultfd_remove(vma, start, end)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) *prev = NULL; /* mmap_lock has been dropped, prev is stale */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) mmap_read_lock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) vma = find_vma(mm, start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793) if (!vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) if (start < vma->vm_start) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) * This "vma" under revalidation is the one
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) * with the lowest vma->vm_start where start
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) * is also < vma->vm_end. If start <
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) * vma->vm_start it means an hole materialized
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) * in the user address space within the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) * virtual range passed to MADV_DONTNEED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803) * or MADV_FREE.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807) if (!can_madv_lru_vma(vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) if (end > vma->vm_end) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) * Don't fail if end > vma->vm_end. If the old
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) * vma was splitted while the mmap_lock was
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) * released the effect of the concurrent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) * operation may not cause madvise() to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) * have an undefined result. There may be an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) * adjacent next vma that we'll walk
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) * next. userfaultfd_remove() will generate an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818) * UFFD_EVENT_REMOVE repetition on the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) * end-vma->vm_end range, but the manager can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) * handle a repetition fine.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) end = vma->vm_end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) VM_WARN_ON(start >= end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) if (behavior == MADV_DONTNEED)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) return madvise_dontneed_single_vma(vma, start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829) else if (behavior == MADV_FREE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) return madvise_free_single_vma(vma, start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) * Application wants to free up the pages and associated backing store.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837) * This is effectively punching a hole into the middle of a file.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839) static long madvise_remove(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) struct vm_area_struct **prev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841) unsigned long start, unsigned long end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) loff_t offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) int error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845) struct file *f;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) struct mm_struct *mm = vma->vm_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848) *prev = NULL; /* tell sys_madvise we drop mmap_lock */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850) if (vma->vm_flags & VM_LOCKED)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853) f = vma->vm_file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) if (!f || !f->f_mapping || !f->f_mapping->host) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860) return -EACCES;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) offset = (loff_t)(start - vma->vm_start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) * Filesystem's fallocate may need to take i_mutex. We need to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867) * explicitly grab a reference because the vma (and hence the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) * vma's reference to the file) can go away as soon as we drop
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869) * mmap_lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) get_file(f);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872) if (userfaultfd_remove(vma, start, end)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873) /* mmap_lock was not released by userfaultfd_remove() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) mmap_read_unlock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) error = vfs_fallocate(f,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877) FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878) offset, end - start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) fput(f);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880) mmap_read_lock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881) return error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884) #ifdef CONFIG_MEMORY_FAILURE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) * Error injection support for memory error handling.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) static int madvise_inject_error(int behavior,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) unsigned long start, unsigned long end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) struct zone *zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892) unsigned long size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894) if (!capable(CAP_SYS_ADMIN))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895) return -EPERM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898) for (; start < end; start += size) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899) unsigned long pfn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903) ret = get_user_pages_fast(start, 1, 0, &page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) if (ret != 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906) pfn = page_to_pfn(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909) * When soft offlining hugepages, after migrating the page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) * we dissolve it, therefore in the second loop "page" will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911) * no longer be a compound page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913) size = page_size(compound_head(page));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915) if (behavior == MADV_SOFT_OFFLINE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916) pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) pfn, start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920) pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921) pfn, start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922) ret = memory_failure(pfn, MF_COUNT_INCREASED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929) /* Ensure that all poisoned pages are removed from per-cpu lists */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930) for_each_populated_zone(zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931) drain_all_pages(zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) static long
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938) madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939) unsigned long start, unsigned long end, int behavior)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941) switch (behavior) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942) case MADV_REMOVE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943) return madvise_remove(vma, prev, start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944) case MADV_WILLNEED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945) return madvise_willneed(vma, prev, start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946) case MADV_COLD:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947) return madvise_cold(vma, prev, start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948) case MADV_PAGEOUT:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949) return madvise_pageout(vma, prev, start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950) case MADV_FREE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951) case MADV_DONTNEED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952) return madvise_dontneed_free(vma, prev, start, end, behavior);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) return madvise_behavior(vma, prev, start, end, behavior);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958) static bool
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) madvise_behavior_valid(int behavior)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961) switch (behavior) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962) case MADV_DOFORK:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963) case MADV_DONTFORK:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964) case MADV_NORMAL:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965) case MADV_SEQUENTIAL:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) case MADV_RANDOM:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967) case MADV_REMOVE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968) case MADV_WILLNEED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969) case MADV_DONTNEED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970) case MADV_FREE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971) case MADV_COLD:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972) case MADV_PAGEOUT:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973) #ifdef CONFIG_KSM
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974) case MADV_MERGEABLE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975) case MADV_UNMERGEABLE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977) #ifdef CONFIG_TRANSPARENT_HUGEPAGE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978) case MADV_HUGEPAGE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979) case MADV_NOHUGEPAGE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981) case MADV_DONTDUMP:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) case MADV_DODUMP:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983) case MADV_WIPEONFORK:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984) case MADV_KEEPONFORK:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985) #ifdef CONFIG_MEMORY_FAILURE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986) case MADV_SOFT_OFFLINE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987) case MADV_HWPOISON:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996) static bool
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997) process_madvise_behavior_valid(int behavior)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999) switch (behavior) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) case MADV_COLD:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) case MADV_PAGEOUT:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) case MADV_WILLNEED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) * The madvise(2) system call.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) * Applications can use madvise() to advise the kernel how it should
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) * handle paging I/O in this VM area. The idea is to help the kernel
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) * use appropriate read-ahead and caching techniques. The information
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) * provided is advisory only, and can be safely disregarded by the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) * kernel without affecting the correct operation of the application.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) * behavior values:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) * MADV_NORMAL - the default behavior is to read clusters. This
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) * results in some read-ahead and read-behind.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) * MADV_RANDOM - the system should read the minimum amount of data
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) * on any access, since it is unlikely that the appli-
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) * cation will need more than what it asks for.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) * MADV_SEQUENTIAL - pages in the given range will probably be accessed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) * once, so they can be aggressively read ahead, and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) * can be freed soon after they are accessed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) * MADV_WILLNEED - the application is notifying the system to read
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) * some pages ahead.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) * MADV_DONTNEED - the application is finished with the given range,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) * so the kernel can free resources associated with it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) * MADV_FREE - the application marks pages in the given range as lazy free,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) * where actual purges are postponed until memory pressure happens.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) * MADV_REMOVE - the application wants to free up the given range of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) * pages and associated backing store.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) * MADV_DONTFORK - omit this area from child's address space when forking:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) * typically, to avoid COWing pages pinned by get_user_pages().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) * MADV_WIPEONFORK - present the child process with zero-filled memory in this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) * range after a fork.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) * MADV_HWPOISON - trigger memory error handler as if the given memory range
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) * were corrupted by unrecoverable hardware memory failure.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) * MADV_MERGEABLE - the application recommends that KSM try to merge pages in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) * this area with pages of identical content from other such areas.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) * MADV_HUGEPAGE - the application wants to back the given range by transparent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) * huge pages in the future. Existing pages might be coalesced and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) * new pages might be allocated as THP.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) * MADV_NOHUGEPAGE - mark the given range as not worth being backed by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) * transparent huge pages so the existing pages will not be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) * coalesced into THP and new pages will not be allocated as THP.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) * MADV_DONTDUMP - the application wants to prevent pages in the given range
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) * from being included in its core dump.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) * MADV_COLD - the application is not expected to use this memory soon,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) * deactivate pages in this range so that they can be reclaimed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) * easily if memory pressure hanppens.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) * MADV_PAGEOUT - the application is not expected to use this memory soon,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) * page out the pages in this range immediately.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) * return values:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) * zero - success
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) * -EINVAL - start + len < 0, start is not page-aligned,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) * "behavior" is not a valid value, or application
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) * is attempting to release locked or shared pages,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) * or the specified address range includes file, Huge TLB,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) * MAP_SHARED or VMPFNMAP range.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) * -ENOMEM - addresses in the specified range are not currently
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) * mapped, or are outside the AS of the process.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) * -EIO - an I/O error occurred while paging in data.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) * -EBADF - map exists, but area maps something that isn't a file.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073) * -EAGAIN - a kernel resource was temporarily unavailable.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) unsigned long end, tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) struct vm_area_struct *vma, *prev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) int unmapped_error = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) int error = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081) int write;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) size_t len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) struct blk_plug plug;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) start = untagged_addr(start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) if (!madvise_behavior_valid(behavior))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) return error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) if (!PAGE_ALIGNED(start))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) return error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) len = PAGE_ALIGN(len_in);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) /* Check to see whether len was rounded up from small -ve to zero */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) if (len_in && !len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) return error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) end = start + len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) if (end < start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) return error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) error = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) if (end == start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) return error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) #ifdef CONFIG_MEMORY_FAILURE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) return madvise_inject_error(behavior, start, start + len_in);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) write = madvise_need_mmap_write(behavior);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) if (write) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) if (mmap_write_lock_killable(mm))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114) return -EINTR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) mmap_read_lock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) * If the interval [start,end) covers some unmapped address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) * ranges, just ignore them, but return -ENOMEM at the end.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) * - different from the way of handling in mlock etc.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) vma = find_vma_prev(mm, start, &prev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) if (vma && start > vma->vm_start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) prev = vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) blk_start_plug(&plug);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) for (;;) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) /* Still start < end. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) error = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) if (!vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) /* Here start < (end|vma->vm_end). */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) if (start < vma->vm_start) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) unmapped_error = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) start = vma->vm_start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) if (start >= end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) /* Here vma->vm_start <= start < (end|vma->vm_end) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) tmp = vma->vm_end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) if (end < tmp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) tmp = end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149) error = madvise_vma(vma, &prev, start, tmp, behavior);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) if (error)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) start = tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) if (prev && start < prev->vm_end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154) start = prev->vm_end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) error = unmapped_error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) if (start >= end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158) if (prev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) vma = prev->vm_next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) else /* madvise_remove dropped mmap_lock */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) vma = find_vma(mm, start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) blk_finish_plug(&plug);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) if (write)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166) mmap_write_unlock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) mmap_read_unlock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) return error;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) return do_madvise(current->mm, start, len_in, behavior);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) size_t, vlen, int, behavior, unsigned int, flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) ssize_t ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) struct iovec iovstack[UIO_FASTIOV], iovec;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) struct iovec *iov = iovstack;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) struct iov_iter iter;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) struct pid *pid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) struct task_struct *task;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) struct mm_struct *mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188) size_t total_len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) unsigned int f_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) if (flags != 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) ret = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) if (ret < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200) pid = pidfd_get_pid(pidfd, &f_flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) if (IS_ERR(pid)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) ret = PTR_ERR(pid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) goto free_iov;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206) task = get_pid_task(pid, PIDTYPE_PID);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207) if (!task) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) ret = -ESRCH;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) goto put_pid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) if (!process_madvise_behavior_valid(behavior)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) ret = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214) goto release_task;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) if (IS_ERR_OR_NULL(mm)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) goto release_task;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) * Require CAP_SYS_NICE for influencing process performance. Note that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) * only non-destructive hints are currently supported.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) if (!capable(CAP_SYS_NICE)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) ret = -EPERM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) goto release_mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) total_len = iov_iter_count(&iter);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) while (iov_iter_count(&iter)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) iovec = iov_iter_iovec(&iter);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) ret = do_madvise(mm, (unsigned long)iovec.iov_base,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238) iovec.iov_len, behavior);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) if (ret < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241) iov_iter_advance(&iter, iovec.iov_len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244) ret = (total_len - iov_iter_count(&iter)) ? : ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) release_mm:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) mmput(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) release_task:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) put_task_struct(task);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250) put_pid:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) put_pid(pid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252) free_iov:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) kfree(iov);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256) }