^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0-only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * Simple NUMA memory policy for the Linux kernel.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Copyright 2003,2004 Andi Kleen, SuSE Labs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) * NUMA policy allows the user to give hints in which node(s) memory should
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) * be allocated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) * Support four policies per VMA and per process:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) * The VMA policy has priority over the process policy for a page fault.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) * interleave Allocate memory interleaved over a set of nodes,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) * with normal fallback if it fails.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) * For VMA based allocations this interleaves based on the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) * offset into the backing object or offset into the mapping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) * for anonymous memory. For process policy an process counter
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) * is used.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) * bind Only allocate memory on a specific set of nodes,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) * no fallback.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) * FIXME: memory is allocated starting with the first node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) * to the last. It would be better if bind would truly restrict
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) * the allocation to memory nodes instead
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) * preferred Try a specific node first before normal fallback.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) * As a special case NUMA_NO_NODE here means do the allocation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) * on the local CPU. This is normally identical to default,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) * but useful to set in a VMA when you have a non default
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) * process policy.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) * default Allocate on the local node first, or when on a VMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) * use the process policy. This is what Linux always did
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) * in a NUMA aware kernel and still does by, ahem, default.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) * The process policy is applied for most non interrupt memory allocations
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) * in that process' context. Interrupts ignore the policies and always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) * try to allocate on the local CPU. The VMA policy is only applied for memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) * allocations for a VMA in the VM.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) * Currently there are a few corner cases in swapping where the policy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) * is not applied, but the majority should be handled. When process policy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) * is used it is not remembered over swap outs/swap ins.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) * Only the highest zone in the zone hierarchy gets policied. Allocations
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) * requesting a lower zone just use default policy. This implies that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) * on systems with highmem kernel lowmem allocation don't get policied.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) * Same with GFP_DMA allocations.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) * all users and remembered even when nobody has memory mapped.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) /* Notebook:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) fix mmap readahead to honour policy and enable policy for any page cache
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) object
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) statistics for bigpages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) global policy for page cache? currently it uses process policy. Requires
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) first item above.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) handle mremap for shared memory (currently ignored for the policy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) grows down?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) make bind policy root only? It can trigger oom much faster and the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) kernel is not always grateful with that.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) #include <linux/mempolicy.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) #include <linux/pagewalk.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) #include <linux/highmem.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) #include <linux/hugetlb.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) #include <linux/kernel.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) #include <linux/sched.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) #include <linux/sched/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) #include <linux/sched/numa_balancing.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) #include <linux/sched/task.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) #include <linux/nodemask.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) #include <linux/cpuset.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) #include <linux/slab.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) #include <linux/string.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) #include <linux/export.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) #include <linux/nsproxy.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) #include <linux/interrupt.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) #include <linux/init.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) #include <linux/compat.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) #include <linux/ptrace.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) #include <linux/swap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) #include <linux/seq_file.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) #include <linux/proc_fs.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) #include <linux/migrate.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) #include <linux/ksm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) #include <linux/rmap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) #include <linux/security.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) #include <linux/syscalls.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) #include <linux/ctype.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) #include <linux/mm_inline.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) #include <linux/mmu_notifier.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) #include <linux/printk.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) #include <linux/swapops.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) #include <asm/tlbflush.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) #include <linux/uaccess.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) #include "internal.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) /* Internal flags */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) static struct kmem_cache *policy_cache;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) static struct kmem_cache *sn_cache;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) /* Highest zone. An specific allocation for a zone below that is not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) policied. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) enum zone_type policy_zone = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) * run-time system-wide default policy => local allocation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) static struct mempolicy default_policy = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) .refcnt = ATOMIC_INIT(1), /* never free it */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) .mode = MPOL_PREFERRED,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) .flags = MPOL_F_LOCAL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) static struct mempolicy preferred_node_policy[MAX_NUMNODES];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) * numa_map_to_online_node - Find closest online node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) * @node: Node id to start the search
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) * Lookup the next closest node by distance if @nid is not online.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) int numa_map_to_online_node(int node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) int min_dist = INT_MAX, dist, n, min_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) if (node == NUMA_NO_NODE || node_online(node))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) return node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) min_node = node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) for_each_online_node(n) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) dist = node_distance(node, n);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) if (dist < min_dist) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) min_dist = dist;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) min_node = n;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) return min_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) EXPORT_SYMBOL_GPL(numa_map_to_online_node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) struct mempolicy *get_task_policy(struct task_struct *p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) struct mempolicy *pol = p->mempolicy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) int node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) if (pol)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) return pol;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) node = numa_node_id();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) if (node != NUMA_NO_NODE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) pol = &preferred_node_policy[node];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) /* preferred_node_policy is not initialised early in boot */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) if (pol->mode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) return pol;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) return &default_policy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) static const struct mempolicy_operations {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) } mpol_ops[MPOL_MAX];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) return pol->flags & MPOL_MODE_FLAGS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) const nodemask_t *rel)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) nodemask_t tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) nodes_fold(tmp, *orig, nodes_weight(*rel));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) nodes_onto(*ret, tmp, *rel);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) if (nodes_empty(*nodes))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) pol->v.nodes = *nodes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) if (!nodes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) pol->flags |= MPOL_F_LOCAL; /* local allocation */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) else if (nodes_empty(*nodes))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) return -EINVAL; /* no allowed nodes */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) pol->v.preferred_node = first_node(*nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) if (nodes_empty(*nodes))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) pol->v.nodes = *nodes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) * any, for the new policy. mpol_new() has already validated the nodes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) * parameter with respect to the policy mode and flags. But, we need to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) * handle an empty nodemask with MPOL_PREFERRED here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) * Must be called holding task's alloc_lock to protect task's mems_allowed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) * and mempolicy. May also be called holding the mmap_lock for write.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) static int mpol_set_nodemask(struct mempolicy *pol,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) const nodemask_t *nodes, struct nodemask_scratch *nsc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) if (pol == NULL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) /* Check N_MEMORY */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) nodes_and(nsc->mask1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) cpuset_current_mems_allowed, node_states[N_MEMORY]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) VM_BUG_ON(!nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) nodes = NULL; /* explicit local allocation */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) if (pol->flags & MPOL_F_RELATIVE_NODES)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) nodes_and(nsc->mask2, *nodes, nsc->mask1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) if (mpol_store_user_nodemask(pol))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) pol->w.user_nodemask = *nodes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) pol->w.cpuset_mems_allowed =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) cpuset_current_mems_allowed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) if (nodes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) ret = mpol_ops[pol->mode].create(pol, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) * This function just creates a new policy, does some check and simple
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) * initialization. You must invoke mpol_set_nodemask() to set nodes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) nodemask_t *nodes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) struct mempolicy *policy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) pr_debug("setting mode %d flags %d nodes[0] %lx\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) if (mode == MPOL_DEFAULT) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) if (nodes && !nodes_empty(*nodes))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) return ERR_PTR(-EINVAL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) VM_BUG_ON(!nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) * All other modes require a valid pointer to a non-empty nodemask.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) if (mode == MPOL_PREFERRED) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) if (nodes_empty(*nodes)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) if (((flags & MPOL_F_STATIC_NODES) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) (flags & MPOL_F_RELATIVE_NODES)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) return ERR_PTR(-EINVAL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) } else if (mode == MPOL_LOCAL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) if (!nodes_empty(*nodes) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) (flags & MPOL_F_STATIC_NODES) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) (flags & MPOL_F_RELATIVE_NODES))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) return ERR_PTR(-EINVAL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) mode = MPOL_PREFERRED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) } else if (nodes_empty(*nodes))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) return ERR_PTR(-EINVAL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) if (!policy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) return ERR_PTR(-ENOMEM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) atomic_set(&policy->refcnt, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) policy->mode = mode;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) policy->flags = flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) return policy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) /* Slow path of a mpol destructor. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) void __mpol_put(struct mempolicy *p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) if (!atomic_dec_and_test(&p->refcnt))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) kmem_cache_free(policy_cache, p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) nodemask_t tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) if (pol->flags & MPOL_F_STATIC_NODES)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) nodes_and(tmp, pol->w.user_nodemask, *nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) else if (pol->flags & MPOL_F_RELATIVE_NODES)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) *nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) pol->w.cpuset_mems_allowed = *nodes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) if (nodes_empty(tmp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) tmp = *nodes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) pol->v.nodes = tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) static void mpol_rebind_preferred(struct mempolicy *pol,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) const nodemask_t *nodes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) nodemask_t tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) if (pol->flags & MPOL_F_STATIC_NODES) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) int node = first_node(pol->w.user_nodemask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) if (node_isset(node, *nodes)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) pol->v.preferred_node = node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) pol->flags &= ~MPOL_F_LOCAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) pol->flags |= MPOL_F_LOCAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) pol->v.preferred_node = first_node(tmp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) } else if (!(pol->flags & MPOL_F_LOCAL)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) pol->v.preferred_node = node_remap(pol->v.preferred_node,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) pol->w.cpuset_mems_allowed,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) *nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) pol->w.cpuset_mems_allowed = *nodes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) * mpol_rebind_policy - Migrate a policy to a different set of nodes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) * Per-vma policies are protected by mmap_lock. Allocations using per-task
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) * policies are protected by task->mems_allowed_seq to prevent a premature
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) * OOM/allocation failure due to parallel nodemask modification.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) if (!pol)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) mpol_ops[pol->mode].rebind(pol, newmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) * Wrapper for mpol_rebind_policy() that just requires task
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) * pointer, and updates task mempolicy.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) * Called with task's alloc_lock held.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) mpol_rebind_policy(tsk->mempolicy, new);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) * Rebind each vma in mm to new nodemask.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) * Call holding a reference to mm. Takes mm->mmap_lock during call.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) struct vm_area_struct *vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) mmap_write_lock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) for (vma = mm->mmap; vma; vma = vma->vm_next) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) vm_write_begin(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) mpol_rebind_policy(vma->vm_policy, new);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) vm_write_end(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) mmap_write_unlock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) [MPOL_DEFAULT] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) .rebind = mpol_rebind_default,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) [MPOL_INTERLEAVE] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) .create = mpol_new_interleave,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) .rebind = mpol_rebind_nodemask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) [MPOL_PREFERRED] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) .create = mpol_new_preferred,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) .rebind = mpol_rebind_preferred,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) [MPOL_BIND] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) .create = mpol_new_bind,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) .rebind = mpol_rebind_nodemask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) static int migrate_page_add(struct page *page, struct list_head *pagelist,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) unsigned long flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) struct queue_pages {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) struct list_head *pagelist;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) nodemask_t *nmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) unsigned long start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) unsigned long end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) struct vm_area_struct *first;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) * Check if the page's nid is in qp->nmask.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) * in the invert of qp->nmask.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) static inline bool queue_pages_required(struct page *page,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) struct queue_pages *qp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) int nid = page_to_nid(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) unsigned long flags = qp->flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) * queue_pages_pmd() has four possible return values:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) * 0 - pages are placed on the right node or queued successfully.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) * specified.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) * 2 - THP was split.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) * existing page was already on a node that does not follow the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) * policy.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) unsigned long end, struct mm_walk *walk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) __releases(ptl)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) struct queue_pages *qp = walk->private;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) if (unlikely(is_pmd_migration_entry(*pmd))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) ret = -EIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) goto unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) page = pmd_page(*pmd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) if (is_huge_zero_page(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) ret = 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) if (!queue_pages_required(page, qp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) goto unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) flags = qp->flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) /* go to thp migration */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) if (!vma_migratable(walk->vma) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) migrate_page_add(page, qp->pagelist, flags)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) ret = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) goto unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) ret = -EIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) * Scan through pages checking if pages follow certain conditions,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) * and move them to the pagelist if they do.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) * queue_pages_pte_range() has three possible return values:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) * 0 - pages are placed on the right node or queued successfully.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) * specified.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) * on a node that does not follow the policy.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) unsigned long end, struct mm_walk *walk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) struct vm_area_struct *vma = walk->vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) struct queue_pages *qp = walk->private;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) unsigned long flags = qp->flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) bool has_unmovable = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) pte_t *pte, *mapped_pte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) ptl = pmd_trans_huge_lock(pmd, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) if (ptl) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) if (ret != 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) /* THP was split, fall through to pte walk */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) if (pmd_trans_unstable(pmd))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) for (; addr != end; pte++, addr += PAGE_SIZE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) if (!pte_present(*pte))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) page = vm_normal_page(vma, addr, *pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) if (!page)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) * vm_normal_page() filters out zero pages, but there might
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) * still be PageReserved pages to skip, perhaps in a VDSO.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) if (PageReserved(page))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) if (!queue_pages_required(page, qp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) /* MPOL_MF_STRICT must be specified if we get here */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) if (!vma_migratable(vma)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) has_unmovable = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) * Do not abort immediately since there may be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) * temporary off LRU pages in the range. Still
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) * need migrate other LRU pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) if (migrate_page_add(page, qp->pagelist, flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) has_unmovable = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) pte_unmap_unlock(mapped_pte, ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) cond_resched();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) if (has_unmovable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) return addr != end ? -EIO : 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) unsigned long addr, unsigned long end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) struct mm_walk *walk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) #ifdef CONFIG_HUGETLB_PAGE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) struct queue_pages *qp = walk->private;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) unsigned long flags = (qp->flags & MPOL_MF_VALID);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) spinlock_t *ptl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) pte_t entry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) entry = huge_ptep_get(pte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) if (!pte_present(entry))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) goto unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) page = pte_page(entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) if (!queue_pages_required(page, qp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) goto unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) if (flags == MPOL_MF_STRICT) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) * STRICT alone means only detecting misplaced page and no
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) * need to further check other vma.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) ret = -EIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) goto unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) if (!vma_migratable(walk->vma)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) * Must be STRICT with MOVE*, otherwise .test_walk() have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) * stopped walking current vma.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) * Detecting misplaced page but allow migrating pages which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) * have been queued.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) ret = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) goto unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) if (flags & (MPOL_MF_MOVE_ALL) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) if (!isolate_huge_page(page, qp->pagelist) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) (flags & MPOL_MF_STRICT))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) * Failed to isolate page but allow migrating pages
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) * which have been queued.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) ret = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) spin_unlock(ptl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) #ifdef CONFIG_NUMA_BALANCING
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) * This is used to mark a range of virtual addresses to be inaccessible.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) * These are later cleared by a NUMA hinting fault. Depending on these
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) * faults, pages may be migrated for better NUMA placement.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) * This is assuming that NUMA faults are handled using PROT_NONE. If
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) * an architecture makes a different choice, it will need further
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) * changes to the core.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) unsigned long change_prot_numa(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) unsigned long addr, unsigned long end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) int nr_updated;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) vm_write_begin(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) if (nr_updated)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) vm_write_end(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) return nr_updated;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) static unsigned long change_prot_numa(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) unsigned long addr, unsigned long end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) #endif /* CONFIG_NUMA_BALANCING */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) static int queue_pages_test_walk(unsigned long start, unsigned long end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) struct mm_walk *walk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) struct vm_area_struct *vma = walk->vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) struct queue_pages *qp = walk->private;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) unsigned long endvma = vma->vm_end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) unsigned long flags = qp->flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) /* range check first */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) VM_BUG_ON_VMA((vma->vm_start > start) || (vma->vm_end < end), vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) if (!qp->first) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) qp->first = vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) if (!(flags & MPOL_MF_DISCONTIG_OK) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) (qp->start < vma->vm_start))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) /* hole at head side of range */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) if (!(flags & MPOL_MF_DISCONTIG_OK) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) ((vma->vm_end < qp->end) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) /* hole at middle or tail of range */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) * Need check MPOL_MF_STRICT to return -EIO if possible
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) * regardless of vma_migratable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) if (!vma_migratable(vma) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) !(flags & MPOL_MF_STRICT))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) if (endvma > end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) endvma = end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) if (flags & MPOL_MF_LAZY) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) /* Similar to task_numa_work, skip inaccessible VMAs */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) !(vma->vm_flags & VM_MIXEDMAP))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) change_prot_numa(vma, start, endvma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) /* queue pages from current vma */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) if (flags & MPOL_MF_VALID)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) static const struct mm_walk_ops queue_pages_walk_ops = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) .hugetlb_entry = queue_pages_hugetlb,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) .pmd_entry = queue_pages_pte_range,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) .test_walk = queue_pages_test_walk,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) * Walk through page tables and collect pages to be migrated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) * If pages found in a given range are on a set of nodes (determined by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) * @nodes and @flags,) it's isolated and queued to the pagelist which is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) * passed via @private.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) * queue_pages_range() has three possible return values:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) * specified.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) * 0 - queue pages successfully or no misplaced page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) * memory range specified by nodemask and maxnode points outside
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) * your accessible address space (-EFAULT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) static int
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) nodemask_t *nodes, unsigned long flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) struct list_head *pagelist)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) struct queue_pages qp = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) .pagelist = pagelist,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) .flags = flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) .nmask = nodes,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) .start = start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) .end = end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) .first = NULL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) if (!qp.first)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) /* whole range in hole */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) err = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) * Apply policy to a single VMA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) * This must be called with the mmap_lock held for writing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) static int vma_replace_policy(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) struct mempolicy *pol)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) struct mempolicy *old;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) struct mempolicy *new;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) vma->vm_start, vma->vm_end, vma->vm_pgoff,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) vma->vm_ops, vma->vm_file,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) vma->vm_ops ? vma->vm_ops->set_policy : NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) new = mpol_dup(pol);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) if (IS_ERR(new))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) return PTR_ERR(new);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) vm_write_begin(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) if (vma->vm_ops && vma->vm_ops->set_policy) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) err = vma->vm_ops->set_policy(vma, new);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) goto err_out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) old = vma->vm_policy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) * The speculative page fault handler accesses this field without
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) * hodling the mmap_sem.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) WRITE_ONCE(vma->vm_policy, new);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803) vm_write_end(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) mpol_put(old);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807) err_out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) vm_write_end(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) mpol_put(new);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) /* Step 2: apply policy to a range and do splits. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) static int mbind_range(struct mm_struct *mm, unsigned long start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) unsigned long end, struct mempolicy *new_pol)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) struct vm_area_struct *prev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818) struct vm_area_struct *vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) int err = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) pgoff_t pgoff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821) unsigned long vmstart;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) unsigned long vmend;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) vma = find_vma(mm, start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) VM_BUG_ON(!vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) prev = vma->vm_prev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) if (start > vma->vm_start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829) prev = vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832) vmstart = max(start, vma->vm_start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) vmend = min(end, vma->vm_end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) if (mpol_equal(vma_policy(vma), new_pol))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838) pgoff = vma->vm_pgoff +
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839) ((vmstart - vma->vm_start) >> PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841) vma->anon_vma, vma->vm_file, pgoff,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) new_pol, vma->vm_userfaultfd_ctx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) vma_get_anon_name(vma));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) if (prev) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845) vma = prev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) goto replace;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848) if (vma->vm_start != vmstart) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849) err = split_vma(vma->vm_mm, vma, vmstart, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853) if (vma->vm_end != vmend) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854) err = split_vma(vma->vm_mm, vma, vmend, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858) replace:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) err = vma_replace_policy(vma, new_pol);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) /* Set the process memory policy */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869) static long do_set_mempolicy(unsigned short mode, unsigned short flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) nodemask_t *nodes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872) struct mempolicy *new, *old;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873) NODEMASK_SCRATCH(scratch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) if (!scratch)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) new = mpol_new(mode, flags, nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880) if (IS_ERR(new)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881) ret = PTR_ERR(new);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) ret = mpol_set_nodemask(new, nodes, scratch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) if (ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887) mpol_put(new);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890) task_lock(current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) old = current->mempolicy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892) current->mempolicy = new;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893) if (new && new->mode == MPOL_INTERLEAVE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894) current->il_prev = MAX_NUMNODES-1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895) task_unlock(current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) mpol_put(old);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899) NODEMASK_SCRATCH_FREE(scratch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) * Return nodemask for policy for get_mempolicy() query
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906) * Called with task's alloc_lock held
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) nodes_clear(*nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911) if (p == &default_policy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914) switch (p->mode) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915) case MPOL_BIND:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916) case MPOL_INTERLEAVE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) *nodes = p->v.nodes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919) case MPOL_PREFERRED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920) if (!(p->flags & MPOL_F_LOCAL))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921) node_set(p->v.preferred_node, *nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922) /* else return empty node mask for local allocation */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925) BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929) static int lookup_node(struct mm_struct *mm, unsigned long addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931) struct page *p = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934) int locked = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935) err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936) if (err > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) err = page_to_nid(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938) put_page(p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940) if (locked)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941) mmap_read_unlock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945) /* Retrieve NUMA policy */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946) static long do_get_mempolicy(int *policy, nodemask_t *nmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947) unsigned long addr, unsigned long flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950) struct mm_struct *mm = current->mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951) struct vm_area_struct *vma = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952) struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) if (flags &
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955) ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958) if (flags & MPOL_F_MEMS_ALLOWED) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961) *policy = 0; /* just so it's initialized */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962) task_lock(current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963) *nmask = cpuset_current_mems_allowed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964) task_unlock(current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968) if (flags & MPOL_F_ADDR) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970) * Do NOT fall back to task policy if the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971) * vma/shared policy at addr is NULL. We
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972) * want to return MPOL_DEFAULT in this case.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974) mmap_read_lock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975) vma = find_vma_intersection(mm, addr, addr+1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976) if (!vma) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977) mmap_read_unlock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980) if (vma->vm_ops && vma->vm_ops->get_policy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981) pol = vma->vm_ops->get_policy(vma, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983) pol = vma->vm_policy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984) } else if (addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987) if (!pol)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988) pol = &default_policy; /* indicates default behavior */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990) if (flags & MPOL_F_NODE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991) if (flags & MPOL_F_ADDR) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993) * Take a refcount on the mpol, lookup_node()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994) * wil drop the mmap_lock, so after calling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995) * lookup_node() only "pol" remains valid, "vma"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996) * is stale.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998) pol_refcount = pol;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999) vma = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) mpol_get(pol);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) err = lookup_node(mm, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) if (err < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) *policy = err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) } else if (pol == current->mempolicy &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) pol->mode == MPOL_INTERLEAVE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) *policy = next_node_in(current->il_prev, pol->v.nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) err = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) *policy = pol == &default_policy ? MPOL_DEFAULT :
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) pol->mode;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) * Internal mempolicy flags must be masked off before exposing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) * the policy to userspace.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) *policy |= (pol->flags & MPOL_MODE_FLAGS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) err = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) if (nmask) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) if (mpol_store_user_nodemask(pol)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) *nmask = pol->w.user_nodemask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) task_lock(current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) get_policy_nodemask(pol, nmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) task_unlock(current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) mpol_cond_put(pol);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) if (vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) mmap_read_unlock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) if (pol_refcount)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) mpol_put(pol_refcount);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) #ifdef CONFIG_MIGRATION
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) * page migration, thp tail pages can be passed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) static int migrate_page_add(struct page *page, struct list_head *pagelist,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) unsigned long flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) struct page *head = compound_head(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) * Avoid migrating a page that is shared with others.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) if (!isolate_lru_page(head)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) list_add_tail(&head->lru, pagelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) mod_node_page_state(page_pgdat(head),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) NR_ISOLATED_ANON + page_is_file_lru(head),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) thp_nr_pages(head));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) } else if (flags & MPOL_MF_STRICT) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061) * Non-movable page may reach here. And, there may be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) * temporary off LRU pages or non-LRU movable pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) * Treat them as unmovable pages since they can't be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) * isolated, so they can't be moved at the moment. It
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) * should return -EIO for this case too.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) return -EIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) * Migrate pages from one node to a target node.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) * Returns error or the number of pages not migrated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) static int migrate_to_node(struct mm_struct *mm, int source, int dest,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081) nodemask_t nmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) LIST_HEAD(pagelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) int err = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084) struct migration_target_control mtc = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) .nid = dest,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089) nodes_clear(nmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) node_set(source, nmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) * This does not "check" the range but isolates all pages that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) * need migration. Between passing in the full user address
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) flags | MPOL_MF_DISCONTIG_OK, &pagelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) if (!list_empty(&pagelist)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) err = migrate_pages(&pagelist, alloc_migration_target, NULL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105) putback_movable_pages(&pagelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) * Move pages between the two nodesets so as to preserve the physical
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) * layout as much as possible.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) * Returns the number of page that could not be moved.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) const nodemask_t *to, int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) int busy = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) int err = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) nodemask_t tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) lru_cache_disable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) mmap_read_lock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) * bit in 'tmp', and return that <source, dest> pair for migration.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) * The pair of nodemasks 'to' and 'from' define the map.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) * If no pair of bits is found that way, fallback to picking some
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) * pair of 'source' and 'dest' bits that are not the same. If the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) * 'source' and 'dest' bits are the same, this represents a node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) * that will be migrating to itself, so no pages need move.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) * If no bits are left in 'tmp', or if all remaining bits left
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140) * in 'tmp' correspond to the same bit in 'to', return false
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) * (nothing left to migrate).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) * This lets us pick a pair of nodes to migrate between, such that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) * if possible the dest node is not already occupied by some other
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) * source node, minimizing the risk of overloading the memory on a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) * node that would happen if we migrated incoming memory to a node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147) * before migrating outgoing memory source that same node.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149) * A single scan of tmp is sufficient. As we go, we remember the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) * most recent <s, d> pair that moved (s != d). If we find a pair
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) * that not only moved, but what's better, moved to an empty slot
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) * (d is not set in tmp), then we break out then, with that pair.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) * Otherwise when we finish scanning from_tmp, we at least have the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154) * most recent <s, d> pair that moved. If we get all the way through
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) * the scan of tmp without finding any node that moved, much less
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) * moved to an empty node, then there is nothing left worth migrating.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) tmp = *from;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) while (!nodes_empty(tmp)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) int s,d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) int source = NUMA_NO_NODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163) int dest = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) for_each_node_mask(s, tmp) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) * do_migrate_pages() tries to maintain the relative
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169) * node relationship of the pages established between
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) * threads and memory areas.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) * However if the number of source nodes is not equal to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) * the number of destination nodes we can not preserve
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) * this node relative relationship. In that case, skip
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) * copying memory from a node that is in the destination
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) * mask.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) * Example: [2,3,4] -> [3,4,5] moves everything.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) if ((nodes_weight(*from) != nodes_weight(*to)) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) (node_isset(s, *to)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) d = node_remap(s, *from, *to);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) if (s == d)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) source = s; /* Node moved. Memorize */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) dest = d;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) /* dest not in remaining from nodes? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194) if (!node_isset(dest, tmp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) if (source == NUMA_NO_NODE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200) node_clear(source, tmp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) err = migrate_to_node(mm, source, dest, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) if (err > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) busy += err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204) if (err < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207) mmap_read_unlock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) lru_cache_enable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) if (err < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) return busy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) * Allocate a new page for page migration based on vma policy.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) * Start by assuming the page is mapped by the same vma as contains @start.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) * Search forward from there, if not. N.B., this assumes that the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) * list of pages handed to migrate_pages()--which is how we get here--
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) * is in virtual address order.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223) static struct page *new_page(struct page *page, unsigned long start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) struct vm_area_struct *vma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) unsigned long address;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) vma = find_vma(current->mm, start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) while (vma) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) address = page_address_in_vma(page, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) if (address != -EFAULT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) vma = vma->vm_next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) if (PageHuge(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) return alloc_huge_page_vma(page_hstate(compound_head(page)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238) vma, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) } else if (PageTransHuge(page)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240) struct page *thp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242) thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243) HPAGE_PMD_ORDER);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244) if (!thp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) prep_transhuge_page(thp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) return thp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250) * if !vma, alloc_page_vma() will use task or system default policy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252) return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) vma, address);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) static int migrate_page_add(struct page *page, struct list_head *pagelist,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258) unsigned long flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260) return -EIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263) int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264) const nodemask_t *to, int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266) return -ENOSYS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269) static struct page *new_page(struct page *page, unsigned long start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275) static long do_mbind(unsigned long start, unsigned long len,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276) unsigned short mode, unsigned short mode_flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) nodemask_t *nmask, unsigned long flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279) struct mm_struct *mm = current->mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280) struct mempolicy *new;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) unsigned long end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284) LIST_HEAD(pagelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286) if (flags & ~(unsigned long)MPOL_MF_VALID)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288) if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289) return -EPERM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1290)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1291) if (start & ~PAGE_MASK)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1292) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1293)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1294) if (mode == MPOL_DEFAULT)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1295) flags &= ~MPOL_MF_STRICT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1296)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1297) len = (len + PAGE_SIZE - 1) & PAGE_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1298) end = start + len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1299)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1300) if (end < start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1301) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1302) if (end == start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1303) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1304)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1305) new = mpol_new(mode, mode_flags, nmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1306) if (IS_ERR(new))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1307) return PTR_ERR(new);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1308)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1309) if (flags & MPOL_MF_LAZY)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1310) new->flags |= MPOL_F_MOF;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1311)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1312) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1313) * If we are using the default policy then operation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1314) * on discontinuous address spaces is okay after all
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1315) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1316) if (!new)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1317) flags |= MPOL_MF_DISCONTIG_OK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1318)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1319) pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1320) start, start + len, mode, mode_flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1321) nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1322)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1323) if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1324)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1325) lru_cache_disable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1326) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1327) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1328) NODEMASK_SCRATCH(scratch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1329) if (scratch) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1330) mmap_write_lock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1331) err = mpol_set_nodemask(new, nmask, scratch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1332) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1333) mmap_write_unlock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1334) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1335) err = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1336) NODEMASK_SCRATCH_FREE(scratch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1337) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1338) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1339) goto mpol_out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1340)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1341) ret = queue_pages_range(mm, start, end, nmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1342) flags | MPOL_MF_INVERT, &pagelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1343)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1344) if (ret < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1345) err = ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1346) goto up_out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1347) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1348)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1349) err = mbind_range(mm, start, end, new);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1350)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1351) if (!err) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1352) int nr_failed = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1353)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1354) if (!list_empty(&pagelist)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1355) WARN_ON_ONCE(flags & MPOL_MF_LAZY);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1356) nr_failed = migrate_pages(&pagelist, new_page, NULL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1357) start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1358) if (nr_failed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1359) putback_movable_pages(&pagelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1360) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1361)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1362) if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1363) err = -EIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1364) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1365) up_out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1366) if (!list_empty(&pagelist))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1367) putback_movable_pages(&pagelist);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1368) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1369)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1370) mmap_write_unlock(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1371) mpol_out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1372) mpol_put(new);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1373) if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1374) lru_cache_enable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1375) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1376) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1377)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1378) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1379) * User space interface with variable sized bitmaps for nodelists.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1380) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1381)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1382) /* Copy a node mask from user space. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1383) static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1384) unsigned long maxnode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1385) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1386) unsigned long k;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1387) unsigned long t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1388) unsigned long nlongs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1389) unsigned long endmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1390)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1391) --maxnode;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1392) nodes_clear(*nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1393) if (maxnode == 0 || !nmask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1394) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1395) if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1396) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1397)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1398) nlongs = BITS_TO_LONGS(maxnode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1399) if ((maxnode % BITS_PER_LONG) == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1400) endmask = ~0UL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1401) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1402) endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1403)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1404) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1405) * When the user specified more nodes than supported just check
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1406) * if the non supported part is all zero.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1407) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1408) * If maxnode have more longs than MAX_NUMNODES, check
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1409) * the bits in that area first. And then go through to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1410) * check the rest bits which equal or bigger than MAX_NUMNODES.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1411) * Otherwise, just check bits [MAX_NUMNODES, maxnode).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1412) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1413) if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1414) for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1415) if (get_user(t, nmask + k))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1416) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1417) if (k == nlongs - 1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1418) if (t & endmask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1419) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1420) } else if (t)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1421) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1422) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1423) nlongs = BITS_TO_LONGS(MAX_NUMNODES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1424) endmask = ~0UL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1425) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1426)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1427) if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1428) unsigned long valid_mask = endmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1429)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1430) valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1431) if (get_user(t, nmask + nlongs - 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1432) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1433) if (t & valid_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1434) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1435) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1436)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1437) if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1438) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1439) nodes_addr(*nodes)[nlongs-1] &= endmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1440) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1441) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1442)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1443) /* Copy a kernel node mask to user space */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1444) static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1445) nodemask_t *nodes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1446) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1447) unsigned long copy = ALIGN(maxnode-1, 64) / 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1448) unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1449)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1450) if (copy > nbytes) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1451) if (copy > PAGE_SIZE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1452) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1453) if (clear_user((char __user *)mask + nbytes, copy - nbytes))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1454) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1455) copy = nbytes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1456) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1457) return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1458) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1459)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1460) static long kernel_mbind(unsigned long start, unsigned long len,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1461) unsigned long mode, const unsigned long __user *nmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1462) unsigned long maxnode, unsigned int flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1463) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1464) nodemask_t nodes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1465) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1466) unsigned short mode_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1467)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1468) start = untagged_addr(start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1469) mode_flags = mode & MPOL_MODE_FLAGS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1470) mode &= ~MPOL_MODE_FLAGS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1471) if (mode >= MPOL_MAX)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1472) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1473) if ((mode_flags & MPOL_F_STATIC_NODES) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1474) (mode_flags & MPOL_F_RELATIVE_NODES))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1475) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1476) err = get_nodes(&nodes, nmask, maxnode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1477) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1478) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1479) return do_mbind(start, len, mode, mode_flags, &nodes, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1480) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1481)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1482) SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1483) unsigned long, mode, const unsigned long __user *, nmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1484) unsigned long, maxnode, unsigned int, flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1485) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1486) return kernel_mbind(start, len, mode, nmask, maxnode, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1487) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1488)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1489) /* Set the process memory policy */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1490) static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1491) unsigned long maxnode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1492) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1493) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1494) nodemask_t nodes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1495) unsigned short flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1496)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1497) flags = mode & MPOL_MODE_FLAGS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1498) mode &= ~MPOL_MODE_FLAGS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1499) if ((unsigned int)mode >= MPOL_MAX)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1500) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1501) if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1502) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1503) err = get_nodes(&nodes, nmask, maxnode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1504) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1505) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1506) return do_set_mempolicy(mode, flags, &nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1507) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1508)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1509) SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1510) unsigned long, maxnode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1511) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1512) return kernel_set_mempolicy(mode, nmask, maxnode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1513) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1514)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1515) static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1516) const unsigned long __user *old_nodes,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1517) const unsigned long __user *new_nodes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1518) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1519) struct mm_struct *mm = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1520) struct task_struct *task;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1521) nodemask_t task_nodes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1522) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1523) nodemask_t *old;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1524) nodemask_t *new;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1525) NODEMASK_SCRATCH(scratch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1526)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1527) if (!scratch)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1528) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1529)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1530) old = &scratch->mask1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1531) new = &scratch->mask2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1532)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1533) err = get_nodes(old, old_nodes, maxnode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1534) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1535) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1536)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1537) err = get_nodes(new, new_nodes, maxnode);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1538) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1539) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1540)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1541) /* Find the mm_struct */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1542) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1543) task = pid ? find_task_by_vpid(pid) : current;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1544) if (!task) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1545) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1546) err = -ESRCH;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1547) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1548) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1549) get_task_struct(task);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1550)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1551) err = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1552)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1553) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1554) * Check if this process has the right to modify the specified process.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1555) * Use the regular "ptrace_may_access()" checks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1556) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1557) if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1558) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1559) err = -EPERM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1560) goto out_put;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1561) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1562) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1563)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1564) task_nodes = cpuset_mems_allowed(task);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1565) /* Is the user allowed to access the target nodes? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1566) if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1567) err = -EPERM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1568) goto out_put;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1569) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1570)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1571) task_nodes = cpuset_mems_allowed(current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1572) nodes_and(*new, *new, task_nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1573) if (nodes_empty(*new))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1574) goto out_put;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1575)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1576) err = security_task_movememory(task);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1577) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1578) goto out_put;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1579)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1580) mm = get_task_mm(task);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1581) put_task_struct(task);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1582)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1583) if (!mm) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1584) err = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1585) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1586) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1587)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1588) err = do_migrate_pages(mm, old, new,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1589) capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1590)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1591) mmput(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1592) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1593) NODEMASK_SCRATCH_FREE(scratch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1594)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1595) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1596)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1597) out_put:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1598) put_task_struct(task);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1599) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1600)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1601) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1602)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1603) SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1604) const unsigned long __user *, old_nodes,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1605) const unsigned long __user *, new_nodes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1606) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1607) return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1608) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1609)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1610)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1611) /* Retrieve NUMA policy */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1612) static int kernel_get_mempolicy(int __user *policy,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1613) unsigned long __user *nmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1614) unsigned long maxnode,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1615) unsigned long addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1616) unsigned long flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1617) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1618) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1619) int pval;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1620) nodemask_t nodes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1621)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1622) if (nmask != NULL && maxnode < nr_node_ids)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1623) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1624)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1625) addr = untagged_addr(addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1626)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1627) err = do_get_mempolicy(&pval, &nodes, addr, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1628)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1629) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1630) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1631)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1632) if (policy && put_user(pval, policy))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1633) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1634)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1635) if (nmask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1636) err = copy_nodes_to_user(nmask, maxnode, &nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1637)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1638) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1639) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1640)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1641) SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1642) unsigned long __user *, nmask, unsigned long, maxnode,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1643) unsigned long, addr, unsigned long, flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1644) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1645) return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1646) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1647)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1648) #ifdef CONFIG_COMPAT
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1649)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1650) COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1651) compat_ulong_t __user *, nmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1652) compat_ulong_t, maxnode,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1653) compat_ulong_t, addr, compat_ulong_t, flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1654) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1655) long err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1656) unsigned long __user *nm = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1657) unsigned long nr_bits, alloc_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1658) DECLARE_BITMAP(bm, MAX_NUMNODES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1659)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1660) nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1661) alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1662)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1663) if (nmask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1664) nm = compat_alloc_user_space(alloc_size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1665)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1666) err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1667)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1668) if (!err && nmask) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1669) unsigned long copy_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1670) copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1671) err = copy_from_user(bm, nm, copy_size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1672) /* ensure entire bitmap is zeroed */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1673) err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1674) err |= compat_put_bitmap(nmask, bm, nr_bits);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1675) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1676)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1677) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1678) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1679)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1680) COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1681) compat_ulong_t, maxnode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1682) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1683) unsigned long __user *nm = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1684) unsigned long nr_bits, alloc_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1685) DECLARE_BITMAP(bm, MAX_NUMNODES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1686)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1687) nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1688) alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1689)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1690) if (nmask) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1691) if (compat_get_bitmap(bm, nmask, nr_bits))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1692) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1693) nm = compat_alloc_user_space(alloc_size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1694) if (copy_to_user(nm, bm, alloc_size))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1695) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1696) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1697)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1698) return kernel_set_mempolicy(mode, nm, nr_bits+1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1699) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1700)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1701) COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1702) compat_ulong_t, mode, compat_ulong_t __user *, nmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1703) compat_ulong_t, maxnode, compat_ulong_t, flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1704) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1705) unsigned long __user *nm = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1706) unsigned long nr_bits, alloc_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1707) nodemask_t bm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1708)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1709) nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1710) alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1711)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1712) if (nmask) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1713) if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1714) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1715) nm = compat_alloc_user_space(alloc_size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1716) if (copy_to_user(nm, nodes_addr(bm), alloc_size))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1717) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1718) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1719)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1720) return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1721) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1722)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1723) COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1724) compat_ulong_t, maxnode,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1725) const compat_ulong_t __user *, old_nodes,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1726) const compat_ulong_t __user *, new_nodes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1727) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1728) unsigned long __user *old = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1729) unsigned long __user *new = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1730) nodemask_t tmp_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1731) unsigned long nr_bits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1732) unsigned long size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1733)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1734) nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1735) size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1736) if (old_nodes) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1737) if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1738) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1739) old = compat_alloc_user_space(new_nodes ? size * 2 : size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1740) if (new_nodes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1741) new = old + size / sizeof(unsigned long);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1742) if (copy_to_user(old, nodes_addr(tmp_mask), size))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1743) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1744) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1745) if (new_nodes) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1746) if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1747) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1748) if (new == NULL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1749) new = compat_alloc_user_space(size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1750) if (copy_to_user(new, nodes_addr(tmp_mask), size))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1751) return -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1752) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1753) return kernel_migrate_pages(pid, nr_bits + 1, old, new);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1754) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1755)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1756) #endif /* CONFIG_COMPAT */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1757)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1758) bool vma_migratable(struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1759) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1760) if (vma->vm_flags & (VM_IO | VM_PFNMAP))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1761) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1762)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1763) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1764) * DAX device mappings require predictable access latency, so avoid
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1765) * incurring periodic faults.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1766) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1767) if (vma_is_dax(vma))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1768) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1769)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1770) if (is_vm_hugetlb_page(vma) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1771) !hugepage_migration_supported(hstate_vma(vma)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1772) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1773)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1774) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1775) * Migration allocates pages in the highest zone. If we cannot
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1776) * do so then migration (at least from node to node) is not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1777) * possible.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1778) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1779) if (vma->vm_file &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1780) gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1781) < policy_zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1782) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1783) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1784) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1785)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1786) struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1787) unsigned long addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1788) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1789) struct mempolicy *pol;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1790)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1791) if (!vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1792) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1793)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1794) if (vma->vm_ops && vma->vm_ops->get_policy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1795) return vma->vm_ops->get_policy(vma, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1796)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1797) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1798) * This could be called without holding the mmap_sem in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1799) * speculative page fault handler's path.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1800) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1801) pol = READ_ONCE(vma->vm_policy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1802) if (pol) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1803) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1804) * shmem_alloc_page() passes MPOL_F_SHARED policy with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1805) * a pseudo vma whose vma->vm_ops=NULL. Take a reference
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1806) * count on these policies which will be dropped by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1807) * mpol_cond_put() later
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1808) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1809) if (mpol_needs_cond_ref(pol))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1810) mpol_get(pol);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1811) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1812)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1813) return pol;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1814) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1815)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1816) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1817) * get_vma_policy(@vma, @addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1818) * @vma: virtual memory area whose policy is sought
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1819) * @addr: address in @vma for shared policy lookup
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1820) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1821) * Returns effective policy for a VMA at specified address.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1822) * Falls back to current->mempolicy or system default policy, as necessary.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1823) * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1824) * count--added by the get_policy() vm_op, as appropriate--to protect against
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1825) * freeing by another task. It is the caller's responsibility to free the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1826) * extra reference for shared policies.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1827) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1828) static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1829) unsigned long addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1830) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1831) struct mempolicy *pol = __get_vma_policy(vma, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1832)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1833) if (!pol)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1834) pol = get_task_policy(current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1835)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1836) return pol;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1837) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1838)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1839) bool vma_policy_mof(struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1840) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1841) struct mempolicy *pol;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1842)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1843) if (vma->vm_ops && vma->vm_ops->get_policy) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1844) bool ret = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1845)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1846) pol = vma->vm_ops->get_policy(vma, vma->vm_start);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1847) if (pol && (pol->flags & MPOL_F_MOF))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1848) ret = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1849) mpol_cond_put(pol);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1850)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1851) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1852) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1853)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1854) pol = vma->vm_policy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1855) if (!pol)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1856) pol = get_task_policy(current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1857)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1858) return pol->flags & MPOL_F_MOF;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1859) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1860)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1861) static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1862) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1863) enum zone_type dynamic_policy_zone = policy_zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1864)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1865) BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1866)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1867) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1868) * if policy->v.nodes has movable memory only,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1869) * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1870) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1871) * policy->v.nodes is intersect with node_states[N_MEMORY].
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1872) * so if the following test faile, it implies
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1873) * policy->v.nodes has movable memory only.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1874) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1875) if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1876) dynamic_policy_zone = ZONE_MOVABLE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1877)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1878) return zone >= dynamic_policy_zone;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1879) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1880)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1881) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1882) * Return a nodemask representing a mempolicy for filtering nodes for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1883) * page allocation
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1884) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1885) nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1886) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1887) /* Lower zones don't get a nodemask applied for MPOL_BIND */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1888) if (unlikely(policy->mode == MPOL_BIND) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1889) apply_policy_zone(policy, gfp_zone(gfp)) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1890) cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1891) return &policy->v.nodes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1892)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1893) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1894) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1895)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1896) /* Return the node id preferred by the given mempolicy, or the given id */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1897) static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1898) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1899) if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1900) nd = policy->v.preferred_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1901) else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1902) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1903) * __GFP_THISNODE shouldn't even be used with the bind policy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1904) * because we might easily break the expectation to stay on the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1905) * requested node and not break the policy.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1906) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1907) WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1908) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1909)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1910) return nd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1911) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1912)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1913) /* Do dynamic interleaving for a process */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1914) static unsigned interleave_nodes(struct mempolicy *policy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1915) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1916) unsigned next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1917) struct task_struct *me = current;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1918)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1919) next = next_node_in(me->il_prev, policy->v.nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1920) if (next < MAX_NUMNODES)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1921) me->il_prev = next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1922) return next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1923) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1924)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1925) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1926) * Depending on the memory policy provide a node from which to allocate the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1927) * next slab entry.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1928) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1929) unsigned int mempolicy_slab_node(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1930) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1931) struct mempolicy *policy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1932) int node = numa_mem_id();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1933)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1934) if (in_interrupt())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1935) return node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1936)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1937) policy = current->mempolicy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1938) if (!policy || policy->flags & MPOL_F_LOCAL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1939) return node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1940)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1941) switch (policy->mode) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1942) case MPOL_PREFERRED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1943) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1944) * handled MPOL_F_LOCAL above
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1945) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1946) return policy->v.preferred_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1947)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1948) case MPOL_INTERLEAVE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1949) return interleave_nodes(policy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1950)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1951) case MPOL_BIND: {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1952) struct zoneref *z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1953)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1954) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1955) * Follow bind policy behavior and start allocation at the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1956) * first node.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1957) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1958) struct zonelist *zonelist;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1959) enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1960) zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1961) z = first_zones_zonelist(zonelist, highest_zoneidx,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1962) &policy->v.nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1963) return z->zone ? zone_to_nid(z->zone) : node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1964) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1965)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1966) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1967) BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1968) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1969) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1970)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1971) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1972) * Do static interleaving for a VMA with known offset @n. Returns the n'th
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1973) * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1974) * number of present nodes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1975) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1976) static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1977) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1978) unsigned nnodes = nodes_weight(pol->v.nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1979) unsigned target;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1980) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1981) int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1982)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1983) if (!nnodes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1984) return numa_node_id();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1985) target = (unsigned int)n % nnodes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1986) nid = first_node(pol->v.nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1987) for (i = 0; i < target; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1988) nid = next_node(nid, pol->v.nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1989) return nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1990) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1991)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1992) /* Determine a node number for interleave */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1993) static inline unsigned interleave_nid(struct mempolicy *pol,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1994) struct vm_area_struct *vma, unsigned long addr, int shift)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1995) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1996) if (vma) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1997) unsigned long off;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1998)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1999) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2000) * for small pages, there is no difference between
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2001) * shift and PAGE_SHIFT, so the bit-shift is safe.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2002) * for huge pages, since vm_pgoff is in units of small
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2003) * pages, we need to shift off the always 0 bits to get
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2004) * a useful offset.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2005) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2006) BUG_ON(shift < PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2007) off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2008) off += (addr - vma->vm_start) >> shift;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2009) return offset_il_node(pol, off);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2010) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2011) return interleave_nodes(pol);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2012) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2013)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2014) #ifdef CONFIG_HUGETLBFS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2015) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2016) * huge_node(@vma, @addr, @gfp_flags, @mpol)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2017) * @vma: virtual memory area whose policy is sought
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2018) * @addr: address in @vma for shared policy lookup and interleave policy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2019) * @gfp_flags: for requested zone
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2020) * @mpol: pointer to mempolicy pointer for reference counted mempolicy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2021) * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2022) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2023) * Returns a nid suitable for a huge page allocation and a pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2024) * to the struct mempolicy for conditional unref after allocation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2025) * If the effective policy is 'BIND, returns a pointer to the mempolicy's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2026) * @nodemask for filtering the zonelist.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2027) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2028) * Must be protected by read_mems_allowed_begin()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2029) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2030) int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2031) struct mempolicy **mpol, nodemask_t **nodemask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2032) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2033) int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2034)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2035) *mpol = get_vma_policy(vma, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2036) *nodemask = NULL; /* assume !MPOL_BIND */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2037)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2038) if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2039) nid = interleave_nid(*mpol, vma, addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2040) huge_page_shift(hstate_vma(vma)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2041) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2042) nid = policy_node(gfp_flags, *mpol, numa_node_id());
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2043) if ((*mpol)->mode == MPOL_BIND)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2044) *nodemask = &(*mpol)->v.nodes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2045) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2046) return nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2047) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2048)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2049) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2050) * init_nodemask_of_mempolicy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2051) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2052) * If the current task's mempolicy is "default" [NULL], return 'false'
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2053) * to indicate default policy. Otherwise, extract the policy nodemask
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2054) * for 'bind' or 'interleave' policy into the argument nodemask, or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2055) * initialize the argument nodemask to contain the single node for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2056) * 'preferred' or 'local' policy and return 'true' to indicate presence
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2057) * of non-default mempolicy.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2058) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2059) * We don't bother with reference counting the mempolicy [mpol_get/put]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2060) * because the current task is examining it's own mempolicy and a task's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2061) * mempolicy is only ever changed by the task itself.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2062) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2063) * N.B., it is the caller's responsibility to free a returned nodemask.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2064) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2065) bool init_nodemask_of_mempolicy(nodemask_t *mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2066) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2067) struct mempolicy *mempolicy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2068) int nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2069)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2070) if (!(mask && current->mempolicy))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2071) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2072)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2073) task_lock(current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2074) mempolicy = current->mempolicy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2075) switch (mempolicy->mode) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2076) case MPOL_PREFERRED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2077) if (mempolicy->flags & MPOL_F_LOCAL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2078) nid = numa_node_id();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2079) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2080) nid = mempolicy->v.preferred_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2081) init_nodemask_of_node(mask, nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2082) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2083)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2084) case MPOL_BIND:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2085) case MPOL_INTERLEAVE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2086) *mask = mempolicy->v.nodes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2087) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2088)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2089) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2090) BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2091) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2092) task_unlock(current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2093)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2094) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2095) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2096) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2097)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2098) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2099) * mempolicy_nodemask_intersects
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2100) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2101) * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2102) * policy. Otherwise, check for intersection between mask and the policy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2103) * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2104) * policy, always return true since it may allocate elsewhere on fallback.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2105) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2106) * Takes task_lock(tsk) to prevent freeing of its mempolicy.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2107) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2108) bool mempolicy_nodemask_intersects(struct task_struct *tsk,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2109) const nodemask_t *mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2110) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2111) struct mempolicy *mempolicy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2112) bool ret = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2113)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2114) if (!mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2115) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2116) task_lock(tsk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2117) mempolicy = tsk->mempolicy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2118) if (!mempolicy)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2119) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2120)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2121) switch (mempolicy->mode) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2122) case MPOL_PREFERRED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2123) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2124) * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2125) * allocate from, they may fallback to other nodes when oom.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2126) * Thus, it's possible for tsk to have allocated memory from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2127) * nodes in mask.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2128) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2129) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2130) case MPOL_BIND:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2131) case MPOL_INTERLEAVE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2132) ret = nodes_intersects(mempolicy->v.nodes, *mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2133) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2134) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2135) BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2136) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2137) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2138) task_unlock(tsk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2139) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2140) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2141)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2142) /* Allocate a page in interleaved policy.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2143) Own path because it needs to do special accounting. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2144) static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2145) unsigned nid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2146) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2147) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2148)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2149) page = __alloc_pages(gfp, order, nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2150) /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2151) if (!static_branch_likely(&vm_numa_stat_key))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2152) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2153) if (page && page_to_nid(page) == nid) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2154) preempt_disable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2155) __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2156) preempt_enable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2157) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2158) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2159) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2160)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2161) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2162) * alloc_pages_vma - Allocate a page for a VMA.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2163) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2164) * @gfp:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2165) * %GFP_USER user allocation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2166) * %GFP_KERNEL kernel allocations,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2167) * %GFP_HIGHMEM highmem/user allocations,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2168) * %GFP_FS allocation should not call back into a file system.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2169) * %GFP_ATOMIC don't sleep.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2170) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2171) * @order:Order of the GFP allocation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2172) * @vma: Pointer to VMA or NULL if not available.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2173) * @addr: Virtual Address of the allocation. Must be inside the VMA.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2174) * @node: Which node to prefer for allocation (modulo policy).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2175) * @hugepage: for hugepages try only the preferred node if possible
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2176) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2177) * This function allocates a page from the kernel page pool and applies
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2178) * a NUMA policy associated with the VMA or the current process.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2179) * When VMA is not NULL caller must read-lock the mmap_lock of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2180) * mm_struct of the VMA to prevent it from going away. Should be used for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2181) * all allocations for pages that will be mapped into user space. Returns
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2182) * NULL when no page can be allocated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2183) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2184) struct page *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2185) alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2186) unsigned long addr, int node, bool hugepage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2187) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2188) struct mempolicy *pol;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2189) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2190) int preferred_nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2191) nodemask_t *nmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2192)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2193) pol = get_vma_policy(vma, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2194)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2195) if (pol->mode == MPOL_INTERLEAVE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2196) unsigned nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2197)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2198) nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2199) mpol_cond_put(pol);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2200) page = alloc_page_interleave(gfp, order, nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2201) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2202) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2203)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2204) if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2205) int hpage_node = node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2206)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2207) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2208) * For hugepage allocation and non-interleave policy which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2209) * allows the current node (or other explicitly preferred
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2210) * node) we only try to allocate from the current/preferred
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2211) * node and don't fall back to other nodes, as the cost of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2212) * remote accesses would likely offset THP benefits.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2213) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2214) * If the policy is interleave, or does not allow the current
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2215) * node in its nodemask, we allocate the standard way.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2216) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2217) if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2218) hpage_node = pol->v.preferred_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2219)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2220) nmask = policy_nodemask(gfp, pol);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2221) if (!nmask || node_isset(hpage_node, *nmask)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2222) mpol_cond_put(pol);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2223) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2224) * First, try to allocate THP only on local node, but
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2225) * don't reclaim unnecessarily, just compact.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2226) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2227) page = __alloc_pages_node(hpage_node,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2228) gfp | __GFP_THISNODE | __GFP_NORETRY, order);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2229)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2230) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2231) * If hugepage allocations are configured to always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2232) * synchronous compact or the vma has been madvised
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2233) * to prefer hugepage backing, retry allowing remote
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2234) * memory with both reclaim and compact as well.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2235) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2236) if (!page && (gfp & __GFP_DIRECT_RECLAIM))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2237) page = __alloc_pages_nodemask(gfp, order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2238) hpage_node, nmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2239)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2240) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2241) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2242) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2243)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2244) nmask = policy_nodemask(gfp, pol);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2245) preferred_nid = policy_node(gfp, pol, node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2246) page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2247) mpol_cond_put(pol);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2248) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2249) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2250) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2251) EXPORT_SYMBOL(alloc_pages_vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2252)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2253) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2254) * alloc_pages_current - Allocate pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2255) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2256) * @gfp:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2257) * %GFP_USER user allocation,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2258) * %GFP_KERNEL kernel allocation,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2259) * %GFP_HIGHMEM highmem allocation,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2260) * %GFP_FS don't call back into a file system.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2261) * %GFP_ATOMIC don't sleep.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2262) * @order: Power of two of allocation size in pages. 0 is a single page.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2263) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2264) * Allocate a page from the kernel page pool. When not in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2265) * interrupt context and apply the current process NUMA policy.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2266) * Returns NULL when no page can be allocated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2267) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2268) struct page *alloc_pages_current(gfp_t gfp, unsigned order)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2269) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2270) struct mempolicy *pol = &default_policy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2271) struct page *page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2272)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2273) if (!in_interrupt() && !(gfp & __GFP_THISNODE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2274) pol = get_task_policy(current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2275)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2276) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2277) * No reference counting needed for current->mempolicy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2278) * nor system default_policy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2279) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2280) if (pol->mode == MPOL_INTERLEAVE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2281) page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2282) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2283) page = __alloc_pages_nodemask(gfp, order,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2284) policy_node(gfp, pol, numa_node_id()),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2285) policy_nodemask(gfp, pol));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2286)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2287) return page;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2288) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2289) EXPORT_SYMBOL(alloc_pages_current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2290)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2291) int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2292) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2293) struct mempolicy *pol = mpol_dup(vma_policy(src));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2294)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2295) if (IS_ERR(pol))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2296) return PTR_ERR(pol);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2297) dst->vm_policy = pol;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2298) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2299) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2300)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2301) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2302) * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2303) * rebinds the mempolicy its copying by calling mpol_rebind_policy()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2304) * with the mems_allowed returned by cpuset_mems_allowed(). This
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2305) * keeps mempolicies cpuset relative after its cpuset moves. See
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2306) * further kernel/cpuset.c update_nodemask().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2307) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2308) * current's mempolicy may be rebinded by the other task(the task that changes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2309) * cpuset's mems), so we needn't do rebind work for current task.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2310) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2311)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2312) /* Slow path of a mempolicy duplicate */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2313) struct mempolicy *__mpol_dup(struct mempolicy *old)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2314) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2315) struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2316)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2317) if (!new)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2318) return ERR_PTR(-ENOMEM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2319)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2320) /* task's mempolicy is protected by alloc_lock */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2321) if (old == current->mempolicy) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2322) task_lock(current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2323) *new = *old;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2324) task_unlock(current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2325) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2326) *new = *old;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2327)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2328) if (current_cpuset_is_being_rebound()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2329) nodemask_t mems = cpuset_mems_allowed(current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2330) mpol_rebind_policy(new, &mems);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2331) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2332) atomic_set(&new->refcnt, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2333) return new;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2334) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2335)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2336) /* Slow path of a mempolicy comparison */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2337) bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2338) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2339) if (!a || !b)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2340) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2341) if (a->mode != b->mode)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2342) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2343) if (a->flags != b->flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2344) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2345) if (mpol_store_user_nodemask(a))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2346) if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2347) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2348)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2349) switch (a->mode) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2350) case MPOL_BIND:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2351) case MPOL_INTERLEAVE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2352) return !!nodes_equal(a->v.nodes, b->v.nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2353) case MPOL_PREFERRED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2354) /* a's ->flags is the same as b's */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2355) if (a->flags & MPOL_F_LOCAL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2356) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2357) return a->v.preferred_node == b->v.preferred_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2358) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2359) BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2360) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2361) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2362) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2363)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2364) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2365) * Shared memory backing store policy support.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2366) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2367) * Remember policies even when nobody has shared memory mapped.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2368) * The policies are kept in Red-Black tree linked from the inode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2369) * They are protected by the sp->lock rwlock, which should be held
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2370) * for any accesses to the tree.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2371) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2372)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2373) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2374) * lookup first element intersecting start-end. Caller holds sp->lock for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2375) * reading or for writing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2376) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2377) static struct sp_node *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2378) sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2379) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2380) struct rb_node *n = sp->root.rb_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2381)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2382) while (n) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2383) struct sp_node *p = rb_entry(n, struct sp_node, nd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2384)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2385) if (start >= p->end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2386) n = n->rb_right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2387) else if (end <= p->start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2388) n = n->rb_left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2389) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2390) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2391) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2392) if (!n)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2393) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2394) for (;;) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2395) struct sp_node *w = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2396) struct rb_node *prev = rb_prev(n);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2397) if (!prev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2398) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2399) w = rb_entry(prev, struct sp_node, nd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2400) if (w->end <= start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2401) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2402) n = prev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2403) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2404) return rb_entry(n, struct sp_node, nd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2405) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2406)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2407) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2408) * Insert a new shared policy into the list. Caller holds sp->lock for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2409) * writing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2410) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2411) static void sp_insert(struct shared_policy *sp, struct sp_node *new)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2412) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2413) struct rb_node **p = &sp->root.rb_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2414) struct rb_node *parent = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2415) struct sp_node *nd;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2416)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2417) while (*p) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2418) parent = *p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2419) nd = rb_entry(parent, struct sp_node, nd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2420) if (new->start < nd->start)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2421) p = &(*p)->rb_left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2422) else if (new->end > nd->end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2423) p = &(*p)->rb_right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2424) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2425) BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2426) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2427) rb_link_node(&new->nd, parent, p);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2428) rb_insert_color(&new->nd, &sp->root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2429) pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2430) new->policy ? new->policy->mode : 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2431) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2432)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2433) /* Find shared policy intersecting idx */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2434) struct mempolicy *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2435) mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2436) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2437) struct mempolicy *pol = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2438) struct sp_node *sn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2439)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2440) if (!sp->root.rb_node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2441) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2442) read_lock(&sp->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2443) sn = sp_lookup(sp, idx, idx+1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2444) if (sn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2445) mpol_get(sn->policy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2446) pol = sn->policy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2447) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2448) read_unlock(&sp->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2449) return pol;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2450) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2451)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2452) static void sp_free(struct sp_node *n)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2453) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2454) mpol_put(n->policy);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2455) kmem_cache_free(sn_cache, n);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2456) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2457)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2458) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2459) * mpol_misplaced - check whether current page node is valid in policy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2460) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2461) * @page: page to be checked
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2462) * @vma: vm area where page mapped
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2463) * @addr: virtual address where page mapped
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2464) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2465) * Lookup current policy node id for vma,addr and "compare to" page's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2466) * node id.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2467) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2468) * Returns:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2469) * -1 - not misplaced, page is in the right node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2470) * node - node id where the page should be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2471) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2472) * Policy determination "mimics" alloc_page_vma().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2473) * Called from fault path where we know the vma and faulting address.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2474) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2475) int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2476) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2477) struct mempolicy *pol;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2478) struct zoneref *z;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2479) int curnid = page_to_nid(page);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2480) unsigned long pgoff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2481) int thiscpu = raw_smp_processor_id();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2482) int thisnid = cpu_to_node(thiscpu);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2483) int polnid = NUMA_NO_NODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2484) int ret = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2485)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2486) pol = get_vma_policy(vma, addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2487) if (!(pol->flags & MPOL_F_MOF))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2488) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2489)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2490) switch (pol->mode) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2491) case MPOL_INTERLEAVE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2492) pgoff = vma->vm_pgoff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2493) pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2494) polnid = offset_il_node(pol, pgoff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2495) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2496)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2497) case MPOL_PREFERRED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2498) if (pol->flags & MPOL_F_LOCAL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2499) polnid = numa_node_id();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2500) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2501) polnid = pol->v.preferred_node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2502) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2503)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2504) case MPOL_BIND:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2505)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2506) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2507) * allows binding to multiple nodes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2508) * use current page if in policy nodemask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2509) * else select nearest allowed node, if any.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2510) * If no allowed nodes, use current [!misplaced].
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2511) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2512) if (node_isset(curnid, pol->v.nodes))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2513) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2514) z = first_zones_zonelist(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2515) node_zonelist(numa_node_id(), GFP_HIGHUSER),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2516) gfp_zone(GFP_HIGHUSER),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2517) &pol->v.nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2518) polnid = zone_to_nid(z->zone);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2519) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2520)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2521) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2522) BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2523) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2524)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2525) /* Migrate the page towards the node whose CPU is referencing it */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2526) if (pol->flags & MPOL_F_MORON) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2527) polnid = thisnid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2528)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2529) if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2530) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2531) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2532)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2533) if (curnid != polnid)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2534) ret = polnid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2535) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2536) mpol_cond_put(pol);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2537)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2538) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2539) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2540)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2541) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2542) * Drop the (possibly final) reference to task->mempolicy. It needs to be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2543) * dropped after task->mempolicy is set to NULL so that any allocation done as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2544) * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2545) * policy.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2546) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2547) void mpol_put_task_policy(struct task_struct *task)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2548) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2549) struct mempolicy *pol;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2550)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2551) task_lock(task);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2552) pol = task->mempolicy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2553) task->mempolicy = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2554) task_unlock(task);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2555) mpol_put(pol);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2556) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2557)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2558) static void sp_delete(struct shared_policy *sp, struct sp_node *n)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2559) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2560) pr_debug("deleting %lx-l%lx\n", n->start, n->end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2561) rb_erase(&n->nd, &sp->root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2562) sp_free(n);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2563) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2564)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2565) static void sp_node_init(struct sp_node *node, unsigned long start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2566) unsigned long end, struct mempolicy *pol)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2567) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2568) node->start = start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2569) node->end = end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2570) node->policy = pol;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2571) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2572)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2573) static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2574) struct mempolicy *pol)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2575) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2576) struct sp_node *n;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2577) struct mempolicy *newpol;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2578)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2579) n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2580) if (!n)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2581) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2582)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2583) newpol = mpol_dup(pol);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2584) if (IS_ERR(newpol)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2585) kmem_cache_free(sn_cache, n);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2586) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2587) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2588) newpol->flags |= MPOL_F_SHARED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2589) sp_node_init(n, start, end, newpol);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2590)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2591) return n;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2592) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2593)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2594) /* Replace a policy range. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2595) static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2596) unsigned long end, struct sp_node *new)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2597) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2598) struct sp_node *n;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2599) struct sp_node *n_new = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2600) struct mempolicy *mpol_new = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2601) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2602)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2603) restart:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2604) write_lock(&sp->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2605) n = sp_lookup(sp, start, end);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2606) /* Take care of old policies in the same range. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2607) while (n && n->start < end) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2608) struct rb_node *next = rb_next(&n->nd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2609) if (n->start >= start) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2610) if (n->end <= end)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2611) sp_delete(sp, n);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2612) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2613) n->start = end;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2614) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2615) /* Old policy spanning whole new range. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2616) if (n->end > end) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2617) if (!n_new)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2618) goto alloc_new;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2619)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2620) *mpol_new = *n->policy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2621) atomic_set(&mpol_new->refcnt, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2622) sp_node_init(n_new, end, n->end, mpol_new);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2623) n->end = start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2624) sp_insert(sp, n_new);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2625) n_new = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2626) mpol_new = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2627) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2628) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2629) n->end = start;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2630) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2631) if (!next)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2632) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2633) n = rb_entry(next, struct sp_node, nd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2634) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2635) if (new)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2636) sp_insert(sp, new);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2637) write_unlock(&sp->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2638) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2639)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2640) err_out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2641) if (mpol_new)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2642) mpol_put(mpol_new);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2643) if (n_new)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2644) kmem_cache_free(sn_cache, n_new);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2645)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2646) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2647)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2648) alloc_new:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2649) write_unlock(&sp->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2650) ret = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2651) n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2652) if (!n_new)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2653) goto err_out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2654) mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2655) if (!mpol_new)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2656) goto err_out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2657) goto restart;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2658) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2659)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2660) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2661) * mpol_shared_policy_init - initialize shared policy for inode
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2662) * @sp: pointer to inode shared policy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2663) * @mpol: struct mempolicy to install
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2664) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2665) * Install non-NULL @mpol in inode's shared policy rb-tree.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2666) * On entry, the current task has a reference on a non-NULL @mpol.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2667) * This must be released on exit.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2668) * This is called at get_inode() calls and we can use GFP_KERNEL.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2669) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2670) void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2671) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2672) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2673)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2674) sp->root = RB_ROOT; /* empty tree == default mempolicy */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2675) rwlock_init(&sp->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2676)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2677) if (mpol) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2678) struct vm_area_struct pvma;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2679) struct mempolicy *new;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2680) NODEMASK_SCRATCH(scratch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2681)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2682) if (!scratch)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2683) goto put_mpol;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2684) /* contextualize the tmpfs mount point mempolicy */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2685) new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2686) if (IS_ERR(new))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2687) goto free_scratch; /* no valid nodemask intersection */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2688)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2689) task_lock(current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2690) ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2691) task_unlock(current);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2692) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2693) goto put_new;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2694)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2695) /* Create pseudo-vma that contains just the policy */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2696) vma_init(&pvma, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2697) pvma.vm_end = TASK_SIZE; /* policy covers entire file */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2698) mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2699)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2700) put_new:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2701) mpol_put(new); /* drop initial ref */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2702) free_scratch:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2703) NODEMASK_SCRATCH_FREE(scratch);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2704) put_mpol:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2705) mpol_put(mpol); /* drop our incoming ref on sb mpol */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2706) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2707) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2708)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2709) int mpol_set_shared_policy(struct shared_policy *info,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2710) struct vm_area_struct *vma, struct mempolicy *npol)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2711) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2712) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2713) struct sp_node *new = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2714) unsigned long sz = vma_pages(vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2715)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2716) pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2717) vma->vm_pgoff,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2718) sz, npol ? npol->mode : -1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2719) npol ? npol->flags : -1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2720) npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2721)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2722) if (npol) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2723) new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2724) if (!new)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2725) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2726) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2727) err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2728) if (err && new)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2729) sp_free(new);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2730) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2731) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2732)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2733) /* Free a backing policy store on inode delete. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2734) void mpol_free_shared_policy(struct shared_policy *p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2735) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2736) struct sp_node *n;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2737) struct rb_node *next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2738)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2739) if (!p->root.rb_node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2740) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2741) write_lock(&p->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2742) next = rb_first(&p->root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2743) while (next) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2744) n = rb_entry(next, struct sp_node, nd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2745) next = rb_next(&n->nd);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2746) sp_delete(p, n);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2747) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2748) write_unlock(&p->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2749) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2750)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2751) #ifdef CONFIG_NUMA_BALANCING
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2752) static int __initdata numabalancing_override;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2753)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2754) static void __init check_numabalancing_enable(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2755) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2756) bool numabalancing_default = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2757)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2758) if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2759) numabalancing_default = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2760)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2761) /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2762) if (numabalancing_override)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2763) set_numabalancing_state(numabalancing_override == 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2764)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2765) if (num_online_nodes() > 1 && !numabalancing_override) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2766) pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2767) numabalancing_default ? "Enabling" : "Disabling");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2768) set_numabalancing_state(numabalancing_default);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2769) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2770) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2771)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2772) static int __init setup_numabalancing(char *str)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2773) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2774) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2775) if (!str)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2776) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2777)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2778) if (!strcmp(str, "enable")) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2779) numabalancing_override = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2780) ret = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2781) } else if (!strcmp(str, "disable")) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2782) numabalancing_override = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2783) ret = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2784) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2785) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2786) if (!ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2787) pr_warn("Unable to parse numa_balancing=\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2788)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2789) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2790) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2791) __setup("numa_balancing=", setup_numabalancing);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2792) #else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2793) static inline void __init check_numabalancing_enable(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2794) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2795) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2796) #endif /* CONFIG_NUMA_BALANCING */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2797)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2798) /* assumes fs == KERNEL_DS */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2799) void __init numa_policy_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2800) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2801) nodemask_t interleave_nodes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2802) unsigned long largest = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2803) int nid, prefer = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2804)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2805) policy_cache = kmem_cache_create("numa_policy",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2806) sizeof(struct mempolicy),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2807) 0, SLAB_PANIC, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2808)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2809) sn_cache = kmem_cache_create("shared_policy_node",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2810) sizeof(struct sp_node),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2811) 0, SLAB_PANIC, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2812)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2813) for_each_node(nid) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2814) preferred_node_policy[nid] = (struct mempolicy) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2815) .refcnt = ATOMIC_INIT(1),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2816) .mode = MPOL_PREFERRED,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2817) .flags = MPOL_F_MOF | MPOL_F_MORON,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2818) .v = { .preferred_node = nid, },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2819) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2820) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2821)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2822) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2823) * Set interleaving policy for system init. Interleaving is only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2824) * enabled across suitably sized nodes (default is >= 16MB), or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2825) * fall back to the largest node if they're all smaller.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2826) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2827) nodes_clear(interleave_nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2828) for_each_node_state(nid, N_MEMORY) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2829) unsigned long total_pages = node_present_pages(nid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2830)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2831) /* Preserve the largest node */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2832) if (largest < total_pages) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2833) largest = total_pages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2834) prefer = nid;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2835) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2836)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2837) /* Interleave this node? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2838) if ((total_pages << PAGE_SHIFT) >= (16 << 20))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2839) node_set(nid, interleave_nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2840) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2841)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2842) /* All too small, use the largest */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2843) if (unlikely(nodes_empty(interleave_nodes)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2844) node_set(prefer, interleave_nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2845)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2846) if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2847) pr_err("%s: interleaving failed\n", __func__);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2848)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2849) check_numabalancing_enable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2850) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2851)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2852) /* Reset policy of current process to default */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2853) void numa_default_policy(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2854) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2855) do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2856) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2857)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2858) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2859) * Parse and format mempolicy from/to strings
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2860) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2861)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2862) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2863) * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2864) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2865) static const char * const policy_modes[] =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2866) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2867) [MPOL_DEFAULT] = "default",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2868) [MPOL_PREFERRED] = "prefer",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2869) [MPOL_BIND] = "bind",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2870) [MPOL_INTERLEAVE] = "interleave",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2871) [MPOL_LOCAL] = "local",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2872) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2873)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2874)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2875) #ifdef CONFIG_TMPFS
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2876) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2877) * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2878) * @str: string containing mempolicy to parse
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2879) * @mpol: pointer to struct mempolicy pointer, returned on success.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2880) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2881) * Format of input:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2882) * <mode>[=<flags>][:<nodelist>]
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2883) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2884) * On success, returns 0, else 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2885) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2886) int mpol_parse_str(char *str, struct mempolicy **mpol)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2887) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2888) struct mempolicy *new = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2889) unsigned short mode_flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2890) nodemask_t nodes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2891) char *nodelist = strchr(str, ':');
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2892) char *flags = strchr(str, '=');
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2893) int err = 1, mode;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2894)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2895) if (flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2896) *flags++ = '\0'; /* terminate mode string */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2897)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2898) if (nodelist) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2899) /* NUL-terminate mode or flags string */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2900) *nodelist++ = '\0';
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2901) if (nodelist_parse(nodelist, nodes))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2902) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2903) if (!nodes_subset(nodes, node_states[N_MEMORY]))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2904) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2905) } else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2906) nodes_clear(nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2907)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2908) mode = match_string(policy_modes, MPOL_MAX, str);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2909) if (mode < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2910) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2911)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2912) switch (mode) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2913) case MPOL_PREFERRED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2914) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2915) * Insist on a nodelist of one node only, although later
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2916) * we use first_node(nodes) to grab a single node, so here
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2917) * nodelist (or nodes) cannot be empty.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2918) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2919) if (nodelist) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2920) char *rest = nodelist;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2921) while (isdigit(*rest))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2922) rest++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2923) if (*rest)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2924) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2925) if (nodes_empty(nodes))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2926) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2927) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2928) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2929) case MPOL_INTERLEAVE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2930) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2931) * Default to online nodes with memory if no nodelist
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2932) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2933) if (!nodelist)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2934) nodes = node_states[N_MEMORY];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2935) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2936) case MPOL_LOCAL:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2937) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2938) * Don't allow a nodelist; mpol_new() checks flags
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2939) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2940) if (nodelist)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2941) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2942) mode = MPOL_PREFERRED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2943) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2944) case MPOL_DEFAULT:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2945) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2946) * Insist on a empty nodelist
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2947) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2948) if (!nodelist)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2949) err = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2950) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2951) case MPOL_BIND:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2952) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2953) * Insist on a nodelist
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2954) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2955) if (!nodelist)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2956) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2957) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2958)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2959) mode_flags = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2960) if (flags) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2961) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2962) * Currently, we only support two mutually exclusive
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2963) * mode flags.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2964) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2965) if (!strcmp(flags, "static"))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2966) mode_flags |= MPOL_F_STATIC_NODES;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2967) else if (!strcmp(flags, "relative"))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2968) mode_flags |= MPOL_F_RELATIVE_NODES;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2969) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2970) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2971) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2972)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2973) new = mpol_new(mode, mode_flags, &nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2974) if (IS_ERR(new))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2975) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2976)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2977) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2978) * Save nodes for mpol_to_str() to show the tmpfs mount options
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2979) * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2980) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2981) if (mode != MPOL_PREFERRED)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2982) new->v.nodes = nodes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2983) else if (nodelist)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2984) new->v.preferred_node = first_node(nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2985) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2986) new->flags |= MPOL_F_LOCAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2987)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2988) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2989) * Save nodes for contextualization: this will be used to "clone"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2990) * the mempolicy in a specific context [cpuset] at a later time.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2991) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2992) new->w.user_nodemask = nodes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2993)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2994) err = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2995)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2996) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2997) /* Restore string for error message */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2998) if (nodelist)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2999) *--nodelist = ':';
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3000) if (flags)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3001) *--flags = '=';
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3002) if (!err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3003) *mpol = new;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3004) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3005) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3006) #endif /* CONFIG_TMPFS */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3007)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3008) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3009) * mpol_to_str - format a mempolicy structure for printing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3010) * @buffer: to contain formatted mempolicy string
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3011) * @maxlen: length of @buffer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3012) * @pol: pointer to mempolicy to be formatted
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3013) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3014) * Convert @pol into a string. If @buffer is too short, truncate the string.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3015) * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3016) * longest flag, "relative", and to display at least a few node ids.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3017) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3018) void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3019) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3020) char *p = buffer;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3021) nodemask_t nodes = NODE_MASK_NONE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3022) unsigned short mode = MPOL_DEFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3023) unsigned short flags = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3024)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3025) if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3026) mode = pol->mode;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3027) flags = pol->flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3028) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3029)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3030) switch (mode) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3031) case MPOL_DEFAULT:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3032) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3033) case MPOL_PREFERRED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3034) if (flags & MPOL_F_LOCAL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3035) mode = MPOL_LOCAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3036) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3037) node_set(pol->v.preferred_node, nodes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3038) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3039) case MPOL_BIND:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3040) case MPOL_INTERLEAVE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3041) nodes = pol->v.nodes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3042) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3043) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3044) WARN_ON_ONCE(1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3045) snprintf(p, maxlen, "unknown");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3046) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3047) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3048)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3049) p += snprintf(p, maxlen, "%s", policy_modes[mode]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3050)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3051) if (flags & MPOL_MODE_FLAGS) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3052) p += snprintf(p, buffer + maxlen - p, "=");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3053)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3054) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3055) * Currently, the only defined flags are mutually exclusive
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3056) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3057) if (flags & MPOL_F_STATIC_NODES)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3058) p += snprintf(p, buffer + maxlen - p, "static");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3059) else if (flags & MPOL_F_RELATIVE_NODES)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3060) p += snprintf(p, buffer + maxlen - p, "relative");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3061) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3062)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3063) if (!nodes_empty(nodes))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3064) p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3065) nodemask_pr_args(&nodes));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3066) }