^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) * kernel/cpuset.c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) * Processor and Memory placement constraints for sets of tasks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * Copyright (C) 2003 BULL SA.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * Copyright (C) 2004-2007 Silicon Graphics, Inc.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) * Copyright (C) 2006 Google, Inc
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) * Portions derived from Patrick Mochel's sysfs code.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) * sysfs is Copyright (c) 2001-3 Patrick Mochel
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) * 2003-10-10 Written by Simon Derr.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) * 2003-10-22 Updates by Stephen Hemminger.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) * 2004 May-July Rework by Paul Jackson.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) * 2006 Rework by Paul Menage to use generic cgroups
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) * 2008 Rework of the scheduler domains and CPU hotplug handling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) * by Max Krasnyansky
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) * This file is subject to the terms and conditions of the GNU General Public
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) * License. See the file COPYING in the main directory of the Linux
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) * distribution for more details.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #include <linux/cpu.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) #include <linux/cpumask.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) #include <linux/cpuset.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) #include <linux/err.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) #include <linux/errno.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) #include <linux/file.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) #include <linux/fs.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) #include <linux/init.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) #include <linux/interrupt.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) #include <linux/kernel.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) #include <linux/kmod.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) #include <linux/list.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) #include <linux/mempolicy.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) #include <linux/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) #include <linux/memory.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) #include <linux/export.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) #include <linux/mount.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) #include <linux/fs_context.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) #include <linux/namei.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) #include <linux/pagemap.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) #include <linux/proc_fs.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) #include <linux/rcupdate.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) #include <linux/sched.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) #include <linux/sched/deadline.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) #include <linux/sched/mm.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) #include <linux/sched/task.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) #include <linux/seq_file.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) #include <linux/security.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) #include <linux/slab.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) #include <linux/spinlock.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) #include <linux/stat.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) #include <linux/string.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) #include <linux/time.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) #include <linux/time64.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) #include <linux/backing-dev.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) #include <linux/sort.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) #include <linux/oom.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) #include <linux/sched/isolation.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) #include <linux/uaccess.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) #include <linux/atomic.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) #include <linux/mutex.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) #include <linux/cgroup.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) #include <linux/wait.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) #include <trace/hooks/sched.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) #include <trace/hooks/cgroup.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) /* See "Frequency meter" comments, below. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) struct fmeter {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) int cnt; /* unprocessed events count */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) int val; /* most recent output value */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) time64_t time; /* clock (secs) when val computed */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) spinlock_t lock; /* guards read or write of above */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) struct cpuset {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) struct cgroup_subsys_state css;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) unsigned long flags; /* "unsigned long" so bitops work */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) * On default hierarchy:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) * The user-configured masks can only be changed by writing to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) * cpuset.cpus and cpuset.mems, and won't be limited by the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) * parent masks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) * The effective masks is the real masks that apply to the tasks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) * in the cpuset. They may be changed if the configured masks are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) * changed or hotplug happens.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) * effective_mask == configured_mask & parent's effective_mask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) * and if it ends up empty, it will inherit the parent's mask.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) * On legacy hierachy:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) * The user-configured masks are always the same with effective masks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) /* user-configured CPUs and Memory Nodes allow to tasks */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) cpumask_var_t cpus_allowed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) cpumask_var_t cpus_requested;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) nodemask_t mems_allowed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) /* effective CPUs and Memory Nodes allow to tasks */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) cpumask_var_t effective_cpus;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) nodemask_t effective_mems;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) * CPUs allocated to child sub-partitions (default hierarchy only)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) * - CPUs granted by the parent = effective_cpus U subparts_cpus
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) * - effective_cpus and subparts_cpus are mutually exclusive.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) * effective_cpus contains only onlined CPUs, but subparts_cpus
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) * may have offlined ones.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) cpumask_var_t subparts_cpus;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) * This is old Memory Nodes tasks took on.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) * - A new cpuset's old_mems_allowed is initialized when some
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) * task is moved into it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) * - old_mems_allowed is used in cpuset_migrate_mm() when we change
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) * cpuset.mems_allowed and have tasks' nodemask updated, and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) * then old_mems_allowed is updated to mems_allowed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) nodemask_t old_mems_allowed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) struct fmeter fmeter; /* memory_pressure filter */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) * Tasks are being attached to this cpuset. Used to prevent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) int attach_in_progress;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) /* partition number for rebuild_sched_domains() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) int pn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) /* for custom sched domain */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) int relax_domain_level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) /* number of CPUs in subparts_cpus */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) int nr_subparts_cpus;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) /* partition root state */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) int partition_root_state;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) * Default hierarchy only:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) * use_parent_ecpus - set if using parent's effective_cpus
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) * child_ecpus_count - # of children with use_parent_ecpus set
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) int use_parent_ecpus;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) int child_ecpus_count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) * Partition root states:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) * 0 - not a partition root
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) * 1 - partition root
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) * -1 - invalid partition root
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) * None of the cpus in cpus_allowed can be put into the parent's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) * subparts_cpus. In this case, the cpuset is not a real partition
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) * root anymore. However, the CPU_EXCLUSIVE bit will still be set
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) * and the cpuset can be restored back to a partition root if the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) * parent cpuset can give more CPUs back to this child cpuset.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) #define PRS_DISABLED 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) #define PRS_ENABLED 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) #define PRS_ERROR -1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) * Temporary cpumasks for working with partitions that are passed among
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) * functions to avoid memory allocation in inner functions.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) struct tmpmasks {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) cpumask_var_t addmask, delmask; /* For partition root */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) cpumask_var_t new_cpus; /* For update_cpumasks_hier() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) return css ? container_of(css, struct cpuset, css) : NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) /* Retrieve the cpuset for a task */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) static inline struct cpuset *task_cs(struct task_struct *task)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) return css_cs(task_css(task, cpuset_cgrp_id));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) static inline struct cpuset *parent_cs(struct cpuset *cs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) return css_cs(cs->css.parent);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) /* bits in struct cpuset flags field */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) typedef enum {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) CS_ONLINE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) CS_CPU_EXCLUSIVE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) CS_MEM_EXCLUSIVE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) CS_MEM_HARDWALL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) CS_MEMORY_MIGRATE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) CS_SCHED_LOAD_BALANCE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) CS_SPREAD_PAGE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) CS_SPREAD_SLAB,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) } cpuset_flagbits_t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) /* convenient tests for these bits */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) static inline bool is_cpuset_online(struct cpuset *cs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) static inline int is_cpu_exclusive(const struct cpuset *cs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) static inline int is_mem_exclusive(const struct cpuset *cs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) static inline int is_mem_hardwall(const struct cpuset *cs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) return test_bit(CS_MEM_HARDWALL, &cs->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) static inline int is_sched_load_balance(const struct cpuset *cs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) static inline int is_memory_migrate(const struct cpuset *cs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) static inline int is_spread_page(const struct cpuset *cs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) return test_bit(CS_SPREAD_PAGE, &cs->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) static inline int is_spread_slab(const struct cpuset *cs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) return test_bit(CS_SPREAD_SLAB, &cs->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) static inline int is_partition_root(const struct cpuset *cs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) return cs->partition_root_state > 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) static struct cpuset top_cpuset = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) (1 << CS_MEM_EXCLUSIVE)),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) .partition_root_state = PRS_ENABLED,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) * cpuset_for_each_child - traverse online children of a cpuset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) * @child_cs: loop cursor pointing to the current child
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) * @pos_css: used for iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) * @parent_cs: target cpuset to walk children of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) * Walk @child_cs through the online children of @parent_cs. Must be used
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) * with RCU read locked.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) #define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) css_for_each_child((pos_css), &(parent_cs)->css) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) * @des_cs: loop cursor pointing to the current descendant
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) * @pos_css: used for iteration
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) * @root_cs: target cpuset to walk ancestor of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) * Walk @des_cs through the online descendants of @root_cs. Must be used
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) * with RCU read locked. The caller may modify @pos_css by calling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) * css_rightmost_descendant() to skip subtree. @root_cs is included in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) * iteration and the first node to be visited.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) * There are two global locks guarding cpuset structures - cpuset_mutex and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) * callback_lock. We also require taking task_lock() when dereferencing a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) * task's cpuset pointer. See "The task_lock() exception", at the end of this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) * comment.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) * A task must hold both locks to modify cpusets. If a task holds
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) * is the only task able to also acquire callback_lock and be able to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) * modify cpusets. It can perform various checks on the cpuset structure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) * first, knowing nothing will change. It can also allocate memory while
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) * just holding cpuset_mutex. While it is performing these checks, various
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) * callback routines can briefly acquire callback_lock to query cpusets.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) * Once it is ready to make the changes, it takes callback_lock, blocking
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) * everyone else.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) * Calls to the kernel memory allocator can not be made while holding
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) * callback_lock, as that would risk double tripping on callback_lock
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) * from one of the callbacks into the cpuset code from within
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) * __alloc_pages().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) * If a task is only holding callback_lock, then it has read-only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) * access to cpusets.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) * Now, the task_struct fields mems_allowed and mempolicy may be changed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) * by other task, we use alloc_lock in the task_struct fields to protect
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) * them.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) * The cpuset_common_file_read() handlers only hold callback_lock across
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) * small pieces of code, such as when reading out possibly multi-word
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) * cpumasks and nodemasks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) * Accessing a task's cpuset should be done in accordance with the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) * guidelines for accessing subsystem state in kernel/cgroup.c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) static DEFINE_MUTEX(cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) static DEFINE_SPINLOCK(callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) static struct workqueue_struct *cpuset_migrate_mm_wq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) * CPU / memory hotplug is handled asynchronously
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) * for hotplug, synchronously for resume_cpus
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) * Cgroup v2 behavior is used on the "cpus" and "mems" control files when
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) * on default hierarchy or when the cpuset_v2_mode flag is set by mounting
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) * With v2 behavior, "cpus" and "mems" are always what the users have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) * requested and won't be changed by hotplug events. Only the effective
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) * cpus or mems will be affected.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) static inline bool is_in_v2_mode(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) * Return in pmask the portion of a task's cpusets's cpus_allowed that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) * are online and are capable of running the task. If none are found,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) * walk up the cpuset hierarchy until we find one that does have some
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) * appropriate cpus.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) * One way or another, we guarantee to return some non-empty subset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) * of cpu_active_mask.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) * Call with callback_lock or cpuset_mutex held.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) static void guarantee_online_cpus(struct task_struct *tsk,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) struct cpumask *pmask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) struct cpuset *cs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) cpumask_copy(pmask, cpu_active_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) cs = task_cs(tsk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) while (!cpumask_intersects(cs->effective_cpus, pmask)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) cs = parent_cs(cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) if (unlikely(!cs)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) * The top cpuset doesn't have any online cpu as a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) * consequence of a race between cpuset_hotplug_work
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) * and cpu hotplug notifier. But we know the top
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) * cpuset's effective_cpus is on its way to be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) * identical to cpu_online_mask.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) cpumask_and(pmask, pmask, cs->effective_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) * Return in *pmask the portion of a cpusets's mems_allowed that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) * are online, with memory. If none are online with memory, walk
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) * up the cpuset hierarchy until we find one that does have some
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) * online mems. The top cpuset always has some mems online.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) * One way or another, we guarantee to return some non-empty subset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) * of node_states[N_MEMORY].
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) * Call with callback_lock or cpuset_mutex held.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) cs = parent_cs(cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) * update task's spread flag if cpuset's page/slab spread flag is set
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) * Call with callback_lock or cpuset_mutex held.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) static void cpuset_update_task_spread_flag(struct cpuset *cs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) struct task_struct *tsk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) if (is_spread_page(cs))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) task_set_spread_page(tsk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) task_clear_spread_page(tsk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) if (is_spread_slab(cs))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) task_set_spread_slab(tsk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) task_clear_spread_slab(tsk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) * One cpuset is a subset of another if all its allowed CPUs and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) * Memory Nodes are a subset of the other, and its exclusive flags
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) * are only set if the other's are set. Call holding cpuset_mutex.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) return cpumask_subset(p->cpus_requested, q->cpus_requested) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) nodes_subset(p->mems_allowed, q->mems_allowed) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) is_mem_exclusive(p) <= is_mem_exclusive(q);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) * alloc_cpumasks - allocate three cpumasks for cpuset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) * @cs: the cpuset that have cpumasks to be allocated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) * @tmp: the tmpmasks structure pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) * Return: 0 if successful, -ENOMEM otherwise.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) * Only one of the two input arguments should be non-NULL.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) cpumask_var_t *pmask1, *pmask2, *pmask3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) if (cs) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) pmask1 = &cs->cpus_allowed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) pmask2 = &cs->effective_cpus;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) pmask3 = &cs->subparts_cpus;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) pmask1 = &tmp->new_cpus;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) pmask2 = &tmp->addmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) pmask3 = &tmp->delmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) goto free_one;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) goto free_two;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) if (cs && !zalloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) goto free_three;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) free_three:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) free_cpumask_var(*pmask3);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) free_two:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) free_cpumask_var(*pmask2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) free_one:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) free_cpumask_var(*pmask1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) * free_cpumasks - free cpumasks in a tmpmasks structure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) * @cs: the cpuset that have cpumasks to be free.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) * @tmp: the tmpmasks structure pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) if (cs) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) free_cpumask_var(cs->cpus_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) free_cpumask_var(cs->cpus_requested);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) free_cpumask_var(cs->effective_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) free_cpumask_var(cs->subparts_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) if (tmp) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) free_cpumask_var(tmp->new_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) free_cpumask_var(tmp->addmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) free_cpumask_var(tmp->delmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) * alloc_trial_cpuset - allocate a trial cpuset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) * @cs: the cpuset that the trial cpuset duplicates
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) struct cpuset *trial;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) if (!trial)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) if (alloc_cpumasks(trial, NULL)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) kfree(trial);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) cpumask_copy(trial->cpus_requested, cs->cpus_requested);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) cpumask_copy(trial->effective_cpus, cs->effective_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) return trial;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) * free_cpuset - free the cpuset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) * @cs: the cpuset to be freed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) static inline void free_cpuset(struct cpuset *cs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) free_cpumasks(cs, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) kfree(cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) * validate_change() - Used to validate that any proposed cpuset change
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) * follows the structural rules for cpusets.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) * If we replaced the flag and mask values of the current cpuset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) * (cur) with those values in the trial cpuset (trial), would
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) * our various subset and exclusive rules still be valid? Presumes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) * cpuset_mutex held.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) * 'cur' is the address of an actual, in-use cpuset. Operations
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) * such as list traversal that depend on the actual address of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) * cpuset in the list must use cur below, not trial.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) * 'trial' is the address of bulk structure copy of cur, with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) * perhaps one or more of the fields cpus_allowed, mems_allowed,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) * or flags changed to new, trial values.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) * Return 0 if valid, -errno if not.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) static int validate_change(struct cpuset *cur, struct cpuset *trial)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) struct cgroup_subsys_state *css;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) struct cpuset *c, *par;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) /* Each of our child cpusets must be a subset of us */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) ret = -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) cpuset_for_each_child(c, css, cur)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) if (!is_cpuset_subset(c, trial))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) /* Remaining checks don't apply to root cpuset */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) if (cur == &top_cpuset)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) par = parent_cs(cur);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) /* On legacy hiearchy, we must be a subset of our parent cpuset. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) ret = -EACCES;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) * If either I or some sibling (!= me) is exclusive, we can't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) * overlap
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) ret = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) cpuset_for_each_child(c, css, par) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) c != cur &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) cpumask_intersects(trial->cpus_requested, c->cpus_requested))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) c != cur &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) nodes_intersects(trial->mems_allowed, c->mems_allowed))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) * Cpusets with tasks - existing or newly being attached - can't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) * be changed to have empty cpus_allowed or mems_allowed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) ret = -ENOSPC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) if (!cpumask_empty(cur->cpus_allowed) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) cpumask_empty(trial->cpus_allowed))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) if (!nodes_empty(cur->mems_allowed) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) nodes_empty(trial->mems_allowed))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) * We can't shrink if we won't have enough room for SCHED_DEADLINE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) * tasks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) ret = -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) if (is_cpu_exclusive(cur) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) !cpuset_cpumask_can_shrink(cur->cpus_allowed,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) trial->cpus_allowed))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) #ifdef CONFIG_SMP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) * Helper routine for generate_sched_domains().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) * Do cpusets a, b have overlapping effective cpus_allowed masks?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) return cpumask_intersects(a->effective_cpus, b->effective_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) static void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) if (dattr->relax_domain_level < c->relax_domain_level)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) dattr->relax_domain_level = c->relax_domain_level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) static void update_domain_attr_tree(struct sched_domain_attr *dattr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) struct cpuset *root_cs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) struct cpuset *cp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) struct cgroup_subsys_state *pos_css;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) /* skip the whole subtree if @cp doesn't have any CPU */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) if (cpumask_empty(cp->cpus_allowed)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) pos_css = css_rightmost_descendant(pos_css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) if (is_sched_load_balance(cp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) update_domain_attr(dattr, cp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) /* Must be called with cpuset_mutex held. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) static inline int nr_cpusets(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) /* jump label reference count + the top-level cpuset */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) return static_key_count(&cpusets_enabled_key.key) + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) * generate_sched_domains()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) * This function builds a partial partition of the systems CPUs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) * A 'partial partition' is a set of non-overlapping subsets whose
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) * union is a subset of that set.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) * The output of this function needs to be passed to kernel/sched/core.c
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) * partition_sched_domains() routine, which will rebuild the scheduler's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) * load balancing domains (sched domains) as specified by that partial
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) * partition.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) * for a background explanation of this.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) * Does not return errors, on the theory that the callers of this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) * routine would rather not worry about failures to rebuild sched
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) * domains when operating in the severe memory shortage situations
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) * that could cause allocation failures below.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) * Must be called with cpuset_mutex held.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) * The three key local variables below are:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) * cp - cpuset pointer, used (together with pos_css) to perform a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) * top-down scan of all cpusets. For our purposes, rebuilding
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) * the schedulers sched domains, we can ignore !is_sched_load_
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) * balance cpusets.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) * csa - (for CpuSet Array) Array of pointers to all the cpusets
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) * that need to be load balanced, for convenient iterative
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) * access by the subsequent code that finds the best partition,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) * i.e the set of domains (subsets) of CPUs such that the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) * cpus_allowed of every cpuset marked is_sched_load_balance
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) * is a subset of one of these domains, while there are as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) * many such domains as possible, each as small as possible.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) * doms - Conversion of 'csa' to an array of cpumasks, for passing to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) * the kernel/sched/core.c routine partition_sched_domains() in a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) * convenient format, that can be easily compared to the prior
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) * value to determine what partition elements (sched domains)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) * were changed (added or removed.)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) * Finding the best partition (set of domains):
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) * The triple nested loops below over i, j, k scan over the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) * load balanced cpusets (using the array of cpuset pointers in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) * csa[]) looking for pairs of cpusets that have overlapping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) * cpus_allowed, but which don't have the same 'pn' partition
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) * number and gives them in the same partition number. It keeps
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) * looping on the 'restart' label until it can no longer find
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) * any such pairs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) * The union of the cpus_allowed masks from the set of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) * all cpusets having the same 'pn' value then form the one
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) * element of the partition (one sched domain) to be passed to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) * partition_sched_domains().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) static int generate_sched_domains(cpumask_var_t **domains,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) struct sched_domain_attr **attributes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) struct cpuset *cp; /* top-down scan of cpusets */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) struct cpuset **csa; /* array of all cpuset ptrs */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) int csn; /* how many cpuset ptrs in csa so far */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) int i, j, k; /* indices for partition finding loops */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) struct sched_domain_attr *dattr; /* attributes for custom domains */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) int ndoms = 0; /* number of sched domains in result */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) int nslot; /* next empty doms[] struct cpumask slot */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) struct cgroup_subsys_state *pos_css;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) bool root_load_balance = is_sched_load_balance(&top_cpuset);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) doms = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) dattr = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) csa = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) /* Special case for the 99% of systems with one, full, sched domain */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) ndoms = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) doms = alloc_sched_domains(ndoms);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) if (!doms)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) goto done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) if (dattr) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) *dattr = SD_ATTR_INIT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) update_domain_attr_tree(dattr, &top_cpuset);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) cpumask_and(doms[0], top_cpuset.effective_cpus,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) housekeeping_cpumask(HK_FLAG_DOMAIN));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) goto done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) if (!csa)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) goto done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) csn = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) if (root_load_balance)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793) csa[csn++] = &top_cpuset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) if (cp == &top_cpuset)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) * Continue traversing beyond @cp iff @cp has some CPUs and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) * isn't load balancing. The former is obvious. The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) * latter: All child cpusets contain a subset of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) * parent's cpus, so just skip them, and then we call
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) * update_domain_attr_tree() to calc relax_domain_level of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803) * the corresponding sched domain.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) * If root is load-balancing, we can skip @cp if it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) * is a subset of the root's effective_cpus.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) if (!cpumask_empty(cp->cpus_allowed) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) !(is_sched_load_balance(cp) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) cpumask_intersects(cp->cpus_allowed,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) housekeeping_cpumask(HK_FLAG_DOMAIN))))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) if (root_load_balance &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818) if (is_sched_load_balance(cp) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) !cpumask_empty(cp->effective_cpus))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) csa[csn++] = cp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) /* skip @cp's subtree if not a partition root */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823) if (!is_partition_root(cp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) pos_css = css_rightmost_descendant(pos_css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) for (i = 0; i < csn; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829) csa[i]->pn = i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) ndoms = csn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832) restart:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) /* Find the best partition (set of sched domains) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834) for (i = 0; i < csn; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) struct cpuset *a = csa[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) int apn = a->pn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838) for (j = 0; j < csn; j++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839) struct cpuset *b = csa[j];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) int bpn = b->pn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) if (apn != bpn && cpusets_overlap(a, b)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) for (k = 0; k < csn; k++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) struct cpuset *c = csa[k];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) if (c->pn == bpn)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847) c->pn = apn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849) ndoms--; /* one less element */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850) goto restart;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) * Now we know how many domains to create.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857) * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) doms = alloc_sched_domains(ndoms);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860) if (!doms)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) goto done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) * The rest of the code, including the scheduler, can deal with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865) * dattr==NULL case. No need to abort if alloc fails.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867) dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) for (nslot = 0, i = 0; i < csn; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) struct cpuset *a = csa[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872) struct cpumask *dp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873) int apn = a->pn;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875) if (apn < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) /* Skip completed partitions */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880) dp = doms[nslot];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) if (nslot == ndoms) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883) static int warnings = 10;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884) if (warnings) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) nslot, ndoms, csn, i, apn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887) warnings--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892) cpumask_clear(dp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893) if (dattr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894) *(dattr + nslot) = SD_ATTR_INIT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895) for (j = i; j < csn; j++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) struct cpuset *b = csa[j];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898) if (apn == b->pn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899) cpumask_or(dp, dp, b->effective_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) if (dattr)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902) update_domain_attr_tree(dattr + nslot, b);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) /* Done with this partition */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) b->pn = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) nslot++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) BUG_ON(nslot != ndoms);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912) done:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913) kfree(csa);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916) * Fallback to the default domain if kmalloc() failed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) * See comments in partition_sched_domains().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919) if (doms == NULL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920) ndoms = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922) *domains = doms;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923) *attributes = dattr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924) return ndoms;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927) static void update_tasks_root_domain(struct cpuset *cs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929) struct css_task_iter it;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930) struct task_struct *task;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932) css_task_iter_start(&cs->css, 0, &it);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934) while ((task = css_task_iter_next(&it)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935) dl_add_task_root_domain(task);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) css_task_iter_end(&it);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940) static void rebuild_root_domains(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942) struct cpuset *cs = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943) struct cgroup_subsys_state *pos_css;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945) lockdep_assert_held(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946) lockdep_assert_cpus_held();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947) lockdep_assert_held(&sched_domains_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952) * Clear default root domain DL accounting, it will be computed again
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953) * if a task belongs to it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955) dl_clear_root_domain(&def_root_domain);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957) cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) if (cpumask_empty(cs->effective_cpus)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) pos_css = css_rightmost_descendant(pos_css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964) css_get(&cs->css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968) update_tasks_root_domain(cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971) css_put(&cs->css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976) static void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977) partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978) struct sched_domain_attr *dattr_new)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980) mutex_lock(&sched_domains_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981) partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) rebuild_root_domains();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983) mutex_unlock(&sched_domains_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987) * Rebuild scheduler domains.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989) * If the flag 'sched_load_balance' of any cpuset with non-empty
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990) * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991) * which has that flag enabled, or if any cpuset with a non-empty
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992) * 'cpus' is removed, then call this routine to rebuild the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993) * scheduler's dynamic sched domains.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995) * Call with cpuset_mutex held. Takes get_online_cpus().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997) static void rebuild_sched_domains_locked(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999) struct cgroup_subsys_state *pos_css;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) struct sched_domain_attr *attr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) cpumask_var_t *doms;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) struct cpuset *cs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) int ndoms;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) lockdep_assert_held(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) * If we have raced with CPU hotplug, return early to avoid
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) * passing doms with offlined cpu to partition_sched_domains().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) * Anyways, cpuset_hotplug_workfn() will rebuild sched domains.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) * With no CPUs in any subpartitions, top_cpuset's effective CPUs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) * should be the same as the active CPUs, so checking only top_cpuset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) * is enough to detect racing CPU offlines.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) if (!top_cpuset.nr_subparts_cpus &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) * With subpartition CPUs, however, the effective CPUs of a partition
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) * root should be only a subset of the active CPUs. Since a CPU in any
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) * partition root could be offlined, all must be checked.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) if (top_cpuset.nr_subparts_cpus) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) if (!is_partition_root(cs)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) pos_css = css_rightmost_descendant(pos_css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) if (!cpumask_subset(cs->effective_cpus,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) cpu_active_mask)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) /* Generate domain masks and attrs */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) ndoms = generate_sched_domains(&doms, &attr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) /* Have scheduler rebuild the domains */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) partition_and_rebuild_sched_domains(ndoms, doms, attr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) #else /* !CONFIG_SMP */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) static void rebuild_sched_domains_locked(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) #endif /* CONFIG_SMP */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) void rebuild_sched_domains(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) get_online_cpus();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) mutex_lock(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) rebuild_sched_domains_locked();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) mutex_unlock(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) put_online_cpus();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) static int update_cpus_allowed(struct cpuset *cs, struct task_struct *p,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) const struct cpumask *new_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) int ret = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) trace_android_rvh_update_cpus_allowed(p, cs->cpus_requested, new_mask, &ret);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) if (!ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) return set_cpus_allowed_ptr(p, new_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) * Iterate through each task of @cs updating its cpus_allowed to the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) * effective cpuset's. As this function is called with cpuset_mutex held,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) * cpuset membership stays stable.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) static void update_tasks_cpumask(struct cpuset *cs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084) struct css_task_iter it;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) struct task_struct *task;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) css_task_iter_start(&cs->css, 0, &it);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) while ((task = css_task_iter_next(&it)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089) update_cpus_allowed(cs, task, cs->effective_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) css_task_iter_end(&it);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) * compute_effective_cpumask - Compute the effective cpumask of the cpuset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) * @new_cpus: the temp variable for the new effective_cpus mask
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) * @cs: the cpuset the need to recompute the new effective_cpus mask
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) * @parent: the parent cpuset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) * If the parent has subpartition CPUs, include them in the list of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) * allowable CPUs in computing the new effective_cpus mask. Since offlined
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) * to mask those out.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) static void compute_effective_cpumask(struct cpumask *new_cpus,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105) struct cpuset *cs, struct cpuset *parent)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) if (parent->nr_subparts_cpus) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) cpumask_or(new_cpus, parent->effective_cpus,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109) parent->subparts_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110) cpumask_and(new_cpus, new_cpus, cs->cpus_requested);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) cpumask_and(new_cpus, new_cpus, cpu_active_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) cpumask_and(new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) * Commands for update_parent_subparts_cpumask
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) enum subparts_cmd {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) partcmd_enable, /* Enable partition root */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) partcmd_disable, /* Disable partition root */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) partcmd_update, /* Update parent's subparts_cpus */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) * @cpuset: The cpuset that requests change in partition root state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) * @cmd: Partition root state change command
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) * @newmask: Optional new cpumask for partcmd_update
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) * @tmp: Temporary addmask and delmask
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) * Return: 0, 1 or an error code
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) * For partcmd_enable, the cpuset is being transformed from a non-partition
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) * root to a partition root. The cpus_allowed mask of the given cpuset will
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) * be put into parent's subparts_cpus and taken away from parent's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) * effective_cpus. The function will return 0 if all the CPUs listed in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) * cpus_allowed can be granted or an error code will be returned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140) * For partcmd_disable, the cpuset is being transofrmed from a partition
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) * root back to a non-partition root. Any CPUs in cpus_allowed that are in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) * parent's subparts_cpus will be taken away from that cpumask and put back
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) * into parent's effective_cpus. 0 should always be returned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) * For partcmd_update, if the optional newmask is specified, the cpu
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) * list is to be changed from cpus_allowed to newmask. Otherwise,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147) * cpus_allowed is assumed to remain the same. The cpuset should either
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) * be a partition root or an invalid partition root. The partition root
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149) * state may change if newmask is NULL and none of the requested CPUs can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) * be granted by the parent. The function will return 1 if changes to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) * parent's subparts_cpus and effective_cpus happen or 0 otherwise.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) * Error code should only be returned when newmask is non-NULL.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154) * The partcmd_enable and partcmd_disable commands are used by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) * update_prstate(). The partcmd_update command is used by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) * update_cpumasks_hier() with newmask NULL and update_cpumask() with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) * newmask set.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) * The checking is more strict when enabling partition root than the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) * other two commands.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) * Because of the implicit cpu exclusive nature of a partition root,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163) * cpumask changes that violates the cpu exclusivity rule will not be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) * permitted when checked by validate_change(). The validate_change()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) * function will also prevent any changes to the cpu list if it is not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166) * a superset of children's cpu lists.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169) struct cpumask *newmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) struct tmpmasks *tmp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) struct cpuset *parent = parent_cs(cpuset);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) int adding; /* Moving cpus from effective_cpus to subparts_cpus */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) int deleting; /* Moving cpus from subparts_cpus to effective_cpus */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) int new_prs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) bool part_error = false; /* Partition error? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) lockdep_assert_held(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) * The parent must be a partition root.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) * The new cpumask, if present, or the current cpus_allowed must
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) * not be empty.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) if (!is_partition_root(parent) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) (newmask && cpumask_empty(newmask)) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) (!newmask && cpumask_empty(cpuset->cpus_allowed)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) * Enabling/disabling partition root is not allowed if there are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) * online children.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194) if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) return -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) * Enabling partition root is not allowed if not all the CPUs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199) * can be granted from parent's effective_cpus or at least one
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200) * CPU will be left after that.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) if ((cmd == partcmd_enable) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204) cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) * A cpumask update cannot make parent's effective_cpus become empty.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) adding = deleting = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211) new_prs = cpuset->partition_root_state;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) if (cmd == partcmd_enable) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214) adding = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) } else if (cmd == partcmd_disable) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) parent->subparts_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) } else if (newmask) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) * partcmd_update with newmask:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) * delmask = cpus_allowed & ~newmask & parent->subparts_cpus
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223) * addmask = newmask & parent->effective_cpus
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) * & ~parent->subparts_cpus
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227) deleting = cpumask_and(tmp->delmask, tmp->delmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) parent->subparts_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) adding = cpumask_andnot(tmp->addmask, tmp->addmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) parent->subparts_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) * Return error if the new effective_cpus could become empty.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) if (adding &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) cpumask_equal(parent->effective_cpus, tmp->addmask)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238) if (!deleting)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241) * As some of the CPUs in subparts_cpus might have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242) * been offlined, we need to compute the real delmask
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243) * to confirm that.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245) if (!cpumask_and(tmp->addmask, tmp->delmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) cpu_active_mask))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) cpumask_copy(tmp->addmask, parent->effective_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252) * partcmd_update w/o newmask:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254) * addmask = cpus_allowed & parent->effective_cpus
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256) * Note that parent's subparts_cpus may have been
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) * pre-shrunk in case there is a change in the cpu list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258) * So no deletion is needed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260) adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261) parent->effective_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262) part_error = cpumask_equal(tmp->addmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263) parent->effective_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266) if (cmd == partcmd_update) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267) int prev_prs = cpuset->partition_root_state;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270) * Check for possible transition between PRS_ENABLED
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271) * and PRS_ERROR.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273) switch (cpuset->partition_root_state) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274) case PRS_ENABLED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275) if (part_error)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276) new_prs = PRS_ERROR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) case PRS_ERROR:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279) if (!part_error)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280) new_prs = PRS_ENABLED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284) * Set part_error if previously in invalid state.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286) part_error = (prev_prs == PRS_ERROR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289) if (!part_error && (new_prs == PRS_ERROR))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1290) return 0; /* Nothing need to be done */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1291)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1292) if (new_prs == PRS_ERROR) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1293) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1294) * Remove all its cpus from parent's subparts_cpus.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1295) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1296) adding = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1297) deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1298) parent->subparts_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1299) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1300)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1301) if (!adding && !deleting && (new_prs == cpuset->partition_root_state))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1302) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1303)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1304) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1305) * Change the parent's subparts_cpus.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1306) * Newly added CPUs will be removed from effective_cpus and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1307) * newly deleted ones will be added back to effective_cpus.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1308) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1309) spin_lock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1310) if (adding) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1311) cpumask_or(parent->subparts_cpus,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1312) parent->subparts_cpus, tmp->addmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1313) cpumask_andnot(parent->effective_cpus,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1314) parent->effective_cpus, tmp->addmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1315) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1316) if (deleting) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1317) cpumask_andnot(parent->subparts_cpus,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1318) parent->subparts_cpus, tmp->delmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1319) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1320) * Some of the CPUs in subparts_cpus might have been offlined.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1321) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1322) cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1323) cpumask_or(parent->effective_cpus,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1324) parent->effective_cpus, tmp->delmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1325) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1326)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1327) parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1328)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1329) if (cpuset->partition_root_state != new_prs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1330) cpuset->partition_root_state = new_prs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1331) spin_unlock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1332)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1333) return cmd == partcmd_update;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1334) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1335)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1336) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1337) * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1338) * @cs: the cpuset to consider
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1339) * @tmp: temp variables for calculating effective_cpus & partition setup
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1340) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1341) * When congifured cpumask is changed, the effective cpumasks of this cpuset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1342) * and all its descendants need to be updated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1343) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1344) * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1345) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1346) * Called with cpuset_mutex held
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1347) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1348) static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1349) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1350) struct cpuset *cp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1351) struct cgroup_subsys_state *pos_css;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1352) bool need_rebuild_sched_domains = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1353) int new_prs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1354)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1355) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1356) cpuset_for_each_descendant_pre(cp, pos_css, cs) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1357) struct cpuset *parent = parent_cs(cp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1358)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1359) compute_effective_cpumask(tmp->new_cpus, cp, parent);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1360)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1361) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1362) * If it becomes empty, inherit the effective mask of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1363) * parent, which is guaranteed to have some CPUs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1364) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1365) if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1366) cpumask_copy(tmp->new_cpus, parent->effective_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1367) if (!cp->use_parent_ecpus) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1368) cp->use_parent_ecpus = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1369) parent->child_ecpus_count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1370) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1371) } else if (cp->use_parent_ecpus) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1372) cp->use_parent_ecpus = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1373) WARN_ON_ONCE(!parent->child_ecpus_count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1374) parent->child_ecpus_count--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1375) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1376)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1377) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1378) * Skip the whole subtree if the cpumask remains the same
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1379) * and has no partition root state.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1380) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1381) if (!cp->partition_root_state &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1382) cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1383) pos_css = css_rightmost_descendant(pos_css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1384) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1385) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1386)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1387) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1388) * update_parent_subparts_cpumask() should have been called
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1389) * for cs already in update_cpumask(). We should also call
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1390) * update_tasks_cpumask() again for tasks in the parent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1391) * cpuset if the parent's subparts_cpus changes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1392) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1393) new_prs = cp->partition_root_state;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1394) if ((cp != cs) && new_prs) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1395) switch (parent->partition_root_state) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1396) case PRS_DISABLED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1397) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1398) * If parent is not a partition root or an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1399) * invalid partition root, clear its state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1400) * and its CS_CPU_EXCLUSIVE flag.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1401) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1402) WARN_ON_ONCE(cp->partition_root_state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1403) != PRS_ERROR);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1404) new_prs = PRS_DISABLED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1405)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1406) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1407) * clear_bit() is an atomic operation and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1408) * readers aren't interested in the state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1409) * of CS_CPU_EXCLUSIVE anyway. So we can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1410) * just update the flag without holding
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1411) * the callback_lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1412) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1413) clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1414) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1415)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1416) case PRS_ENABLED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1417) if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1418) update_tasks_cpumask(parent);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1419) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1420)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1421) case PRS_ERROR:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1422) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1423) * When parent is invalid, it has to be too.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1424) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1425) new_prs = PRS_ERROR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1426) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1427) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1428) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1429)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1430) if (!css_tryget_online(&cp->css))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1431) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1432) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1433)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1434) spin_lock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1435)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1436) cpumask_copy(cp->effective_cpus, tmp->new_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1437) if (cp->nr_subparts_cpus && (new_prs != PRS_ENABLED)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1438) cp->nr_subparts_cpus = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1439) cpumask_clear(cp->subparts_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1440) } else if (cp->nr_subparts_cpus) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1441) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1442) * Make sure that effective_cpus & subparts_cpus
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1443) * are mutually exclusive.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1444) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1445) * In the unlikely event that effective_cpus
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1446) * becomes empty. we clear cp->nr_subparts_cpus and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1447) * let its child partition roots to compete for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1448) * CPUs again.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1449) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1450) cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1451) cp->subparts_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1452) if (cpumask_empty(cp->effective_cpus)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1453) cpumask_copy(cp->effective_cpus, tmp->new_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1454) cpumask_clear(cp->subparts_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1455) cp->nr_subparts_cpus = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1456) } else if (!cpumask_subset(cp->subparts_cpus,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1457) tmp->new_cpus)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1458) cpumask_andnot(cp->subparts_cpus,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1459) cp->subparts_cpus, tmp->new_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1460) cp->nr_subparts_cpus
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1461) = cpumask_weight(cp->subparts_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1462) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1463) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1464)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1465) if (new_prs != cp->partition_root_state)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1466) cp->partition_root_state = new_prs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1467)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1468) spin_unlock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1469)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1470) WARN_ON(!is_in_v2_mode() &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1471) !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1472)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1473) update_tasks_cpumask(cp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1474)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1475) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1476) * On legacy hierarchy, if the effective cpumask of any non-
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1477) * empty cpuset is changed, we need to rebuild sched domains.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1478) * On default hierarchy, the cpuset needs to be a partition
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1479) * root as well.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1480) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1481) if (!cpumask_empty(cp->cpus_allowed) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1482) is_sched_load_balance(cp) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1483) (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1484) is_partition_root(cp)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1485) need_rebuild_sched_domains = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1486)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1487) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1488) css_put(&cp->css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1489) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1490) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1491)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1492) if (need_rebuild_sched_domains)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1493) rebuild_sched_domains_locked();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1494) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1495)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1496) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1497) * update_sibling_cpumasks - Update siblings cpumasks
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1498) * @parent: Parent cpuset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1499) * @cs: Current cpuset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1500) * @tmp: Temp variables
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1501) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1502) static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1503) struct tmpmasks *tmp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1504) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1505) struct cpuset *sibling;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1506) struct cgroup_subsys_state *pos_css;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1507)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1508) lockdep_assert_held(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1509)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1510) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1511) * Check all its siblings and call update_cpumasks_hier()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1512) * if their use_parent_ecpus flag is set in order for them
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1513) * to use the right effective_cpus value.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1514) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1515) * The update_cpumasks_hier() function may sleep. So we have to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1516) * release the RCU read lock before calling it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1517) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1518) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1519) cpuset_for_each_child(sibling, pos_css, parent) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1520) if (sibling == cs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1521) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1522) if (!sibling->use_parent_ecpus)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1523) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1524) if (!css_tryget_online(&sibling->css))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1525) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1526)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1527) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1528) update_cpumasks_hier(sibling, tmp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1529) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1530) css_put(&sibling->css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1531) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1532) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1533) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1534)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1535) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1536) * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1537) * @cs: the cpuset to consider
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1538) * @trialcs: trial cpuset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1539) * @buf: buffer of cpu numbers written to this cpuset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1540) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1541) static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1542) const char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1543) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1544) int retval;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1545) struct tmpmasks tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1546)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1547) /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1548) if (cs == &top_cpuset)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1549) return -EACCES;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1550)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1551) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1552) * An empty cpus_requested is ok only if the cpuset has no tasks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1553) * Since cpulist_parse() fails on an empty mask, we special case
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1554) * that parsing. The validate_change() call ensures that cpusets
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1555) * with tasks have cpus.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1556) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1557) if (!*buf) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1558) cpumask_clear(trialcs->cpus_requested);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1559) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1560) retval = cpulist_parse(buf, trialcs->cpus_requested);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1561) if (retval < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1562) return retval;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1563) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1564)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1565) if (!cpumask_subset(trialcs->cpus_requested, cpu_present_mask))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1566) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1567)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1568) cpumask_and(trialcs->cpus_allowed, trialcs->cpus_requested, cpu_active_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1569)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1570) /* Nothing to do if the cpus didn't change */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1571) if (cpumask_equal(cs->cpus_requested, trialcs->cpus_requested))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1572) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1573)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1574) retval = validate_change(cs, trialcs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1575) if (retval < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1576) return retval;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1577)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1578) #ifdef CONFIG_CPUMASK_OFFSTACK
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1579) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1580) * Use the cpumasks in trialcs for tmpmasks when they are pointers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1581) * to allocated cpumasks.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1582) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1583) tmp.addmask = trialcs->subparts_cpus;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1584) tmp.delmask = trialcs->effective_cpus;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1585) tmp.new_cpus = trialcs->cpus_allowed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1586) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1587)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1588) if (cs->partition_root_state) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1589) /* Cpumask of a partition root cannot be empty */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1590) if (cpumask_empty(trialcs->cpus_allowed))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1591) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1592) if (update_parent_subparts_cpumask(cs, partcmd_update,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1593) trialcs->cpus_allowed, &tmp) < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1594) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1595) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1596)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1597) spin_lock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1598) cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1599) cpumask_copy(cs->cpus_requested, trialcs->cpus_requested);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1600)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1601) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1602) * Make sure that subparts_cpus is a subset of cpus_allowed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1603) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1604) if (cs->nr_subparts_cpus) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1605) cpumask_and(cs->subparts_cpus, cs->subparts_cpus, cs->cpus_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1606) cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1607) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1608) spin_unlock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1609)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1610) update_cpumasks_hier(cs, &tmp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1611)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1612) if (cs->partition_root_state) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1613) struct cpuset *parent = parent_cs(cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1614)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1615) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1616) * For partition root, update the cpumasks of sibling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1617) * cpusets if they use parent's effective_cpus.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1618) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1619) if (parent->child_ecpus_count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1620) update_sibling_cpumasks(parent, cs, &tmp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1621) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1622) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1623) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1624)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1625) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1626) * Migrate memory region from one set of nodes to another. This is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1627) * performed asynchronously as it can be called from process migration path
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1628) * holding locks involved in process management. All mm migrations are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1629) * performed in the queued order and can be waited for by flushing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1630) * cpuset_migrate_mm_wq.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1631) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1632)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1633) struct cpuset_migrate_mm_work {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1634) struct work_struct work;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1635) struct mm_struct *mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1636) nodemask_t from;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1637) nodemask_t to;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1638) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1639)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1640) static void cpuset_migrate_mm_workfn(struct work_struct *work)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1641) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1642) struct cpuset_migrate_mm_work *mwork =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1643) container_of(work, struct cpuset_migrate_mm_work, work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1644)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1645) /* on a wq worker, no need to worry about %current's mems_allowed */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1646) do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1647) mmput(mwork->mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1648) kfree(mwork);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1649) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1650)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1651) static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1652) const nodemask_t *to)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1653) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1654) struct cpuset_migrate_mm_work *mwork;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1655)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1656) mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1657) if (mwork) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1658) mwork->mm = mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1659) mwork->from = *from;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1660) mwork->to = *to;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1661) INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1662) queue_work(cpuset_migrate_mm_wq, &mwork->work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1663) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1664) mmput(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1665) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1666) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1667)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1668) static void cpuset_post_attach(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1669) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1670) flush_workqueue(cpuset_migrate_mm_wq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1671) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1672)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1673) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1674) * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1675) * @tsk: the task to change
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1676) * @newmems: new nodes that the task will be set
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1677) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1678) * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1679) * and rebind an eventual tasks' mempolicy. If the task is allocating in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1680) * parallel, it might temporarily see an empty intersection, which results in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1681) * a seqlock check and retry before OOM or allocation failure.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1682) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1683) static void cpuset_change_task_nodemask(struct task_struct *tsk,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1684) nodemask_t *newmems)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1685) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1686) task_lock(tsk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1687)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1688) local_irq_disable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1689) write_seqcount_begin(&tsk->mems_allowed_seq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1690)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1691) nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1692) mpol_rebind_task(tsk, newmems);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1693) tsk->mems_allowed = *newmems;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1694)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1695) write_seqcount_end(&tsk->mems_allowed_seq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1696) local_irq_enable();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1697)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1698) task_unlock(tsk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1699) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1700)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1701) static void *cpuset_being_rebound;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1702)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1703) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1704) * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1705) * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1706) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1707) * Iterate through each task of @cs updating its mems_allowed to the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1708) * effective cpuset's. As this function is called with cpuset_mutex held,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1709) * cpuset membership stays stable.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1710) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1711) static void update_tasks_nodemask(struct cpuset *cs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1712) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1713) static nodemask_t newmems; /* protected by cpuset_mutex */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1714) struct css_task_iter it;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1715) struct task_struct *task;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1716)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1717) cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1718)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1719) guarantee_online_mems(cs, &newmems);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1720)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1721) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1722) * The mpol_rebind_mm() call takes mmap_lock, which we couldn't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1723) * take while holding tasklist_lock. Forks can happen - the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1724) * mpol_dup() cpuset_being_rebound check will catch such forks,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1725) * and rebind their vma mempolicies too. Because we still hold
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1726) * the global cpuset_mutex, we know that no other rebind effort
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1727) * will be contending for the global variable cpuset_being_rebound.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1728) * It's ok if we rebind the same mm twice; mpol_rebind_mm()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1729) * is idempotent. Also migrate pages in each mm to new nodes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1730) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1731) css_task_iter_start(&cs->css, 0, &it);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1732) while ((task = css_task_iter_next(&it))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1733) struct mm_struct *mm;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1734) bool migrate;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1735)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1736) cpuset_change_task_nodemask(task, &newmems);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1737)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1738) mm = get_task_mm(task);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1739) if (!mm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1740) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1741)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1742) migrate = is_memory_migrate(cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1743)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1744) mpol_rebind_mm(mm, &cs->mems_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1745) if (migrate)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1746) cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1747) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1748) mmput(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1749) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1750) css_task_iter_end(&it);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1751)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1752) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1753) * All the tasks' nodemasks have been updated, update
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1754) * cs->old_mems_allowed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1755) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1756) cs->old_mems_allowed = newmems;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1757)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1758) /* We're done rebinding vmas to this cpuset's new mems_allowed. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1759) cpuset_being_rebound = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1760) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1761)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1762) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1763) * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1764) * @cs: the cpuset to consider
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1765) * @new_mems: a temp variable for calculating new effective_mems
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1766) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1767) * When configured nodemask is changed, the effective nodemasks of this cpuset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1768) * and all its descendants need to be updated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1769) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1770) * On legacy hiearchy, effective_mems will be the same with mems_allowed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1771) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1772) * Called with cpuset_mutex held
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1773) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1774) static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1775) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1776) struct cpuset *cp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1777) struct cgroup_subsys_state *pos_css;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1778)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1779) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1780) cpuset_for_each_descendant_pre(cp, pos_css, cs) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1781) struct cpuset *parent = parent_cs(cp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1782)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1783) nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1784)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1785) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1786) * If it becomes empty, inherit the effective mask of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1787) * parent, which is guaranteed to have some MEMs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1788) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1789) if (is_in_v2_mode() && nodes_empty(*new_mems))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1790) *new_mems = parent->effective_mems;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1791)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1792) /* Skip the whole subtree if the nodemask remains the same. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1793) if (nodes_equal(*new_mems, cp->effective_mems)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1794) pos_css = css_rightmost_descendant(pos_css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1795) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1796) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1797)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1798) if (!css_tryget_online(&cp->css))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1799) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1800) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1801)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1802) spin_lock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1803) cp->effective_mems = *new_mems;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1804) spin_unlock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1805)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1806) WARN_ON(!is_in_v2_mode() &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1807) !nodes_equal(cp->mems_allowed, cp->effective_mems));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1808)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1809) update_tasks_nodemask(cp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1810)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1811) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1812) css_put(&cp->css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1813) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1814) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1815) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1816)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1817) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1818) * Handle user request to change the 'mems' memory placement
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1819) * of a cpuset. Needs to validate the request, update the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1820) * cpusets mems_allowed, and for each task in the cpuset,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1821) * update mems_allowed and rebind task's mempolicy and any vma
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1822) * mempolicies and if the cpuset is marked 'memory_migrate',
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1823) * migrate the tasks pages to the new memory.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1824) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1825) * Call with cpuset_mutex held. May take callback_lock during call.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1826) * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1827) * lock each such tasks mm->mmap_lock, scan its vma's and rebind
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1828) * their mempolicies to the cpusets new mems_allowed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1829) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1830) static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1831) const char *buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1832) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1833) int retval;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1834)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1835) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1836) * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1837) * it's read-only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1838) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1839) if (cs == &top_cpuset) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1840) retval = -EACCES;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1841) goto done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1842) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1843)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1844) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1845) * An empty mems_allowed is ok iff there are no tasks in the cpuset.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1846) * Since nodelist_parse() fails on an empty mask, we special case
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1847) * that parsing. The validate_change() call ensures that cpusets
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1848) * with tasks have memory.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1849) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1850) if (!*buf) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1851) nodes_clear(trialcs->mems_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1852) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1853) retval = nodelist_parse(buf, trialcs->mems_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1854) if (retval < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1855) goto done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1856)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1857) if (!nodes_subset(trialcs->mems_allowed,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1858) top_cpuset.mems_allowed)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1859) retval = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1860) goto done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1861) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1862) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1863)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1864) if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1865) retval = 0; /* Too easy - nothing to do */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1866) goto done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1867) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1868) retval = validate_change(cs, trialcs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1869) if (retval < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1870) goto done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1871)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1872) spin_lock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1873) cs->mems_allowed = trialcs->mems_allowed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1874) spin_unlock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1875)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1876) /* use trialcs->mems_allowed as a temp variable */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1877) update_nodemasks_hier(cs, &trialcs->mems_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1878) done:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1879) return retval;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1880) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1881)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1882) bool current_cpuset_is_being_rebound(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1883) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1884) bool ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1885)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1886) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1887) ret = task_cs(current) == cpuset_being_rebound;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1888) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1889)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1890) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1891) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1892)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1893) static int update_relax_domain_level(struct cpuset *cs, s64 val)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1894) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1895) #ifdef CONFIG_SMP
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1896) if (val < -1 || val >= sched_domain_level_max)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1897) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1898) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1899)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1900) if (val != cs->relax_domain_level) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1901) cs->relax_domain_level = val;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1902) if (!cpumask_empty(cs->cpus_allowed) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1903) is_sched_load_balance(cs))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1904) rebuild_sched_domains_locked();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1905) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1906)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1907) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1908) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1909)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1910) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1911) * update_tasks_flags - update the spread flags of tasks in the cpuset.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1912) * @cs: the cpuset in which each task's spread flags needs to be changed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1913) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1914) * Iterate through each task of @cs updating its spread flags. As this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1915) * function is called with cpuset_mutex held, cpuset membership stays
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1916) * stable.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1917) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1918) static void update_tasks_flags(struct cpuset *cs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1919) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1920) struct css_task_iter it;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1921) struct task_struct *task;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1922)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1923) css_task_iter_start(&cs->css, 0, &it);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1924) while ((task = css_task_iter_next(&it)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1925) cpuset_update_task_spread_flag(cs, task);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1926) css_task_iter_end(&it);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1927) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1928)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1929) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1930) * update_flag - read a 0 or a 1 in a file and update associated flag
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1931) * bit: the bit to update (see cpuset_flagbits_t)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1932) * cs: the cpuset to update
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1933) * turning_on: whether the flag is being set or cleared
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1934) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1935) * Call with cpuset_mutex held.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1936) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1937)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1938) static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1939) int turning_on)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1940) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1941) struct cpuset *trialcs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1942) int balance_flag_changed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1943) int spread_flag_changed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1944) int err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1945)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1946) trialcs = alloc_trial_cpuset(cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1947) if (!trialcs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1948) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1949)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1950) if (turning_on)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1951) set_bit(bit, &trialcs->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1952) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1953) clear_bit(bit, &trialcs->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1954)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1955) err = validate_change(cs, trialcs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1956) if (err < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1957) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1958)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1959) balance_flag_changed = (is_sched_load_balance(cs) !=
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1960) is_sched_load_balance(trialcs));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1961)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1962) spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1963) || (is_spread_page(cs) != is_spread_page(trialcs)));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1964)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1965) spin_lock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1966) cs->flags = trialcs->flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1967) spin_unlock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1968)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1969) if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1970) rebuild_sched_domains_locked();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1971)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1972) if (spread_flag_changed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1973) update_tasks_flags(cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1974) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1975) free_cpuset(trialcs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1976) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1977) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1978)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1979) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1980) * update_prstate - update partititon_root_state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1981) * cs: the cpuset to update
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1982) * new_prs: new partition root state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1983) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1984) * Call with cpuset_mutex held.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1985) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1986) static int update_prstate(struct cpuset *cs, int new_prs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1987) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1988) int err, old_prs = cs->partition_root_state;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1989) struct cpuset *parent = parent_cs(cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1990) struct tmpmasks tmpmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1991)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1992) if (old_prs == new_prs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1993) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1994)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1995) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1996) * Cannot force a partial or invalid partition root to a full
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1997) * partition root.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1998) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1999) if (new_prs && (old_prs == PRS_ERROR))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2000) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2001)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2002) if (alloc_cpumasks(NULL, &tmpmask))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2003) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2004)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2005) err = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2006) if (!old_prs) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2007) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2008) * Turning on partition root requires setting the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2009) * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2010) * cannot be NULL.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2011) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2012) if (cpumask_empty(cs->cpus_allowed))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2013) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2014)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2015) err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2016) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2017) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2018)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2019) err = update_parent_subparts_cpumask(cs, partcmd_enable,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2020) NULL, &tmpmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2021) if (err) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2022) update_flag(CS_CPU_EXCLUSIVE, cs, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2023) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2024) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2025) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2026) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2027) * Turning off partition root will clear the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2028) * CS_CPU_EXCLUSIVE bit.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2029) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2030) if (old_prs == PRS_ERROR) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2031) update_flag(CS_CPU_EXCLUSIVE, cs, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2032) err = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2033) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2034) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2035)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2036) err = update_parent_subparts_cpumask(cs, partcmd_disable,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2037) NULL, &tmpmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2038) if (err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2039) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2040)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2041) /* Turning off CS_CPU_EXCLUSIVE will not return error */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2042) update_flag(CS_CPU_EXCLUSIVE, cs, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2043) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2044)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2045) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2046) * Update cpumask of parent's tasks except when it is the top
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2047) * cpuset as some system daemons cannot be mapped to other CPUs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2048) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2049) if (parent != &top_cpuset)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2050) update_tasks_cpumask(parent);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2051)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2052) if (parent->child_ecpus_count)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2053) update_sibling_cpumasks(parent, cs, &tmpmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2054)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2055) rebuild_sched_domains_locked();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2056) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2057) if (!err) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2058) spin_lock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2059) cs->partition_root_state = new_prs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2060) spin_unlock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2061) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2062)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2063) free_cpumasks(NULL, &tmpmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2064) return err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2065) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2066)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2067) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2068) * Frequency meter - How fast is some event occurring?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2069) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2070) * These routines manage a digitally filtered, constant time based,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2071) * event frequency meter. There are four routines:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2072) * fmeter_init() - initialize a frequency meter.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2073) * fmeter_markevent() - called each time the event happens.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2074) * fmeter_getrate() - returns the recent rate of such events.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2075) * fmeter_update() - internal routine used to update fmeter.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2076) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2077) * A common data structure is passed to each of these routines,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2078) * which is used to keep track of the state required to manage the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2079) * frequency meter and its digital filter.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2080) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2081) * The filter works on the number of events marked per unit time.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2082) * The filter is single-pole low-pass recursive (IIR). The time unit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2083) * is 1 second. Arithmetic is done using 32-bit integers scaled to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2084) * simulate 3 decimal digits of precision (multiplied by 1000).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2085) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2086) * With an FM_COEF of 933, and a time base of 1 second, the filter
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2087) * has a half-life of 10 seconds, meaning that if the events quit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2088) * happening, then the rate returned from the fmeter_getrate()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2089) * will be cut in half each 10 seconds, until it converges to zero.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2090) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2091) * It is not worth doing a real infinitely recursive filter. If more
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2092) * than FM_MAXTICKS ticks have elapsed since the last filter event,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2093) * just compute FM_MAXTICKS ticks worth, by which point the level
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2094) * will be stable.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2095) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2096) * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2097) * arithmetic overflow in the fmeter_update() routine.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2098) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2099) * Given the simple 32 bit integer arithmetic used, this meter works
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2100) * best for reporting rates between one per millisecond (msec) and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2101) * one per 32 (approx) seconds. At constant rates faster than one
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2102) * per msec it maxes out at values just under 1,000,000. At constant
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2103) * rates between one per msec, and one per second it will stabilize
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2104) * to a value N*1000, where N is the rate of events per second.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2105) * At constant rates between one per second and one per 32 seconds,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2106) * it will be choppy, moving up on the seconds that have an event,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2107) * and then decaying until the next event. At rates slower than
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2108) * about one in 32 seconds, it decays all the way back to zero between
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2109) * each event.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2110) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2111)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2112) #define FM_COEF 933 /* coefficient for half-life of 10 secs */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2113) #define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2114) #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2115) #define FM_SCALE 1000 /* faux fixed point scale */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2116)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2117) /* Initialize a frequency meter */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2118) static void fmeter_init(struct fmeter *fmp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2119) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2120) fmp->cnt = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2121) fmp->val = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2122) fmp->time = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2123) spin_lock_init(&fmp->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2124) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2125)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2126) /* Internal meter update - process cnt events and update value */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2127) static void fmeter_update(struct fmeter *fmp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2128) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2129) time64_t now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2130) u32 ticks;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2131)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2132) now = ktime_get_seconds();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2133) ticks = now - fmp->time;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2134)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2135) if (ticks == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2136) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2137)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2138) ticks = min(FM_MAXTICKS, ticks);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2139) while (ticks-- > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2140) fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2141) fmp->time = now;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2142)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2143) fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2144) fmp->cnt = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2145) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2146)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2147) /* Process any previous ticks, then bump cnt by one (times scale). */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2148) static void fmeter_markevent(struct fmeter *fmp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2149) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2150) spin_lock(&fmp->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2151) fmeter_update(fmp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2152) fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2153) spin_unlock(&fmp->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2154) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2155)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2156) /* Process any previous ticks, then return current value. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2157) static int fmeter_getrate(struct fmeter *fmp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2158) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2159) int val;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2160)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2161) spin_lock(&fmp->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2162) fmeter_update(fmp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2163) val = fmp->val;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2164) spin_unlock(&fmp->lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2165) return val;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2166) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2167)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2168) static struct cpuset *cpuset_attach_old_cs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2169)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2170) /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2171) static int cpuset_can_attach(struct cgroup_taskset *tset)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2172) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2173) struct cgroup_subsys_state *css;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2174) struct cpuset *cs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2175) struct task_struct *task;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2176) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2177)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2178) /* used later by cpuset_attach() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2179) cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2180) cs = css_cs(css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2181)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2182) mutex_lock(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2183)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2184) /* allow moving tasks into an empty cpuset if on default hierarchy */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2185) ret = -ENOSPC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2186) if (!is_in_v2_mode() &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2187) (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2188) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2189)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2190) cgroup_taskset_for_each(task, css, tset) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2191) ret = task_can_attach(task, cs->cpus_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2192) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2193) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2194) ret = security_task_setscheduler(task);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2195) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2196) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2197) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2198)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2199) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2200) * Mark attach is in progress. This makes validate_change() fail
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2201) * changes which zero cpus/mems_allowed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2202) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2203) cs->attach_in_progress++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2204) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2205) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2206) mutex_unlock(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2207) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2208) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2209)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2210) static void cpuset_cancel_attach(struct cgroup_taskset *tset)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2211) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2212) struct cgroup_subsys_state *css;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2213)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2214) cgroup_taskset_first(tset, &css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2215)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2216) mutex_lock(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2217) css_cs(css)->attach_in_progress--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2218) mutex_unlock(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2219) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2220)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2221) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2222) * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2223) * but we can't allocate it dynamically there. Define it global and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2224) * allocate from cpuset_init().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2225) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2226) static cpumask_var_t cpus_attach;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2227)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2228) static void cpuset_attach(struct cgroup_taskset *tset)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2229) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2230) /* static buf protected by cpuset_mutex */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2231) static nodemask_t cpuset_attach_nodemask_to;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2232) struct task_struct *task;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2233) struct task_struct *leader;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2234) struct cgroup_subsys_state *css;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2235) struct cpuset *cs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2236) struct cpuset *oldcs = cpuset_attach_old_cs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2237)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2238) cgroup_taskset_first(tset, &css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2239) cs = css_cs(css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2240)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2241) cpus_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2242) mutex_lock(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2243)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2244) guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2245)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2246) cgroup_taskset_for_each(task, css, tset) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2247) if (cs != &top_cpuset)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2248) guarantee_online_cpus(task, cpus_attach);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2249) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2250) cpumask_copy(cpus_attach, task_cpu_possible_mask(task));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2251) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2252) * can_attach beforehand should guarantee that this doesn't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2253) * fail. TODO: have a better way to handle failure here
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2254) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2255) WARN_ON_ONCE(update_cpus_allowed(cs, task, cpus_attach));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2256)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2257) cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2258) cpuset_update_task_spread_flag(cs, task);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2259) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2260)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2261) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2262) * Change mm for all threadgroup leaders. This is expensive and may
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2263) * sleep and should be moved outside migration path proper.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2264) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2265) cpuset_attach_nodemask_to = cs->effective_mems;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2266) cgroup_taskset_for_each_leader(leader, css, tset) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2267) struct mm_struct *mm = get_task_mm(leader);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2268)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2269) if (mm) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2270) mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2271)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2272) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2273) * old_mems_allowed is the same with mems_allowed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2274) * here, except if this task is being moved
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2275) * automatically due to hotplug. In that case
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2276) * @mems_allowed has been updated and is empty, so
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2277) * @old_mems_allowed is the right nodesets that we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2278) * migrate mm from.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2279) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2280) if (is_memory_migrate(cs))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2281) cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2282) &cpuset_attach_nodemask_to);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2283) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2284) mmput(mm);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2285) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2286) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2287)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2288) cs->old_mems_allowed = cpuset_attach_nodemask_to;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2289)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2290) cs->attach_in_progress--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2291) if (!cs->attach_in_progress)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2292) wake_up(&cpuset_attach_wq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2293)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2294) mutex_unlock(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2295) cpus_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2296) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2297)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2298) /* The various types of files and directories in a cpuset file system */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2299)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2300) typedef enum {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2301) FILE_MEMORY_MIGRATE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2302) FILE_CPULIST,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2303) FILE_MEMLIST,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2304) FILE_EFFECTIVE_CPULIST,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2305) FILE_EFFECTIVE_MEMLIST,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2306) FILE_SUBPARTS_CPULIST,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2307) FILE_CPU_EXCLUSIVE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2308) FILE_MEM_EXCLUSIVE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2309) FILE_MEM_HARDWALL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2310) FILE_SCHED_LOAD_BALANCE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2311) FILE_PARTITION_ROOT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2312) FILE_SCHED_RELAX_DOMAIN_LEVEL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2313) FILE_MEMORY_PRESSURE_ENABLED,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2314) FILE_MEMORY_PRESSURE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2315) FILE_SPREAD_PAGE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2316) FILE_SPREAD_SLAB,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2317) } cpuset_filetype_t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2318)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2319) static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2320) u64 val)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2321) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2322) struct cpuset *cs = css_cs(css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2323) cpuset_filetype_t type = cft->private;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2324) int retval = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2325)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2326) get_online_cpus();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2327) mutex_lock(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2328) if (!is_cpuset_online(cs)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2329) retval = -ENODEV;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2330) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2331) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2332)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2333) switch (type) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2334) case FILE_CPU_EXCLUSIVE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2335) retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2336) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2337) case FILE_MEM_EXCLUSIVE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2338) retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2339) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2340) case FILE_MEM_HARDWALL:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2341) retval = update_flag(CS_MEM_HARDWALL, cs, val);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2342) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2343) case FILE_SCHED_LOAD_BALANCE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2344) retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2345) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2346) case FILE_MEMORY_MIGRATE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2347) retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2348) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2349) case FILE_MEMORY_PRESSURE_ENABLED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2350) cpuset_memory_pressure_enabled = !!val;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2351) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2352) case FILE_SPREAD_PAGE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2353) retval = update_flag(CS_SPREAD_PAGE, cs, val);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2354) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2355) case FILE_SPREAD_SLAB:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2356) retval = update_flag(CS_SPREAD_SLAB, cs, val);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2357) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2358) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2359) retval = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2360) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2361) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2362) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2363) mutex_unlock(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2364) put_online_cpus();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2365) return retval;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2366) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2367)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2368) static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2369) s64 val)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2370) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2371) struct cpuset *cs = css_cs(css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2372) cpuset_filetype_t type = cft->private;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2373) int retval = -ENODEV;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2374)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2375) get_online_cpus();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2376) mutex_lock(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2377) if (!is_cpuset_online(cs))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2378) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2379)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2380) switch (type) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2381) case FILE_SCHED_RELAX_DOMAIN_LEVEL:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2382) retval = update_relax_domain_level(cs, val);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2383) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2384) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2385) retval = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2386) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2387) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2388) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2389) mutex_unlock(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2390) put_online_cpus();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2391) return retval;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2392) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2393)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2394) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2395) * Common handling for a write to a "cpus" or "mems" file.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2396) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2397) static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2398) char *buf, size_t nbytes, loff_t off)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2399) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2400) struct cpuset *cs = css_cs(of_css(of));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2401) struct cpuset *trialcs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2402) int retval = -ENODEV;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2403)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2404) buf = strstrip(buf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2405)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2406) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2407) * CPU or memory hotunplug may leave @cs w/o any execution
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2408) * resources, in which case the hotplug code asynchronously updates
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2409) * configuration and transfers all tasks to the nearest ancestor
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2410) * which can execute.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2411) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2412) * As writes to "cpus" or "mems" may restore @cs's execution
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2413) * resources, wait for the previously scheduled operations before
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2414) * proceeding, so that we don't end up keep removing tasks added
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2415) * after execution capability is restored.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2416) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2417) * cpuset_hotplug_work calls back into cgroup core via
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2418) * cgroup_transfer_tasks() and waiting for it from a cgroupfs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2419) * operation like this one can lead to a deadlock through kernfs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2420) * active_ref protection. Let's break the protection. Losing the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2421) * protection is okay as we check whether @cs is online after
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2422) * grabbing cpuset_mutex anyway. This only happens on the legacy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2423) * hierarchies.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2424) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2425) css_get(&cs->css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2426) kernfs_break_active_protection(of->kn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2427) flush_work(&cpuset_hotplug_work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2428)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2429) get_online_cpus();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2430) mutex_lock(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2431) if (!is_cpuset_online(cs))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2432) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2433)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2434) trialcs = alloc_trial_cpuset(cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2435) if (!trialcs) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2436) retval = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2437) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2438) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2439)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2440) switch (of_cft(of)->private) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2441) case FILE_CPULIST:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2442) retval = update_cpumask(cs, trialcs, buf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2443) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2444) case FILE_MEMLIST:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2445) retval = update_nodemask(cs, trialcs, buf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2446) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2447) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2448) retval = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2449) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2450) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2451)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2452) free_cpuset(trialcs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2453) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2454) mutex_unlock(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2455) put_online_cpus();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2456) kernfs_unbreak_active_protection(of->kn);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2457) css_put(&cs->css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2458) flush_workqueue(cpuset_migrate_mm_wq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2459) return retval ?: nbytes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2460) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2461)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2462) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2463) * These ascii lists should be read in a single call, by using a user
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2464) * buffer large enough to hold the entire map. If read in smaller
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2465) * chunks, there is no guarantee of atomicity. Since the display format
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2466) * used, list of ranges of sequential numbers, is variable length,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2467) * and since these maps can change value dynamically, one could read
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2468) * gibberish by doing partial reads while a list was changing.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2469) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2470) static int cpuset_common_seq_show(struct seq_file *sf, void *v)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2471) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2472) struct cpuset *cs = css_cs(seq_css(sf));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2473) cpuset_filetype_t type = seq_cft(sf)->private;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2474) int ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2475)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2476) spin_lock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2477)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2478) switch (type) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2479) case FILE_CPULIST:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2480) seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_requested));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2481) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2482) case FILE_MEMLIST:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2483) seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2484) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2485) case FILE_EFFECTIVE_CPULIST:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2486) seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2487) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2488) case FILE_EFFECTIVE_MEMLIST:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2489) seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2490) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2491) case FILE_SUBPARTS_CPULIST:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2492) seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2493) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2494) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2495) ret = -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2496) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2497)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2498) spin_unlock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2499) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2500) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2501)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2502) static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2503) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2504) struct cpuset *cs = css_cs(css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2505) cpuset_filetype_t type = cft->private;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2506) switch (type) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2507) case FILE_CPU_EXCLUSIVE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2508) return is_cpu_exclusive(cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2509) case FILE_MEM_EXCLUSIVE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2510) return is_mem_exclusive(cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2511) case FILE_MEM_HARDWALL:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2512) return is_mem_hardwall(cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2513) case FILE_SCHED_LOAD_BALANCE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2514) return is_sched_load_balance(cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2515) case FILE_MEMORY_MIGRATE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2516) return is_memory_migrate(cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2517) case FILE_MEMORY_PRESSURE_ENABLED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2518) return cpuset_memory_pressure_enabled;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2519) case FILE_MEMORY_PRESSURE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2520) return fmeter_getrate(&cs->fmeter);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2521) case FILE_SPREAD_PAGE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2522) return is_spread_page(cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2523) case FILE_SPREAD_SLAB:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2524) return is_spread_slab(cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2525) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2526) BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2527) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2528)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2529) /* Unreachable but makes gcc happy */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2530) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2531) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2532)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2533) static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2534) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2535) struct cpuset *cs = css_cs(css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2536) cpuset_filetype_t type = cft->private;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2537) switch (type) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2538) case FILE_SCHED_RELAX_DOMAIN_LEVEL:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2539) return cs->relax_domain_level;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2540) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2541) BUG();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2542) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2543)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2544) /* Unrechable but makes gcc happy */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2545) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2546) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2547)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2548) static int sched_partition_show(struct seq_file *seq, void *v)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2549) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2550) struct cpuset *cs = css_cs(seq_css(seq));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2551)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2552) switch (cs->partition_root_state) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2553) case PRS_ENABLED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2554) seq_puts(seq, "root\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2555) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2556) case PRS_DISABLED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2557) seq_puts(seq, "member\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2558) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2559) case PRS_ERROR:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2560) seq_puts(seq, "root invalid\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2561) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2562) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2563) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2564) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2565)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2566) static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2567) size_t nbytes, loff_t off)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2568) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2569) struct cpuset *cs = css_cs(of_css(of));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2570) int val;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2571) int retval = -ENODEV;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2572)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2573) buf = strstrip(buf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2574)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2575) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2576) * Convert "root" to ENABLED, and convert "member" to DISABLED.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2577) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2578) if (!strcmp(buf, "root"))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2579) val = PRS_ENABLED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2580) else if (!strcmp(buf, "member"))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2581) val = PRS_DISABLED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2582) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2583) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2584)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2585) css_get(&cs->css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2586) get_online_cpus();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2587) mutex_lock(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2588) if (!is_cpuset_online(cs))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2589) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2590)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2591) retval = update_prstate(cs, val);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2592) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2593) mutex_unlock(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2594) put_online_cpus();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2595) css_put(&cs->css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2596) return retval ?: nbytes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2597) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2598)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2599) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2600) * for the common functions, 'private' gives the type of file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2601) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2602)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2603) static struct cftype legacy_files[] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2604) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2605) .name = "cpus",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2606) .seq_show = cpuset_common_seq_show,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2607) .write = cpuset_write_resmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2608) .max_write_len = (100U + 6 * NR_CPUS),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2609) .private = FILE_CPULIST,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2610) },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2611)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2612) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2613) .name = "mems",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2614) .seq_show = cpuset_common_seq_show,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2615) .write = cpuset_write_resmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2616) .max_write_len = (100U + 6 * MAX_NUMNODES),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2617) .private = FILE_MEMLIST,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2618) },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2619)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2620) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2621) .name = "effective_cpus",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2622) .seq_show = cpuset_common_seq_show,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2623) .private = FILE_EFFECTIVE_CPULIST,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2624) },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2625)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2626) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2627) .name = "effective_mems",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2628) .seq_show = cpuset_common_seq_show,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2629) .private = FILE_EFFECTIVE_MEMLIST,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2630) },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2631)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2632) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2633) .name = "cpu_exclusive",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2634) .read_u64 = cpuset_read_u64,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2635) .write_u64 = cpuset_write_u64,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2636) .private = FILE_CPU_EXCLUSIVE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2637) },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2638)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2639) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2640) .name = "mem_exclusive",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2641) .read_u64 = cpuset_read_u64,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2642) .write_u64 = cpuset_write_u64,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2643) .private = FILE_MEM_EXCLUSIVE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2644) },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2645)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2646) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2647) .name = "mem_hardwall",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2648) .read_u64 = cpuset_read_u64,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2649) .write_u64 = cpuset_write_u64,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2650) .private = FILE_MEM_HARDWALL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2651) },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2652)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2653) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2654) .name = "sched_load_balance",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2655) .read_u64 = cpuset_read_u64,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2656) .write_u64 = cpuset_write_u64,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2657) .private = FILE_SCHED_LOAD_BALANCE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2658) },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2659)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2660) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2661) .name = "sched_relax_domain_level",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2662) .read_s64 = cpuset_read_s64,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2663) .write_s64 = cpuset_write_s64,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2664) .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2665) },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2666)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2667) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2668) .name = "memory_migrate",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2669) .read_u64 = cpuset_read_u64,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2670) .write_u64 = cpuset_write_u64,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2671) .private = FILE_MEMORY_MIGRATE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2672) },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2673)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2674) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2675) .name = "memory_pressure",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2676) .read_u64 = cpuset_read_u64,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2677) .private = FILE_MEMORY_PRESSURE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2678) },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2679)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2680) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2681) .name = "memory_spread_page",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2682) .read_u64 = cpuset_read_u64,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2683) .write_u64 = cpuset_write_u64,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2684) .private = FILE_SPREAD_PAGE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2685) },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2686)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2687) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2688) .name = "memory_spread_slab",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2689) .read_u64 = cpuset_read_u64,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2690) .write_u64 = cpuset_write_u64,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2691) .private = FILE_SPREAD_SLAB,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2692) },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2693)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2694) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2695) .name = "memory_pressure_enabled",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2696) .flags = CFTYPE_ONLY_ON_ROOT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2697) .read_u64 = cpuset_read_u64,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2698) .write_u64 = cpuset_write_u64,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2699) .private = FILE_MEMORY_PRESSURE_ENABLED,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2700) },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2701)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2702) { } /* terminate */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2703) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2704)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2705) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2706) * This is currently a minimal set for the default hierarchy. It can be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2707) * expanded later on by migrating more features and control files from v1.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2708) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2709) static struct cftype dfl_files[] = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2710) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2711) .name = "cpus",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2712) .seq_show = cpuset_common_seq_show,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2713) .write = cpuset_write_resmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2714) .max_write_len = (100U + 6 * NR_CPUS),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2715) .private = FILE_CPULIST,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2716) .flags = CFTYPE_NOT_ON_ROOT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2717) },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2718)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2719) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2720) .name = "mems",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2721) .seq_show = cpuset_common_seq_show,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2722) .write = cpuset_write_resmask,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2723) .max_write_len = (100U + 6 * MAX_NUMNODES),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2724) .private = FILE_MEMLIST,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2725) .flags = CFTYPE_NOT_ON_ROOT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2726) },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2727)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2728) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2729) .name = "cpus.effective",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2730) .seq_show = cpuset_common_seq_show,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2731) .private = FILE_EFFECTIVE_CPULIST,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2732) },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2733)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2734) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2735) .name = "mems.effective",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2736) .seq_show = cpuset_common_seq_show,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2737) .private = FILE_EFFECTIVE_MEMLIST,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2738) },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2739)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2740) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2741) .name = "cpus.partition",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2742) .seq_show = sched_partition_show,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2743) .write = sched_partition_write,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2744) .private = FILE_PARTITION_ROOT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2745) .flags = CFTYPE_NOT_ON_ROOT,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2746) },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2747)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2748) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2749) .name = "cpus.subpartitions",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2750) .seq_show = cpuset_common_seq_show,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2751) .private = FILE_SUBPARTS_CPULIST,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2752) .flags = CFTYPE_DEBUG,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2753) },
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2754)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2755) { } /* terminate */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2756) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2757)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2758)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2759) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2760) * cpuset_css_alloc - allocate a cpuset css
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2761) * cgrp: control group that the new cpuset will be part of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2762) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2763)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2764) static struct cgroup_subsys_state *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2765) cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2766) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2767) struct cpuset *cs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2768)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2769) if (!parent_css)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2770) return &top_cpuset.css;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2771)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2772) cs = kzalloc(sizeof(*cs), GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2773) if (!cs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2774) return ERR_PTR(-ENOMEM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2775)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2776) if (alloc_cpumasks(cs, NULL)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2777) kfree(cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2778) return ERR_PTR(-ENOMEM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2779) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2780)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2781) set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2782) nodes_clear(cs->mems_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2783) nodes_clear(cs->effective_mems);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2784) fmeter_init(&cs->fmeter);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2785) cs->relax_domain_level = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2786)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2787) return &cs->css;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2788) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2789)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2790) static int cpuset_css_online(struct cgroup_subsys_state *css)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2791) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2792) struct cpuset *cs = css_cs(css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2793) struct cpuset *parent = parent_cs(cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2794) struct cpuset *tmp_cs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2795) struct cgroup_subsys_state *pos_css;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2796)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2797) if (!parent)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2798) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2799)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2800) get_online_cpus();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2801) mutex_lock(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2802)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2803) set_bit(CS_ONLINE, &cs->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2804) if (is_spread_page(parent))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2805) set_bit(CS_SPREAD_PAGE, &cs->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2806) if (is_spread_slab(parent))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2807) set_bit(CS_SPREAD_SLAB, &cs->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2808)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2809) cpuset_inc();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2810)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2811) spin_lock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2812) if (is_in_v2_mode()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2813) cpumask_copy(cs->effective_cpus, parent->effective_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2814) cs->effective_mems = parent->effective_mems;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2815) cs->use_parent_ecpus = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2816) parent->child_ecpus_count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2817) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2818) spin_unlock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2819)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2820) if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2821) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2822)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2823) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2824) * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2825) * set. This flag handling is implemented in cgroup core for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2826) * histrical reasons - the flag may be specified during mount.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2827) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2828) * Currently, if any sibling cpusets have exclusive cpus or mem, we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2829) * refuse to clone the configuration - thereby refusing the task to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2830) * be entered, and as a result refusing the sys_unshare() or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2831) * clone() which initiated it. If this becomes a problem for some
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2832) * users who wish to allow that scenario, then this could be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2833) * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2834) * (and likewise for mems) to the new cgroup.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2835) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2836) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2837) cpuset_for_each_child(tmp_cs, pos_css, parent) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2838) if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2839) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2840) goto out_unlock;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2841) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2842) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2843) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2844)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2845) spin_lock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2846) cs->mems_allowed = parent->mems_allowed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2847) cs->effective_mems = parent->mems_allowed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2848) cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2849) cpumask_copy(cs->cpus_requested, parent->cpus_requested);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2850) cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2851) spin_unlock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2852) out_unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2853) mutex_unlock(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2854) put_online_cpus();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2855) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2856) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2857)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2858) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2859) * If the cpuset being removed has its flag 'sched_load_balance'
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2860) * enabled, then simulate turning sched_load_balance off, which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2861) * will call rebuild_sched_domains_locked(). That is not needed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2862) * in the default hierarchy where only changes in partition
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2863) * will cause repartitioning.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2864) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2865) * If the cpuset has the 'sched.partition' flag enabled, simulate
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2866) * turning 'sched.partition" off.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2867) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2868)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2869) static void cpuset_css_offline(struct cgroup_subsys_state *css)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2870) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2871) struct cpuset *cs = css_cs(css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2872)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2873) get_online_cpus();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2874) mutex_lock(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2875)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2876) if (is_partition_root(cs))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2877) update_prstate(cs, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2878)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2879) if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2880) is_sched_load_balance(cs))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2881) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2882)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2883) if (cs->use_parent_ecpus) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2884) struct cpuset *parent = parent_cs(cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2885)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2886) cs->use_parent_ecpus = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2887) parent->child_ecpus_count--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2888) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2889)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2890) cpuset_dec();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2891) clear_bit(CS_ONLINE, &cs->flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2892)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2893) mutex_unlock(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2894) put_online_cpus();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2895) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2896)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2897) static void cpuset_css_free(struct cgroup_subsys_state *css)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2898) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2899) struct cpuset *cs = css_cs(css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2900)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2901) free_cpuset(cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2902) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2903)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2904) static void cpuset_bind(struct cgroup_subsys_state *root_css)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2905) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2906) mutex_lock(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2907) spin_lock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2908)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2909) if (is_in_v2_mode()) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2910) cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2911) top_cpuset.mems_allowed = node_possible_map;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2912) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2913) cpumask_copy(top_cpuset.cpus_allowed,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2914) top_cpuset.effective_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2915) top_cpuset.mems_allowed = top_cpuset.effective_mems;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2916) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2917)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2918) spin_unlock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2919) mutex_unlock(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2920) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2921)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2922) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2923) * Make sure the new task conform to the current state of its parent,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2924) * which could have been changed by cpuset just after it inherits the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2925) * state from the parent and before it sits on the cgroup's task list.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2926) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2927) static void cpuset_fork(struct task_struct *task)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2928) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2929) int inherit_cpus = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2930) if (task_css_is_root(task, cpuset_cgrp_id))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2931) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2932)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2933) trace_android_rvh_cpuset_fork(task, &inherit_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2934) if (!inherit_cpus)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2935) set_cpus_allowed_ptr(task, current->cpus_ptr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2936) task->mems_allowed = current->mems_allowed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2937) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2938)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2939) struct cgroup_subsys cpuset_cgrp_subsys = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2940) .css_alloc = cpuset_css_alloc,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2941) .css_online = cpuset_css_online,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2942) .css_offline = cpuset_css_offline,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2943) .css_free = cpuset_css_free,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2944) .can_attach = cpuset_can_attach,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2945) .cancel_attach = cpuset_cancel_attach,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2946) .attach = cpuset_attach,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2947) .post_attach = cpuset_post_attach,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2948) .bind = cpuset_bind,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2949) .fork = cpuset_fork,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2950) .legacy_cftypes = legacy_files,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2951) .dfl_cftypes = dfl_files,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2952) .early_init = true,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2953) .threaded = true,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2954) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2955)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2956) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2957) * cpuset_init - initialize cpusets at system boot
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2958) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2959) * Description: Initialize top_cpuset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2960) **/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2961)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2962) int __init cpuset_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2963) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2964) BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2965) BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2966) BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2967) BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2968)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2969) cpumask_setall(top_cpuset.cpus_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2970) cpumask_setall(top_cpuset.cpus_requested);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2971) nodes_setall(top_cpuset.mems_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2972) cpumask_setall(top_cpuset.effective_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2973) nodes_setall(top_cpuset.effective_mems);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2974)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2975) fmeter_init(&top_cpuset.fmeter);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2976) set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2977) top_cpuset.relax_domain_level = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2978)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2979) BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2980)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2981) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2982) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2983)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2984) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2985) * If CPU and/or memory hotplug handlers, below, unplug any CPUs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2986) * or memory nodes, we need to walk over the cpuset hierarchy,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2987) * removing that CPU or node from all cpusets. If this removes the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2988) * last CPU or node from a cpuset, then move the tasks in the empty
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2989) * cpuset to its next-highest non-empty parent.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2990) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2991) static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2992) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2993) struct cpuset *parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2994)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2995) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2996) * Find its next-highest non-empty parent, (top cpuset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2997) * has online cpus, so can't be empty).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2998) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2999) parent = parent_cs(cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3000) while (cpumask_empty(parent->cpus_allowed) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3001) nodes_empty(parent->mems_allowed))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3002) parent = parent_cs(parent);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3003)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3004) if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3005) pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3006) pr_cont_cgroup_name(cs->css.cgroup);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3007) pr_cont("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3008) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3009) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3010)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3011) static void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3012) hotplug_update_tasks_legacy(struct cpuset *cs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3013) struct cpumask *new_cpus, nodemask_t *new_mems,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3014) bool cpus_updated, bool mems_updated)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3015) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3016) bool is_empty;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3017)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3018) spin_lock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3019) cpumask_copy(cs->cpus_allowed, new_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3020) cpumask_copy(cs->effective_cpus, new_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3021) cs->mems_allowed = *new_mems;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3022) cs->effective_mems = *new_mems;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3023) spin_unlock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3024)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3025) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3026) * Don't call update_tasks_cpumask() if the cpuset becomes empty,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3027) * as the tasks will be migratecd to an ancestor.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3028) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3029) if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3030) update_tasks_cpumask(cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3031) if (mems_updated && !nodes_empty(cs->mems_allowed))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3032) update_tasks_nodemask(cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3033)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3034) is_empty = cpumask_empty(cs->cpus_allowed) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3035) nodes_empty(cs->mems_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3036)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3037) mutex_unlock(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3038)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3039) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3040) * Move tasks to the nearest ancestor with execution resources,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3041) * This is full cgroup operation which will also call back into
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3042) * cpuset. Should be done outside any lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3043) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3044) if (is_empty)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3045) remove_tasks_in_empty_cpuset(cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3046)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3047) mutex_lock(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3048) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3049)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3050) static void
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3051) hotplug_update_tasks(struct cpuset *cs,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3052) struct cpumask *new_cpus, nodemask_t *new_mems,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3053) bool cpus_updated, bool mems_updated)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3054) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3055) if (cpumask_empty(new_cpus))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3056) cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3057) if (nodes_empty(*new_mems))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3058) *new_mems = parent_cs(cs)->effective_mems;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3059)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3060) spin_lock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3061) cpumask_copy(cs->effective_cpus, new_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3062) cs->effective_mems = *new_mems;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3063) spin_unlock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3064)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3065) if (cpus_updated)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3066) update_tasks_cpumask(cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3067) if (mems_updated)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3068) update_tasks_nodemask(cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3069) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3070)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3071) static bool force_rebuild;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3072)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3073) void cpuset_force_rebuild(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3074) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3075) force_rebuild = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3076) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3077)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3078) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3079) * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3080) * @cs: cpuset in interest
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3081) * @tmp: the tmpmasks structure pointer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3082) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3083) * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3084) * offline, update @cs accordingly. If @cs ends up with no CPU or memory,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3085) * all its tasks are moved to the nearest ancestor with both resources.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3086) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3087) static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3088) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3089) static cpumask_t new_cpus;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3090) static nodemask_t new_mems;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3091) bool cpus_updated;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3092) bool mems_updated;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3093) struct cpuset *parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3094) retry:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3095) wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3096)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3097) mutex_lock(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3098)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3099) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3100) * We have raced with task attaching. We wait until attaching
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3101) * is finished, so we won't attach a task to an empty cpuset.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3102) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3103) if (cs->attach_in_progress) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3104) mutex_unlock(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3105) goto retry;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3106) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3107)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3108) parent = parent_cs(cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3109) compute_effective_cpumask(&new_cpus, cs, parent);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3110) nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3111)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3112) if (cs->nr_subparts_cpus)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3113) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3114) * Make sure that CPUs allocated to child partitions
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3115) * do not show up in effective_cpus.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3116) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3117) cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3118)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3119) if (!tmp || !cs->partition_root_state)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3120) goto update_tasks;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3121)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3122) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3123) * In the unlikely event that a partition root has empty
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3124) * effective_cpus or its parent becomes erroneous, we have to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3125) * transition it to the erroneous state.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3126) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3127) if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3128) (parent->partition_root_state == PRS_ERROR))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3129) if (cs->nr_subparts_cpus) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3130) spin_lock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3131) cs->nr_subparts_cpus = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3132) cpumask_clear(cs->subparts_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3133) spin_unlock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3134) compute_effective_cpumask(&new_cpus, cs, parent);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3135) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3136)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3137) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3138) * If the effective_cpus is empty because the child
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3139) * partitions take away all the CPUs, we can keep
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3140) * the current partition and let the child partitions
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3141) * fight for available CPUs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3142) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3143) if ((parent->partition_root_state == PRS_ERROR) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3144) cpumask_empty(&new_cpus)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3145) update_parent_subparts_cpumask(cs, partcmd_disable,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3146) NULL, tmp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3147) spin_lock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3148) cs->partition_root_state = PRS_ERROR;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3149) spin_unlock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3150) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3151) cpuset_force_rebuild();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3152) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3153)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3154) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3155) * On the other hand, an erroneous partition root may be transitioned
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3156) * back to a regular one or a partition root with no CPU allocated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3157) * from the parent may change to erroneous.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3158) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3159) if (is_partition_root(parent) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3160) ((cs->partition_root_state == PRS_ERROR) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3161) !cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3162) update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3163) cpuset_force_rebuild();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3164)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3165) update_tasks:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3166) cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3167) mems_updated = !nodes_equal(new_mems, cs->effective_mems);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3168)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3169) if (is_in_v2_mode())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3170) hotplug_update_tasks(cs, &new_cpus, &new_mems,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3171) cpus_updated, mems_updated);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3172) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3173) hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3174) cpus_updated, mems_updated);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3175)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3176) mutex_unlock(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3177) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3178)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3179) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3180) * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3181) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3182) * This function is called after either CPU or memory configuration has
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3183) * changed and updates cpuset accordingly. The top_cpuset is always
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3184) * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3185) * order to make cpusets transparent (of no affect) on systems that are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3186) * actively using CPU hotplug but making no active use of cpusets.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3187) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3188) * Non-root cpusets are only affected by offlining. If any CPUs or memory
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3189) * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3190) * all descendants.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3191) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3192) * Note that CPU offlining during suspend is ignored. We don't modify
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3193) * cpusets across suspend/resume cycles at all.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3194) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3195) void cpuset_hotplug_workfn(struct work_struct *work)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3196) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3197) static cpumask_t new_cpus;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3198) static nodemask_t new_mems;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3199) bool cpus_updated, mems_updated;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3200) bool on_dfl = is_in_v2_mode();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3201) struct tmpmasks tmp, *ptmp = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3202)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3203) if (on_dfl && !alloc_cpumasks(NULL, &tmp))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3204) ptmp = &tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3205)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3206) mutex_lock(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3207)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3208) /* fetch the available cpus/mems and find out which changed how */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3209) cpumask_copy(&new_cpus, cpu_active_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3210) new_mems = node_states[N_MEMORY];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3211)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3212) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3213) * If subparts_cpus is populated, it is likely that the check below
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3214) * will produce a false positive on cpus_updated when the cpu list
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3215) * isn't changed. It is extra work, but it is better to be safe.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3216) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3217) cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3218) mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3219)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3220) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3221) * In the rare case that hotplug removes all the cpus in subparts_cpus,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3222) * we assumed that cpus are updated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3223) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3224) if (!cpus_updated && top_cpuset.nr_subparts_cpus)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3225) cpus_updated = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3226)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3227) /* synchronize cpus_allowed to cpu_active_mask */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3228) if (cpus_updated) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3229) spin_lock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3230) if (!on_dfl)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3231) cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3232) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3233) * Make sure that CPUs allocated to child partitions
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3234) * do not show up in effective_cpus. If no CPU is left,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3235) * we clear the subparts_cpus & let the child partitions
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3236) * fight for the CPUs again.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3237) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3238) if (top_cpuset.nr_subparts_cpus) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3239) if (cpumask_subset(&new_cpus,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3240) top_cpuset.subparts_cpus)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3241) top_cpuset.nr_subparts_cpus = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3242) cpumask_clear(top_cpuset.subparts_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3243) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3244) cpumask_andnot(&new_cpus, &new_cpus,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3245) top_cpuset.subparts_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3246) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3247) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3248) cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3249) spin_unlock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3250) /* we don't mess with cpumasks of tasks in top_cpuset */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3251) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3252)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3253) /* synchronize mems_allowed to N_MEMORY */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3254) if (mems_updated) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3255) spin_lock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3256) if (!on_dfl)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3257) top_cpuset.mems_allowed = new_mems;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3258) top_cpuset.effective_mems = new_mems;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3259) spin_unlock_irq(&callback_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3260) update_tasks_nodemask(&top_cpuset);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3261) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3262)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3263) mutex_unlock(&cpuset_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3264)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3265) /* if cpus or mems changed, we need to propagate to descendants */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3266) if (cpus_updated || mems_updated) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3267) struct cpuset *cs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3268) struct cgroup_subsys_state *pos_css;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3269)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3270) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3271) cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3272) if (cs == &top_cpuset || !css_tryget_online(&cs->css))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3273) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3274) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3275)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3276) cpuset_hotplug_update_tasks(cs, ptmp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3277)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3278) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3279) css_put(&cs->css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3280) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3281) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3282) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3283)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3284) /* rebuild sched domains if cpus_allowed has changed */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3285) if (cpus_updated || force_rebuild) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3286) force_rebuild = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3287) rebuild_sched_domains();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3288) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3289)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3290) free_cpumasks(NULL, ptmp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3291) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3292)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3293) void cpuset_update_active_cpus(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3294) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3295) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3296) * We're inside cpu hotplug critical region which usually nests
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3297) * inside cgroup synchronization. Bounce actual hotplug processing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3298) * to a work item to avoid reverse locking order.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3299) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3300) schedule_work(&cpuset_hotplug_work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3301) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3302)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3303) void cpuset_update_active_cpus_affine(int cpu)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3304) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3305) schedule_work_on(cpu, &cpuset_hotplug_work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3306) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3307)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3308) void cpuset_wait_for_hotplug(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3309) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3310) flush_work(&cpuset_hotplug_work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3311) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3312)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3313) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3314) * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3315) * Call this routine anytime after node_states[N_MEMORY] changes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3316) * See cpuset_update_active_cpus() for CPU hotplug handling.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3317) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3318) static int cpuset_track_online_nodes(struct notifier_block *self,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3319) unsigned long action, void *arg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3320) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3321) schedule_work(&cpuset_hotplug_work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3322) return NOTIFY_OK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3323) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3324)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3325) static struct notifier_block cpuset_track_online_nodes_nb = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3326) .notifier_call = cpuset_track_online_nodes,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3327) .priority = 10, /* ??! */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3328) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3329)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3330) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3331) * cpuset_init_smp - initialize cpus_allowed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3332) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3333) * Description: Finish top cpuset after cpu, node maps are initialized
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3334) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3335) void __init cpuset_init_smp(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3336) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3337) cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3338) top_cpuset.mems_allowed = node_states[N_MEMORY];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3339) top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3340)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3341) cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3342) top_cpuset.effective_mems = node_states[N_MEMORY];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3343)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3344) register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3345)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3346) cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3347) BUG_ON(!cpuset_migrate_mm_wq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3348) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3349)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3350) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3351) * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3352) * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3353) * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3354) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3355) * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3356) * attached to the specified @tsk. Guaranteed to return some non-empty
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3357) * subset of cpu_online_mask, even if this means going outside the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3358) * tasks cpuset.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3359) **/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3360)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3361) void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3362) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3363) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3364)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3365) spin_lock_irqsave(&callback_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3366) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3367) guarantee_online_cpus(tsk, pmask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3368) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3369) spin_unlock_irqrestore(&callback_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3370) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3371) EXPORT_SYMBOL_GPL(cpuset_cpus_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3372) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3373) * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3374) * @tsk: pointer to task_struct with which the scheduler is struggling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3375) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3376) * Description: In the case that the scheduler cannot find an allowed cpu in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3377) * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3378) * mode however, this value is the same as task_cs(tsk)->effective_cpus,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3379) * which will not contain a sane cpumask during cases such as cpu hotplugging.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3380) * This is the absolute last resort for the scheduler and it is only used if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3381) * _every_ other avenue has been traveled.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3382) **/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3383)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3384) void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3385) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3386) const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3387) const struct cpumask *cs_mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3388)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3389) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3390) cs_mask = task_cs(tsk)->cpus_allowed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3391)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3392) if (!is_in_v2_mode() || !cpumask_subset(cs_mask, possible_mask))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3393) goto unlock; /* select_fallback_rq will try harder */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3394)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3395) do_set_cpus_allowed(tsk, cs_mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3396) unlock:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3397) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3398)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3399) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3400) * We own tsk->cpus_allowed, nobody can change it under us.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3401) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3402) * But we used cs && cs->cpus_allowed lockless and thus can
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3403) * race with cgroup_attach_task() or update_cpumask() and get
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3404) * the wrong tsk->cpus_allowed. However, both cases imply the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3405) * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3406) * which takes task_rq_lock().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3407) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3408) * If we are called after it dropped the lock we must see all
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3409) * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3410) * set any mask even if it is not right from task_cs() pov,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3411) * the pending set_cpus_allowed_ptr() will fix things.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3412) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3413) * select_fallback_rq() will fix things ups and set cpu_possible_mask
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3414) * if required.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3415) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3416) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3417)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3418) void __init cpuset_init_current_mems_allowed(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3419) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3420) nodes_setall(current->mems_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3421) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3422)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3423) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3424) * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3425) * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3426) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3427) * Description: Returns the nodemask_t mems_allowed of the cpuset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3428) * attached to the specified @tsk. Guaranteed to return some non-empty
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3429) * subset of node_states[N_MEMORY], even if this means going outside the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3430) * tasks cpuset.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3431) **/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3432)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3433) nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3434) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3435) nodemask_t mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3436) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3437)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3438) spin_lock_irqsave(&callback_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3439) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3440) guarantee_online_mems(task_cs(tsk), &mask);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3441) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3442) spin_unlock_irqrestore(&callback_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3443)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3444) return mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3445) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3446)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3447) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3448) * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3449) * @nodemask: the nodemask to be checked
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3450) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3451) * Are any of the nodes in the nodemask allowed in current->mems_allowed?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3452) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3453) int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3454) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3455) return nodes_intersects(*nodemask, current->mems_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3456) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3457)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3458) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3459) * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3460) * mem_hardwall ancestor to the specified cpuset. Call holding
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3461) * callback_lock. If no ancestor is mem_exclusive or mem_hardwall
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3462) * (an unusual configuration), then returns the root cpuset.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3463) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3464) static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3465) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3466) while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3467) cs = parent_cs(cs);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3468) return cs;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3469) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3470)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3471) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3472) * cpuset_node_allowed - Can we allocate on a memory node?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3473) * @node: is this an allowed node?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3474) * @gfp_mask: memory allocation flags
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3475) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3476) * If we're in interrupt, yes, we can always allocate. If @node is set in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3477) * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3478) * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3479) * yes. If current has access to memory reserves as an oom victim, yes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3480) * Otherwise, no.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3481) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3482) * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3483) * and do not allow allocations outside the current tasks cpuset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3484) * unless the task has been OOM killed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3485) * GFP_KERNEL allocations are not so marked, so can escape to the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3486) * nearest enclosing hardwalled ancestor cpuset.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3487) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3488) * Scanning up parent cpusets requires callback_lock. The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3489) * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3490) * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3491) * current tasks mems_allowed came up empty on the first pass over
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3492) * the zonelist. So only GFP_KERNEL allocations, if all nodes in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3493) * cpuset are short of memory, might require taking the callback_lock.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3494) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3495) * The first call here from mm/page_alloc:get_page_from_freelist()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3496) * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3497) * so no allocation on a node outside the cpuset is allowed (unless
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3498) * in interrupt, of course).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3499) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3500) * The second pass through get_page_from_freelist() doesn't even call
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3501) * here for GFP_ATOMIC calls. For those calls, the __alloc_pages()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3502) * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3503) * in alloc_flags. That logic and the checks below have the combined
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3504) * affect that:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3505) * in_interrupt - any node ok (current task context irrelevant)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3506) * GFP_ATOMIC - any node ok
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3507) * tsk_is_oom_victim - any node ok
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3508) * GFP_KERNEL - any node in enclosing hardwalled cpuset ok
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3509) * GFP_USER - only nodes in current tasks mems allowed ok.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3510) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3511) bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3512) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3513) struct cpuset *cs; /* current cpuset ancestors */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3514) int allowed; /* is allocation in zone z allowed? */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3515) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3516)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3517) if (in_interrupt())
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3518) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3519) if (node_isset(node, current->mems_allowed))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3520) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3521) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3522) * Allow tasks that have access to memory reserves because they have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3523) * been OOM killed to get memory anywhere.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3524) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3525) if (unlikely(tsk_is_oom_victim(current)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3526) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3527) if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3528) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3529)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3530) if (current->flags & PF_EXITING) /* Let dying task have memory */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3531) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3532)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3533) /* Not hardwall and node outside mems_allowed: scan up cpusets */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3534) spin_lock_irqsave(&callback_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3535)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3536) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3537) cs = nearest_hardwall_ancestor(task_cs(current));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3538) allowed = node_isset(node, cs->mems_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3539) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3540)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3541) spin_unlock_irqrestore(&callback_lock, flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3542) return allowed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3543) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3544)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3545) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3546) * cpuset_mem_spread_node() - On which node to begin search for a file page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3547) * cpuset_slab_spread_node() - On which node to begin search for a slab page
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3548) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3549) * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3550) * tasks in a cpuset with is_spread_page or is_spread_slab set),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3551) * and if the memory allocation used cpuset_mem_spread_node()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3552) * to determine on which node to start looking, as it will for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3553) * certain page cache or slab cache pages such as used for file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3554) * system buffers and inode caches, then instead of starting on the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3555) * local node to look for a free page, rather spread the starting
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3556) * node around the tasks mems_allowed nodes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3557) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3558) * We don't have to worry about the returned node being offline
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3559) * because "it can't happen", and even if it did, it would be ok.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3560) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3561) * The routines calling guarantee_online_mems() are careful to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3562) * only set nodes in task->mems_allowed that are online. So it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3563) * should not be possible for the following code to return an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3564) * offline node. But if it did, that would be ok, as this routine
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3565) * is not returning the node where the allocation must be, only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3566) * the node where the search should start. The zonelist passed to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3567) * __alloc_pages() will include all nodes. If the slab allocator
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3568) * is passed an offline node, it will fall back to the local node.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3569) * See kmem_cache_alloc_node().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3570) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3571)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3572) static int cpuset_spread_node(int *rotor)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3573) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3574) return *rotor = next_node_in(*rotor, current->mems_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3575) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3576)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3577) int cpuset_mem_spread_node(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3578) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3579) if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3580) current->cpuset_mem_spread_rotor =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3581) node_random(¤t->mems_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3582)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3583) return cpuset_spread_node(¤t->cpuset_mem_spread_rotor);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3584) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3585)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3586) int cpuset_slab_spread_node(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3587) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3588) if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3589) current->cpuset_slab_spread_rotor =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3590) node_random(¤t->mems_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3591)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3592) return cpuset_spread_node(¤t->cpuset_slab_spread_rotor);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3593) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3594)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3595) EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3596)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3597) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3598) * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3599) * @tsk1: pointer to task_struct of some task.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3600) * @tsk2: pointer to task_struct of some other task.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3601) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3602) * Description: Return true if @tsk1's mems_allowed intersects the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3603) * mems_allowed of @tsk2. Used by the OOM killer to determine if
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3604) * one of the task's memory usage might impact the memory available
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3605) * to the other.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3606) **/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3607)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3608) int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3609) const struct task_struct *tsk2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3610) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3611) return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3612) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3613)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3614) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3615) * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3616) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3617) * Description: Prints current's name, cpuset name, and cached copy of its
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3618) * mems_allowed to the kernel log.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3619) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3620) void cpuset_print_current_mems_allowed(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3621) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3622) struct cgroup *cgrp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3623)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3624) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3625)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3626) cgrp = task_cs(current)->css.cgroup;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3627) pr_cont(",cpuset=");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3628) pr_cont_cgroup_name(cgrp);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3629) pr_cont(",mems_allowed=%*pbl",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3630) nodemask_pr_args(¤t->mems_allowed));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3631)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3632) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3633) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3634)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3635) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3636) * Collection of memory_pressure is suppressed unless
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3637) * this flag is enabled by writing "1" to the special
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3638) * cpuset file 'memory_pressure_enabled' in the root cpuset.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3639) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3640)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3641) int cpuset_memory_pressure_enabled __read_mostly;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3642)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3643) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3644) * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3645) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3646) * Keep a running average of the rate of synchronous (direct)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3647) * page reclaim efforts initiated by tasks in each cpuset.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3648) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3649) * This represents the rate at which some task in the cpuset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3650) * ran low on memory on all nodes it was allowed to use, and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3651) * had to enter the kernels page reclaim code in an effort to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3652) * create more free memory by tossing clean pages or swapping
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3653) * or writing dirty pages.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3654) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3655) * Display to user space in the per-cpuset read-only file
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3656) * "memory_pressure". Value displayed is an integer
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3657) * representing the recent rate of entry into the synchronous
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3658) * (direct) page reclaim by any task attached to the cpuset.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3659) **/
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3660)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3661) void __cpuset_memory_pressure_bump(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3662) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3663) rcu_read_lock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3664) fmeter_markevent(&task_cs(current)->fmeter);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3665) rcu_read_unlock();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3666) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3667)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3668) #ifdef CONFIG_PROC_PID_CPUSET
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3669) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3670) * proc_cpuset_show()
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3671) * - Print tasks cpuset path into seq_file.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3672) * - Used for /proc/<pid>/cpuset.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3673) * - No need to task_lock(tsk) on this tsk->cpuset reference, as it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3674) * doesn't really matter if tsk->cpuset changes after we read it,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3675) * and we take cpuset_mutex, keeping cpuset_attach() from changing it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3676) * anyway.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3677) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3678) int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3679) struct pid *pid, struct task_struct *tsk)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3680) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3681) char *buf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3682) struct cgroup_subsys_state *css;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3683) int retval;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3684)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3685) retval = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3686) buf = kmalloc(PATH_MAX, GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3687) if (!buf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3688) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3689)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3690) css = task_get_css(tsk, cpuset_cgrp_id);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3691) retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3692) current->nsproxy->cgroup_ns);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3693) css_put(css);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3694) if (retval >= PATH_MAX)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3695) retval = -ENAMETOOLONG;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3696) if (retval < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3697) goto out_free;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3698) seq_puts(m, buf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3699) seq_putc(m, '\n');
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3700) retval = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3701) out_free:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3702) kfree(buf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3703) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3704) return retval;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3705) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3706) #endif /* CONFIG_PROC_PID_CPUSET */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3707)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3708) /* Display task mems_allowed in /proc/<pid>/status file. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3709) void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3710) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3711) seq_printf(m, "Mems_allowed:\t%*pb\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3712) nodemask_pr_args(&task->mems_allowed));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3713) seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3714) nodemask_pr_args(&task->mems_allowed));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3715) }