| 1 | /* | 
|---|
| 2 | *  kernel/cpuset.c | 
|---|
| 3 | * | 
|---|
| 4 | *  Processor and Memory placement constraints for sets of tasks. | 
|---|
| 5 | * | 
|---|
| 6 | *  Copyright (C) 2003 BULL SA. | 
|---|
| 7 | *  Copyright (C) 2004-2007 Silicon Graphics, Inc. | 
|---|
| 8 | *  Copyright (C) 2006 Google, Inc | 
|---|
| 9 | * | 
|---|
| 10 | *  Portions derived from Patrick Mochel's sysfs code. | 
|---|
| 11 | *  sysfs is Copyright (c) 2001-3 Patrick Mochel | 
|---|
| 12 | * | 
|---|
| 13 | *  2003-10-10 Written by Simon Derr. | 
|---|
| 14 | *  2003-10-22 Updates by Stephen Hemminger. | 
|---|
| 15 | *  2004 May-July Rework by Paul Jackson. | 
|---|
| 16 | *  2006 Rework by Paul Menage to use generic cgroups | 
|---|
| 17 | *  2008 Rework of the scheduler domains and CPU hotplug handling | 
|---|
| 18 | *       by Max Krasnyansky | 
|---|
| 19 | * | 
|---|
| 20 | *  This file is subject to the terms and conditions of the GNU General Public | 
|---|
| 21 | *  License.  See the file COPYING in the main directory of the Linux | 
|---|
| 22 | *  distribution for more details. | 
|---|
| 23 | */ | 
|---|
| 24 | #include "cpuset-internal.h" | 
|---|
| 25 |  | 
|---|
| 26 | #include <linux/init.h> | 
|---|
| 27 | #include <linux/interrupt.h> | 
|---|
| 28 | #include <linux/kernel.h> | 
|---|
| 29 | #include <linux/mempolicy.h> | 
|---|
| 30 | #include <linux/mm.h> | 
|---|
| 31 | #include <linux/memory.h> | 
|---|
| 32 | #include <linux/export.h> | 
|---|
| 33 | #include <linux/rcupdate.h> | 
|---|
| 34 | #include <linux/sched.h> | 
|---|
| 35 | #include <linux/sched/deadline.h> | 
|---|
| 36 | #include <linux/sched/mm.h> | 
|---|
| 37 | #include <linux/sched/task.h> | 
|---|
| 38 | #include <linux/security.h> | 
|---|
| 39 | #include <linux/oom.h> | 
|---|
| 40 | #include <linux/sched/isolation.h> | 
|---|
| 41 | #include <linux/wait.h> | 
|---|
| 42 | #include <linux/workqueue.h> | 
|---|
| 43 | #include <linux/task_work.h> | 
|---|
| 44 |  | 
|---|
| 45 | DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); | 
|---|
| 46 | DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); | 
|---|
| 47 |  | 
|---|
| 48 | /* | 
|---|
| 49 | * There could be abnormal cpuset configurations for cpu or memory | 
|---|
| 50 | * node binding, add this key to provide a quick low-cost judgment | 
|---|
| 51 | * of the situation. | 
|---|
| 52 | */ | 
|---|
| 53 | DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key); | 
|---|
| 54 |  | 
|---|
| 55 | static const char * const perr_strings[] = { | 
|---|
| 56 | [PERR_INVCPUS]   = "Invalid cpu list in cpuset.cpus.exclusive", | 
|---|
| 57 | [PERR_INVPARENT] = "Parent is an invalid partition root", | 
|---|
| 58 | [PERR_NOTPART]   = "Parent is not a partition root", | 
|---|
| 59 | [PERR_NOTEXCL]   = "Cpu list in cpuset.cpus not exclusive", | 
|---|
| 60 | [PERR_NOCPUS]    = "Parent unable to distribute cpu downstream", | 
|---|
| 61 | [PERR_HOTPLUG]   = "No cpu available due to hotplug", | 
|---|
| 62 | [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty", | 
|---|
| 63 | [PERR_HKEEPING]  = "partition config conflicts with housekeeping setup", | 
|---|
| 64 | [PERR_ACCESS]    = "Enable partition not permitted", | 
|---|
| 65 | [PERR_REMOTE]    = "Have remote partition underneath", | 
|---|
| 66 | }; | 
|---|
| 67 |  | 
|---|
| 68 | /* | 
|---|
| 69 | * For local partitions, update to subpartitions_cpus & isolated_cpus is done | 
|---|
| 70 | * in update_parent_effective_cpumask(). For remote partitions, it is done in | 
|---|
| 71 | * the remote_partition_*() and remote_cpus_update() helpers. | 
|---|
| 72 | */ | 
|---|
| 73 | /* | 
|---|
| 74 | * Exclusive CPUs distributed out to local or remote sub-partitions of | 
|---|
| 75 | * top_cpuset | 
|---|
| 76 | */ | 
|---|
| 77 | static cpumask_var_t	subpartitions_cpus; | 
|---|
| 78 |  | 
|---|
| 79 | /* | 
|---|
| 80 | * Exclusive CPUs in isolated partitions | 
|---|
| 81 | */ | 
|---|
| 82 | static cpumask_var_t	isolated_cpus; | 
|---|
| 83 |  | 
|---|
| 84 | /* | 
|---|
| 85 | * Housekeeping (HK_TYPE_DOMAIN) CPUs at boot | 
|---|
| 86 | */ | 
|---|
| 87 | static cpumask_var_t	boot_hk_cpus; | 
|---|
| 88 | static bool		have_boot_isolcpus; | 
|---|
| 89 |  | 
|---|
| 90 | /* List of remote partition root children */ | 
|---|
| 91 | static struct list_head remote_children; | 
|---|
| 92 |  | 
|---|
| 93 | /* | 
|---|
| 94 | * A flag to force sched domain rebuild at the end of an operation. | 
|---|
| 95 | * It can be set in | 
|---|
| 96 | *  - update_partition_sd_lb() | 
|---|
| 97 | *  - update_cpumasks_hier() | 
|---|
| 98 | *  - cpuset_update_flag() | 
|---|
| 99 | *  - cpuset_hotplug_update_tasks() | 
|---|
| 100 | *  - cpuset_handle_hotplug() | 
|---|
| 101 | * | 
|---|
| 102 | * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock. | 
|---|
| 103 | * | 
|---|
| 104 | * Note that update_relax_domain_level() in cpuset-v1.c can still call | 
|---|
| 105 | * rebuild_sched_domains_locked() directly without using this flag. | 
|---|
| 106 | */ | 
|---|
| 107 | static bool force_sd_rebuild; | 
|---|
| 108 |  | 
|---|
| 109 | /* | 
|---|
| 110 | * Partition root states: | 
|---|
| 111 | * | 
|---|
| 112 | *   0 - member (not a partition root) | 
|---|
| 113 | *   1 - partition root | 
|---|
| 114 | *   2 - partition root without load balancing (isolated) | 
|---|
| 115 | *  -1 - invalid partition root | 
|---|
| 116 | *  -2 - invalid isolated partition root | 
|---|
| 117 | * | 
|---|
| 118 | *  There are 2 types of partitions - local or remote. Local partitions are | 
|---|
| 119 | *  those whose parents are partition root themselves. Setting of | 
|---|
| 120 | *  cpuset.cpus.exclusive are optional in setting up local partitions. | 
|---|
| 121 | *  Remote partitions are those whose parents are not partition roots. Passing | 
|---|
| 122 | *  down exclusive CPUs by setting cpuset.cpus.exclusive along its ancestor | 
|---|
| 123 | *  nodes are mandatory in creating a remote partition. | 
|---|
| 124 | * | 
|---|
| 125 | *  For simplicity, a local partition can be created under a local or remote | 
|---|
| 126 | *  partition but a remote partition cannot have any partition root in its | 
|---|
| 127 | *  ancestor chain except the cgroup root. | 
|---|
| 128 | */ | 
|---|
| 129 | #define PRS_MEMBER		0 | 
|---|
| 130 | #define PRS_ROOT		1 | 
|---|
| 131 | #define PRS_ISOLATED		2 | 
|---|
| 132 | #define PRS_INVALID_ROOT	-1 | 
|---|
| 133 | #define PRS_INVALID_ISOLATED	-2 | 
|---|
| 134 |  | 
|---|
| 135 | /* | 
|---|
| 136 | * Temporary cpumasks for working with partitions that are passed among | 
|---|
| 137 | * functions to avoid memory allocation in inner functions. | 
|---|
| 138 | */ | 
|---|
| 139 | struct tmpmasks { | 
|---|
| 140 | cpumask_var_t addmask, delmask;	/* For partition root */ | 
|---|
| 141 | cpumask_var_t new_cpus;		/* For update_cpumasks_hier() */ | 
|---|
| 142 | }; | 
|---|
| 143 |  | 
|---|
| 144 | void inc_dl_tasks_cs(struct task_struct *p) | 
|---|
| 145 | { | 
|---|
| 146 | struct cpuset *cs = task_cs(task: p); | 
|---|
| 147 |  | 
|---|
| 148 | cs->nr_deadline_tasks++; | 
|---|
| 149 | } | 
|---|
| 150 |  | 
|---|
| 151 | void dec_dl_tasks_cs(struct task_struct *p) | 
|---|
| 152 | { | 
|---|
| 153 | struct cpuset *cs = task_cs(task: p); | 
|---|
| 154 |  | 
|---|
| 155 | cs->nr_deadline_tasks--; | 
|---|
| 156 | } | 
|---|
| 157 |  | 
|---|
| 158 | static inline bool is_partition_valid(const struct cpuset *cs) | 
|---|
| 159 | { | 
|---|
| 160 | return cs->partition_root_state > 0; | 
|---|
| 161 | } | 
|---|
| 162 |  | 
|---|
| 163 | static inline bool is_partition_invalid(const struct cpuset *cs) | 
|---|
| 164 | { | 
|---|
| 165 | return cs->partition_root_state < 0; | 
|---|
| 166 | } | 
|---|
| 167 |  | 
|---|
| 168 | static inline bool cs_is_member(const struct cpuset *cs) | 
|---|
| 169 | { | 
|---|
| 170 | return cs->partition_root_state == PRS_MEMBER; | 
|---|
| 171 | } | 
|---|
| 172 |  | 
|---|
| 173 | /* | 
|---|
| 174 | * Callers should hold callback_lock to modify partition_root_state. | 
|---|
| 175 | */ | 
|---|
| 176 | static inline void make_partition_invalid(struct cpuset *cs) | 
|---|
| 177 | { | 
|---|
| 178 | if (cs->partition_root_state > 0) | 
|---|
| 179 | cs->partition_root_state = -cs->partition_root_state; | 
|---|
| 180 | } | 
|---|
| 181 |  | 
|---|
| 182 | /* | 
|---|
| 183 | * Send notification event of whenever partition_root_state changes. | 
|---|
| 184 | */ | 
|---|
| 185 | static inline void notify_partition_change(struct cpuset *cs, int old_prs) | 
|---|
| 186 | { | 
|---|
| 187 | if (old_prs == cs->partition_root_state) | 
|---|
| 188 | return; | 
|---|
| 189 | cgroup_file_notify(cfile: &cs->partition_file); | 
|---|
| 190 |  | 
|---|
| 191 | /* Reset prs_err if not invalid */ | 
|---|
| 192 | if (is_partition_valid(cs)) | 
|---|
| 193 | WRITE_ONCE(cs->prs_err, PERR_NONE); | 
|---|
| 194 | } | 
|---|
| 195 |  | 
|---|
| 196 | /* | 
|---|
| 197 | * The top_cpuset is always synchronized to cpu_active_mask and we should avoid | 
|---|
| 198 | * using cpu_online_mask as much as possible. An active CPU is always an online | 
|---|
| 199 | * CPU, but not vice versa. cpu_active_mask and cpu_online_mask can differ | 
|---|
| 200 | * during hotplug operations. A CPU is marked active at the last stage of CPU | 
|---|
| 201 | * bringup (CPUHP_AP_ACTIVE). It is also the stage where cpuset hotplug code | 
|---|
| 202 | * will be called to update the sched domains so that the scheduler can move | 
|---|
| 203 | * a normal task to a newly active CPU or remove tasks away from a newly | 
|---|
| 204 | * inactivated CPU. The online bit is set much earlier in the CPU bringup | 
|---|
| 205 | * process and cleared much later in CPU teardown. | 
|---|
| 206 | * | 
|---|
| 207 | * If cpu_online_mask is used while a hotunplug operation is happening in | 
|---|
| 208 | * parallel, we may leave an offline CPU in cpu_allowed or some other masks. | 
|---|
| 209 | */ | 
|---|
| 210 | static struct cpuset top_cpuset = { | 
|---|
| 211 | .flags = BIT(CS_CPU_EXCLUSIVE) | | 
|---|
| 212 | BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), | 
|---|
| 213 | .partition_root_state = PRS_ROOT, | 
|---|
| 214 | .relax_domain_level = -1, | 
|---|
| 215 | .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling), | 
|---|
| 216 | }; | 
|---|
| 217 |  | 
|---|
| 218 | /* | 
|---|
| 219 | * There are two global locks guarding cpuset structures - cpuset_mutex and | 
|---|
| 220 | * callback_lock. The cpuset code uses only cpuset_mutex. Other kernel | 
|---|
| 221 | * subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset | 
|---|
| 222 | * structures. Note that cpuset_mutex needs to be a mutex as it is used in | 
|---|
| 223 | * paths that rely on priority inheritance (e.g. scheduler - on RT) for | 
|---|
| 224 | * correctness. | 
|---|
| 225 | * | 
|---|
| 226 | * A task must hold both locks to modify cpusets.  If a task holds | 
|---|
| 227 | * cpuset_mutex, it blocks others, ensuring that it is the only task able to | 
|---|
| 228 | * also acquire callback_lock and be able to modify cpusets.  It can perform | 
|---|
| 229 | * various checks on the cpuset structure first, knowing nothing will change. | 
|---|
| 230 | * It can also allocate memory while just holding cpuset_mutex.  While it is | 
|---|
| 231 | * performing these checks, various callback routines can briefly acquire | 
|---|
| 232 | * callback_lock to query cpusets.  Once it is ready to make the changes, it | 
|---|
| 233 | * takes callback_lock, blocking everyone else. | 
|---|
| 234 | * | 
|---|
| 235 | * Calls to the kernel memory allocator can not be made while holding | 
|---|
| 236 | * callback_lock, as that would risk double tripping on callback_lock | 
|---|
| 237 | * from one of the callbacks into the cpuset code from within | 
|---|
| 238 | * __alloc_pages(). | 
|---|
| 239 | * | 
|---|
| 240 | * If a task is only holding callback_lock, then it has read-only | 
|---|
| 241 | * access to cpusets. | 
|---|
| 242 | * | 
|---|
| 243 | * Now, the task_struct fields mems_allowed and mempolicy may be changed | 
|---|
| 244 | * by other task, we use alloc_lock in the task_struct fields to protect | 
|---|
| 245 | * them. | 
|---|
| 246 | * | 
|---|
| 247 | * The cpuset_common_seq_show() handlers only hold callback_lock across | 
|---|
| 248 | * small pieces of code, such as when reading out possibly multi-word | 
|---|
| 249 | * cpumasks and nodemasks. | 
|---|
| 250 | */ | 
|---|
| 251 |  | 
|---|
| 252 | static DEFINE_MUTEX(cpuset_mutex); | 
|---|
| 253 |  | 
|---|
| 254 | /** | 
|---|
| 255 | * cpuset_lock - Acquire the global cpuset mutex | 
|---|
| 256 | * | 
|---|
| 257 | * This locks the global cpuset mutex to prevent modifications to cpuset | 
|---|
| 258 | * hierarchy and configurations. This helper is not enough to make modification. | 
|---|
| 259 | */ | 
|---|
| 260 | void cpuset_lock(void) | 
|---|
| 261 | { | 
|---|
| 262 | mutex_lock(lock: &cpuset_mutex); | 
|---|
| 263 | } | 
|---|
| 264 |  | 
|---|
| 265 | void cpuset_unlock(void) | 
|---|
| 266 | { | 
|---|
| 267 | mutex_unlock(lock: &cpuset_mutex); | 
|---|
| 268 | } | 
|---|
| 269 |  | 
|---|
| 270 | /** | 
|---|
| 271 | * cpuset_full_lock - Acquire full protection for cpuset modification | 
|---|
| 272 | * | 
|---|
| 273 | * Takes both CPU hotplug read lock (cpus_read_lock()) and cpuset mutex | 
|---|
| 274 | * to safely modify cpuset data. | 
|---|
| 275 | */ | 
|---|
| 276 | void cpuset_full_lock(void) | 
|---|
| 277 | { | 
|---|
| 278 | cpus_read_lock(); | 
|---|
| 279 | mutex_lock(lock: &cpuset_mutex); | 
|---|
| 280 | } | 
|---|
| 281 |  | 
|---|
| 282 | void cpuset_full_unlock(void) | 
|---|
| 283 | { | 
|---|
| 284 | mutex_unlock(lock: &cpuset_mutex); | 
|---|
| 285 | cpus_read_unlock(); | 
|---|
| 286 | } | 
|---|
| 287 |  | 
|---|
| 288 | static DEFINE_SPINLOCK(callback_lock); | 
|---|
| 289 |  | 
|---|
| 290 | void cpuset_callback_lock_irq(void) | 
|---|
| 291 | { | 
|---|
| 292 | spin_lock_irq(lock: &callback_lock); | 
|---|
| 293 | } | 
|---|
| 294 |  | 
|---|
| 295 | void cpuset_callback_unlock_irq(void) | 
|---|
| 296 | { | 
|---|
| 297 | spin_unlock_irq(lock: &callback_lock); | 
|---|
| 298 | } | 
|---|
| 299 |  | 
|---|
| 300 | static struct workqueue_struct *cpuset_migrate_mm_wq; | 
|---|
| 301 |  | 
|---|
| 302 | static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); | 
|---|
| 303 |  | 
|---|
| 304 | static inline void check_insane_mems_config(nodemask_t *nodes) | 
|---|
| 305 | { | 
|---|
| 306 | if (!cpusets_insane_config() && | 
|---|
| 307 | movable_only_nodes(nodes)) { | 
|---|
| 308 | static_branch_enable_cpuslocked(&cpusets_insane_config_key); | 
|---|
| 309 | pr_info( "Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n" | 
|---|
| 310 | "Cpuset allocations might fail even with a lot of memory available.\n", | 
|---|
| 311 | nodemask_pr_args(nodes)); | 
|---|
| 312 | } | 
|---|
| 313 | } | 
|---|
| 314 |  | 
|---|
| 315 | /* | 
|---|
| 316 | * decrease cs->attach_in_progress. | 
|---|
| 317 | * wake_up cpuset_attach_wq if cs->attach_in_progress==0. | 
|---|
| 318 | */ | 
|---|
| 319 | static inline void dec_attach_in_progress_locked(struct cpuset *cs) | 
|---|
| 320 | { | 
|---|
| 321 | lockdep_assert_held(&cpuset_mutex); | 
|---|
| 322 |  | 
|---|
| 323 | cs->attach_in_progress--; | 
|---|
| 324 | if (!cs->attach_in_progress) | 
|---|
| 325 | wake_up(&cpuset_attach_wq); | 
|---|
| 326 | } | 
|---|
| 327 |  | 
|---|
| 328 | static inline void dec_attach_in_progress(struct cpuset *cs) | 
|---|
| 329 | { | 
|---|
| 330 | mutex_lock(lock: &cpuset_mutex); | 
|---|
| 331 | dec_attach_in_progress_locked(cs); | 
|---|
| 332 | mutex_unlock(lock: &cpuset_mutex); | 
|---|
| 333 | } | 
|---|
| 334 |  | 
|---|
| 335 | static inline bool cpuset_v2(void) | 
|---|
| 336 | { | 
|---|
| 337 | return !IS_ENABLED(CONFIG_CPUSETS_V1) || | 
|---|
| 338 | cgroup_subsys_on_dfl(cpuset_cgrp_subsys); | 
|---|
| 339 | } | 
|---|
| 340 |  | 
|---|
| 341 | /* | 
|---|
| 342 | * Cgroup v2 behavior is used on the "cpus" and "mems" control files when | 
|---|
| 343 | * on default hierarchy or when the cpuset_v2_mode flag is set by mounting | 
|---|
| 344 | * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option. | 
|---|
| 345 | * With v2 behavior, "cpus" and "mems" are always what the users have | 
|---|
| 346 | * requested and won't be changed by hotplug events. Only the effective | 
|---|
| 347 | * cpus or mems will be affected. | 
|---|
| 348 | */ | 
|---|
| 349 | static inline bool is_in_v2_mode(void) | 
|---|
| 350 | { | 
|---|
| 351 | return cpuset_v2() || | 
|---|
| 352 | (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); | 
|---|
| 353 | } | 
|---|
| 354 |  | 
|---|
| 355 | /** | 
|---|
| 356 | * partition_is_populated - check if partition has tasks | 
|---|
| 357 | * @cs: partition root to be checked | 
|---|
| 358 | * @excluded_child: a child cpuset to be excluded in task checking | 
|---|
| 359 | * Return: true if there are tasks, false otherwise | 
|---|
| 360 | * | 
|---|
| 361 | * It is assumed that @cs is a valid partition root. @excluded_child should | 
|---|
| 362 | * be non-NULL when this cpuset is going to become a partition itself. | 
|---|
| 363 | */ | 
|---|
| 364 | static inline bool partition_is_populated(struct cpuset *cs, | 
|---|
| 365 | struct cpuset *excluded_child) | 
|---|
| 366 | { | 
|---|
| 367 | struct cgroup_subsys_state *css; | 
|---|
| 368 | struct cpuset *child; | 
|---|
| 369 |  | 
|---|
| 370 | if (cs->css.cgroup->nr_populated_csets) | 
|---|
| 371 | return true; | 
|---|
| 372 | if (!excluded_child && !cs->nr_subparts) | 
|---|
| 373 | return cgroup_is_populated(cgrp: cs->css.cgroup); | 
|---|
| 374 |  | 
|---|
| 375 | rcu_read_lock(); | 
|---|
| 376 | cpuset_for_each_child(child, css, cs) { | 
|---|
| 377 | if (child == excluded_child) | 
|---|
| 378 | continue; | 
|---|
| 379 | if (is_partition_valid(cs: child)) | 
|---|
| 380 | continue; | 
|---|
| 381 | if (cgroup_is_populated(cgrp: child->css.cgroup)) { | 
|---|
| 382 | rcu_read_unlock(); | 
|---|
| 383 | return true; | 
|---|
| 384 | } | 
|---|
| 385 | } | 
|---|
| 386 | rcu_read_unlock(); | 
|---|
| 387 | return false; | 
|---|
| 388 | } | 
|---|
| 389 |  | 
|---|
| 390 | /* | 
|---|
| 391 | * Return in pmask the portion of a task's cpusets's cpus_allowed that | 
|---|
| 392 | * are online and are capable of running the task.  If none are found, | 
|---|
| 393 | * walk up the cpuset hierarchy until we find one that does have some | 
|---|
| 394 | * appropriate cpus. | 
|---|
| 395 | * | 
|---|
| 396 | * One way or another, we guarantee to return some non-empty subset | 
|---|
| 397 | * of cpu_active_mask. | 
|---|
| 398 | * | 
|---|
| 399 | * Call with callback_lock or cpuset_mutex held. | 
|---|
| 400 | */ | 
|---|
| 401 | static void guarantee_active_cpus(struct task_struct *tsk, | 
|---|
| 402 | struct cpumask *pmask) | 
|---|
| 403 | { | 
|---|
| 404 | const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); | 
|---|
| 405 | struct cpuset *cs; | 
|---|
| 406 |  | 
|---|
| 407 | if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask))) | 
|---|
| 408 | cpumask_copy(dstp: pmask, cpu_active_mask); | 
|---|
| 409 |  | 
|---|
| 410 | rcu_read_lock(); | 
|---|
| 411 | cs = task_cs(task: tsk); | 
|---|
| 412 |  | 
|---|
| 413 | while (!cpumask_intersects(src1p: cs->effective_cpus, src2p: pmask)) | 
|---|
| 414 | cs = parent_cs(cs); | 
|---|
| 415 |  | 
|---|
| 416 | cpumask_and(dstp: pmask, src1p: pmask, src2p: cs->effective_cpus); | 
|---|
| 417 | rcu_read_unlock(); | 
|---|
| 418 | } | 
|---|
| 419 |  | 
|---|
| 420 | /* | 
|---|
| 421 | * Return in *pmask the portion of a cpusets's mems_allowed that | 
|---|
| 422 | * are online, with memory.  If none are online with memory, walk | 
|---|
| 423 | * up the cpuset hierarchy until we find one that does have some | 
|---|
| 424 | * online mems.  The top cpuset always has some mems online. | 
|---|
| 425 | * | 
|---|
| 426 | * One way or another, we guarantee to return some non-empty subset | 
|---|
| 427 | * of node_states[N_MEMORY]. | 
|---|
| 428 | * | 
|---|
| 429 | * Call with callback_lock or cpuset_mutex held. | 
|---|
| 430 | */ | 
|---|
| 431 | static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) | 
|---|
| 432 | { | 
|---|
| 433 | while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY])) | 
|---|
| 434 | cs = parent_cs(cs); | 
|---|
| 435 | nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); | 
|---|
| 436 | } | 
|---|
| 437 |  | 
|---|
| 438 | /** | 
|---|
| 439 | * alloc_cpumasks - Allocate an array of cpumask variables | 
|---|
| 440 | * @pmasks: Pointer to array of cpumask_var_t pointers | 
|---|
| 441 | * @size: Number of cpumasks to allocate | 
|---|
| 442 | * Return: 0 if successful, -ENOMEM otherwise. | 
|---|
| 443 | * | 
|---|
| 444 | * Allocates @size cpumasks and initializes them to empty. Returns 0 on | 
|---|
| 445 | * success, -ENOMEM on allocation failure. On failure, any previously | 
|---|
| 446 | * allocated cpumasks are freed. | 
|---|
| 447 | */ | 
|---|
| 448 | static inline int alloc_cpumasks(cpumask_var_t *pmasks[], u32 size) | 
|---|
| 449 | { | 
|---|
| 450 | int i; | 
|---|
| 451 |  | 
|---|
| 452 | for (i = 0; i < size; i++) { | 
|---|
| 453 | if (!zalloc_cpumask_var(mask: pmasks[i], GFP_KERNEL)) { | 
|---|
| 454 | while (--i >= 0) | 
|---|
| 455 | free_cpumask_var(mask: *pmasks[i]); | 
|---|
| 456 | return -ENOMEM; | 
|---|
| 457 | } | 
|---|
| 458 | } | 
|---|
| 459 | return 0; | 
|---|
| 460 | } | 
|---|
| 461 |  | 
|---|
| 462 | /** | 
|---|
| 463 | * alloc_tmpmasks - Allocate temporary cpumasks for cpuset operations. | 
|---|
| 464 | * @tmp: Pointer to tmpmasks structure to populate | 
|---|
| 465 | * Return: 0 on success, -ENOMEM on allocation failure | 
|---|
| 466 | */ | 
|---|
| 467 | static inline int alloc_tmpmasks(struct tmpmasks *tmp) | 
|---|
| 468 | { | 
|---|
| 469 | /* | 
|---|
| 470 | * Array of pointers to the three cpumask_var_t fields in tmpmasks. | 
|---|
| 471 | * Note: Array size must match actual number of masks (3) | 
|---|
| 472 | */ | 
|---|
| 473 | cpumask_var_t *pmask[3] = { | 
|---|
| 474 | &tmp->new_cpus, | 
|---|
| 475 | &tmp->addmask, | 
|---|
| 476 | &tmp->delmask | 
|---|
| 477 | }; | 
|---|
| 478 |  | 
|---|
| 479 | return alloc_cpumasks(pmasks: pmask, ARRAY_SIZE(pmask)); | 
|---|
| 480 | } | 
|---|
| 481 |  | 
|---|
| 482 | /** | 
|---|
| 483 | * free_tmpmasks - free cpumasks in a tmpmasks structure | 
|---|
| 484 | * @tmp: the tmpmasks structure pointer | 
|---|
| 485 | */ | 
|---|
| 486 | static inline void free_tmpmasks(struct tmpmasks *tmp) | 
|---|
| 487 | { | 
|---|
| 488 | if (!tmp) | 
|---|
| 489 | return; | 
|---|
| 490 |  | 
|---|
| 491 | free_cpumask_var(mask: tmp->new_cpus); | 
|---|
| 492 | free_cpumask_var(mask: tmp->addmask); | 
|---|
| 493 | free_cpumask_var(mask: tmp->delmask); | 
|---|
| 494 | } | 
|---|
| 495 |  | 
|---|
| 496 | /** | 
|---|
| 497 | * dup_or_alloc_cpuset - Duplicate or allocate a new cpuset | 
|---|
| 498 | * @cs: Source cpuset to duplicate (NULL for a fresh allocation) | 
|---|
| 499 | * | 
|---|
| 500 | * Creates a new cpuset by either: | 
|---|
| 501 | * 1. Duplicating an existing cpuset (if @cs is non-NULL), or | 
|---|
| 502 | * 2. Allocating a fresh cpuset with zero-initialized masks (if @cs is NULL) | 
|---|
| 503 | * | 
|---|
| 504 | * Return: Pointer to newly allocated cpuset on success, NULL on failure | 
|---|
| 505 | */ | 
|---|
| 506 | static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs) | 
|---|
| 507 | { | 
|---|
| 508 | struct cpuset *trial; | 
|---|
| 509 |  | 
|---|
| 510 | /* Allocate base structure */ | 
|---|
| 511 | trial = cs ? kmemdup(cs, sizeof(*cs), GFP_KERNEL) : | 
|---|
| 512 | kzalloc(sizeof(*cs), GFP_KERNEL); | 
|---|
| 513 | if (!trial) | 
|---|
| 514 | return NULL; | 
|---|
| 515 |  | 
|---|
| 516 | /* Setup cpumask pointer array */ | 
|---|
| 517 | cpumask_var_t *pmask[4] = { | 
|---|
| 518 | &trial->cpus_allowed, | 
|---|
| 519 | &trial->effective_cpus, | 
|---|
| 520 | &trial->effective_xcpus, | 
|---|
| 521 | &trial->exclusive_cpus | 
|---|
| 522 | }; | 
|---|
| 523 |  | 
|---|
| 524 | if (alloc_cpumasks(pmasks: pmask, ARRAY_SIZE(pmask))) { | 
|---|
| 525 | kfree(objp: trial); | 
|---|
| 526 | return NULL; | 
|---|
| 527 | } | 
|---|
| 528 |  | 
|---|
| 529 | /* Copy masks if duplicating */ | 
|---|
| 530 | if (cs) { | 
|---|
| 531 | cpumask_copy(dstp: trial->cpus_allowed, srcp: cs->cpus_allowed); | 
|---|
| 532 | cpumask_copy(dstp: trial->effective_cpus, srcp: cs->effective_cpus); | 
|---|
| 533 | cpumask_copy(dstp: trial->effective_xcpus, srcp: cs->effective_xcpus); | 
|---|
| 534 | cpumask_copy(dstp: trial->exclusive_cpus, srcp: cs->exclusive_cpus); | 
|---|
| 535 | } | 
|---|
| 536 |  | 
|---|
| 537 | return trial; | 
|---|
| 538 | } | 
|---|
| 539 |  | 
|---|
| 540 | /** | 
|---|
| 541 | * free_cpuset - free the cpuset | 
|---|
| 542 | * @cs: the cpuset to be freed | 
|---|
| 543 | */ | 
|---|
| 544 | static inline void free_cpuset(struct cpuset *cs) | 
|---|
| 545 | { | 
|---|
| 546 | free_cpumask_var(mask: cs->cpus_allowed); | 
|---|
| 547 | free_cpumask_var(mask: cs->effective_cpus); | 
|---|
| 548 | free_cpumask_var(mask: cs->effective_xcpus); | 
|---|
| 549 | free_cpumask_var(mask: cs->exclusive_cpus); | 
|---|
| 550 | kfree(objp: cs); | 
|---|
| 551 | } | 
|---|
| 552 |  | 
|---|
| 553 | /* Return user specified exclusive CPUs */ | 
|---|
| 554 | static inline struct cpumask *user_xcpus(struct cpuset *cs) | 
|---|
| 555 | { | 
|---|
| 556 | return cpumask_empty(srcp: cs->exclusive_cpus) ? cs->cpus_allowed | 
|---|
| 557 | : cs->exclusive_cpus; | 
|---|
| 558 | } | 
|---|
| 559 |  | 
|---|
| 560 | static inline bool xcpus_empty(struct cpuset *cs) | 
|---|
| 561 | { | 
|---|
| 562 | return cpumask_empty(srcp: cs->cpus_allowed) && | 
|---|
| 563 | cpumask_empty(srcp: cs->exclusive_cpus); | 
|---|
| 564 | } | 
|---|
| 565 |  | 
|---|
| 566 | /* | 
|---|
| 567 | * cpusets_are_exclusive() - check if two cpusets are exclusive | 
|---|
| 568 | * | 
|---|
| 569 | * Return true if exclusive, false if not | 
|---|
| 570 | */ | 
|---|
| 571 | static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) | 
|---|
| 572 | { | 
|---|
| 573 | struct cpumask *xcpus1 = user_xcpus(cs: cs1); | 
|---|
| 574 | struct cpumask *xcpus2 = user_xcpus(cs: cs2); | 
|---|
| 575 |  | 
|---|
| 576 | if (cpumask_intersects(src1p: xcpus1, src2p: xcpus2)) | 
|---|
| 577 | return false; | 
|---|
| 578 | return true; | 
|---|
| 579 | } | 
|---|
| 580 |  | 
|---|
| 581 | /** | 
|---|
| 582 | * cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts | 
|---|
| 583 | * @cs1: first cpuset to check | 
|---|
| 584 | * @cs2: second cpuset to check | 
|---|
| 585 | * | 
|---|
| 586 | * Returns: true if CPU exclusivity conflict exists, false otherwise | 
|---|
| 587 | * | 
|---|
| 588 | * Conflict detection rules: | 
|---|
| 589 | * 1. If either cpuset is CPU exclusive, they must be mutually exclusive | 
|---|
| 590 | * 2. exclusive_cpus masks cannot intersect between cpusets | 
|---|
| 591 | * 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs | 
|---|
| 592 | */ | 
|---|
| 593 | static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) | 
|---|
| 594 | { | 
|---|
| 595 | /* If either cpuset is exclusive, check if they are mutually exclusive */ | 
|---|
| 596 | if (is_cpu_exclusive(cs: cs1) || is_cpu_exclusive(cs: cs2)) | 
|---|
| 597 | return !cpusets_are_exclusive(cs1, cs2); | 
|---|
| 598 |  | 
|---|
| 599 | /* Exclusive_cpus cannot intersect */ | 
|---|
| 600 | if (cpumask_intersects(src1p: cs1->exclusive_cpus, src2p: cs2->exclusive_cpus)) | 
|---|
| 601 | return true; | 
|---|
| 602 |  | 
|---|
| 603 | /* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */ | 
|---|
| 604 | if (!cpumask_empty(srcp: cs1->cpus_allowed) && | 
|---|
| 605 | cpumask_subset(src1p: cs1->cpus_allowed, src2p: cs2->exclusive_cpus)) | 
|---|
| 606 | return true; | 
|---|
| 607 |  | 
|---|
| 608 | if (!cpumask_empty(srcp: cs2->cpus_allowed) && | 
|---|
| 609 | cpumask_subset(src1p: cs2->cpus_allowed, src2p: cs1->exclusive_cpus)) | 
|---|
| 610 | return true; | 
|---|
| 611 |  | 
|---|
| 612 | return false; | 
|---|
| 613 | } | 
|---|
| 614 |  | 
|---|
| 615 | static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) | 
|---|
| 616 | { | 
|---|
| 617 | if ((is_mem_exclusive(cs: cs1) || is_mem_exclusive(cs: cs2))) | 
|---|
| 618 | return nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); | 
|---|
| 619 | return false; | 
|---|
| 620 | } | 
|---|
| 621 |  | 
|---|
| 622 | /* | 
|---|
| 623 | * validate_change() - Used to validate that any proposed cpuset change | 
|---|
| 624 | *		       follows the structural rules for cpusets. | 
|---|
| 625 | * | 
|---|
| 626 | * If we replaced the flag and mask values of the current cpuset | 
|---|
| 627 | * (cur) with those values in the trial cpuset (trial), would | 
|---|
| 628 | * our various subset and exclusive rules still be valid?  Presumes | 
|---|
| 629 | * cpuset_mutex held. | 
|---|
| 630 | * | 
|---|
| 631 | * 'cur' is the address of an actual, in-use cpuset.  Operations | 
|---|
| 632 | * such as list traversal that depend on the actual address of the | 
|---|
| 633 | * cpuset in the list must use cur below, not trial. | 
|---|
| 634 | * | 
|---|
| 635 | * 'trial' is the address of bulk structure copy of cur, with | 
|---|
| 636 | * perhaps one or more of the fields cpus_allowed, mems_allowed, | 
|---|
| 637 | * or flags changed to new, trial values. | 
|---|
| 638 | * | 
|---|
| 639 | * Return 0 if valid, -errno if not. | 
|---|
| 640 | */ | 
|---|
| 641 |  | 
|---|
| 642 | static int validate_change(struct cpuset *cur, struct cpuset *trial) | 
|---|
| 643 | { | 
|---|
| 644 | struct cgroup_subsys_state *css; | 
|---|
| 645 | struct cpuset *c, *par; | 
|---|
| 646 | int ret = 0; | 
|---|
| 647 |  | 
|---|
| 648 | rcu_read_lock(); | 
|---|
| 649 |  | 
|---|
| 650 | if (!is_in_v2_mode()) | 
|---|
| 651 | ret = cpuset1_validate_change(cur, trial); | 
|---|
| 652 | if (ret) | 
|---|
| 653 | goto out; | 
|---|
| 654 |  | 
|---|
| 655 | /* Remaining checks don't apply to root cpuset */ | 
|---|
| 656 | if (cur == &top_cpuset) | 
|---|
| 657 | goto out; | 
|---|
| 658 |  | 
|---|
| 659 | par = parent_cs(cs: cur); | 
|---|
| 660 |  | 
|---|
| 661 | /* | 
|---|
| 662 | * Cpusets with tasks - existing or newly being attached - can't | 
|---|
| 663 | * be changed to have empty cpus_allowed or mems_allowed. | 
|---|
| 664 | */ | 
|---|
| 665 | ret = -ENOSPC; | 
|---|
| 666 | if ((cgroup_is_populated(cgrp: cur->css.cgroup) || cur->attach_in_progress)) { | 
|---|
| 667 | if (!cpumask_empty(srcp: cur->cpus_allowed) && | 
|---|
| 668 | cpumask_empty(srcp: trial->cpus_allowed)) | 
|---|
| 669 | goto out; | 
|---|
| 670 | if (!nodes_empty(cur->mems_allowed) && | 
|---|
| 671 | nodes_empty(trial->mems_allowed)) | 
|---|
| 672 | goto out; | 
|---|
| 673 | } | 
|---|
| 674 |  | 
|---|
| 675 | /* | 
|---|
| 676 | * We can't shrink if we won't have enough room for SCHED_DEADLINE | 
|---|
| 677 | * tasks. This check is not done when scheduling is disabled as the | 
|---|
| 678 | * users should know what they are doing. | 
|---|
| 679 | * | 
|---|
| 680 | * For v1, effective_cpus == cpus_allowed & user_xcpus() returns | 
|---|
| 681 | * cpus_allowed. | 
|---|
| 682 | * | 
|---|
| 683 | * For v2, is_cpu_exclusive() & is_sched_load_balance() are true only | 
|---|
| 684 | * for non-isolated partition root. At this point, the target | 
|---|
| 685 | * effective_cpus isn't computed yet. user_xcpus() is the best | 
|---|
| 686 | * approximation. | 
|---|
| 687 | * | 
|---|
| 688 | * TBD: May need to precompute the real effective_cpus here in case | 
|---|
| 689 | * incorrect scheduling of SCHED_DEADLINE tasks in a partition | 
|---|
| 690 | * becomes an issue. | 
|---|
| 691 | */ | 
|---|
| 692 | ret = -EBUSY; | 
|---|
| 693 | if (is_cpu_exclusive(cs: cur) && is_sched_load_balance(cs: cur) && | 
|---|
| 694 | !cpuset_cpumask_can_shrink(cur: cur->effective_cpus, trial: user_xcpus(cs: trial))) | 
|---|
| 695 | goto out; | 
|---|
| 696 |  | 
|---|
| 697 | /* | 
|---|
| 698 | * If either I or some sibling (!= me) is exclusive, we can't | 
|---|
| 699 | * overlap. exclusive_cpus cannot overlap with each other if set. | 
|---|
| 700 | */ | 
|---|
| 701 | ret = -EINVAL; | 
|---|
| 702 | cpuset_for_each_child(c, css, par) { | 
|---|
| 703 | if (c == cur) | 
|---|
| 704 | continue; | 
|---|
| 705 | if (cpus_excl_conflict(cs1: trial, cs2: c)) | 
|---|
| 706 | goto out; | 
|---|
| 707 | if (mems_excl_conflict(cs1: trial, cs2: c)) | 
|---|
| 708 | goto out; | 
|---|
| 709 | } | 
|---|
| 710 |  | 
|---|
| 711 | ret = 0; | 
|---|
| 712 | out: | 
|---|
| 713 | rcu_read_unlock(); | 
|---|
| 714 | return ret; | 
|---|
| 715 | } | 
|---|
| 716 |  | 
|---|
| 717 | #ifdef CONFIG_SMP | 
|---|
| 718 | /* | 
|---|
| 719 | * Helper routine for generate_sched_domains(). | 
|---|
| 720 | * Do cpusets a, b have overlapping effective cpus_allowed masks? | 
|---|
| 721 | */ | 
|---|
| 722 | static int cpusets_overlap(struct cpuset *a, struct cpuset *b) | 
|---|
| 723 | { | 
|---|
| 724 | return cpumask_intersects(src1p: a->effective_cpus, src2p: b->effective_cpus); | 
|---|
| 725 | } | 
|---|
| 726 |  | 
|---|
| 727 | static void | 
|---|
| 728 | update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) | 
|---|
| 729 | { | 
|---|
| 730 | if (dattr->relax_domain_level < c->relax_domain_level) | 
|---|
| 731 | dattr->relax_domain_level = c->relax_domain_level; | 
|---|
| 732 | return; | 
|---|
| 733 | } | 
|---|
| 734 |  | 
|---|
| 735 | static void update_domain_attr_tree(struct sched_domain_attr *dattr, | 
|---|
| 736 | struct cpuset *root_cs) | 
|---|
| 737 | { | 
|---|
| 738 | struct cpuset *cp; | 
|---|
| 739 | struct cgroup_subsys_state *pos_css; | 
|---|
| 740 |  | 
|---|
| 741 | rcu_read_lock(); | 
|---|
| 742 | cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { | 
|---|
| 743 | /* skip the whole subtree if @cp doesn't have any CPU */ | 
|---|
| 744 | if (cpumask_empty(srcp: cp->cpus_allowed)) { | 
|---|
| 745 | pos_css = css_rightmost_descendant(pos: pos_css); | 
|---|
| 746 | continue; | 
|---|
| 747 | } | 
|---|
| 748 |  | 
|---|
| 749 | if (is_sched_load_balance(cs: cp)) | 
|---|
| 750 | update_domain_attr(dattr, c: cp); | 
|---|
| 751 | } | 
|---|
| 752 | rcu_read_unlock(); | 
|---|
| 753 | } | 
|---|
| 754 |  | 
|---|
| 755 | /* Must be called with cpuset_mutex held.  */ | 
|---|
| 756 | static inline int nr_cpusets(void) | 
|---|
| 757 | { | 
|---|
| 758 | /* jump label reference count + the top-level cpuset */ | 
|---|
| 759 | return static_key_count(key: &cpusets_enabled_key.key) + 1; | 
|---|
| 760 | } | 
|---|
| 761 |  | 
|---|
| 762 | /* | 
|---|
| 763 | * generate_sched_domains() | 
|---|
| 764 | * | 
|---|
| 765 | * This function builds a partial partition of the systems CPUs | 
|---|
| 766 | * A 'partial partition' is a set of non-overlapping subsets whose | 
|---|
| 767 | * union is a subset of that set. | 
|---|
| 768 | * The output of this function needs to be passed to kernel/sched/core.c | 
|---|
| 769 | * partition_sched_domains() routine, which will rebuild the scheduler's | 
|---|
| 770 | * load balancing domains (sched domains) as specified by that partial | 
|---|
| 771 | * partition. | 
|---|
| 772 | * | 
|---|
| 773 | * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst | 
|---|
| 774 | * for a background explanation of this. | 
|---|
| 775 | * | 
|---|
| 776 | * Does not return errors, on the theory that the callers of this | 
|---|
| 777 | * routine would rather not worry about failures to rebuild sched | 
|---|
| 778 | * domains when operating in the severe memory shortage situations | 
|---|
| 779 | * that could cause allocation failures below. | 
|---|
| 780 | * | 
|---|
| 781 | * Must be called with cpuset_mutex held. | 
|---|
| 782 | * | 
|---|
| 783 | * The three key local variables below are: | 
|---|
| 784 | *    cp - cpuset pointer, used (together with pos_css) to perform a | 
|---|
| 785 | *	   top-down scan of all cpusets. For our purposes, rebuilding | 
|---|
| 786 | *	   the schedulers sched domains, we can ignore !is_sched_load_ | 
|---|
| 787 | *	   balance cpusets. | 
|---|
| 788 | *  csa  - (for CpuSet Array) Array of pointers to all the cpusets | 
|---|
| 789 | *	   that need to be load balanced, for convenient iterative | 
|---|
| 790 | *	   access by the subsequent code that finds the best partition, | 
|---|
| 791 | *	   i.e the set of domains (subsets) of CPUs such that the | 
|---|
| 792 | *	   cpus_allowed of every cpuset marked is_sched_load_balance | 
|---|
| 793 | *	   is a subset of one of these domains, while there are as | 
|---|
| 794 | *	   many such domains as possible, each as small as possible. | 
|---|
| 795 | * doms  - Conversion of 'csa' to an array of cpumasks, for passing to | 
|---|
| 796 | *	   the kernel/sched/core.c routine partition_sched_domains() in a | 
|---|
| 797 | *	   convenient format, that can be easily compared to the prior | 
|---|
| 798 | *	   value to determine what partition elements (sched domains) | 
|---|
| 799 | *	   were changed (added or removed.) | 
|---|
| 800 | * | 
|---|
| 801 | * Finding the best partition (set of domains): | 
|---|
| 802 | *	The double nested loops below over i, j scan over the load | 
|---|
| 803 | *	balanced cpusets (using the array of cpuset pointers in csa[]) | 
|---|
| 804 | *	looking for pairs of cpusets that have overlapping cpus_allowed | 
|---|
| 805 | *	and merging them using a union-find algorithm. | 
|---|
| 806 | * | 
|---|
| 807 | *	The union of the cpus_allowed masks from the set of all cpusets | 
|---|
| 808 | *	having the same root then form the one element of the partition | 
|---|
| 809 | *	(one sched domain) to be passed to partition_sched_domains(). | 
|---|
| 810 | * | 
|---|
| 811 | */ | 
|---|
| 812 | static int generate_sched_domains(cpumask_var_t **domains, | 
|---|
| 813 | struct sched_domain_attr **attributes) | 
|---|
| 814 | { | 
|---|
| 815 | struct cpuset *cp;	/* top-down scan of cpusets */ | 
|---|
| 816 | struct cpuset **csa;	/* array of all cpuset ptrs */ | 
|---|
| 817 | int csn;		/* how many cpuset ptrs in csa so far */ | 
|---|
| 818 | int i, j;		/* indices for partition finding loops */ | 
|---|
| 819 | cpumask_var_t *doms;	/* resulting partition; i.e. sched domains */ | 
|---|
| 820 | struct sched_domain_attr *dattr;  /* attributes for custom domains */ | 
|---|
| 821 | int ndoms = 0;		/* number of sched domains in result */ | 
|---|
| 822 | int nslot;		/* next empty doms[] struct cpumask slot */ | 
|---|
| 823 | struct cgroup_subsys_state *pos_css; | 
|---|
| 824 | bool root_load_balance = is_sched_load_balance(cs: &top_cpuset); | 
|---|
| 825 | bool cgrpv2 = cpuset_v2(); | 
|---|
| 826 | int nslot_update; | 
|---|
| 827 |  | 
|---|
| 828 | doms = NULL; | 
|---|
| 829 | dattr = NULL; | 
|---|
| 830 | csa = NULL; | 
|---|
| 831 |  | 
|---|
| 832 | /* Special case for the 99% of systems with one, full, sched domain */ | 
|---|
| 833 | if (root_load_balance && cpumask_empty(srcp: subpartitions_cpus)) { | 
|---|
| 834 | single_root_domain: | 
|---|
| 835 | ndoms = 1; | 
|---|
| 836 | doms = alloc_sched_domains(ndoms); | 
|---|
| 837 | if (!doms) | 
|---|
| 838 | goto done; | 
|---|
| 839 |  | 
|---|
| 840 | dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); | 
|---|
| 841 | if (dattr) { | 
|---|
| 842 | *dattr = SD_ATTR_INIT; | 
|---|
| 843 | update_domain_attr_tree(dattr, root_cs: &top_cpuset); | 
|---|
| 844 | } | 
|---|
| 845 | cpumask_and(dstp: doms[0], src1p: top_cpuset.effective_cpus, | 
|---|
| 846 | src2p: housekeeping_cpumask(type: HK_TYPE_DOMAIN)); | 
|---|
| 847 |  | 
|---|
| 848 | goto done; | 
|---|
| 849 | } | 
|---|
| 850 |  | 
|---|
| 851 | csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL); | 
|---|
| 852 | if (!csa) | 
|---|
| 853 | goto done; | 
|---|
| 854 | csn = 0; | 
|---|
| 855 |  | 
|---|
| 856 | rcu_read_lock(); | 
|---|
| 857 | if (root_load_balance) | 
|---|
| 858 | csa[csn++] = &top_cpuset; | 
|---|
| 859 | cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { | 
|---|
| 860 | if (cp == &top_cpuset) | 
|---|
| 861 | continue; | 
|---|
| 862 |  | 
|---|
| 863 | if (cgrpv2) | 
|---|
| 864 | goto v2; | 
|---|
| 865 |  | 
|---|
| 866 | /* | 
|---|
| 867 | * v1: | 
|---|
| 868 | * Continue traversing beyond @cp iff @cp has some CPUs and | 
|---|
| 869 | * isn't load balancing.  The former is obvious.  The | 
|---|
| 870 | * latter: All child cpusets contain a subset of the | 
|---|
| 871 | * parent's cpus, so just skip them, and then we call | 
|---|
| 872 | * update_domain_attr_tree() to calc relax_domain_level of | 
|---|
| 873 | * the corresponding sched domain. | 
|---|
| 874 | */ | 
|---|
| 875 | if (!cpumask_empty(srcp: cp->cpus_allowed) && | 
|---|
| 876 | !(is_sched_load_balance(cs: cp) && | 
|---|
| 877 | cpumask_intersects(src1p: cp->cpus_allowed, | 
|---|
| 878 | src2p: housekeeping_cpumask(type: HK_TYPE_DOMAIN)))) | 
|---|
| 879 | continue; | 
|---|
| 880 |  | 
|---|
| 881 | if (is_sched_load_balance(cs: cp) && | 
|---|
| 882 | !cpumask_empty(srcp: cp->effective_cpus)) | 
|---|
| 883 | csa[csn++] = cp; | 
|---|
| 884 |  | 
|---|
| 885 | /* skip @cp's subtree */ | 
|---|
| 886 | pos_css = css_rightmost_descendant(pos: pos_css); | 
|---|
| 887 | continue; | 
|---|
| 888 |  | 
|---|
| 889 | v2: | 
|---|
| 890 | /* | 
|---|
| 891 | * Only valid partition roots that are not isolated and with | 
|---|
| 892 | * non-empty effective_cpus will be saved into csn[]. | 
|---|
| 893 | */ | 
|---|
| 894 | if ((cp->partition_root_state == PRS_ROOT) && | 
|---|
| 895 | !cpumask_empty(srcp: cp->effective_cpus)) | 
|---|
| 896 | csa[csn++] = cp; | 
|---|
| 897 |  | 
|---|
| 898 | /* | 
|---|
| 899 | * Skip @cp's subtree if not a partition root and has no | 
|---|
| 900 | * exclusive CPUs to be granted to child cpusets. | 
|---|
| 901 | */ | 
|---|
| 902 | if (!is_partition_valid(cs: cp) && cpumask_empty(srcp: cp->exclusive_cpus)) | 
|---|
| 903 | pos_css = css_rightmost_descendant(pos: pos_css); | 
|---|
| 904 | } | 
|---|
| 905 | rcu_read_unlock(); | 
|---|
| 906 |  | 
|---|
| 907 | /* | 
|---|
| 908 | * If there are only isolated partitions underneath the cgroup root, | 
|---|
| 909 | * we can optimize out unneeded sched domains scanning. | 
|---|
| 910 | */ | 
|---|
| 911 | if (root_load_balance && (csn == 1)) | 
|---|
| 912 | goto single_root_domain; | 
|---|
| 913 |  | 
|---|
| 914 | for (i = 0; i < csn; i++) | 
|---|
| 915 | uf_node_init(node: &csa[i]->node); | 
|---|
| 916 |  | 
|---|
| 917 | /* Merge overlapping cpusets */ | 
|---|
| 918 | for (i = 0; i < csn; i++) { | 
|---|
| 919 | for (j = i + 1; j < csn; j++) { | 
|---|
| 920 | if (cpusets_overlap(a: csa[i], b: csa[j])) { | 
|---|
| 921 | /* | 
|---|
| 922 | * Cgroup v2 shouldn't pass down overlapping | 
|---|
| 923 | * partition root cpusets. | 
|---|
| 924 | */ | 
|---|
| 925 | WARN_ON_ONCE(cgrpv2); | 
|---|
| 926 | uf_union(node1: &csa[i]->node, node2: &csa[j]->node); | 
|---|
| 927 | } | 
|---|
| 928 | } | 
|---|
| 929 | } | 
|---|
| 930 |  | 
|---|
| 931 | /* Count the total number of domains */ | 
|---|
| 932 | for (i = 0; i < csn; i++) { | 
|---|
| 933 | if (uf_find(node: &csa[i]->node) == &csa[i]->node) | 
|---|
| 934 | ndoms++; | 
|---|
| 935 | } | 
|---|
| 936 |  | 
|---|
| 937 | /* | 
|---|
| 938 | * Now we know how many domains to create. | 
|---|
| 939 | * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. | 
|---|
| 940 | */ | 
|---|
| 941 | doms = alloc_sched_domains(ndoms); | 
|---|
| 942 | if (!doms) | 
|---|
| 943 | goto done; | 
|---|
| 944 |  | 
|---|
| 945 | /* | 
|---|
| 946 | * The rest of the code, including the scheduler, can deal with | 
|---|
| 947 | * dattr==NULL case. No need to abort if alloc fails. | 
|---|
| 948 | */ | 
|---|
| 949 | dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr), | 
|---|
| 950 | GFP_KERNEL); | 
|---|
| 951 |  | 
|---|
| 952 | /* | 
|---|
| 953 | * Cgroup v2 doesn't support domain attributes, just set all of them | 
|---|
| 954 | * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a | 
|---|
| 955 | * subset of HK_TYPE_DOMAIN housekeeping CPUs. | 
|---|
| 956 | */ | 
|---|
| 957 | if (cgrpv2) { | 
|---|
| 958 | for (i = 0; i < ndoms; i++) { | 
|---|
| 959 | /* | 
|---|
| 960 | * The top cpuset may contain some boot time isolated | 
|---|
| 961 | * CPUs that need to be excluded from the sched domain. | 
|---|
| 962 | */ | 
|---|
| 963 | if (csa[i] == &top_cpuset) | 
|---|
| 964 | cpumask_and(dstp: doms[i], src1p: csa[i]->effective_cpus, | 
|---|
| 965 | src2p: housekeeping_cpumask(type: HK_TYPE_DOMAIN)); | 
|---|
| 966 | else | 
|---|
| 967 | cpumask_copy(dstp: doms[i], srcp: csa[i]->effective_cpus); | 
|---|
| 968 | if (dattr) | 
|---|
| 969 | dattr[i] = SD_ATTR_INIT; | 
|---|
| 970 | } | 
|---|
| 971 | goto done; | 
|---|
| 972 | } | 
|---|
| 973 |  | 
|---|
| 974 | for (nslot = 0, i = 0; i < csn; i++) { | 
|---|
| 975 | nslot_update = 0; | 
|---|
| 976 | for (j = i; j < csn; j++) { | 
|---|
| 977 | if (uf_find(node: &csa[j]->node) == &csa[i]->node) { | 
|---|
| 978 | struct cpumask *dp = doms[nslot]; | 
|---|
| 979 |  | 
|---|
| 980 | if (i == j) { | 
|---|
| 981 | nslot_update = 1; | 
|---|
| 982 | cpumask_clear(dstp: dp); | 
|---|
| 983 | if (dattr) | 
|---|
| 984 | *(dattr + nslot) = SD_ATTR_INIT; | 
|---|
| 985 | } | 
|---|
| 986 | cpumask_or(dstp: dp, src1p: dp, src2p: csa[j]->effective_cpus); | 
|---|
| 987 | cpumask_and(dstp: dp, src1p: dp, src2p: housekeeping_cpumask(type: HK_TYPE_DOMAIN)); | 
|---|
| 988 | if (dattr) | 
|---|
| 989 | update_domain_attr_tree(dattr: dattr + nslot, root_cs: csa[j]); | 
|---|
| 990 | } | 
|---|
| 991 | } | 
|---|
| 992 | if (nslot_update) | 
|---|
| 993 | nslot++; | 
|---|
| 994 | } | 
|---|
| 995 | BUG_ON(nslot != ndoms); | 
|---|
| 996 |  | 
|---|
| 997 | done: | 
|---|
| 998 | kfree(objp: csa); | 
|---|
| 999 |  | 
|---|
| 1000 | /* | 
|---|
| 1001 | * Fallback to the default domain if kmalloc() failed. | 
|---|
| 1002 | * See comments in partition_sched_domains(). | 
|---|
| 1003 | */ | 
|---|
| 1004 | if (doms == NULL) | 
|---|
| 1005 | ndoms = 1; | 
|---|
| 1006 |  | 
|---|
| 1007 | *domains    = doms; | 
|---|
| 1008 | *attributes = dattr; | 
|---|
| 1009 | return ndoms; | 
|---|
| 1010 | } | 
|---|
| 1011 |  | 
|---|
| 1012 | static void dl_update_tasks_root_domain(struct cpuset *cs) | 
|---|
| 1013 | { | 
|---|
| 1014 | struct css_task_iter it; | 
|---|
| 1015 | struct task_struct *task; | 
|---|
| 1016 |  | 
|---|
| 1017 | if (cs->nr_deadline_tasks == 0) | 
|---|
| 1018 | return; | 
|---|
| 1019 |  | 
|---|
| 1020 | css_task_iter_start(css: &cs->css, flags: 0, it: &it); | 
|---|
| 1021 |  | 
|---|
| 1022 | while ((task = css_task_iter_next(it: &it))) | 
|---|
| 1023 | dl_add_task_root_domain(p: task); | 
|---|
| 1024 |  | 
|---|
| 1025 | css_task_iter_end(it: &it); | 
|---|
| 1026 | } | 
|---|
| 1027 |  | 
|---|
| 1028 | void dl_rebuild_rd_accounting(void) | 
|---|
| 1029 | { | 
|---|
| 1030 | struct cpuset *cs = NULL; | 
|---|
| 1031 | struct cgroup_subsys_state *pos_css; | 
|---|
| 1032 | int cpu; | 
|---|
| 1033 | u64 cookie = ++dl_cookie; | 
|---|
| 1034 |  | 
|---|
| 1035 | lockdep_assert_held(&cpuset_mutex); | 
|---|
| 1036 | lockdep_assert_cpus_held(); | 
|---|
| 1037 | lockdep_assert_held(&sched_domains_mutex); | 
|---|
| 1038 |  | 
|---|
| 1039 | rcu_read_lock(); | 
|---|
| 1040 |  | 
|---|
| 1041 | for_each_possible_cpu(cpu) { | 
|---|
| 1042 | if (dl_bw_visited(cpu, cookie)) | 
|---|
| 1043 | continue; | 
|---|
| 1044 |  | 
|---|
| 1045 | dl_clear_root_domain_cpu(cpu); | 
|---|
| 1046 | } | 
|---|
| 1047 |  | 
|---|
| 1048 | cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { | 
|---|
| 1049 |  | 
|---|
| 1050 | if (cpumask_empty(srcp: cs->effective_cpus)) { | 
|---|
| 1051 | pos_css = css_rightmost_descendant(pos: pos_css); | 
|---|
| 1052 | continue; | 
|---|
| 1053 | } | 
|---|
| 1054 |  | 
|---|
| 1055 | css_get(css: &cs->css); | 
|---|
| 1056 |  | 
|---|
| 1057 | rcu_read_unlock(); | 
|---|
| 1058 |  | 
|---|
| 1059 | dl_update_tasks_root_domain(cs); | 
|---|
| 1060 |  | 
|---|
| 1061 | rcu_read_lock(); | 
|---|
| 1062 | css_put(css: &cs->css); | 
|---|
| 1063 | } | 
|---|
| 1064 | rcu_read_unlock(); | 
|---|
| 1065 | } | 
|---|
| 1066 |  | 
|---|
| 1067 | /* | 
|---|
| 1068 | * Rebuild scheduler domains. | 
|---|
| 1069 | * | 
|---|
| 1070 | * If the flag 'sched_load_balance' of any cpuset with non-empty | 
|---|
| 1071 | * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset | 
|---|
| 1072 | * which has that flag enabled, or if any cpuset with a non-empty | 
|---|
| 1073 | * 'cpus' is removed, then call this routine to rebuild the | 
|---|
| 1074 | * scheduler's dynamic sched domains. | 
|---|
| 1075 | * | 
|---|
| 1076 | * Call with cpuset_mutex held.  Takes cpus_read_lock(). | 
|---|
| 1077 | */ | 
|---|
| 1078 | void rebuild_sched_domains_locked(void) | 
|---|
| 1079 | { | 
|---|
| 1080 | struct cgroup_subsys_state *pos_css; | 
|---|
| 1081 | struct sched_domain_attr *attr; | 
|---|
| 1082 | cpumask_var_t *doms; | 
|---|
| 1083 | struct cpuset *cs; | 
|---|
| 1084 | int ndoms; | 
|---|
| 1085 |  | 
|---|
| 1086 | lockdep_assert_cpus_held(); | 
|---|
| 1087 | lockdep_assert_held(&cpuset_mutex); | 
|---|
| 1088 | force_sd_rebuild = false; | 
|---|
| 1089 |  | 
|---|
| 1090 | /* | 
|---|
| 1091 | * If we have raced with CPU hotplug, return early to avoid | 
|---|
| 1092 | * passing doms with offlined cpu to partition_sched_domains(). | 
|---|
| 1093 | * Anyways, cpuset_handle_hotplug() will rebuild sched domains. | 
|---|
| 1094 | * | 
|---|
| 1095 | * With no CPUs in any subpartitions, top_cpuset's effective CPUs | 
|---|
| 1096 | * should be the same as the active CPUs, so checking only top_cpuset | 
|---|
| 1097 | * is enough to detect racing CPU offlines. | 
|---|
| 1098 | */ | 
|---|
| 1099 | if (cpumask_empty(srcp: subpartitions_cpus) && | 
|---|
| 1100 | !cpumask_equal(src1p: top_cpuset.effective_cpus, cpu_active_mask)) | 
|---|
| 1101 | return; | 
|---|
| 1102 |  | 
|---|
| 1103 | /* | 
|---|
| 1104 | * With subpartition CPUs, however, the effective CPUs of a partition | 
|---|
| 1105 | * root should be only a subset of the active CPUs.  Since a CPU in any | 
|---|
| 1106 | * partition root could be offlined, all must be checked. | 
|---|
| 1107 | */ | 
|---|
| 1108 | if (!cpumask_empty(srcp: subpartitions_cpus)) { | 
|---|
| 1109 | rcu_read_lock(); | 
|---|
| 1110 | cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { | 
|---|
| 1111 | if (!is_partition_valid(cs)) { | 
|---|
| 1112 | pos_css = css_rightmost_descendant(pos: pos_css); | 
|---|
| 1113 | continue; | 
|---|
| 1114 | } | 
|---|
| 1115 | if (!cpumask_subset(src1p: cs->effective_cpus, | 
|---|
| 1116 | cpu_active_mask)) { | 
|---|
| 1117 | rcu_read_unlock(); | 
|---|
| 1118 | return; | 
|---|
| 1119 | } | 
|---|
| 1120 | } | 
|---|
| 1121 | rcu_read_unlock(); | 
|---|
| 1122 | } | 
|---|
| 1123 |  | 
|---|
| 1124 | /* Generate domain masks and attrs */ | 
|---|
| 1125 | ndoms = generate_sched_domains(domains: &doms, attributes: &attr); | 
|---|
| 1126 |  | 
|---|
| 1127 | /* Have scheduler rebuild the domains */ | 
|---|
| 1128 | partition_sched_domains(ndoms_new: ndoms, doms_new: doms, dattr_new: attr); | 
|---|
| 1129 | } | 
|---|
| 1130 | #else /* !CONFIG_SMP */ | 
|---|
| 1131 | void rebuild_sched_domains_locked(void) | 
|---|
| 1132 | { | 
|---|
| 1133 | } | 
|---|
| 1134 | #endif /* CONFIG_SMP */ | 
|---|
| 1135 |  | 
|---|
| 1136 | static void rebuild_sched_domains_cpuslocked(void) | 
|---|
| 1137 | { | 
|---|
| 1138 | mutex_lock(lock: &cpuset_mutex); | 
|---|
| 1139 | rebuild_sched_domains_locked(); | 
|---|
| 1140 | mutex_unlock(lock: &cpuset_mutex); | 
|---|
| 1141 | } | 
|---|
| 1142 |  | 
|---|
| 1143 | void rebuild_sched_domains(void) | 
|---|
| 1144 | { | 
|---|
| 1145 | cpus_read_lock(); | 
|---|
| 1146 | rebuild_sched_domains_cpuslocked(); | 
|---|
| 1147 | cpus_read_unlock(); | 
|---|
| 1148 | } | 
|---|
| 1149 |  | 
|---|
| 1150 | void cpuset_reset_sched_domains(void) | 
|---|
| 1151 | { | 
|---|
| 1152 | mutex_lock(lock: &cpuset_mutex); | 
|---|
| 1153 | partition_sched_domains(ndoms_new: 1, NULL, NULL); | 
|---|
| 1154 | mutex_unlock(lock: &cpuset_mutex); | 
|---|
| 1155 | } | 
|---|
| 1156 |  | 
|---|
| 1157 | /** | 
|---|
| 1158 | * cpuset_update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. | 
|---|
| 1159 | * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed | 
|---|
| 1160 | * @new_cpus: the temp variable for the new effective_cpus mask | 
|---|
| 1161 | * | 
|---|
| 1162 | * Iterate through each task of @cs updating its cpus_allowed to the | 
|---|
| 1163 | * effective cpuset's.  As this function is called with cpuset_mutex held, | 
|---|
| 1164 | * cpuset membership stays stable. | 
|---|
| 1165 | * | 
|---|
| 1166 | * For top_cpuset, task_cpu_possible_mask() is used instead of effective_cpus | 
|---|
| 1167 | * to make sure all offline CPUs are also included as hotplug code won't | 
|---|
| 1168 | * update cpumasks for tasks in top_cpuset. | 
|---|
| 1169 | * | 
|---|
| 1170 | * As task_cpu_possible_mask() can be task dependent in arm64, we have to | 
|---|
| 1171 | * do cpu masking per task instead of doing it once for all. | 
|---|
| 1172 | */ | 
|---|
| 1173 | void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) | 
|---|
| 1174 | { | 
|---|
| 1175 | struct css_task_iter it; | 
|---|
| 1176 | struct task_struct *task; | 
|---|
| 1177 | bool top_cs = cs == &top_cpuset; | 
|---|
| 1178 |  | 
|---|
| 1179 | css_task_iter_start(css: &cs->css, flags: 0, it: &it); | 
|---|
| 1180 | while ((task = css_task_iter_next(it: &it))) { | 
|---|
| 1181 | const struct cpumask *possible_mask = task_cpu_possible_mask(task); | 
|---|
| 1182 |  | 
|---|
| 1183 | if (top_cs) { | 
|---|
| 1184 | /* | 
|---|
| 1185 | * PF_NO_SETAFFINITY tasks are ignored. | 
|---|
| 1186 | * All per cpu kthreads should have PF_NO_SETAFFINITY | 
|---|
| 1187 | * flag set, see kthread_set_per_cpu(). | 
|---|
| 1188 | */ | 
|---|
| 1189 | if (task->flags & PF_NO_SETAFFINITY) | 
|---|
| 1190 | continue; | 
|---|
| 1191 | cpumask_andnot(dstp: new_cpus, src1p: possible_mask, src2p: subpartitions_cpus); | 
|---|
| 1192 | } else { | 
|---|
| 1193 | cpumask_and(dstp: new_cpus, src1p: possible_mask, src2p: cs->effective_cpus); | 
|---|
| 1194 | } | 
|---|
| 1195 | set_cpus_allowed_ptr(p: task, new_mask: new_cpus); | 
|---|
| 1196 | } | 
|---|
| 1197 | css_task_iter_end(it: &it); | 
|---|
| 1198 | } | 
|---|
| 1199 |  | 
|---|
| 1200 | /** | 
|---|
| 1201 | * compute_effective_cpumask - Compute the effective cpumask of the cpuset | 
|---|
| 1202 | * @new_cpus: the temp variable for the new effective_cpus mask | 
|---|
| 1203 | * @cs: the cpuset the need to recompute the new effective_cpus mask | 
|---|
| 1204 | * @parent: the parent cpuset | 
|---|
| 1205 | * | 
|---|
| 1206 | * The result is valid only if the given cpuset isn't a partition root. | 
|---|
| 1207 | */ | 
|---|
| 1208 | static void compute_effective_cpumask(struct cpumask *new_cpus, | 
|---|
| 1209 | struct cpuset *cs, struct cpuset *parent) | 
|---|
| 1210 | { | 
|---|
| 1211 | cpumask_and(dstp: new_cpus, src1p: cs->cpus_allowed, src2p: parent->effective_cpus); | 
|---|
| 1212 | } | 
|---|
| 1213 |  | 
|---|
| 1214 | /* | 
|---|
| 1215 | * Commands for update_parent_effective_cpumask | 
|---|
| 1216 | */ | 
|---|
| 1217 | enum partition_cmd { | 
|---|
| 1218 | partcmd_enable,		/* Enable partition root	  */ | 
|---|
| 1219 | partcmd_enablei,	/* Enable isolated partition root */ | 
|---|
| 1220 | partcmd_disable,	/* Disable partition root	  */ | 
|---|
| 1221 | partcmd_update,		/* Update parent's effective_cpus */ | 
|---|
| 1222 | partcmd_invalidate,	/* Make partition invalid	  */ | 
|---|
| 1223 | }; | 
|---|
| 1224 |  | 
|---|
| 1225 | static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, | 
|---|
| 1226 | struct tmpmasks *tmp); | 
|---|
| 1227 |  | 
|---|
| 1228 | /* | 
|---|
| 1229 | * Update partition exclusive flag | 
|---|
| 1230 | * | 
|---|
| 1231 | * Return: 0 if successful, an error code otherwise | 
|---|
| 1232 | */ | 
|---|
| 1233 | static int update_partition_exclusive_flag(struct cpuset *cs, int new_prs) | 
|---|
| 1234 | { | 
|---|
| 1235 | bool exclusive = (new_prs > PRS_MEMBER); | 
|---|
| 1236 |  | 
|---|
| 1237 | if (exclusive && !is_cpu_exclusive(cs)) { | 
|---|
| 1238 | if (cpuset_update_flag(bit: CS_CPU_EXCLUSIVE, cs, turning_on: 1)) | 
|---|
| 1239 | return PERR_NOTEXCL; | 
|---|
| 1240 | } else if (!exclusive && is_cpu_exclusive(cs)) { | 
|---|
| 1241 | /* Turning off CS_CPU_EXCLUSIVE will not return error */ | 
|---|
| 1242 | cpuset_update_flag(bit: CS_CPU_EXCLUSIVE, cs, turning_on: 0); | 
|---|
| 1243 | } | 
|---|
| 1244 | return 0; | 
|---|
| 1245 | } | 
|---|
| 1246 |  | 
|---|
| 1247 | /* | 
|---|
| 1248 | * Update partition load balance flag and/or rebuild sched domain | 
|---|
| 1249 | * | 
|---|
| 1250 | * Changing load balance flag will automatically call | 
|---|
| 1251 | * rebuild_sched_domains_locked(). | 
|---|
| 1252 | * This function is for cgroup v2 only. | 
|---|
| 1253 | */ | 
|---|
| 1254 | static void update_partition_sd_lb(struct cpuset *cs, int old_prs) | 
|---|
| 1255 | { | 
|---|
| 1256 | int new_prs = cs->partition_root_state; | 
|---|
| 1257 | bool rebuild_domains = (new_prs > 0) || (old_prs > 0); | 
|---|
| 1258 | bool new_lb; | 
|---|
| 1259 |  | 
|---|
| 1260 | /* | 
|---|
| 1261 | * If cs is not a valid partition root, the load balance state | 
|---|
| 1262 | * will follow its parent. | 
|---|
| 1263 | */ | 
|---|
| 1264 | if (new_prs > 0) { | 
|---|
| 1265 | new_lb = (new_prs != PRS_ISOLATED); | 
|---|
| 1266 | } else { | 
|---|
| 1267 | new_lb = is_sched_load_balance(cs: parent_cs(cs)); | 
|---|
| 1268 | } | 
|---|
| 1269 | if (new_lb != !!is_sched_load_balance(cs)) { | 
|---|
| 1270 | rebuild_domains = true; | 
|---|
| 1271 | if (new_lb) | 
|---|
| 1272 | set_bit(nr: CS_SCHED_LOAD_BALANCE, addr: &cs->flags); | 
|---|
| 1273 | else | 
|---|
| 1274 | clear_bit(nr: CS_SCHED_LOAD_BALANCE, addr: &cs->flags); | 
|---|
| 1275 | } | 
|---|
| 1276 |  | 
|---|
| 1277 | if (rebuild_domains) | 
|---|
| 1278 | cpuset_force_rebuild(); | 
|---|
| 1279 | } | 
|---|
| 1280 |  | 
|---|
| 1281 | /* | 
|---|
| 1282 | * tasks_nocpu_error - Return true if tasks will have no effective_cpus | 
|---|
| 1283 | */ | 
|---|
| 1284 | static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs, | 
|---|
| 1285 | struct cpumask *xcpus) | 
|---|
| 1286 | { | 
|---|
| 1287 | /* | 
|---|
| 1288 | * A populated partition (cs or parent) can't have empty effective_cpus | 
|---|
| 1289 | */ | 
|---|
| 1290 | return (cpumask_subset(src1p: parent->effective_cpus, src2p: xcpus) && | 
|---|
| 1291 | partition_is_populated(cs: parent, excluded_child: cs)) || | 
|---|
| 1292 | (!cpumask_intersects(src1p: xcpus, cpu_active_mask) && | 
|---|
| 1293 | partition_is_populated(cs, NULL)); | 
|---|
| 1294 | } | 
|---|
| 1295 |  | 
|---|
| 1296 | static void reset_partition_data(struct cpuset *cs) | 
|---|
| 1297 | { | 
|---|
| 1298 | struct cpuset *parent = parent_cs(cs); | 
|---|
| 1299 |  | 
|---|
| 1300 | if (!cpuset_v2()) | 
|---|
| 1301 | return; | 
|---|
| 1302 |  | 
|---|
| 1303 | lockdep_assert_held(&callback_lock); | 
|---|
| 1304 |  | 
|---|
| 1305 | cs->nr_subparts = 0; | 
|---|
| 1306 | if (cpumask_empty(srcp: cs->exclusive_cpus)) { | 
|---|
| 1307 | cpumask_clear(dstp: cs->effective_xcpus); | 
|---|
| 1308 | if (is_cpu_exclusive(cs)) | 
|---|
| 1309 | clear_bit(nr: CS_CPU_EXCLUSIVE, addr: &cs->flags); | 
|---|
| 1310 | } | 
|---|
| 1311 | if (!cpumask_and(dstp: cs->effective_cpus, src1p: parent->effective_cpus, src2p: cs->cpus_allowed)) | 
|---|
| 1312 | cpumask_copy(dstp: cs->effective_cpus, srcp: parent->effective_cpus); | 
|---|
| 1313 | } | 
|---|
| 1314 |  | 
|---|
| 1315 | /* | 
|---|
| 1316 | * isolated_cpus_update - Update the isolated_cpus mask | 
|---|
| 1317 | * @old_prs: old partition_root_state | 
|---|
| 1318 | * @new_prs: new partition_root_state | 
|---|
| 1319 | * @xcpus: exclusive CPUs with state change | 
|---|
| 1320 | */ | 
|---|
| 1321 | static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus) | 
|---|
| 1322 | { | 
|---|
| 1323 | WARN_ON_ONCE(old_prs == new_prs); | 
|---|
| 1324 | if (new_prs == PRS_ISOLATED) | 
|---|
| 1325 | cpumask_or(dstp: isolated_cpus, src1p: isolated_cpus, src2p: xcpus); | 
|---|
| 1326 | else | 
|---|
| 1327 | cpumask_andnot(dstp: isolated_cpus, src1p: isolated_cpus, src2p: xcpus); | 
|---|
| 1328 | } | 
|---|
| 1329 |  | 
|---|
| 1330 | /* | 
|---|
| 1331 | * partition_xcpus_add - Add new exclusive CPUs to partition | 
|---|
| 1332 | * @new_prs: new partition_root_state | 
|---|
| 1333 | * @parent: parent cpuset | 
|---|
| 1334 | * @xcpus: exclusive CPUs to be added | 
|---|
| 1335 | * Return: true if isolated_cpus modified, false otherwise | 
|---|
| 1336 | * | 
|---|
| 1337 | * Remote partition if parent == NULL | 
|---|
| 1338 | */ | 
|---|
| 1339 | static bool partition_xcpus_add(int new_prs, struct cpuset *parent, | 
|---|
| 1340 | struct cpumask *xcpus) | 
|---|
| 1341 | { | 
|---|
| 1342 | bool isolcpus_updated; | 
|---|
| 1343 |  | 
|---|
| 1344 | WARN_ON_ONCE(new_prs < 0); | 
|---|
| 1345 | lockdep_assert_held(&callback_lock); | 
|---|
| 1346 | if (!parent) | 
|---|
| 1347 | parent = &top_cpuset; | 
|---|
| 1348 |  | 
|---|
| 1349 |  | 
|---|
| 1350 | if (parent == &top_cpuset) | 
|---|
| 1351 | cpumask_or(dstp: subpartitions_cpus, src1p: subpartitions_cpus, src2p: xcpus); | 
|---|
| 1352 |  | 
|---|
| 1353 | isolcpus_updated = (new_prs != parent->partition_root_state); | 
|---|
| 1354 | if (isolcpus_updated) | 
|---|
| 1355 | isolated_cpus_update(old_prs: parent->partition_root_state, new_prs, | 
|---|
| 1356 | xcpus); | 
|---|
| 1357 |  | 
|---|
| 1358 | cpumask_andnot(dstp: parent->effective_cpus, src1p: parent->effective_cpus, src2p: xcpus); | 
|---|
| 1359 | return isolcpus_updated; | 
|---|
| 1360 | } | 
|---|
| 1361 |  | 
|---|
| 1362 | /* | 
|---|
| 1363 | * partition_xcpus_del - Remove exclusive CPUs from partition | 
|---|
| 1364 | * @old_prs: old partition_root_state | 
|---|
| 1365 | * @parent: parent cpuset | 
|---|
| 1366 | * @xcpus: exclusive CPUs to be removed | 
|---|
| 1367 | * Return: true if isolated_cpus modified, false otherwise | 
|---|
| 1368 | * | 
|---|
| 1369 | * Remote partition if parent == NULL | 
|---|
| 1370 | */ | 
|---|
| 1371 | static bool partition_xcpus_del(int old_prs, struct cpuset *parent, | 
|---|
| 1372 | struct cpumask *xcpus) | 
|---|
| 1373 | { | 
|---|
| 1374 | bool isolcpus_updated; | 
|---|
| 1375 |  | 
|---|
| 1376 | WARN_ON_ONCE(old_prs < 0); | 
|---|
| 1377 | lockdep_assert_held(&callback_lock); | 
|---|
| 1378 | if (!parent) | 
|---|
| 1379 | parent = &top_cpuset; | 
|---|
| 1380 |  | 
|---|
| 1381 | if (parent == &top_cpuset) | 
|---|
| 1382 | cpumask_andnot(dstp: subpartitions_cpus, src1p: subpartitions_cpus, src2p: xcpus); | 
|---|
| 1383 |  | 
|---|
| 1384 | isolcpus_updated = (old_prs != parent->partition_root_state); | 
|---|
| 1385 | if (isolcpus_updated) | 
|---|
| 1386 | isolated_cpus_update(old_prs, new_prs: parent->partition_root_state, | 
|---|
| 1387 | xcpus); | 
|---|
| 1388 |  | 
|---|
| 1389 | cpumask_and(dstp: xcpus, src1p: xcpus, cpu_active_mask); | 
|---|
| 1390 | cpumask_or(dstp: parent->effective_cpus, src1p: parent->effective_cpus, src2p: xcpus); | 
|---|
| 1391 | return isolcpus_updated; | 
|---|
| 1392 | } | 
|---|
| 1393 |  | 
|---|
| 1394 | static void update_unbound_workqueue_cpumask(bool isolcpus_updated) | 
|---|
| 1395 | { | 
|---|
| 1396 | int ret; | 
|---|
| 1397 |  | 
|---|
| 1398 | lockdep_assert_cpus_held(); | 
|---|
| 1399 |  | 
|---|
| 1400 | if (!isolcpus_updated) | 
|---|
| 1401 | return; | 
|---|
| 1402 |  | 
|---|
| 1403 | ret = workqueue_unbound_exclude_cpumask(cpumask: isolated_cpus); | 
|---|
| 1404 | WARN_ON_ONCE(ret < 0); | 
|---|
| 1405 | } | 
|---|
| 1406 |  | 
|---|
| 1407 | /** | 
|---|
| 1408 | * cpuset_cpu_is_isolated - Check if the given CPU is isolated | 
|---|
| 1409 | * @cpu: the CPU number to be checked | 
|---|
| 1410 | * Return: true if CPU is used in an isolated partition, false otherwise | 
|---|
| 1411 | */ | 
|---|
| 1412 | bool cpuset_cpu_is_isolated(int cpu) | 
|---|
| 1413 | { | 
|---|
| 1414 | return cpumask_test_cpu(cpu, cpumask: isolated_cpus); | 
|---|
| 1415 | } | 
|---|
| 1416 | EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); | 
|---|
| 1417 |  | 
|---|
| 1418 | /** | 
|---|
| 1419 | * rm_siblings_excl_cpus - Remove exclusive CPUs that are used by sibling cpusets | 
|---|
| 1420 | * @parent: Parent cpuset containing all siblings | 
|---|
| 1421 | * @cs: Current cpuset (will be skipped) | 
|---|
| 1422 | * @excpus:  exclusive effective CPU mask to modify | 
|---|
| 1423 | * | 
|---|
| 1424 | * This function ensures the given @excpus mask doesn't include any CPUs that | 
|---|
| 1425 | * are exclusively allocated to sibling cpusets. It walks through all siblings | 
|---|
| 1426 | * of @cs under @parent and removes their exclusive CPUs from @excpus. | 
|---|
| 1427 | */ | 
|---|
| 1428 | static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs, | 
|---|
| 1429 | struct cpumask *excpus) | 
|---|
| 1430 | { | 
|---|
| 1431 | struct cgroup_subsys_state *css; | 
|---|
| 1432 | struct cpuset *sibling; | 
|---|
| 1433 | int retval = 0; | 
|---|
| 1434 |  | 
|---|
| 1435 | if (cpumask_empty(srcp: excpus)) | 
|---|
| 1436 | return retval; | 
|---|
| 1437 |  | 
|---|
| 1438 | /* | 
|---|
| 1439 | * Exclude exclusive CPUs from siblings | 
|---|
| 1440 | */ | 
|---|
| 1441 | rcu_read_lock(); | 
|---|
| 1442 | cpuset_for_each_child(sibling, css, parent) { | 
|---|
| 1443 | if (sibling == cs) | 
|---|
| 1444 | continue; | 
|---|
| 1445 |  | 
|---|
| 1446 | if (cpumask_intersects(src1p: excpus, src2p: sibling->exclusive_cpus)) { | 
|---|
| 1447 | cpumask_andnot(dstp: excpus, src1p: excpus, src2p: sibling->exclusive_cpus); | 
|---|
| 1448 | retval++; | 
|---|
| 1449 | continue; | 
|---|
| 1450 | } | 
|---|
| 1451 | if (cpumask_intersects(src1p: excpus, src2p: sibling->effective_xcpus)) { | 
|---|
| 1452 | cpumask_andnot(dstp: excpus, src1p: excpus, src2p: sibling->effective_xcpus); | 
|---|
| 1453 | retval++; | 
|---|
| 1454 | } | 
|---|
| 1455 | } | 
|---|
| 1456 | rcu_read_unlock(); | 
|---|
| 1457 |  | 
|---|
| 1458 | return retval; | 
|---|
| 1459 | } | 
|---|
| 1460 |  | 
|---|
| 1461 | /* | 
|---|
| 1462 | * compute_excpus - compute effective exclusive CPUs | 
|---|
| 1463 | * @cs: cpuset | 
|---|
| 1464 | * @xcpus: effective exclusive CPUs value to be set | 
|---|
| 1465 | * Return: 0 if there is no sibling conflict, > 0 otherwise | 
|---|
| 1466 | * | 
|---|
| 1467 | * If exclusive_cpus isn't explicitly set , we have to scan the sibling cpusets | 
|---|
| 1468 | * and exclude their exclusive_cpus or effective_xcpus as well. | 
|---|
| 1469 | */ | 
|---|
| 1470 | static int compute_excpus(struct cpuset *cs, struct cpumask *excpus) | 
|---|
| 1471 | { | 
|---|
| 1472 | struct cpuset *parent = parent_cs(cs); | 
|---|
| 1473 |  | 
|---|
| 1474 | cpumask_and(dstp: excpus, src1p: user_xcpus(cs), src2p: parent->effective_xcpus); | 
|---|
| 1475 |  | 
|---|
| 1476 | if (!cpumask_empty(srcp: cs->exclusive_cpus)) | 
|---|
| 1477 | return 0; | 
|---|
| 1478 |  | 
|---|
| 1479 | return rm_siblings_excl_cpus(parent, cs, excpus); | 
|---|
| 1480 | } | 
|---|
| 1481 |  | 
|---|
| 1482 | /* | 
|---|
| 1483 | * compute_trialcs_excpus - Compute effective exclusive CPUs for a trial cpuset | 
|---|
| 1484 | * @trialcs: The trial cpuset containing the proposed new configuration | 
|---|
| 1485 | * @cs: The original cpuset that the trial configuration is based on | 
|---|
| 1486 | * Return: 0 if successful with no sibling conflict, >0 if a conflict is found | 
|---|
| 1487 | * | 
|---|
| 1488 | * Computes the effective_xcpus for a trial configuration. @cs is provided to represent | 
|---|
| 1489 | * the real cs. | 
|---|
| 1490 | */ | 
|---|
| 1491 | static int compute_trialcs_excpus(struct cpuset *trialcs, struct cpuset *cs) | 
|---|
| 1492 | { | 
|---|
| 1493 | struct cpuset *parent = parent_cs(cs: trialcs); | 
|---|
| 1494 | struct cpumask *excpus = trialcs->effective_xcpus; | 
|---|
| 1495 |  | 
|---|
| 1496 | /* trialcs is member, cpuset.cpus has no impact to excpus */ | 
|---|
| 1497 | if (cs_is_member(cs)) | 
|---|
| 1498 | cpumask_and(dstp: excpus, src1p: trialcs->exclusive_cpus, | 
|---|
| 1499 | src2p: parent->effective_xcpus); | 
|---|
| 1500 | else | 
|---|
| 1501 | cpumask_and(dstp: excpus, src1p: user_xcpus(cs: trialcs), src2p: parent->effective_xcpus); | 
|---|
| 1502 |  | 
|---|
| 1503 | return rm_siblings_excl_cpus(parent, cs, excpus); | 
|---|
| 1504 | } | 
|---|
| 1505 |  | 
|---|
| 1506 | static inline bool is_remote_partition(struct cpuset *cs) | 
|---|
| 1507 | { | 
|---|
| 1508 | return !list_empty(head: &cs->remote_sibling); | 
|---|
| 1509 | } | 
|---|
| 1510 |  | 
|---|
| 1511 | static inline bool is_local_partition(struct cpuset *cs) | 
|---|
| 1512 | { | 
|---|
| 1513 | return is_partition_valid(cs) && !is_remote_partition(cs); | 
|---|
| 1514 | } | 
|---|
| 1515 |  | 
|---|
| 1516 | /* | 
|---|
| 1517 | * remote_partition_enable - Enable current cpuset as a remote partition root | 
|---|
| 1518 | * @cs: the cpuset to update | 
|---|
| 1519 | * @new_prs: new partition_root_state | 
|---|
| 1520 | * @tmp: temporary masks | 
|---|
| 1521 | * Return: 0 if successful, errcode if error | 
|---|
| 1522 | * | 
|---|
| 1523 | * Enable the current cpuset to become a remote partition root taking CPUs | 
|---|
| 1524 | * directly from the top cpuset. cpuset_mutex must be held by the caller. | 
|---|
| 1525 | */ | 
|---|
| 1526 | static int remote_partition_enable(struct cpuset *cs, int new_prs, | 
|---|
| 1527 | struct tmpmasks *tmp) | 
|---|
| 1528 | { | 
|---|
| 1529 | bool isolcpus_updated; | 
|---|
| 1530 |  | 
|---|
| 1531 | /* | 
|---|
| 1532 | * The user must have sysadmin privilege. | 
|---|
| 1533 | */ | 
|---|
| 1534 | if (!capable(CAP_SYS_ADMIN)) | 
|---|
| 1535 | return PERR_ACCESS; | 
|---|
| 1536 |  | 
|---|
| 1537 | /* | 
|---|
| 1538 | * The requested exclusive_cpus must not be allocated to other | 
|---|
| 1539 | * partitions and it can't use up all the root's effective_cpus. | 
|---|
| 1540 | * | 
|---|
| 1541 | * The effective_xcpus mask can contain offline CPUs, but there must | 
|---|
| 1542 | * be at least one or more online CPUs present before it can be enabled. | 
|---|
| 1543 | * | 
|---|
| 1544 | * Note that creating a remote partition with any local partition root | 
|---|
| 1545 | * above it or remote partition root underneath it is not allowed. | 
|---|
| 1546 | */ | 
|---|
| 1547 | compute_excpus(cs, excpus: tmp->new_cpus); | 
|---|
| 1548 | WARN_ON_ONCE(cpumask_intersects(tmp->new_cpus, subpartitions_cpus)); | 
|---|
| 1549 | if (!cpumask_intersects(src1p: tmp->new_cpus, cpu_active_mask) || | 
|---|
| 1550 | cpumask_subset(src1p: top_cpuset.effective_cpus, src2p: tmp->new_cpus)) | 
|---|
| 1551 | return PERR_INVCPUS; | 
|---|
| 1552 |  | 
|---|
| 1553 | spin_lock_irq(lock: &callback_lock); | 
|---|
| 1554 | isolcpus_updated = partition_xcpus_add(new_prs, NULL, xcpus: tmp->new_cpus); | 
|---|
| 1555 | list_add(new: &cs->remote_sibling, head: &remote_children); | 
|---|
| 1556 | cpumask_copy(dstp: cs->effective_xcpus, srcp: tmp->new_cpus); | 
|---|
| 1557 | spin_unlock_irq(lock: &callback_lock); | 
|---|
| 1558 | update_unbound_workqueue_cpumask(isolcpus_updated); | 
|---|
| 1559 | cpuset_force_rebuild(); | 
|---|
| 1560 | cs->prs_err = 0; | 
|---|
| 1561 |  | 
|---|
| 1562 | /* | 
|---|
| 1563 | * Propagate changes in top_cpuset's effective_cpus down the hierarchy. | 
|---|
| 1564 | */ | 
|---|
| 1565 | cpuset_update_tasks_cpumask(cs: &top_cpuset, new_cpus: tmp->new_cpus); | 
|---|
| 1566 | update_sibling_cpumasks(parent: &top_cpuset, NULL, tmp); | 
|---|
| 1567 | return 0; | 
|---|
| 1568 | } | 
|---|
| 1569 |  | 
|---|
| 1570 | /* | 
|---|
| 1571 | * remote_partition_disable - Remove current cpuset from remote partition list | 
|---|
| 1572 | * @cs: the cpuset to update | 
|---|
| 1573 | * @tmp: temporary masks | 
|---|
| 1574 | * | 
|---|
| 1575 | * The effective_cpus is also updated. | 
|---|
| 1576 | * | 
|---|
| 1577 | * cpuset_mutex must be held by the caller. | 
|---|
| 1578 | */ | 
|---|
| 1579 | static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) | 
|---|
| 1580 | { | 
|---|
| 1581 | bool isolcpus_updated; | 
|---|
| 1582 |  | 
|---|
| 1583 | WARN_ON_ONCE(!is_remote_partition(cs)); | 
|---|
| 1584 | WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); | 
|---|
| 1585 |  | 
|---|
| 1586 | spin_lock_irq(lock: &callback_lock); | 
|---|
| 1587 | list_del_init(entry: &cs->remote_sibling); | 
|---|
| 1588 | isolcpus_updated = partition_xcpus_del(old_prs: cs->partition_root_state, | 
|---|
| 1589 | NULL, xcpus: cs->effective_xcpus); | 
|---|
| 1590 | if (cs->prs_err) | 
|---|
| 1591 | cs->partition_root_state = -cs->partition_root_state; | 
|---|
| 1592 | else | 
|---|
| 1593 | cs->partition_root_state = PRS_MEMBER; | 
|---|
| 1594 |  | 
|---|
| 1595 | /* effective_xcpus may need to be changed */ | 
|---|
| 1596 | compute_excpus(cs, excpus: cs->effective_xcpus); | 
|---|
| 1597 | reset_partition_data(cs); | 
|---|
| 1598 | spin_unlock_irq(lock: &callback_lock); | 
|---|
| 1599 | update_unbound_workqueue_cpumask(isolcpus_updated); | 
|---|
| 1600 | cpuset_force_rebuild(); | 
|---|
| 1601 |  | 
|---|
| 1602 | /* | 
|---|
| 1603 | * Propagate changes in top_cpuset's effective_cpus down the hierarchy. | 
|---|
| 1604 | */ | 
|---|
| 1605 | cpuset_update_tasks_cpumask(cs: &top_cpuset, new_cpus: tmp->new_cpus); | 
|---|
| 1606 | update_sibling_cpumasks(parent: &top_cpuset, NULL, tmp); | 
|---|
| 1607 | } | 
|---|
| 1608 |  | 
|---|
| 1609 | /* | 
|---|
| 1610 | * remote_cpus_update - cpus_exclusive change of remote partition | 
|---|
| 1611 | * @cs: the cpuset to be updated | 
|---|
| 1612 | * @xcpus: the new exclusive_cpus mask, if non-NULL | 
|---|
| 1613 | * @excpus: the new effective_xcpus mask | 
|---|
| 1614 | * @tmp: temporary masks | 
|---|
| 1615 | * | 
|---|
| 1616 | * top_cpuset and subpartitions_cpus will be updated or partition can be | 
|---|
| 1617 | * invalidated. | 
|---|
| 1618 | */ | 
|---|
| 1619 | static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus, | 
|---|
| 1620 | struct cpumask *excpus, struct tmpmasks *tmp) | 
|---|
| 1621 | { | 
|---|
| 1622 | bool adding, deleting; | 
|---|
| 1623 | int prs = cs->partition_root_state; | 
|---|
| 1624 | int isolcpus_updated = 0; | 
|---|
| 1625 |  | 
|---|
| 1626 | if (WARN_ON_ONCE(!is_remote_partition(cs))) | 
|---|
| 1627 | return; | 
|---|
| 1628 |  | 
|---|
| 1629 | WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); | 
|---|
| 1630 |  | 
|---|
| 1631 | if (cpumask_empty(srcp: excpus)) { | 
|---|
| 1632 | cs->prs_err = PERR_CPUSEMPTY; | 
|---|
| 1633 | goto invalidate; | 
|---|
| 1634 | } | 
|---|
| 1635 |  | 
|---|
| 1636 | adding   = cpumask_andnot(dstp: tmp->addmask, src1p: excpus, src2p: cs->effective_xcpus); | 
|---|
| 1637 | deleting = cpumask_andnot(dstp: tmp->delmask, src1p: cs->effective_xcpus, src2p: excpus); | 
|---|
| 1638 |  | 
|---|
| 1639 | /* | 
|---|
| 1640 | * Additions of remote CPUs is only allowed if those CPUs are | 
|---|
| 1641 | * not allocated to other partitions and there are effective_cpus | 
|---|
| 1642 | * left in the top cpuset. | 
|---|
| 1643 | */ | 
|---|
| 1644 | if (adding) { | 
|---|
| 1645 | WARN_ON_ONCE(cpumask_intersects(tmp->addmask, subpartitions_cpus)); | 
|---|
| 1646 | if (!capable(CAP_SYS_ADMIN)) | 
|---|
| 1647 | cs->prs_err = PERR_ACCESS; | 
|---|
| 1648 | else if (cpumask_intersects(src1p: tmp->addmask, src2p: subpartitions_cpus) || | 
|---|
| 1649 | cpumask_subset(src1p: top_cpuset.effective_cpus, src2p: tmp->addmask)) | 
|---|
| 1650 | cs->prs_err = PERR_NOCPUS; | 
|---|
| 1651 | if (cs->prs_err) | 
|---|
| 1652 | goto invalidate; | 
|---|
| 1653 | } | 
|---|
| 1654 |  | 
|---|
| 1655 | spin_lock_irq(lock: &callback_lock); | 
|---|
| 1656 | if (adding) | 
|---|
| 1657 | isolcpus_updated += partition_xcpus_add(new_prs: prs, NULL, xcpus: tmp->addmask); | 
|---|
| 1658 | if (deleting) | 
|---|
| 1659 | isolcpus_updated += partition_xcpus_del(old_prs: prs, NULL, xcpus: tmp->delmask); | 
|---|
| 1660 | /* | 
|---|
| 1661 | * Need to update effective_xcpus and exclusive_cpus now as | 
|---|
| 1662 | * update_sibling_cpumasks() below may iterate back to the same cs. | 
|---|
| 1663 | */ | 
|---|
| 1664 | cpumask_copy(dstp: cs->effective_xcpus, srcp: excpus); | 
|---|
| 1665 | if (xcpus) | 
|---|
| 1666 | cpumask_copy(dstp: cs->exclusive_cpus, srcp: xcpus); | 
|---|
| 1667 | spin_unlock_irq(lock: &callback_lock); | 
|---|
| 1668 | update_unbound_workqueue_cpumask(isolcpus_updated); | 
|---|
| 1669 | if (adding || deleting) | 
|---|
| 1670 | cpuset_force_rebuild(); | 
|---|
| 1671 |  | 
|---|
| 1672 | /* | 
|---|
| 1673 | * Propagate changes in top_cpuset's effective_cpus down the hierarchy. | 
|---|
| 1674 | */ | 
|---|
| 1675 | cpuset_update_tasks_cpumask(cs: &top_cpuset, new_cpus: tmp->new_cpus); | 
|---|
| 1676 | update_sibling_cpumasks(parent: &top_cpuset, NULL, tmp); | 
|---|
| 1677 | return; | 
|---|
| 1678 |  | 
|---|
| 1679 | invalidate: | 
|---|
| 1680 | remote_partition_disable(cs, tmp); | 
|---|
| 1681 | } | 
|---|
| 1682 |  | 
|---|
| 1683 | /* | 
|---|
| 1684 | * prstate_housekeeping_conflict - check for partition & housekeeping conflicts | 
|---|
| 1685 | * @prstate: partition root state to be checked | 
|---|
| 1686 | * @new_cpus: cpu mask | 
|---|
| 1687 | * Return: true if there is conflict, false otherwise | 
|---|
| 1688 | * | 
|---|
| 1689 | * CPUs outside of boot_hk_cpus, if defined, can only be used in an | 
|---|
| 1690 | * isolated partition. | 
|---|
| 1691 | */ | 
|---|
| 1692 | static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) | 
|---|
| 1693 | { | 
|---|
| 1694 | if (!have_boot_isolcpus) | 
|---|
| 1695 | return false; | 
|---|
| 1696 |  | 
|---|
| 1697 | if ((prstate != PRS_ISOLATED) && !cpumask_subset(src1p: new_cpus, src2p: boot_hk_cpus)) | 
|---|
| 1698 | return true; | 
|---|
| 1699 |  | 
|---|
| 1700 | return false; | 
|---|
| 1701 | } | 
|---|
| 1702 |  | 
|---|
| 1703 | /** | 
|---|
| 1704 | * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset | 
|---|
| 1705 | * @cs:      The cpuset that requests change in partition root state | 
|---|
| 1706 | * @cmd:     Partition root state change command | 
|---|
| 1707 | * @newmask: Optional new cpumask for partcmd_update | 
|---|
| 1708 | * @tmp:     Temporary addmask and delmask | 
|---|
| 1709 | * Return:   0 or a partition root state error code | 
|---|
| 1710 | * | 
|---|
| 1711 | * For partcmd_enable*, the cpuset is being transformed from a non-partition | 
|---|
| 1712 | * root to a partition root. The effective_xcpus (cpus_allowed if | 
|---|
| 1713 | * effective_xcpus not set) mask of the given cpuset will be taken away from | 
|---|
| 1714 | * parent's effective_cpus. The function will return 0 if all the CPUs listed | 
|---|
| 1715 | * in effective_xcpus can be granted or an error code will be returned. | 
|---|
| 1716 | * | 
|---|
| 1717 | * For partcmd_disable, the cpuset is being transformed from a partition | 
|---|
| 1718 | * root back to a non-partition root. Any CPUs in effective_xcpus will be | 
|---|
| 1719 | * given back to parent's effective_cpus. 0 will always be returned. | 
|---|
| 1720 | * | 
|---|
| 1721 | * For partcmd_update, if the optional newmask is specified, the cpu list is | 
|---|
| 1722 | * to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is | 
|---|
| 1723 | * assumed to remain the same. The cpuset should either be a valid or invalid | 
|---|
| 1724 | * partition root. The partition root state may change from valid to invalid | 
|---|
| 1725 | * or vice versa. An error code will be returned if transitioning from | 
|---|
| 1726 | * invalid to valid violates the exclusivity rule. | 
|---|
| 1727 | * | 
|---|
| 1728 | * For partcmd_invalidate, the current partition will be made invalid. | 
|---|
| 1729 | * | 
|---|
| 1730 | * The partcmd_enable* and partcmd_disable commands are used by | 
|---|
| 1731 | * update_prstate(). An error code may be returned and the caller will check | 
|---|
| 1732 | * for error. | 
|---|
| 1733 | * | 
|---|
| 1734 | * The partcmd_update command is used by update_cpumasks_hier() with newmask | 
|---|
| 1735 | * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used | 
|---|
| 1736 | * by update_cpumask() with NULL newmask. In both cases, the callers won't | 
|---|
| 1737 | * check for error and so partition_root_state and prs_err will be updated | 
|---|
| 1738 | * directly. | 
|---|
| 1739 | */ | 
|---|
| 1740 | static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, | 
|---|
| 1741 | struct cpumask *newmask, | 
|---|
| 1742 | struct tmpmasks *tmp) | 
|---|
| 1743 | { | 
|---|
| 1744 | struct cpuset *parent = parent_cs(cs); | 
|---|
| 1745 | int adding;	/* Adding cpus to parent's effective_cpus	*/ | 
|---|
| 1746 | int deleting;	/* Deleting cpus from parent's effective_cpus	*/ | 
|---|
| 1747 | int old_prs, new_prs; | 
|---|
| 1748 | int part_error = PERR_NONE;	/* Partition error? */ | 
|---|
| 1749 | int subparts_delta = 0; | 
|---|
| 1750 | int isolcpus_updated = 0; | 
|---|
| 1751 | struct cpumask *xcpus = user_xcpus(cs); | 
|---|
| 1752 | bool nocpu; | 
|---|
| 1753 |  | 
|---|
| 1754 | lockdep_assert_held(&cpuset_mutex); | 
|---|
| 1755 | WARN_ON_ONCE(is_remote_partition(cs));	/* For local partition only */ | 
|---|
| 1756 |  | 
|---|
| 1757 | /* | 
|---|
| 1758 | * new_prs will only be changed for the partcmd_update and | 
|---|
| 1759 | * partcmd_invalidate commands. | 
|---|
| 1760 | */ | 
|---|
| 1761 | adding = deleting = false; | 
|---|
| 1762 | old_prs = new_prs = cs->partition_root_state; | 
|---|
| 1763 |  | 
|---|
| 1764 | if (cmd == partcmd_invalidate) { | 
|---|
| 1765 | if (is_partition_invalid(cs)) | 
|---|
| 1766 | return 0; | 
|---|
| 1767 |  | 
|---|
| 1768 | /* | 
|---|
| 1769 | * Make the current partition invalid. | 
|---|
| 1770 | */ | 
|---|
| 1771 | if (is_partition_valid(cs: parent)) | 
|---|
| 1772 | adding = cpumask_and(dstp: tmp->addmask, | 
|---|
| 1773 | src1p: xcpus, src2p: parent->effective_xcpus); | 
|---|
| 1774 | if (old_prs > 0) { | 
|---|
| 1775 | new_prs = -old_prs; | 
|---|
| 1776 | subparts_delta--; | 
|---|
| 1777 | } | 
|---|
| 1778 | goto write_error; | 
|---|
| 1779 | } | 
|---|
| 1780 |  | 
|---|
| 1781 | /* | 
|---|
| 1782 | * The parent must be a partition root. | 
|---|
| 1783 | * The new cpumask, if present, or the current cpus_allowed must | 
|---|
| 1784 | * not be empty. | 
|---|
| 1785 | */ | 
|---|
| 1786 | if (!is_partition_valid(cs: parent)) { | 
|---|
| 1787 | return is_partition_invalid(cs: parent) | 
|---|
| 1788 | ? PERR_INVPARENT : PERR_NOTPART; | 
|---|
| 1789 | } | 
|---|
| 1790 | if (!newmask && xcpus_empty(cs)) | 
|---|
| 1791 | return PERR_CPUSEMPTY; | 
|---|
| 1792 |  | 
|---|
| 1793 | nocpu = tasks_nocpu_error(parent, cs, xcpus); | 
|---|
| 1794 |  | 
|---|
| 1795 | if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) { | 
|---|
| 1796 | /* | 
|---|
| 1797 | * Need to call compute_excpus() in case | 
|---|
| 1798 | * exclusive_cpus not set. Sibling conflict should only happen | 
|---|
| 1799 | * if exclusive_cpus isn't set. | 
|---|
| 1800 | */ | 
|---|
| 1801 | xcpus = tmp->delmask; | 
|---|
| 1802 | if (compute_excpus(cs, excpus: xcpus)) | 
|---|
| 1803 | WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus)); | 
|---|
| 1804 | new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; | 
|---|
| 1805 |  | 
|---|
| 1806 | /* | 
|---|
| 1807 | * Enabling partition root is not allowed if its | 
|---|
| 1808 | * effective_xcpus is empty. | 
|---|
| 1809 | */ | 
|---|
| 1810 | if (cpumask_empty(srcp: xcpus)) | 
|---|
| 1811 | return PERR_INVCPUS; | 
|---|
| 1812 |  | 
|---|
| 1813 | if (prstate_housekeeping_conflict(prstate: new_prs, new_cpus: xcpus)) | 
|---|
| 1814 | return PERR_HKEEPING; | 
|---|
| 1815 |  | 
|---|
| 1816 | if (tasks_nocpu_error(parent, cs, xcpus)) | 
|---|
| 1817 | return PERR_NOCPUS; | 
|---|
| 1818 |  | 
|---|
| 1819 | /* | 
|---|
| 1820 | * This function will only be called when all the preliminary | 
|---|
| 1821 | * checks have passed. At this point, the following condition | 
|---|
| 1822 | * should hold. | 
|---|
| 1823 | * | 
|---|
| 1824 | * (cs->effective_xcpus & cpu_active_mask) ⊆ parent->effective_cpus | 
|---|
| 1825 | * | 
|---|
| 1826 | * Warn if it is not the case. | 
|---|
| 1827 | */ | 
|---|
| 1828 | cpumask_and(dstp: tmp->new_cpus, src1p: xcpus, cpu_active_mask); | 
|---|
| 1829 | WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus)); | 
|---|
| 1830 |  | 
|---|
| 1831 | deleting = true; | 
|---|
| 1832 | subparts_delta++; | 
|---|
| 1833 | } else if (cmd == partcmd_disable) { | 
|---|
| 1834 | /* | 
|---|
| 1835 | * May need to add cpus back to parent's effective_cpus | 
|---|
| 1836 | * (and maybe removed from subpartitions_cpus/isolated_cpus) | 
|---|
| 1837 | * for valid partition root. xcpus may contain CPUs that | 
|---|
| 1838 | * shouldn't be removed from the two global cpumasks. | 
|---|
| 1839 | */ | 
|---|
| 1840 | if (is_partition_valid(cs)) { | 
|---|
| 1841 | cpumask_copy(dstp: tmp->addmask, srcp: cs->effective_xcpus); | 
|---|
| 1842 | adding = true; | 
|---|
| 1843 | subparts_delta--; | 
|---|
| 1844 | } | 
|---|
| 1845 | new_prs = PRS_MEMBER; | 
|---|
| 1846 | } else if (newmask) { | 
|---|
| 1847 | /* | 
|---|
| 1848 | * Empty cpumask is not allowed | 
|---|
| 1849 | */ | 
|---|
| 1850 | if (cpumask_empty(srcp: newmask)) { | 
|---|
| 1851 | part_error = PERR_CPUSEMPTY; | 
|---|
| 1852 | goto write_error; | 
|---|
| 1853 | } | 
|---|
| 1854 |  | 
|---|
| 1855 | /* Check newmask again, whether cpus are available for parent/cs */ | 
|---|
| 1856 | nocpu |= tasks_nocpu_error(parent, cs, xcpus: newmask); | 
|---|
| 1857 |  | 
|---|
| 1858 | /* | 
|---|
| 1859 | * partcmd_update with newmask: | 
|---|
| 1860 | * | 
|---|
| 1861 | * Compute add/delete mask to/from effective_cpus | 
|---|
| 1862 | * | 
|---|
| 1863 | * For valid partition: | 
|---|
| 1864 | *   addmask = exclusive_cpus & ~newmask | 
|---|
| 1865 | *			      & parent->effective_xcpus | 
|---|
| 1866 | *   delmask = newmask & ~exclusive_cpus | 
|---|
| 1867 | *		       & parent->effective_xcpus | 
|---|
| 1868 | * | 
|---|
| 1869 | * For invalid partition: | 
|---|
| 1870 | *   delmask = newmask & parent->effective_xcpus | 
|---|
| 1871 | */ | 
|---|
| 1872 | if (is_partition_invalid(cs)) { | 
|---|
| 1873 | adding = false; | 
|---|
| 1874 | deleting = cpumask_and(dstp: tmp->delmask, | 
|---|
| 1875 | src1p: newmask, src2p: parent->effective_xcpus); | 
|---|
| 1876 | } else { | 
|---|
| 1877 | cpumask_andnot(dstp: tmp->addmask, src1p: xcpus, src2p: newmask); | 
|---|
| 1878 | adding = cpumask_and(dstp: tmp->addmask, src1p: tmp->addmask, | 
|---|
| 1879 | src2p: parent->effective_xcpus); | 
|---|
| 1880 |  | 
|---|
| 1881 | cpumask_andnot(dstp: tmp->delmask, src1p: newmask, src2p: xcpus); | 
|---|
| 1882 | deleting = cpumask_and(dstp: tmp->delmask, src1p: tmp->delmask, | 
|---|
| 1883 | src2p: parent->effective_xcpus); | 
|---|
| 1884 | } | 
|---|
| 1885 | /* | 
|---|
| 1886 | * The new CPUs to be removed from parent's effective CPUs | 
|---|
| 1887 | * must be present. | 
|---|
| 1888 | */ | 
|---|
| 1889 | if (deleting) { | 
|---|
| 1890 | cpumask_and(dstp: tmp->new_cpus, src1p: tmp->delmask, cpu_active_mask); | 
|---|
| 1891 | WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus)); | 
|---|
| 1892 | } | 
|---|
| 1893 |  | 
|---|
| 1894 | /* | 
|---|
| 1895 | * Make partition invalid if parent's effective_cpus could | 
|---|
| 1896 | * become empty and there are tasks in the parent. | 
|---|
| 1897 | */ | 
|---|
| 1898 | if (nocpu && (!adding || | 
|---|
| 1899 | !cpumask_intersects(src1p: tmp->addmask, cpu_active_mask))) { | 
|---|
| 1900 | part_error = PERR_NOCPUS; | 
|---|
| 1901 | deleting = false; | 
|---|
| 1902 | adding = cpumask_and(dstp: tmp->addmask, | 
|---|
| 1903 | src1p: xcpus, src2p: parent->effective_xcpus); | 
|---|
| 1904 | } | 
|---|
| 1905 | } else { | 
|---|
| 1906 | /* | 
|---|
| 1907 | * partcmd_update w/o newmask | 
|---|
| 1908 | * | 
|---|
| 1909 | * delmask = effective_xcpus & parent->effective_cpus | 
|---|
| 1910 | * | 
|---|
| 1911 | * This can be called from: | 
|---|
| 1912 | * 1) update_cpumasks_hier() | 
|---|
| 1913 | * 2) cpuset_hotplug_update_tasks() | 
|---|
| 1914 | * | 
|---|
| 1915 | * Check to see if it can be transitioned from valid to | 
|---|
| 1916 | * invalid partition or vice versa. | 
|---|
| 1917 | * | 
|---|
| 1918 | * A partition error happens when parent has tasks and all | 
|---|
| 1919 | * its effective CPUs will have to be distributed out. | 
|---|
| 1920 | */ | 
|---|
| 1921 | if (nocpu) { | 
|---|
| 1922 | part_error = PERR_NOCPUS; | 
|---|
| 1923 | if (is_partition_valid(cs)) | 
|---|
| 1924 | adding = cpumask_and(dstp: tmp->addmask, | 
|---|
| 1925 | src1p: xcpus, src2p: parent->effective_xcpus); | 
|---|
| 1926 | } else if (is_partition_invalid(cs) && !cpumask_empty(srcp: xcpus) && | 
|---|
| 1927 | cpumask_subset(src1p: xcpus, src2p: parent->effective_xcpus)) { | 
|---|
| 1928 | struct cgroup_subsys_state *css; | 
|---|
| 1929 | struct cpuset *child; | 
|---|
| 1930 | bool exclusive = true; | 
|---|
| 1931 |  | 
|---|
| 1932 | /* | 
|---|
| 1933 | * Convert invalid partition to valid has to | 
|---|
| 1934 | * pass the cpu exclusivity test. | 
|---|
| 1935 | */ | 
|---|
| 1936 | rcu_read_lock(); | 
|---|
| 1937 | cpuset_for_each_child(child, css, parent) { | 
|---|
| 1938 | if (child == cs) | 
|---|
| 1939 | continue; | 
|---|
| 1940 | if (!cpusets_are_exclusive(cs1: cs, cs2: child)) { | 
|---|
| 1941 | exclusive = false; | 
|---|
| 1942 | break; | 
|---|
| 1943 | } | 
|---|
| 1944 | } | 
|---|
| 1945 | rcu_read_unlock(); | 
|---|
| 1946 | if (exclusive) | 
|---|
| 1947 | deleting = cpumask_and(dstp: tmp->delmask, | 
|---|
| 1948 | src1p: xcpus, src2p: parent->effective_cpus); | 
|---|
| 1949 | else | 
|---|
| 1950 | part_error = PERR_NOTEXCL; | 
|---|
| 1951 | } | 
|---|
| 1952 | } | 
|---|
| 1953 |  | 
|---|
| 1954 | write_error: | 
|---|
| 1955 | if (part_error) | 
|---|
| 1956 | WRITE_ONCE(cs->prs_err, part_error); | 
|---|
| 1957 |  | 
|---|
| 1958 | if (cmd == partcmd_update) { | 
|---|
| 1959 | /* | 
|---|
| 1960 | * Check for possible transition between valid and invalid | 
|---|
| 1961 | * partition root. | 
|---|
| 1962 | */ | 
|---|
| 1963 | switch (cs->partition_root_state) { | 
|---|
| 1964 | case PRS_ROOT: | 
|---|
| 1965 | case PRS_ISOLATED: | 
|---|
| 1966 | if (part_error) { | 
|---|
| 1967 | new_prs = -old_prs; | 
|---|
| 1968 | subparts_delta--; | 
|---|
| 1969 | } | 
|---|
| 1970 | break; | 
|---|
| 1971 | case PRS_INVALID_ROOT: | 
|---|
| 1972 | case PRS_INVALID_ISOLATED: | 
|---|
| 1973 | if (!part_error) { | 
|---|
| 1974 | new_prs = -old_prs; | 
|---|
| 1975 | subparts_delta++; | 
|---|
| 1976 | } | 
|---|
| 1977 | break; | 
|---|
| 1978 | } | 
|---|
| 1979 | } | 
|---|
| 1980 |  | 
|---|
| 1981 | if (!adding && !deleting && (new_prs == old_prs)) | 
|---|
| 1982 | return 0; | 
|---|
| 1983 |  | 
|---|
| 1984 | /* | 
|---|
| 1985 | * Transitioning between invalid to valid or vice versa may require | 
|---|
| 1986 | * changing CS_CPU_EXCLUSIVE. In the case of partcmd_update, | 
|---|
| 1987 | * validate_change() has already been successfully called and | 
|---|
| 1988 | * CPU lists in cs haven't been updated yet. So defer it to later. | 
|---|
| 1989 | */ | 
|---|
| 1990 | if ((old_prs != new_prs) && (cmd != partcmd_update))  { | 
|---|
| 1991 | int err = update_partition_exclusive_flag(cs, new_prs); | 
|---|
| 1992 |  | 
|---|
| 1993 | if (err) | 
|---|
| 1994 | return err; | 
|---|
| 1995 | } | 
|---|
| 1996 |  | 
|---|
| 1997 | /* | 
|---|
| 1998 | * Change the parent's effective_cpus & effective_xcpus (top cpuset | 
|---|
| 1999 | * only). | 
|---|
| 2000 | * | 
|---|
| 2001 | * Newly added CPUs will be removed from effective_cpus and | 
|---|
| 2002 | * newly deleted ones will be added back to effective_cpus. | 
|---|
| 2003 | */ | 
|---|
| 2004 | spin_lock_irq(lock: &callback_lock); | 
|---|
| 2005 | if (old_prs != new_prs) { | 
|---|
| 2006 | cs->partition_root_state = new_prs; | 
|---|
| 2007 | if (new_prs <= 0) | 
|---|
| 2008 | cs->nr_subparts = 0; | 
|---|
| 2009 | } | 
|---|
| 2010 | /* | 
|---|
| 2011 | * Adding to parent's effective_cpus means deletion CPUs from cs | 
|---|
| 2012 | * and vice versa. | 
|---|
| 2013 | */ | 
|---|
| 2014 | if (adding) | 
|---|
| 2015 | isolcpus_updated += partition_xcpus_del(old_prs, parent, | 
|---|
| 2016 | xcpus: tmp->addmask); | 
|---|
| 2017 | if (deleting) | 
|---|
| 2018 | isolcpus_updated += partition_xcpus_add(new_prs, parent, | 
|---|
| 2019 | xcpus: tmp->delmask); | 
|---|
| 2020 |  | 
|---|
| 2021 | if (is_partition_valid(cs: parent)) { | 
|---|
| 2022 | parent->nr_subparts += subparts_delta; | 
|---|
| 2023 | WARN_ON_ONCE(parent->nr_subparts < 0); | 
|---|
| 2024 | } | 
|---|
| 2025 | spin_unlock_irq(lock: &callback_lock); | 
|---|
| 2026 | update_unbound_workqueue_cpumask(isolcpus_updated); | 
|---|
| 2027 |  | 
|---|
| 2028 | if ((old_prs != new_prs) && (cmd == partcmd_update)) | 
|---|
| 2029 | update_partition_exclusive_flag(cs, new_prs); | 
|---|
| 2030 |  | 
|---|
| 2031 | if (adding || deleting) { | 
|---|
| 2032 | cpuset_update_tasks_cpumask(cs: parent, new_cpus: tmp->addmask); | 
|---|
| 2033 | update_sibling_cpumasks(parent, cs, tmp); | 
|---|
| 2034 | } | 
|---|
| 2035 |  | 
|---|
| 2036 | /* | 
|---|
| 2037 | * For partcmd_update without newmask, it is being called from | 
|---|
| 2038 | * cpuset_handle_hotplug(). Update the load balance flag and | 
|---|
| 2039 | * scheduling domain accordingly. | 
|---|
| 2040 | */ | 
|---|
| 2041 | if ((cmd == partcmd_update) && !newmask) | 
|---|
| 2042 | update_partition_sd_lb(cs, old_prs); | 
|---|
| 2043 |  | 
|---|
| 2044 | notify_partition_change(cs, old_prs); | 
|---|
| 2045 | return 0; | 
|---|
| 2046 | } | 
|---|
| 2047 |  | 
|---|
| 2048 | /** | 
|---|
| 2049 | * compute_partition_effective_cpumask - compute effective_cpus for partition | 
|---|
| 2050 | * @cs: partition root cpuset | 
|---|
| 2051 | * @new_ecpus: previously computed effective_cpus to be updated | 
|---|
| 2052 | * | 
|---|
| 2053 | * Compute the effective_cpus of a partition root by scanning effective_xcpus | 
|---|
| 2054 | * of child partition roots and excluding their effective_xcpus. | 
|---|
| 2055 | * | 
|---|
| 2056 | * This has the side effect of invalidating valid child partition roots, | 
|---|
| 2057 | * if necessary. Since it is called from either cpuset_hotplug_update_tasks() | 
|---|
| 2058 | * or update_cpumasks_hier() where parent and children are modified | 
|---|
| 2059 | * successively, we don't need to call update_parent_effective_cpumask() | 
|---|
| 2060 | * and the child's effective_cpus will be updated in later iterations. | 
|---|
| 2061 | * | 
|---|
| 2062 | * Note that rcu_read_lock() is assumed to be held. | 
|---|
| 2063 | */ | 
|---|
| 2064 | static void compute_partition_effective_cpumask(struct cpuset *cs, | 
|---|
| 2065 | struct cpumask *new_ecpus) | 
|---|
| 2066 | { | 
|---|
| 2067 | struct cgroup_subsys_state *css; | 
|---|
| 2068 | struct cpuset *child; | 
|---|
| 2069 | bool populated = partition_is_populated(cs, NULL); | 
|---|
| 2070 |  | 
|---|
| 2071 | /* | 
|---|
| 2072 | * Check child partition roots to see if they should be | 
|---|
| 2073 | * invalidated when | 
|---|
| 2074 | *  1) child effective_xcpus not a subset of new | 
|---|
| 2075 | *     excluisve_cpus | 
|---|
| 2076 | *  2) All the effective_cpus will be used up and cp | 
|---|
| 2077 | *     has tasks | 
|---|
| 2078 | */ | 
|---|
| 2079 | compute_excpus(cs, excpus: new_ecpus); | 
|---|
| 2080 | cpumask_and(dstp: new_ecpus, src1p: new_ecpus, cpu_active_mask); | 
|---|
| 2081 |  | 
|---|
| 2082 | rcu_read_lock(); | 
|---|
| 2083 | cpuset_for_each_child(child, css, cs) { | 
|---|
| 2084 | if (!is_partition_valid(cs: child)) | 
|---|
| 2085 | continue; | 
|---|
| 2086 |  | 
|---|
| 2087 | /* | 
|---|
| 2088 | * There shouldn't be a remote partition underneath another | 
|---|
| 2089 | * partition root. | 
|---|
| 2090 | */ | 
|---|
| 2091 | WARN_ON_ONCE(is_remote_partition(child)); | 
|---|
| 2092 | child->prs_err = 0; | 
|---|
| 2093 | if (!cpumask_subset(src1p: child->effective_xcpus, | 
|---|
| 2094 | src2p: cs->effective_xcpus)) | 
|---|
| 2095 | child->prs_err = PERR_INVCPUS; | 
|---|
| 2096 | else if (populated && | 
|---|
| 2097 | cpumask_subset(src1p: new_ecpus, src2p: child->effective_xcpus)) | 
|---|
| 2098 | child->prs_err = PERR_NOCPUS; | 
|---|
| 2099 |  | 
|---|
| 2100 | if (child->prs_err) { | 
|---|
| 2101 | int old_prs = child->partition_root_state; | 
|---|
| 2102 |  | 
|---|
| 2103 | /* | 
|---|
| 2104 | * Invalidate child partition | 
|---|
| 2105 | */ | 
|---|
| 2106 | spin_lock_irq(lock: &callback_lock); | 
|---|
| 2107 | make_partition_invalid(cs: child); | 
|---|
| 2108 | cs->nr_subparts--; | 
|---|
| 2109 | child->nr_subparts = 0; | 
|---|
| 2110 | spin_unlock_irq(lock: &callback_lock); | 
|---|
| 2111 | notify_partition_change(cs: child, old_prs); | 
|---|
| 2112 | continue; | 
|---|
| 2113 | } | 
|---|
| 2114 | cpumask_andnot(dstp: new_ecpus, src1p: new_ecpus, | 
|---|
| 2115 | src2p: child->effective_xcpus); | 
|---|
| 2116 | } | 
|---|
| 2117 | rcu_read_unlock(); | 
|---|
| 2118 | } | 
|---|
| 2119 |  | 
|---|
| 2120 | /* | 
|---|
| 2121 | * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree | 
|---|
| 2122 | * @cs:  the cpuset to consider | 
|---|
| 2123 | * @tmp: temp variables for calculating effective_cpus & partition setup | 
|---|
| 2124 | * @force: don't skip any descendant cpusets if set | 
|---|
| 2125 | * | 
|---|
| 2126 | * When configured cpumask is changed, the effective cpumasks of this cpuset | 
|---|
| 2127 | * and all its descendants need to be updated. | 
|---|
| 2128 | * | 
|---|
| 2129 | * On legacy hierarchy, effective_cpus will be the same with cpu_allowed. | 
|---|
| 2130 | * | 
|---|
| 2131 | * Called with cpuset_mutex held | 
|---|
| 2132 | */ | 
|---|
| 2133 | static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, | 
|---|
| 2134 | bool force) | 
|---|
| 2135 | { | 
|---|
| 2136 | struct cpuset *cp; | 
|---|
| 2137 | struct cgroup_subsys_state *pos_css; | 
|---|
| 2138 | bool need_rebuild_sched_domains = false; | 
|---|
| 2139 | int old_prs, new_prs; | 
|---|
| 2140 |  | 
|---|
| 2141 | rcu_read_lock(); | 
|---|
| 2142 | cpuset_for_each_descendant_pre(cp, pos_css, cs) { | 
|---|
| 2143 | struct cpuset *parent = parent_cs(cs: cp); | 
|---|
| 2144 | bool remote = is_remote_partition(cs: cp); | 
|---|
| 2145 | bool update_parent = false; | 
|---|
| 2146 |  | 
|---|
| 2147 | old_prs = new_prs = cp->partition_root_state; | 
|---|
| 2148 |  | 
|---|
| 2149 | /* | 
|---|
| 2150 | * For child remote partition root (!= cs), we need to call | 
|---|
| 2151 | * remote_cpus_update() if effective_xcpus will be changed. | 
|---|
| 2152 | * Otherwise, we can skip the whole subtree. | 
|---|
| 2153 | * | 
|---|
| 2154 | * remote_cpus_update() will reuse tmp->new_cpus only after | 
|---|
| 2155 | * its value is being processed. | 
|---|
| 2156 | */ | 
|---|
| 2157 | if (remote && (cp != cs)) { | 
|---|
| 2158 | compute_excpus(cs: cp, excpus: tmp->new_cpus); | 
|---|
| 2159 | if (cpumask_equal(src1p: cp->effective_xcpus, src2p: tmp->new_cpus)) { | 
|---|
| 2160 | pos_css = css_rightmost_descendant(pos: pos_css); | 
|---|
| 2161 | continue; | 
|---|
| 2162 | } | 
|---|
| 2163 | rcu_read_unlock(); | 
|---|
| 2164 | remote_cpus_update(cs: cp, NULL, excpus: tmp->new_cpus, tmp); | 
|---|
| 2165 | rcu_read_lock(); | 
|---|
| 2166 |  | 
|---|
| 2167 | /* Remote partition may be invalidated */ | 
|---|
| 2168 | new_prs = cp->partition_root_state; | 
|---|
| 2169 | remote = (new_prs == old_prs); | 
|---|
| 2170 | } | 
|---|
| 2171 |  | 
|---|
| 2172 | if (remote || (is_partition_valid(cs: parent) && is_partition_valid(cs: cp))) | 
|---|
| 2173 | compute_partition_effective_cpumask(cs: cp, new_ecpus: tmp->new_cpus); | 
|---|
| 2174 | else | 
|---|
| 2175 | compute_effective_cpumask(new_cpus: tmp->new_cpus, cs: cp, parent); | 
|---|
| 2176 |  | 
|---|
| 2177 | if (remote) | 
|---|
| 2178 | goto get_css;	/* Ready to update cpuset data */ | 
|---|
| 2179 |  | 
|---|
| 2180 | /* | 
|---|
| 2181 | * A partition with no effective_cpus is allowed as long as | 
|---|
| 2182 | * there is no task associated with it. Call | 
|---|
| 2183 | * update_parent_effective_cpumask() to check it. | 
|---|
| 2184 | */ | 
|---|
| 2185 | if (is_partition_valid(cs: cp) && cpumask_empty(srcp: tmp->new_cpus)) { | 
|---|
| 2186 | update_parent = true; | 
|---|
| 2187 | goto update_parent_effective; | 
|---|
| 2188 | } | 
|---|
| 2189 |  | 
|---|
| 2190 | /* | 
|---|
| 2191 | * If it becomes empty, inherit the effective mask of the | 
|---|
| 2192 | * parent, which is guaranteed to have some CPUs unless | 
|---|
| 2193 | * it is a partition root that has explicitly distributed | 
|---|
| 2194 | * out all its CPUs. | 
|---|
| 2195 | */ | 
|---|
| 2196 | if (is_in_v2_mode() && !remote && cpumask_empty(srcp: tmp->new_cpus)) | 
|---|
| 2197 | cpumask_copy(dstp: tmp->new_cpus, srcp: parent->effective_cpus); | 
|---|
| 2198 |  | 
|---|
| 2199 | /* | 
|---|
| 2200 | * Skip the whole subtree if | 
|---|
| 2201 | * 1) the cpumask remains the same, | 
|---|
| 2202 | * 2) has no partition root state, | 
|---|
| 2203 | * 3) force flag not set, and | 
|---|
| 2204 | * 4) for v2 load balance state same as its parent. | 
|---|
| 2205 | */ | 
|---|
| 2206 | if (!cp->partition_root_state && !force && | 
|---|
| 2207 | cpumask_equal(src1p: tmp->new_cpus, src2p: cp->effective_cpus) && | 
|---|
| 2208 | (!cpuset_v2() || | 
|---|
| 2209 | (is_sched_load_balance(cs: parent) == is_sched_load_balance(cs: cp)))) { | 
|---|
| 2210 | pos_css = css_rightmost_descendant(pos: pos_css); | 
|---|
| 2211 | continue; | 
|---|
| 2212 | } | 
|---|
| 2213 |  | 
|---|
| 2214 | update_parent_effective: | 
|---|
| 2215 | /* | 
|---|
| 2216 | * update_parent_effective_cpumask() should have been called | 
|---|
| 2217 | * for cs already in update_cpumask(). We should also call | 
|---|
| 2218 | * cpuset_update_tasks_cpumask() again for tasks in the parent | 
|---|
| 2219 | * cpuset if the parent's effective_cpus changes. | 
|---|
| 2220 | */ | 
|---|
| 2221 | if ((cp != cs) && old_prs) { | 
|---|
| 2222 | switch (parent->partition_root_state) { | 
|---|
| 2223 | case PRS_ROOT: | 
|---|
| 2224 | case PRS_ISOLATED: | 
|---|
| 2225 | update_parent = true; | 
|---|
| 2226 | break; | 
|---|
| 2227 |  | 
|---|
| 2228 | default: | 
|---|
| 2229 | /* | 
|---|
| 2230 | * When parent is not a partition root or is | 
|---|
| 2231 | * invalid, child partition roots become | 
|---|
| 2232 | * invalid too. | 
|---|
| 2233 | */ | 
|---|
| 2234 | if (is_partition_valid(cs: cp)) | 
|---|
| 2235 | new_prs = -cp->partition_root_state; | 
|---|
| 2236 | WRITE_ONCE(cp->prs_err, | 
|---|
| 2237 | is_partition_invalid(parent) | 
|---|
| 2238 | ? PERR_INVPARENT : PERR_NOTPART); | 
|---|
| 2239 | break; | 
|---|
| 2240 | } | 
|---|
| 2241 | } | 
|---|
| 2242 | get_css: | 
|---|
| 2243 | if (!css_tryget_online(css: &cp->css)) | 
|---|
| 2244 | continue; | 
|---|
| 2245 | rcu_read_unlock(); | 
|---|
| 2246 |  | 
|---|
| 2247 | if (update_parent) { | 
|---|
| 2248 | update_parent_effective_cpumask(cs: cp, cmd: partcmd_update, NULL, tmp); | 
|---|
| 2249 | /* | 
|---|
| 2250 | * The cpuset partition_root_state may become | 
|---|
| 2251 | * invalid. Capture it. | 
|---|
| 2252 | */ | 
|---|
| 2253 | new_prs = cp->partition_root_state; | 
|---|
| 2254 | } | 
|---|
| 2255 |  | 
|---|
| 2256 | spin_lock_irq(lock: &callback_lock); | 
|---|
| 2257 | cpumask_copy(dstp: cp->effective_cpus, srcp: tmp->new_cpus); | 
|---|
| 2258 | cp->partition_root_state = new_prs; | 
|---|
| 2259 | if (!cpumask_empty(srcp: cp->exclusive_cpus) && (cp != cs)) | 
|---|
| 2260 | compute_excpus(cs: cp, excpus: cp->effective_xcpus); | 
|---|
| 2261 |  | 
|---|
| 2262 | /* | 
|---|
| 2263 | * Make sure effective_xcpus is properly set for a valid | 
|---|
| 2264 | * partition root. | 
|---|
| 2265 | */ | 
|---|
| 2266 | if ((new_prs > 0) && cpumask_empty(srcp: cp->exclusive_cpus)) | 
|---|
| 2267 | cpumask_and(dstp: cp->effective_xcpus, | 
|---|
| 2268 | src1p: cp->cpus_allowed, src2p: parent->effective_xcpus); | 
|---|
| 2269 | else if (new_prs < 0) | 
|---|
| 2270 | reset_partition_data(cs: cp); | 
|---|
| 2271 | spin_unlock_irq(lock: &callback_lock); | 
|---|
| 2272 |  | 
|---|
| 2273 | notify_partition_change(cs: cp, old_prs); | 
|---|
| 2274 |  | 
|---|
| 2275 | WARN_ON(!is_in_v2_mode() && | 
|---|
| 2276 | !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); | 
|---|
| 2277 |  | 
|---|
| 2278 | cpuset_update_tasks_cpumask(cs: cp, new_cpus: cp->effective_cpus); | 
|---|
| 2279 |  | 
|---|
| 2280 | /* | 
|---|
| 2281 | * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE | 
|---|
| 2282 | * from parent if current cpuset isn't a valid partition root | 
|---|
| 2283 | * and their load balance states differ. | 
|---|
| 2284 | */ | 
|---|
| 2285 | if (cpuset_v2() && !is_partition_valid(cs: cp) && | 
|---|
| 2286 | (is_sched_load_balance(cs: parent) != is_sched_load_balance(cs: cp))) { | 
|---|
| 2287 | if (is_sched_load_balance(cs: parent)) | 
|---|
| 2288 | set_bit(nr: CS_SCHED_LOAD_BALANCE, addr: &cp->flags); | 
|---|
| 2289 | else | 
|---|
| 2290 | clear_bit(nr: CS_SCHED_LOAD_BALANCE, addr: &cp->flags); | 
|---|
| 2291 | } | 
|---|
| 2292 |  | 
|---|
| 2293 | /* | 
|---|
| 2294 | * On legacy hierarchy, if the effective cpumask of any non- | 
|---|
| 2295 | * empty cpuset is changed, we need to rebuild sched domains. | 
|---|
| 2296 | * On default hierarchy, the cpuset needs to be a partition | 
|---|
| 2297 | * root as well. | 
|---|
| 2298 | */ | 
|---|
| 2299 | if (!cpumask_empty(srcp: cp->cpus_allowed) && | 
|---|
| 2300 | is_sched_load_balance(cs: cp) && | 
|---|
| 2301 | (!cpuset_v2() || is_partition_valid(cs: cp))) | 
|---|
| 2302 | need_rebuild_sched_domains = true; | 
|---|
| 2303 |  | 
|---|
| 2304 | rcu_read_lock(); | 
|---|
| 2305 | css_put(css: &cp->css); | 
|---|
| 2306 | } | 
|---|
| 2307 | rcu_read_unlock(); | 
|---|
| 2308 |  | 
|---|
| 2309 | if (need_rebuild_sched_domains) | 
|---|
| 2310 | cpuset_force_rebuild(); | 
|---|
| 2311 | } | 
|---|
| 2312 |  | 
|---|
| 2313 | /** | 
|---|
| 2314 | * update_sibling_cpumasks - Update siblings cpumasks | 
|---|
| 2315 | * @parent:  Parent cpuset | 
|---|
| 2316 | * @cs:      Current cpuset | 
|---|
| 2317 | * @tmp:     Temp variables | 
|---|
| 2318 | */ | 
|---|
| 2319 | static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, | 
|---|
| 2320 | struct tmpmasks *tmp) | 
|---|
| 2321 | { | 
|---|
| 2322 | struct cpuset *sibling; | 
|---|
| 2323 | struct cgroup_subsys_state *pos_css; | 
|---|
| 2324 |  | 
|---|
| 2325 | lockdep_assert_held(&cpuset_mutex); | 
|---|
| 2326 |  | 
|---|
| 2327 | /* | 
|---|
| 2328 | * Check all its siblings and call update_cpumasks_hier() | 
|---|
| 2329 | * if their effective_cpus will need to be changed. | 
|---|
| 2330 | * | 
|---|
| 2331 | * It is possible a change in parent's effective_cpus | 
|---|
| 2332 | * due to a change in a child partition's effective_xcpus will impact | 
|---|
| 2333 | * its siblings even if they do not inherit parent's effective_cpus | 
|---|
| 2334 | * directly. | 
|---|
| 2335 | * | 
|---|
| 2336 | * The update_cpumasks_hier() function may sleep. So we have to | 
|---|
| 2337 | * release the RCU read lock before calling it. | 
|---|
| 2338 | */ | 
|---|
| 2339 | rcu_read_lock(); | 
|---|
| 2340 | cpuset_for_each_child(sibling, pos_css, parent) { | 
|---|
| 2341 | if (sibling == cs) | 
|---|
| 2342 | continue; | 
|---|
| 2343 | if (!is_partition_valid(cs: sibling)) { | 
|---|
| 2344 | compute_effective_cpumask(new_cpus: tmp->new_cpus, cs: sibling, | 
|---|
| 2345 | parent); | 
|---|
| 2346 | if (cpumask_equal(src1p: tmp->new_cpus, src2p: sibling->effective_cpus)) | 
|---|
| 2347 | continue; | 
|---|
| 2348 | } else if (is_remote_partition(cs: sibling)) { | 
|---|
| 2349 | /* | 
|---|
| 2350 | * Change in a sibling cpuset won't affect a remote | 
|---|
| 2351 | * partition root. | 
|---|
| 2352 | */ | 
|---|
| 2353 | continue; | 
|---|
| 2354 | } | 
|---|
| 2355 |  | 
|---|
| 2356 | if (!css_tryget_online(css: &sibling->css)) | 
|---|
| 2357 | continue; | 
|---|
| 2358 |  | 
|---|
| 2359 | rcu_read_unlock(); | 
|---|
| 2360 | update_cpumasks_hier(cs: sibling, tmp, force: false); | 
|---|
| 2361 | rcu_read_lock(); | 
|---|
| 2362 | css_put(css: &sibling->css); | 
|---|
| 2363 | } | 
|---|
| 2364 | rcu_read_unlock(); | 
|---|
| 2365 | } | 
|---|
| 2366 |  | 
|---|
| 2367 | static int parse_cpuset_cpulist(const char *buf, struct cpumask *out_mask) | 
|---|
| 2368 | { | 
|---|
| 2369 | int retval; | 
|---|
| 2370 |  | 
|---|
| 2371 | retval = cpulist_parse(buf, dstp: out_mask); | 
|---|
| 2372 | if (retval < 0) | 
|---|
| 2373 | return retval; | 
|---|
| 2374 | if (!cpumask_subset(src1p: out_mask, src2p: top_cpuset.cpus_allowed)) | 
|---|
| 2375 | return -EINVAL; | 
|---|
| 2376 |  | 
|---|
| 2377 | return 0; | 
|---|
| 2378 | } | 
|---|
| 2379 |  | 
|---|
| 2380 | /** | 
|---|
| 2381 | * validate_partition - Validate a cpuset partition configuration | 
|---|
| 2382 | * @cs: The cpuset to validate | 
|---|
| 2383 | * @trialcs: The trial cpuset containing proposed configuration changes | 
|---|
| 2384 | * | 
|---|
| 2385 | * If any validation check fails, the appropriate error code is set in the | 
|---|
| 2386 | * cpuset's prs_err field. | 
|---|
| 2387 | * | 
|---|
| 2388 | * Return: PRS error code (0 if valid, non-zero error code if invalid) | 
|---|
| 2389 | */ | 
|---|
| 2390 | static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *trialcs) | 
|---|
| 2391 | { | 
|---|
| 2392 | struct cpuset *parent = parent_cs(cs); | 
|---|
| 2393 |  | 
|---|
| 2394 | if (cs_is_member(cs: trialcs)) | 
|---|
| 2395 | return PERR_NONE; | 
|---|
| 2396 |  | 
|---|
| 2397 | if (cpumask_empty(srcp: trialcs->effective_xcpus)) | 
|---|
| 2398 | return PERR_INVCPUS; | 
|---|
| 2399 |  | 
|---|
| 2400 | if (prstate_housekeeping_conflict(prstate: trialcs->partition_root_state, | 
|---|
| 2401 | new_cpus: trialcs->effective_xcpus)) | 
|---|
| 2402 | return PERR_HKEEPING; | 
|---|
| 2403 |  | 
|---|
| 2404 | if (tasks_nocpu_error(parent, cs, xcpus: trialcs->effective_xcpus)) | 
|---|
| 2405 | return PERR_NOCPUS; | 
|---|
| 2406 |  | 
|---|
| 2407 | return PERR_NONE; | 
|---|
| 2408 | } | 
|---|
| 2409 |  | 
|---|
| 2410 | static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs, | 
|---|
| 2411 | struct tmpmasks *tmp) | 
|---|
| 2412 | { | 
|---|
| 2413 | int retval; | 
|---|
| 2414 | struct cpuset *parent = parent_cs(cs); | 
|---|
| 2415 |  | 
|---|
| 2416 | retval = validate_change(cur: cs, trial: trialcs); | 
|---|
| 2417 |  | 
|---|
| 2418 | if ((retval == -EINVAL) && cpuset_v2()) { | 
|---|
| 2419 | struct cgroup_subsys_state *css; | 
|---|
| 2420 | struct cpuset *cp; | 
|---|
| 2421 |  | 
|---|
| 2422 | /* | 
|---|
| 2423 | * The -EINVAL error code indicates that partition sibling | 
|---|
| 2424 | * CPU exclusivity rule has been violated. We still allow | 
|---|
| 2425 | * the cpumask change to proceed while invalidating the | 
|---|
| 2426 | * partition. However, any conflicting sibling partitions | 
|---|
| 2427 | * have to be marked as invalid too. | 
|---|
| 2428 | */ | 
|---|
| 2429 | trialcs->prs_err = PERR_NOTEXCL; | 
|---|
| 2430 | rcu_read_lock(); | 
|---|
| 2431 | cpuset_for_each_child(cp, css, parent) { | 
|---|
| 2432 | struct cpumask *xcpus = user_xcpus(cs: trialcs); | 
|---|
| 2433 |  | 
|---|
| 2434 | if (is_partition_valid(cs: cp) && | 
|---|
| 2435 | cpumask_intersects(src1p: xcpus, src2p: cp->effective_xcpus)) { | 
|---|
| 2436 | rcu_read_unlock(); | 
|---|
| 2437 | update_parent_effective_cpumask(cs: cp, cmd: partcmd_invalidate, NULL, tmp); | 
|---|
| 2438 | rcu_read_lock(); | 
|---|
| 2439 | } | 
|---|
| 2440 | } | 
|---|
| 2441 | rcu_read_unlock(); | 
|---|
| 2442 | retval = 0; | 
|---|
| 2443 | } | 
|---|
| 2444 | return retval; | 
|---|
| 2445 | } | 
|---|
| 2446 |  | 
|---|
| 2447 | /** | 
|---|
| 2448 | * partition_cpus_change - Handle partition state changes due to CPU mask updates | 
|---|
| 2449 | * @cs: The target cpuset being modified | 
|---|
| 2450 | * @trialcs: The trial cpuset containing proposed configuration changes | 
|---|
| 2451 | * @tmp: Temporary masks for intermediate calculations | 
|---|
| 2452 | * | 
|---|
| 2453 | * This function handles partition state transitions triggered by CPU mask changes. | 
|---|
| 2454 | * CPU modifications may cause a partition to be disabled or require state updates. | 
|---|
| 2455 | */ | 
|---|
| 2456 | static void partition_cpus_change(struct cpuset *cs, struct cpuset *trialcs, | 
|---|
| 2457 | struct tmpmasks *tmp) | 
|---|
| 2458 | { | 
|---|
| 2459 | enum prs_errcode prs_err; | 
|---|
| 2460 |  | 
|---|
| 2461 | if (cs_is_member(cs)) | 
|---|
| 2462 | return; | 
|---|
| 2463 |  | 
|---|
| 2464 | prs_err = validate_partition(cs, trialcs); | 
|---|
| 2465 | if (prs_err) | 
|---|
| 2466 | trialcs->prs_err = cs->prs_err = prs_err; | 
|---|
| 2467 |  | 
|---|
| 2468 | if (is_remote_partition(cs)) { | 
|---|
| 2469 | if (trialcs->prs_err) | 
|---|
| 2470 | remote_partition_disable(cs, tmp); | 
|---|
| 2471 | else | 
|---|
| 2472 | remote_cpus_update(cs, xcpus: trialcs->exclusive_cpus, | 
|---|
| 2473 | excpus: trialcs->effective_xcpus, tmp); | 
|---|
| 2474 | } else { | 
|---|
| 2475 | if (trialcs->prs_err) | 
|---|
| 2476 | update_parent_effective_cpumask(cs, cmd: partcmd_invalidate, | 
|---|
| 2477 | NULL, tmp); | 
|---|
| 2478 | else | 
|---|
| 2479 | update_parent_effective_cpumask(cs, cmd: partcmd_update, | 
|---|
| 2480 | newmask: trialcs->effective_xcpus, tmp); | 
|---|
| 2481 | } | 
|---|
| 2482 | } | 
|---|
| 2483 |  | 
|---|
| 2484 | /** | 
|---|
| 2485 | * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it | 
|---|
| 2486 | * @cs: the cpuset to consider | 
|---|
| 2487 | * @trialcs: trial cpuset | 
|---|
| 2488 | * @buf: buffer of cpu numbers written to this cpuset | 
|---|
| 2489 | */ | 
|---|
| 2490 | static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, | 
|---|
| 2491 | const char *buf) | 
|---|
| 2492 | { | 
|---|
| 2493 | int retval; | 
|---|
| 2494 | struct tmpmasks tmp; | 
|---|
| 2495 | bool force = false; | 
|---|
| 2496 | int old_prs = cs->partition_root_state; | 
|---|
| 2497 |  | 
|---|
| 2498 | retval = parse_cpuset_cpulist(buf, out_mask: trialcs->cpus_allowed); | 
|---|
| 2499 | if (retval < 0) | 
|---|
| 2500 | return retval; | 
|---|
| 2501 |  | 
|---|
| 2502 | /* Nothing to do if the cpus didn't change */ | 
|---|
| 2503 | if (cpumask_equal(src1p: cs->cpus_allowed, src2p: trialcs->cpus_allowed)) | 
|---|
| 2504 | return 0; | 
|---|
| 2505 |  | 
|---|
| 2506 | if (alloc_tmpmasks(tmp: &tmp)) | 
|---|
| 2507 | return -ENOMEM; | 
|---|
| 2508 |  | 
|---|
| 2509 | compute_trialcs_excpus(trialcs, cs); | 
|---|
| 2510 | trialcs->prs_err = PERR_NONE; | 
|---|
| 2511 |  | 
|---|
| 2512 | retval = cpus_allowed_validate_change(cs, trialcs, tmp: &tmp); | 
|---|
| 2513 | if (retval < 0) | 
|---|
| 2514 | goto out_free; | 
|---|
| 2515 |  | 
|---|
| 2516 | /* | 
|---|
| 2517 | * Check all the descendants in update_cpumasks_hier() if | 
|---|
| 2518 | * effective_xcpus is to be changed. | 
|---|
| 2519 | */ | 
|---|
| 2520 | force = !cpumask_equal(src1p: cs->effective_xcpus, src2p: trialcs->effective_xcpus); | 
|---|
| 2521 |  | 
|---|
| 2522 | partition_cpus_change(cs, trialcs, tmp: &tmp); | 
|---|
| 2523 |  | 
|---|
| 2524 | spin_lock_irq(lock: &callback_lock); | 
|---|
| 2525 | cpumask_copy(dstp: cs->cpus_allowed, srcp: trialcs->cpus_allowed); | 
|---|
| 2526 | cpumask_copy(dstp: cs->effective_xcpus, srcp: trialcs->effective_xcpus); | 
|---|
| 2527 | if ((old_prs > 0) && !is_partition_valid(cs)) | 
|---|
| 2528 | reset_partition_data(cs); | 
|---|
| 2529 | spin_unlock_irq(lock: &callback_lock); | 
|---|
| 2530 |  | 
|---|
| 2531 | /* effective_cpus/effective_xcpus will be updated here */ | 
|---|
| 2532 | update_cpumasks_hier(cs, tmp: &tmp, force); | 
|---|
| 2533 |  | 
|---|
| 2534 | /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ | 
|---|
| 2535 | if (cs->partition_root_state) | 
|---|
| 2536 | update_partition_sd_lb(cs, old_prs); | 
|---|
| 2537 | out_free: | 
|---|
| 2538 | free_tmpmasks(tmp: &tmp); | 
|---|
| 2539 | return retval; | 
|---|
| 2540 | } | 
|---|
| 2541 |  | 
|---|
| 2542 | /** | 
|---|
| 2543 | * update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset | 
|---|
| 2544 | * @cs: the cpuset to consider | 
|---|
| 2545 | * @trialcs: trial cpuset | 
|---|
| 2546 | * @buf: buffer of cpu numbers written to this cpuset | 
|---|
| 2547 | * | 
|---|
| 2548 | * The tasks' cpumask will be updated if cs is a valid partition root. | 
|---|
| 2549 | */ | 
|---|
| 2550 | static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, | 
|---|
| 2551 | const char *buf) | 
|---|
| 2552 | { | 
|---|
| 2553 | int retval; | 
|---|
| 2554 | struct tmpmasks tmp; | 
|---|
| 2555 | bool force = false; | 
|---|
| 2556 | int old_prs = cs->partition_root_state; | 
|---|
| 2557 |  | 
|---|
| 2558 | retval = parse_cpuset_cpulist(buf, out_mask: trialcs->exclusive_cpus); | 
|---|
| 2559 | if (retval < 0) | 
|---|
| 2560 | return retval; | 
|---|
| 2561 |  | 
|---|
| 2562 | /* Nothing to do if the CPUs didn't change */ | 
|---|
| 2563 | if (cpumask_equal(src1p: cs->exclusive_cpus, src2p: trialcs->exclusive_cpus)) | 
|---|
| 2564 | return 0; | 
|---|
| 2565 |  | 
|---|
| 2566 | /* | 
|---|
| 2567 | * Reject the change if there is exclusive CPUs conflict with | 
|---|
| 2568 | * the siblings. | 
|---|
| 2569 | */ | 
|---|
| 2570 | if (compute_trialcs_excpus(trialcs, cs)) | 
|---|
| 2571 | return -EINVAL; | 
|---|
| 2572 |  | 
|---|
| 2573 | /* | 
|---|
| 2574 | * Check all the descendants in update_cpumasks_hier() if | 
|---|
| 2575 | * effective_xcpus is to be changed. | 
|---|
| 2576 | */ | 
|---|
| 2577 | force = !cpumask_equal(src1p: cs->effective_xcpus, src2p: trialcs->effective_xcpus); | 
|---|
| 2578 |  | 
|---|
| 2579 | retval = validate_change(cur: cs, trial: trialcs); | 
|---|
| 2580 | if (retval) | 
|---|
| 2581 | return retval; | 
|---|
| 2582 |  | 
|---|
| 2583 | if (alloc_tmpmasks(tmp: &tmp)) | 
|---|
| 2584 | return -ENOMEM; | 
|---|
| 2585 |  | 
|---|
| 2586 | trialcs->prs_err = PERR_NONE; | 
|---|
| 2587 | partition_cpus_change(cs, trialcs, tmp: &tmp); | 
|---|
| 2588 |  | 
|---|
| 2589 | spin_lock_irq(lock: &callback_lock); | 
|---|
| 2590 | cpumask_copy(dstp: cs->exclusive_cpus, srcp: trialcs->exclusive_cpus); | 
|---|
| 2591 | cpumask_copy(dstp: cs->effective_xcpus, srcp: trialcs->effective_xcpus); | 
|---|
| 2592 | if ((old_prs > 0) && !is_partition_valid(cs)) | 
|---|
| 2593 | reset_partition_data(cs); | 
|---|
| 2594 | spin_unlock_irq(lock: &callback_lock); | 
|---|
| 2595 |  | 
|---|
| 2596 | /* | 
|---|
| 2597 | * Call update_cpumasks_hier() to update effective_cpus/effective_xcpus | 
|---|
| 2598 | * of the subtree when it is a valid partition root or effective_xcpus | 
|---|
| 2599 | * is updated. | 
|---|
| 2600 | */ | 
|---|
| 2601 | if (is_partition_valid(cs) || force) | 
|---|
| 2602 | update_cpumasks_hier(cs, tmp: &tmp, force); | 
|---|
| 2603 |  | 
|---|
| 2604 | /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ | 
|---|
| 2605 | if (cs->partition_root_state) | 
|---|
| 2606 | update_partition_sd_lb(cs, old_prs); | 
|---|
| 2607 |  | 
|---|
| 2608 | free_tmpmasks(tmp: &tmp); | 
|---|
| 2609 | return 0; | 
|---|
| 2610 | } | 
|---|
| 2611 |  | 
|---|
| 2612 | /* | 
|---|
| 2613 | * Migrate memory region from one set of nodes to another.  This is | 
|---|
| 2614 | * performed asynchronously as it can be called from process migration path | 
|---|
| 2615 | * holding locks involved in process management.  All mm migrations are | 
|---|
| 2616 | * performed in the queued order and can be waited for by flushing | 
|---|
| 2617 | * cpuset_migrate_mm_wq. | 
|---|
| 2618 | */ | 
|---|
| 2619 |  | 
|---|
| 2620 | struct cpuset_migrate_mm_work { | 
|---|
| 2621 | struct work_struct	work; | 
|---|
| 2622 | struct mm_struct	*mm; | 
|---|
| 2623 | nodemask_t		from; | 
|---|
| 2624 | nodemask_t		to; | 
|---|
| 2625 | }; | 
|---|
| 2626 |  | 
|---|
| 2627 | static void cpuset_migrate_mm_workfn(struct work_struct *work) | 
|---|
| 2628 | { | 
|---|
| 2629 | struct cpuset_migrate_mm_work *mwork = | 
|---|
| 2630 | container_of(work, struct cpuset_migrate_mm_work, work); | 
|---|
| 2631 |  | 
|---|
| 2632 | /* on a wq worker, no need to worry about %current's mems_allowed */ | 
|---|
| 2633 | do_migrate_pages(mm: mwork->mm, from: &mwork->from, to: &mwork->to, MPOL_MF_MOVE_ALL); | 
|---|
| 2634 | mmput(mwork->mm); | 
|---|
| 2635 | kfree(objp: mwork); | 
|---|
| 2636 | } | 
|---|
| 2637 |  | 
|---|
| 2638 | static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, | 
|---|
| 2639 | const nodemask_t *to) | 
|---|
| 2640 | { | 
|---|
| 2641 | struct cpuset_migrate_mm_work *mwork; | 
|---|
| 2642 |  | 
|---|
| 2643 | if (nodes_equal(*from, *to)) { | 
|---|
| 2644 | mmput(mm); | 
|---|
| 2645 | return; | 
|---|
| 2646 | } | 
|---|
| 2647 |  | 
|---|
| 2648 | mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); | 
|---|
| 2649 | if (mwork) { | 
|---|
| 2650 | mwork->mm = mm; | 
|---|
| 2651 | mwork->from = *from; | 
|---|
| 2652 | mwork->to = *to; | 
|---|
| 2653 | INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn); | 
|---|
| 2654 | queue_work(wq: cpuset_migrate_mm_wq, work: &mwork->work); | 
|---|
| 2655 | } else { | 
|---|
| 2656 | mmput(mm); | 
|---|
| 2657 | } | 
|---|
| 2658 | } | 
|---|
| 2659 |  | 
|---|
| 2660 | static void flush_migrate_mm_task_workfn(struct callback_head *head) | 
|---|
| 2661 | { | 
|---|
| 2662 | flush_workqueue(cpuset_migrate_mm_wq); | 
|---|
| 2663 | kfree(objp: head); | 
|---|
| 2664 | } | 
|---|
| 2665 |  | 
|---|
| 2666 | static void schedule_flush_migrate_mm(void) | 
|---|
| 2667 | { | 
|---|
| 2668 | struct callback_head *flush_cb; | 
|---|
| 2669 |  | 
|---|
| 2670 | flush_cb = kzalloc(sizeof(struct callback_head), GFP_KERNEL); | 
|---|
| 2671 | if (!flush_cb) | 
|---|
| 2672 | return; | 
|---|
| 2673 |  | 
|---|
| 2674 | init_task_work(twork: flush_cb, func: flush_migrate_mm_task_workfn); | 
|---|
| 2675 |  | 
|---|
| 2676 | if (task_work_add(current, twork: flush_cb, mode: TWA_RESUME)) | 
|---|
| 2677 | kfree(objp: flush_cb); | 
|---|
| 2678 | } | 
|---|
| 2679 |  | 
|---|
| 2680 | /* | 
|---|
| 2681 | * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy | 
|---|
| 2682 | * @tsk: the task to change | 
|---|
| 2683 | * @newmems: new nodes that the task will be set | 
|---|
| 2684 | * | 
|---|
| 2685 | * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed | 
|---|
| 2686 | * and rebind an eventual tasks' mempolicy. If the task is allocating in | 
|---|
| 2687 | * parallel, it might temporarily see an empty intersection, which results in | 
|---|
| 2688 | * a seqlock check and retry before OOM or allocation failure. | 
|---|
| 2689 | */ | 
|---|
| 2690 | static void cpuset_change_task_nodemask(struct task_struct *tsk, | 
|---|
| 2691 | nodemask_t *newmems) | 
|---|
| 2692 | { | 
|---|
| 2693 | task_lock(p: tsk); | 
|---|
| 2694 |  | 
|---|
| 2695 | local_irq_disable(); | 
|---|
| 2696 | write_seqcount_begin(&tsk->mems_allowed_seq); | 
|---|
| 2697 |  | 
|---|
| 2698 | nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); | 
|---|
| 2699 | mpol_rebind_task(tsk, new: newmems); | 
|---|
| 2700 | tsk->mems_allowed = *newmems; | 
|---|
| 2701 |  | 
|---|
| 2702 | write_seqcount_end(&tsk->mems_allowed_seq); | 
|---|
| 2703 | local_irq_enable(); | 
|---|
| 2704 |  | 
|---|
| 2705 | task_unlock(p: tsk); | 
|---|
| 2706 | } | 
|---|
| 2707 |  | 
|---|
| 2708 | static void *cpuset_being_rebound; | 
|---|
| 2709 |  | 
|---|
| 2710 | /** | 
|---|
| 2711 | * cpuset_update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. | 
|---|
| 2712 | * @cs: the cpuset in which each task's mems_allowed mask needs to be changed | 
|---|
| 2713 | * | 
|---|
| 2714 | * Iterate through each task of @cs updating its mems_allowed to the | 
|---|
| 2715 | * effective cpuset's.  As this function is called with cpuset_mutex held, | 
|---|
| 2716 | * cpuset membership stays stable. | 
|---|
| 2717 | */ | 
|---|
| 2718 | void cpuset_update_tasks_nodemask(struct cpuset *cs) | 
|---|
| 2719 | { | 
|---|
| 2720 | static nodemask_t newmems;	/* protected by cpuset_mutex */ | 
|---|
| 2721 | struct css_task_iter it; | 
|---|
| 2722 | struct task_struct *task; | 
|---|
| 2723 |  | 
|---|
| 2724 | cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */ | 
|---|
| 2725 |  | 
|---|
| 2726 | guarantee_online_mems(cs, pmask: &newmems); | 
|---|
| 2727 |  | 
|---|
| 2728 | /* | 
|---|
| 2729 | * The mpol_rebind_mm() call takes mmap_lock, which we couldn't | 
|---|
| 2730 | * take while holding tasklist_lock.  Forks can happen - the | 
|---|
| 2731 | * mpol_dup() cpuset_being_rebound check will catch such forks, | 
|---|
| 2732 | * and rebind their vma mempolicies too.  Because we still hold | 
|---|
| 2733 | * the global cpuset_mutex, we know that no other rebind effort | 
|---|
| 2734 | * will be contending for the global variable cpuset_being_rebound. | 
|---|
| 2735 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() | 
|---|
| 2736 | * is idempotent.  Also migrate pages in each mm to new nodes. | 
|---|
| 2737 | */ | 
|---|
| 2738 | css_task_iter_start(css: &cs->css, flags: 0, it: &it); | 
|---|
| 2739 | while ((task = css_task_iter_next(it: &it))) { | 
|---|
| 2740 | struct mm_struct *mm; | 
|---|
| 2741 | bool migrate; | 
|---|
| 2742 |  | 
|---|
| 2743 | cpuset_change_task_nodemask(tsk: task, newmems: &newmems); | 
|---|
| 2744 |  | 
|---|
| 2745 | mm = get_task_mm(task); | 
|---|
| 2746 | if (!mm) | 
|---|
| 2747 | continue; | 
|---|
| 2748 |  | 
|---|
| 2749 | migrate = is_memory_migrate(cs); | 
|---|
| 2750 |  | 
|---|
| 2751 | mpol_rebind_mm(mm, new: &cs->mems_allowed); | 
|---|
| 2752 | if (migrate) | 
|---|
| 2753 | cpuset_migrate_mm(mm, from: &cs->old_mems_allowed, to: &newmems); | 
|---|
| 2754 | else | 
|---|
| 2755 | mmput(mm); | 
|---|
| 2756 | } | 
|---|
| 2757 | css_task_iter_end(it: &it); | 
|---|
| 2758 |  | 
|---|
| 2759 | /* | 
|---|
| 2760 | * All the tasks' nodemasks have been updated, update | 
|---|
| 2761 | * cs->old_mems_allowed. | 
|---|
| 2762 | */ | 
|---|
| 2763 | cs->old_mems_allowed = newmems; | 
|---|
| 2764 |  | 
|---|
| 2765 | /* We're done rebinding vmas to this cpuset's new mems_allowed. */ | 
|---|
| 2766 | cpuset_being_rebound = NULL; | 
|---|
| 2767 | } | 
|---|
| 2768 |  | 
|---|
| 2769 | /* | 
|---|
| 2770 | * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree | 
|---|
| 2771 | * @cs: the cpuset to consider | 
|---|
| 2772 | * @new_mems: a temp variable for calculating new effective_mems | 
|---|
| 2773 | * | 
|---|
| 2774 | * When configured nodemask is changed, the effective nodemasks of this cpuset | 
|---|
| 2775 | * and all its descendants need to be updated. | 
|---|
| 2776 | * | 
|---|
| 2777 | * On legacy hierarchy, effective_mems will be the same with mems_allowed. | 
|---|
| 2778 | * | 
|---|
| 2779 | * Called with cpuset_mutex held | 
|---|
| 2780 | */ | 
|---|
| 2781 | static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) | 
|---|
| 2782 | { | 
|---|
| 2783 | struct cpuset *cp; | 
|---|
| 2784 | struct cgroup_subsys_state *pos_css; | 
|---|
| 2785 |  | 
|---|
| 2786 | rcu_read_lock(); | 
|---|
| 2787 | cpuset_for_each_descendant_pre(cp, pos_css, cs) { | 
|---|
| 2788 | struct cpuset *parent = parent_cs(cs: cp); | 
|---|
| 2789 |  | 
|---|
| 2790 | nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems); | 
|---|
| 2791 |  | 
|---|
| 2792 | /* | 
|---|
| 2793 | * If it becomes empty, inherit the effective mask of the | 
|---|
| 2794 | * parent, which is guaranteed to have some MEMs. | 
|---|
| 2795 | */ | 
|---|
| 2796 | if (is_in_v2_mode() && nodes_empty(*new_mems)) | 
|---|
| 2797 | *new_mems = parent->effective_mems; | 
|---|
| 2798 |  | 
|---|
| 2799 | /* Skip the whole subtree if the nodemask remains the same. */ | 
|---|
| 2800 | if (nodes_equal(*new_mems, cp->effective_mems)) { | 
|---|
| 2801 | pos_css = css_rightmost_descendant(pos: pos_css); | 
|---|
| 2802 | continue; | 
|---|
| 2803 | } | 
|---|
| 2804 |  | 
|---|
| 2805 | if (!css_tryget_online(css: &cp->css)) | 
|---|
| 2806 | continue; | 
|---|
| 2807 | rcu_read_unlock(); | 
|---|
| 2808 |  | 
|---|
| 2809 | spin_lock_irq(lock: &callback_lock); | 
|---|
| 2810 | cp->effective_mems = *new_mems; | 
|---|
| 2811 | spin_unlock_irq(lock: &callback_lock); | 
|---|
| 2812 |  | 
|---|
| 2813 | WARN_ON(!is_in_v2_mode() && | 
|---|
| 2814 | !nodes_equal(cp->mems_allowed, cp->effective_mems)); | 
|---|
| 2815 |  | 
|---|
| 2816 | cpuset_update_tasks_nodemask(cs: cp); | 
|---|
| 2817 |  | 
|---|
| 2818 | rcu_read_lock(); | 
|---|
| 2819 | css_put(css: &cp->css); | 
|---|
| 2820 | } | 
|---|
| 2821 | rcu_read_unlock(); | 
|---|
| 2822 | } | 
|---|
| 2823 |  | 
|---|
| 2824 | /* | 
|---|
| 2825 | * Handle user request to change the 'mems' memory placement | 
|---|
| 2826 | * of a cpuset.  Needs to validate the request, update the | 
|---|
| 2827 | * cpusets mems_allowed, and for each task in the cpuset, | 
|---|
| 2828 | * update mems_allowed and rebind task's mempolicy and any vma | 
|---|
| 2829 | * mempolicies and if the cpuset is marked 'memory_migrate', | 
|---|
| 2830 | * migrate the tasks pages to the new memory. | 
|---|
| 2831 | * | 
|---|
| 2832 | * Call with cpuset_mutex held. May take callback_lock during call. | 
|---|
| 2833 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, | 
|---|
| 2834 | * lock each such tasks mm->mmap_lock, scan its vma's and rebind | 
|---|
| 2835 | * their mempolicies to the cpusets new mems_allowed. | 
|---|
| 2836 | */ | 
|---|
| 2837 | static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | 
|---|
| 2838 | const char *buf) | 
|---|
| 2839 | { | 
|---|
| 2840 | int retval; | 
|---|
| 2841 |  | 
|---|
| 2842 | /* | 
|---|
| 2843 | * An empty mems_allowed is ok iff there are no tasks in the cpuset. | 
|---|
| 2844 | * The validate_change() call ensures that cpusets with tasks have memory. | 
|---|
| 2845 | */ | 
|---|
| 2846 | retval = nodelist_parse(buf, trialcs->mems_allowed); | 
|---|
| 2847 | if (retval < 0) | 
|---|
| 2848 | goto done; | 
|---|
| 2849 |  | 
|---|
| 2850 | if (!nodes_subset(trialcs->mems_allowed, | 
|---|
| 2851 | top_cpuset.mems_allowed)) { | 
|---|
| 2852 | retval = -EINVAL; | 
|---|
| 2853 | goto done; | 
|---|
| 2854 | } | 
|---|
| 2855 |  | 
|---|
| 2856 | if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) { | 
|---|
| 2857 | retval = 0;		/* Too easy - nothing to do */ | 
|---|
| 2858 | goto done; | 
|---|
| 2859 | } | 
|---|
| 2860 | retval = validate_change(cur: cs, trial: trialcs); | 
|---|
| 2861 | if (retval < 0) | 
|---|
| 2862 | goto done; | 
|---|
| 2863 |  | 
|---|
| 2864 | check_insane_mems_config(nodes: &trialcs->mems_allowed); | 
|---|
| 2865 |  | 
|---|
| 2866 | spin_lock_irq(lock: &callback_lock); | 
|---|
| 2867 | cs->mems_allowed = trialcs->mems_allowed; | 
|---|
| 2868 | spin_unlock_irq(lock: &callback_lock); | 
|---|
| 2869 |  | 
|---|
| 2870 | /* use trialcs->mems_allowed as a temp variable */ | 
|---|
| 2871 | update_nodemasks_hier(cs, new_mems: &trialcs->mems_allowed); | 
|---|
| 2872 | done: | 
|---|
| 2873 | return retval; | 
|---|
| 2874 | } | 
|---|
| 2875 |  | 
|---|
| 2876 | bool current_cpuset_is_being_rebound(void) | 
|---|
| 2877 | { | 
|---|
| 2878 | bool ret; | 
|---|
| 2879 |  | 
|---|
| 2880 | rcu_read_lock(); | 
|---|
| 2881 | ret = task_cs(current) == cpuset_being_rebound; | 
|---|
| 2882 | rcu_read_unlock(); | 
|---|
| 2883 |  | 
|---|
| 2884 | return ret; | 
|---|
| 2885 | } | 
|---|
| 2886 |  | 
|---|
| 2887 | /* | 
|---|
| 2888 | * cpuset_update_flag - read a 0 or a 1 in a file and update associated flag | 
|---|
| 2889 | * bit:		the bit to update (see cpuset_flagbits_t) | 
|---|
| 2890 | * cs:		the cpuset to update | 
|---|
| 2891 | * turning_on: 	whether the flag is being set or cleared | 
|---|
| 2892 | * | 
|---|
| 2893 | * Call with cpuset_mutex held. | 
|---|
| 2894 | */ | 
|---|
| 2895 |  | 
|---|
| 2896 | int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, | 
|---|
| 2897 | int turning_on) | 
|---|
| 2898 | { | 
|---|
| 2899 | struct cpuset *trialcs; | 
|---|
| 2900 | int balance_flag_changed; | 
|---|
| 2901 | int spread_flag_changed; | 
|---|
| 2902 | int err; | 
|---|
| 2903 |  | 
|---|
| 2904 | trialcs = dup_or_alloc_cpuset(cs); | 
|---|
| 2905 | if (!trialcs) | 
|---|
| 2906 | return -ENOMEM; | 
|---|
| 2907 |  | 
|---|
| 2908 | if (turning_on) | 
|---|
| 2909 | set_bit(nr: bit, addr: &trialcs->flags); | 
|---|
| 2910 | else | 
|---|
| 2911 | clear_bit(nr: bit, addr: &trialcs->flags); | 
|---|
| 2912 |  | 
|---|
| 2913 | err = validate_change(cur: cs, trial: trialcs); | 
|---|
| 2914 | if (err < 0) | 
|---|
| 2915 | goto out; | 
|---|
| 2916 |  | 
|---|
| 2917 | balance_flag_changed = (is_sched_load_balance(cs) != | 
|---|
| 2918 | is_sched_load_balance(cs: trialcs)); | 
|---|
| 2919 |  | 
|---|
| 2920 | spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(cs: trialcs)) | 
|---|
| 2921 | || (is_spread_page(cs) != is_spread_page(cs: trialcs))); | 
|---|
| 2922 |  | 
|---|
| 2923 | spin_lock_irq(lock: &callback_lock); | 
|---|
| 2924 | cs->flags = trialcs->flags; | 
|---|
| 2925 | spin_unlock_irq(lock: &callback_lock); | 
|---|
| 2926 |  | 
|---|
| 2927 | if (!cpumask_empty(srcp: trialcs->cpus_allowed) && balance_flag_changed) { | 
|---|
| 2928 | if (cpuset_v2()) | 
|---|
| 2929 | cpuset_force_rebuild(); | 
|---|
| 2930 | else | 
|---|
| 2931 | rebuild_sched_domains_locked(); | 
|---|
| 2932 | } | 
|---|
| 2933 |  | 
|---|
| 2934 | if (spread_flag_changed) | 
|---|
| 2935 | cpuset1_update_tasks_flags(cs); | 
|---|
| 2936 | out: | 
|---|
| 2937 | free_cpuset(cs: trialcs); | 
|---|
| 2938 | return err; | 
|---|
| 2939 | } | 
|---|
| 2940 |  | 
|---|
| 2941 | /** | 
|---|
| 2942 | * update_prstate - update partition_root_state | 
|---|
| 2943 | * @cs: the cpuset to update | 
|---|
| 2944 | * @new_prs: new partition root state | 
|---|
| 2945 | * Return: 0 if successful, != 0 if error | 
|---|
| 2946 | * | 
|---|
| 2947 | * Call with cpuset_mutex held. | 
|---|
| 2948 | */ | 
|---|
| 2949 | static int update_prstate(struct cpuset *cs, int new_prs) | 
|---|
| 2950 | { | 
|---|
| 2951 | int err = PERR_NONE, old_prs = cs->partition_root_state; | 
|---|
| 2952 | struct cpuset *parent = parent_cs(cs); | 
|---|
| 2953 | struct tmpmasks tmpmask; | 
|---|
| 2954 | bool isolcpus_updated = false; | 
|---|
| 2955 |  | 
|---|
| 2956 | if (old_prs == new_prs) | 
|---|
| 2957 | return 0; | 
|---|
| 2958 |  | 
|---|
| 2959 | /* | 
|---|
| 2960 | * Treat a previously invalid partition root as if it is a "member". | 
|---|
| 2961 | */ | 
|---|
| 2962 | if (new_prs && is_partition_invalid(cs)) | 
|---|
| 2963 | old_prs = PRS_MEMBER; | 
|---|
| 2964 |  | 
|---|
| 2965 | if (alloc_tmpmasks(tmp: &tmpmask)) | 
|---|
| 2966 | return -ENOMEM; | 
|---|
| 2967 |  | 
|---|
| 2968 | err = update_partition_exclusive_flag(cs, new_prs); | 
|---|
| 2969 | if (err) | 
|---|
| 2970 | goto out; | 
|---|
| 2971 |  | 
|---|
| 2972 | if (!old_prs) { | 
|---|
| 2973 | /* | 
|---|
| 2974 | * cpus_allowed and exclusive_cpus cannot be both empty. | 
|---|
| 2975 | */ | 
|---|
| 2976 | if (xcpus_empty(cs)) { | 
|---|
| 2977 | err = PERR_CPUSEMPTY; | 
|---|
| 2978 | goto out; | 
|---|
| 2979 | } | 
|---|
| 2980 |  | 
|---|
| 2981 | /* | 
|---|
| 2982 | * We don't support the creation of a new local partition with | 
|---|
| 2983 | * a remote partition underneath it. This unsupported | 
|---|
| 2984 | * setting can happen only if parent is the top_cpuset because | 
|---|
| 2985 | * a remote partition cannot be created underneath an existing | 
|---|
| 2986 | * local or remote partition. | 
|---|
| 2987 | */ | 
|---|
| 2988 | if ((parent == &top_cpuset) && | 
|---|
| 2989 | cpumask_intersects(src1p: cs->exclusive_cpus, src2p: subpartitions_cpus)) { | 
|---|
| 2990 | err = PERR_REMOTE; | 
|---|
| 2991 | goto out; | 
|---|
| 2992 | } | 
|---|
| 2993 |  | 
|---|
| 2994 | /* | 
|---|
| 2995 | * If parent is valid partition, enable local partiion. | 
|---|
| 2996 | * Otherwise, enable a remote partition. | 
|---|
| 2997 | */ | 
|---|
| 2998 | if (is_partition_valid(cs: parent)) { | 
|---|
| 2999 | enum partition_cmd cmd = (new_prs == PRS_ROOT) | 
|---|
| 3000 | ? partcmd_enable : partcmd_enablei; | 
|---|
| 3001 |  | 
|---|
| 3002 | err = update_parent_effective_cpumask(cs, cmd, NULL, tmp: &tmpmask); | 
|---|
| 3003 | } else { | 
|---|
| 3004 | err = remote_partition_enable(cs, new_prs, tmp: &tmpmask); | 
|---|
| 3005 | } | 
|---|
| 3006 | } else if (old_prs && new_prs) { | 
|---|
| 3007 | /* | 
|---|
| 3008 | * A change in load balance state only, no change in cpumasks. | 
|---|
| 3009 | * Need to update isolated_cpus. | 
|---|
| 3010 | */ | 
|---|
| 3011 | isolcpus_updated = true; | 
|---|
| 3012 | } else { | 
|---|
| 3013 | /* | 
|---|
| 3014 | * Switching back to member is always allowed even if it | 
|---|
| 3015 | * disables child partitions. | 
|---|
| 3016 | */ | 
|---|
| 3017 | if (is_remote_partition(cs)) | 
|---|
| 3018 | remote_partition_disable(cs, tmp: &tmpmask); | 
|---|
| 3019 | else | 
|---|
| 3020 | update_parent_effective_cpumask(cs, cmd: partcmd_disable, | 
|---|
| 3021 | NULL, tmp: &tmpmask); | 
|---|
| 3022 |  | 
|---|
| 3023 | /* | 
|---|
| 3024 | * Invalidation of child partitions will be done in | 
|---|
| 3025 | * update_cpumasks_hier(). | 
|---|
| 3026 | */ | 
|---|
| 3027 | } | 
|---|
| 3028 | out: | 
|---|
| 3029 | /* | 
|---|
| 3030 | * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error | 
|---|
| 3031 | * happens. | 
|---|
| 3032 | */ | 
|---|
| 3033 | if (err) { | 
|---|
| 3034 | new_prs = -new_prs; | 
|---|
| 3035 | update_partition_exclusive_flag(cs, new_prs); | 
|---|
| 3036 | } | 
|---|
| 3037 |  | 
|---|
| 3038 | spin_lock_irq(lock: &callback_lock); | 
|---|
| 3039 | cs->partition_root_state = new_prs; | 
|---|
| 3040 | WRITE_ONCE(cs->prs_err, err); | 
|---|
| 3041 | if (!is_partition_valid(cs)) | 
|---|
| 3042 | reset_partition_data(cs); | 
|---|
| 3043 | else if (isolcpus_updated) | 
|---|
| 3044 | isolated_cpus_update(old_prs, new_prs, xcpus: cs->effective_xcpus); | 
|---|
| 3045 | spin_unlock_irq(lock: &callback_lock); | 
|---|
| 3046 | update_unbound_workqueue_cpumask(isolcpus_updated); | 
|---|
| 3047 |  | 
|---|
| 3048 | /* Force update if switching back to member & update effective_xcpus */ | 
|---|
| 3049 | update_cpumasks_hier(cs, tmp: &tmpmask, force: !new_prs); | 
|---|
| 3050 |  | 
|---|
| 3051 | /* A newly created partition must have effective_xcpus set */ | 
|---|
| 3052 | WARN_ON_ONCE(!old_prs && (new_prs > 0) | 
|---|
| 3053 | && cpumask_empty(cs->effective_xcpus)); | 
|---|
| 3054 |  | 
|---|
| 3055 | /* Update sched domains and load balance flag */ | 
|---|
| 3056 | update_partition_sd_lb(cs, old_prs); | 
|---|
| 3057 |  | 
|---|
| 3058 | notify_partition_change(cs, old_prs); | 
|---|
| 3059 | if (force_sd_rebuild) | 
|---|
| 3060 | rebuild_sched_domains_locked(); | 
|---|
| 3061 | free_tmpmasks(tmp: &tmpmask); | 
|---|
| 3062 | return 0; | 
|---|
| 3063 | } | 
|---|
| 3064 |  | 
|---|
| 3065 | static struct cpuset *cpuset_attach_old_cs; | 
|---|
| 3066 |  | 
|---|
| 3067 | /* | 
|---|
| 3068 | * Check to see if a cpuset can accept a new task | 
|---|
| 3069 | * For v1, cpus_allowed and mems_allowed can't be empty. | 
|---|
| 3070 | * For v2, effective_cpus can't be empty. | 
|---|
| 3071 | * Note that in v1, effective_cpus = cpus_allowed. | 
|---|
| 3072 | */ | 
|---|
| 3073 | static int cpuset_can_attach_check(struct cpuset *cs) | 
|---|
| 3074 | { | 
|---|
| 3075 | if (cpumask_empty(srcp: cs->effective_cpus) || | 
|---|
| 3076 | (!is_in_v2_mode() && nodes_empty(cs->mems_allowed))) | 
|---|
| 3077 | return -ENOSPC; | 
|---|
| 3078 | return 0; | 
|---|
| 3079 | } | 
|---|
| 3080 |  | 
|---|
| 3081 | static void reset_migrate_dl_data(struct cpuset *cs) | 
|---|
| 3082 | { | 
|---|
| 3083 | cs->nr_migrate_dl_tasks = 0; | 
|---|
| 3084 | cs->sum_migrate_dl_bw = 0; | 
|---|
| 3085 | } | 
|---|
| 3086 |  | 
|---|
| 3087 | /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ | 
|---|
| 3088 | static int cpuset_can_attach(struct cgroup_taskset *tset) | 
|---|
| 3089 | { | 
|---|
| 3090 | struct cgroup_subsys_state *css; | 
|---|
| 3091 | struct cpuset *cs, *oldcs; | 
|---|
| 3092 | struct task_struct *task; | 
|---|
| 3093 | bool cpus_updated, mems_updated; | 
|---|
| 3094 | int ret; | 
|---|
| 3095 |  | 
|---|
| 3096 | /* used later by cpuset_attach() */ | 
|---|
| 3097 | cpuset_attach_old_cs = task_cs(task: cgroup_taskset_first(tset, dst_cssp: &css)); | 
|---|
| 3098 | oldcs = cpuset_attach_old_cs; | 
|---|
| 3099 | cs = css_cs(css); | 
|---|
| 3100 |  | 
|---|
| 3101 | mutex_lock(lock: &cpuset_mutex); | 
|---|
| 3102 |  | 
|---|
| 3103 | /* Check to see if task is allowed in the cpuset */ | 
|---|
| 3104 | ret = cpuset_can_attach_check(cs); | 
|---|
| 3105 | if (ret) | 
|---|
| 3106 | goto out_unlock; | 
|---|
| 3107 |  | 
|---|
| 3108 | cpus_updated = !cpumask_equal(src1p: cs->effective_cpus, src2p: oldcs->effective_cpus); | 
|---|
| 3109 | mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); | 
|---|
| 3110 |  | 
|---|
| 3111 | cgroup_taskset_for_each(task, css, tset) { | 
|---|
| 3112 | ret = task_can_attach(p: task); | 
|---|
| 3113 | if (ret) | 
|---|
| 3114 | goto out_unlock; | 
|---|
| 3115 |  | 
|---|
| 3116 | /* | 
|---|
| 3117 | * Skip rights over task check in v2 when nothing changes, | 
|---|
| 3118 | * migration permission derives from hierarchy ownership in | 
|---|
| 3119 | * cgroup_procs_write_permission()). | 
|---|
| 3120 | */ | 
|---|
| 3121 | if (!cpuset_v2() || (cpus_updated || mems_updated)) { | 
|---|
| 3122 | ret = security_task_setscheduler(p: task); | 
|---|
| 3123 | if (ret) | 
|---|
| 3124 | goto out_unlock; | 
|---|
| 3125 | } | 
|---|
| 3126 |  | 
|---|
| 3127 | if (dl_task(p: task)) { | 
|---|
| 3128 | cs->nr_migrate_dl_tasks++; | 
|---|
| 3129 | cs->sum_migrate_dl_bw += task->dl.dl_bw; | 
|---|
| 3130 | } | 
|---|
| 3131 | } | 
|---|
| 3132 |  | 
|---|
| 3133 | if (!cs->nr_migrate_dl_tasks) | 
|---|
| 3134 | goto out_success; | 
|---|
| 3135 |  | 
|---|
| 3136 | if (!cpumask_intersects(src1p: oldcs->effective_cpus, src2p: cs->effective_cpus)) { | 
|---|
| 3137 | int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); | 
|---|
| 3138 |  | 
|---|
| 3139 | if (unlikely(cpu >= nr_cpu_ids)) { | 
|---|
| 3140 | reset_migrate_dl_data(cs); | 
|---|
| 3141 | ret = -EINVAL; | 
|---|
| 3142 | goto out_unlock; | 
|---|
| 3143 | } | 
|---|
| 3144 |  | 
|---|
| 3145 | ret = dl_bw_alloc(cpu, dl_bw: cs->sum_migrate_dl_bw); | 
|---|
| 3146 | if (ret) { | 
|---|
| 3147 | reset_migrate_dl_data(cs); | 
|---|
| 3148 | goto out_unlock; | 
|---|
| 3149 | } | 
|---|
| 3150 | } | 
|---|
| 3151 |  | 
|---|
| 3152 | out_success: | 
|---|
| 3153 | /* | 
|---|
| 3154 | * Mark attach is in progress.  This makes validate_change() fail | 
|---|
| 3155 | * changes which zero cpus/mems_allowed. | 
|---|
| 3156 | */ | 
|---|
| 3157 | cs->attach_in_progress++; | 
|---|
| 3158 | out_unlock: | 
|---|
| 3159 | mutex_unlock(lock: &cpuset_mutex); | 
|---|
| 3160 | return ret; | 
|---|
| 3161 | } | 
|---|
| 3162 |  | 
|---|
| 3163 | static void cpuset_cancel_attach(struct cgroup_taskset *tset) | 
|---|
| 3164 | { | 
|---|
| 3165 | struct cgroup_subsys_state *css; | 
|---|
| 3166 | struct cpuset *cs; | 
|---|
| 3167 |  | 
|---|
| 3168 | cgroup_taskset_first(tset, dst_cssp: &css); | 
|---|
| 3169 | cs = css_cs(css); | 
|---|
| 3170 |  | 
|---|
| 3171 | mutex_lock(lock: &cpuset_mutex); | 
|---|
| 3172 | dec_attach_in_progress_locked(cs); | 
|---|
| 3173 |  | 
|---|
| 3174 | if (cs->nr_migrate_dl_tasks) { | 
|---|
| 3175 | int cpu = cpumask_any(cs->effective_cpus); | 
|---|
| 3176 |  | 
|---|
| 3177 | dl_bw_free(cpu, dl_bw: cs->sum_migrate_dl_bw); | 
|---|
| 3178 | reset_migrate_dl_data(cs); | 
|---|
| 3179 | } | 
|---|
| 3180 |  | 
|---|
| 3181 | mutex_unlock(lock: &cpuset_mutex); | 
|---|
| 3182 | } | 
|---|
| 3183 |  | 
|---|
| 3184 | /* | 
|---|
| 3185 | * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task() | 
|---|
| 3186 | * but we can't allocate it dynamically there.  Define it global and | 
|---|
| 3187 | * allocate from cpuset_init(). | 
|---|
| 3188 | */ | 
|---|
| 3189 | static cpumask_var_t cpus_attach; | 
|---|
| 3190 | static nodemask_t cpuset_attach_nodemask_to; | 
|---|
| 3191 |  | 
|---|
| 3192 | static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) | 
|---|
| 3193 | { | 
|---|
| 3194 | lockdep_assert_held(&cpuset_mutex); | 
|---|
| 3195 |  | 
|---|
| 3196 | if (cs != &top_cpuset) | 
|---|
| 3197 | guarantee_active_cpus(tsk: task, pmask: cpus_attach); | 
|---|
| 3198 | else | 
|---|
| 3199 | cpumask_andnot(dstp: cpus_attach, task_cpu_possible_mask(task), | 
|---|
| 3200 | src2p: subpartitions_cpus); | 
|---|
| 3201 | /* | 
|---|
| 3202 | * can_attach beforehand should guarantee that this doesn't | 
|---|
| 3203 | * fail.  TODO: have a better way to handle failure here | 
|---|
| 3204 | */ | 
|---|
| 3205 | WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); | 
|---|
| 3206 |  | 
|---|
| 3207 | cpuset_change_task_nodemask(tsk: task, newmems: &cpuset_attach_nodemask_to); | 
|---|
| 3208 | cpuset1_update_task_spread_flags(cs, tsk: task); | 
|---|
| 3209 | } | 
|---|
| 3210 |  | 
|---|
| 3211 | static void cpuset_attach(struct cgroup_taskset *tset) | 
|---|
| 3212 | { | 
|---|
| 3213 | struct task_struct *task; | 
|---|
| 3214 | struct task_struct *leader; | 
|---|
| 3215 | struct cgroup_subsys_state *css; | 
|---|
| 3216 | struct cpuset *cs; | 
|---|
| 3217 | struct cpuset *oldcs = cpuset_attach_old_cs; | 
|---|
| 3218 | bool cpus_updated, mems_updated; | 
|---|
| 3219 | bool queue_task_work = false; | 
|---|
| 3220 |  | 
|---|
| 3221 | cgroup_taskset_first(tset, dst_cssp: &css); | 
|---|
| 3222 | cs = css_cs(css); | 
|---|
| 3223 |  | 
|---|
| 3224 | lockdep_assert_cpus_held();	/* see cgroup_attach_lock() */ | 
|---|
| 3225 | mutex_lock(lock: &cpuset_mutex); | 
|---|
| 3226 | cpus_updated = !cpumask_equal(src1p: cs->effective_cpus, | 
|---|
| 3227 | src2p: oldcs->effective_cpus); | 
|---|
| 3228 | mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); | 
|---|
| 3229 |  | 
|---|
| 3230 | /* | 
|---|
| 3231 | * In the default hierarchy, enabling cpuset in the child cgroups | 
|---|
| 3232 | * will trigger a number of cpuset_attach() calls with no change | 
|---|
| 3233 | * in effective cpus and mems. In that case, we can optimize out | 
|---|
| 3234 | * by skipping the task iteration and update. | 
|---|
| 3235 | */ | 
|---|
| 3236 | if (cpuset_v2() && !cpus_updated && !mems_updated) { | 
|---|
| 3237 | cpuset_attach_nodemask_to = cs->effective_mems; | 
|---|
| 3238 | goto out; | 
|---|
| 3239 | } | 
|---|
| 3240 |  | 
|---|
| 3241 | guarantee_online_mems(cs, pmask: &cpuset_attach_nodemask_to); | 
|---|
| 3242 |  | 
|---|
| 3243 | cgroup_taskset_for_each(task, css, tset) | 
|---|
| 3244 | cpuset_attach_task(cs, task); | 
|---|
| 3245 |  | 
|---|
| 3246 | /* | 
|---|
| 3247 | * Change mm for all threadgroup leaders. This is expensive and may | 
|---|
| 3248 | * sleep and should be moved outside migration path proper. Skip it | 
|---|
| 3249 | * if there is no change in effective_mems and CS_MEMORY_MIGRATE is | 
|---|
| 3250 | * not set. | 
|---|
| 3251 | */ | 
|---|
| 3252 | cpuset_attach_nodemask_to = cs->effective_mems; | 
|---|
| 3253 | if (!is_memory_migrate(cs) && !mems_updated) | 
|---|
| 3254 | goto out; | 
|---|
| 3255 |  | 
|---|
| 3256 | cgroup_taskset_for_each_leader(leader, css, tset) { | 
|---|
| 3257 | struct mm_struct *mm = get_task_mm(task: leader); | 
|---|
| 3258 |  | 
|---|
| 3259 | if (mm) { | 
|---|
| 3260 | mpol_rebind_mm(mm, new: &cpuset_attach_nodemask_to); | 
|---|
| 3261 |  | 
|---|
| 3262 | /* | 
|---|
| 3263 | * old_mems_allowed is the same with mems_allowed | 
|---|
| 3264 | * here, except if this task is being moved | 
|---|
| 3265 | * automatically due to hotplug.  In that case | 
|---|
| 3266 | * @mems_allowed has been updated and is empty, so | 
|---|
| 3267 | * @old_mems_allowed is the right nodesets that we | 
|---|
| 3268 | * migrate mm from. | 
|---|
| 3269 | */ | 
|---|
| 3270 | if (is_memory_migrate(cs)) { | 
|---|
| 3271 | cpuset_migrate_mm(mm, from: &oldcs->old_mems_allowed, | 
|---|
| 3272 | to: &cpuset_attach_nodemask_to); | 
|---|
| 3273 | queue_task_work = true; | 
|---|
| 3274 | } else | 
|---|
| 3275 | mmput(mm); | 
|---|
| 3276 | } | 
|---|
| 3277 | } | 
|---|
| 3278 |  | 
|---|
| 3279 | out: | 
|---|
| 3280 | if (queue_task_work) | 
|---|
| 3281 | schedule_flush_migrate_mm(); | 
|---|
| 3282 | cs->old_mems_allowed = cpuset_attach_nodemask_to; | 
|---|
| 3283 |  | 
|---|
| 3284 | if (cs->nr_migrate_dl_tasks) { | 
|---|
| 3285 | cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks; | 
|---|
| 3286 | oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks; | 
|---|
| 3287 | reset_migrate_dl_data(cs); | 
|---|
| 3288 | } | 
|---|
| 3289 |  | 
|---|
| 3290 | dec_attach_in_progress_locked(cs); | 
|---|
| 3291 |  | 
|---|
| 3292 | mutex_unlock(lock: &cpuset_mutex); | 
|---|
| 3293 | } | 
|---|
| 3294 |  | 
|---|
| 3295 | /* | 
|---|
| 3296 | * Common handling for a write to a "cpus" or "mems" file. | 
|---|
| 3297 | */ | 
|---|
| 3298 | ssize_t cpuset_write_resmask(struct kernfs_open_file *of, | 
|---|
| 3299 | char *buf, size_t nbytes, loff_t off) | 
|---|
| 3300 | { | 
|---|
| 3301 | struct cpuset *cs = css_cs(css: of_css(of)); | 
|---|
| 3302 | struct cpuset *trialcs; | 
|---|
| 3303 | int retval = -ENODEV; | 
|---|
| 3304 |  | 
|---|
| 3305 | /* root is read-only */ | 
|---|
| 3306 | if (cs == &top_cpuset) | 
|---|
| 3307 | return -EACCES; | 
|---|
| 3308 |  | 
|---|
| 3309 | buf = strstrip(str: buf); | 
|---|
| 3310 | cpuset_full_lock(); | 
|---|
| 3311 | if (!is_cpuset_online(cs)) | 
|---|
| 3312 | goto out_unlock; | 
|---|
| 3313 |  | 
|---|
| 3314 | trialcs = dup_or_alloc_cpuset(cs); | 
|---|
| 3315 | if (!trialcs) { | 
|---|
| 3316 | retval = -ENOMEM; | 
|---|
| 3317 | goto out_unlock; | 
|---|
| 3318 | } | 
|---|
| 3319 |  | 
|---|
| 3320 | switch (of_cft(of)->private) { | 
|---|
| 3321 | case FILE_CPULIST: | 
|---|
| 3322 | retval = update_cpumask(cs, trialcs, buf); | 
|---|
| 3323 | break; | 
|---|
| 3324 | case FILE_EXCLUSIVE_CPULIST: | 
|---|
| 3325 | retval = update_exclusive_cpumask(cs, trialcs, buf); | 
|---|
| 3326 | break; | 
|---|
| 3327 | case FILE_MEMLIST: | 
|---|
| 3328 | retval = update_nodemask(cs, trialcs, buf); | 
|---|
| 3329 | break; | 
|---|
| 3330 | default: | 
|---|
| 3331 | retval = -EINVAL; | 
|---|
| 3332 | break; | 
|---|
| 3333 | } | 
|---|
| 3334 |  | 
|---|
| 3335 | free_cpuset(cs: trialcs); | 
|---|
| 3336 | if (force_sd_rebuild) | 
|---|
| 3337 | rebuild_sched_domains_locked(); | 
|---|
| 3338 | out_unlock: | 
|---|
| 3339 | cpuset_full_unlock(); | 
|---|
| 3340 | if (of_cft(of)->private == FILE_MEMLIST) | 
|---|
| 3341 | schedule_flush_migrate_mm(); | 
|---|
| 3342 | return retval ?: nbytes; | 
|---|
| 3343 | } | 
|---|
| 3344 |  | 
|---|
| 3345 | /* | 
|---|
| 3346 | * These ascii lists should be read in a single call, by using a user | 
|---|
| 3347 | * buffer large enough to hold the entire map.  If read in smaller | 
|---|
| 3348 | * chunks, there is no guarantee of atomicity.  Since the display format | 
|---|
| 3349 | * used, list of ranges of sequential numbers, is variable length, | 
|---|
| 3350 | * and since these maps can change value dynamically, one could read | 
|---|
| 3351 | * gibberish by doing partial reads while a list was changing. | 
|---|
| 3352 | */ | 
|---|
| 3353 | int cpuset_common_seq_show(struct seq_file *sf, void *v) | 
|---|
| 3354 | { | 
|---|
| 3355 | struct cpuset *cs = css_cs(css: seq_css(seq: sf)); | 
|---|
| 3356 | cpuset_filetype_t type = seq_cft(seq: sf)->private; | 
|---|
| 3357 | int ret = 0; | 
|---|
| 3358 |  | 
|---|
| 3359 | spin_lock_irq(lock: &callback_lock); | 
|---|
| 3360 |  | 
|---|
| 3361 | switch (type) { | 
|---|
| 3362 | case FILE_CPULIST: | 
|---|
| 3363 | seq_printf(m: sf, fmt: "%*pbl\n", cpumask_pr_args(cs->cpus_allowed)); | 
|---|
| 3364 | break; | 
|---|
| 3365 | case FILE_MEMLIST: | 
|---|
| 3366 | seq_printf(m: sf, fmt: "%*pbl\n", nodemask_pr_args(&cs->mems_allowed)); | 
|---|
| 3367 | break; | 
|---|
| 3368 | case FILE_EFFECTIVE_CPULIST: | 
|---|
| 3369 | seq_printf(m: sf, fmt: "%*pbl\n", cpumask_pr_args(cs->effective_cpus)); | 
|---|
| 3370 | break; | 
|---|
| 3371 | case FILE_EFFECTIVE_MEMLIST: | 
|---|
| 3372 | seq_printf(m: sf, fmt: "%*pbl\n", nodemask_pr_args(&cs->effective_mems)); | 
|---|
| 3373 | break; | 
|---|
| 3374 | case FILE_EXCLUSIVE_CPULIST: | 
|---|
| 3375 | seq_printf(m: sf, fmt: "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus)); | 
|---|
| 3376 | break; | 
|---|
| 3377 | case FILE_EFFECTIVE_XCPULIST: | 
|---|
| 3378 | seq_printf(m: sf, fmt: "%*pbl\n", cpumask_pr_args(cs->effective_xcpus)); | 
|---|
| 3379 | break; | 
|---|
| 3380 | case FILE_SUBPARTS_CPULIST: | 
|---|
| 3381 | seq_printf(m: sf, fmt: "%*pbl\n", cpumask_pr_args(subpartitions_cpus)); | 
|---|
| 3382 | break; | 
|---|
| 3383 | case FILE_ISOLATED_CPULIST: | 
|---|
| 3384 | seq_printf(m: sf, fmt: "%*pbl\n", cpumask_pr_args(isolated_cpus)); | 
|---|
| 3385 | break; | 
|---|
| 3386 | default: | 
|---|
| 3387 | ret = -EINVAL; | 
|---|
| 3388 | } | 
|---|
| 3389 |  | 
|---|
| 3390 | spin_unlock_irq(lock: &callback_lock); | 
|---|
| 3391 | return ret; | 
|---|
| 3392 | } | 
|---|
| 3393 |  | 
|---|
| 3394 | static int cpuset_partition_show(struct seq_file *seq, void *v) | 
|---|
| 3395 | { | 
|---|
| 3396 | struct cpuset *cs = css_cs(css: seq_css(seq)); | 
|---|
| 3397 | const char *err, *type = NULL; | 
|---|
| 3398 |  | 
|---|
| 3399 | switch (cs->partition_root_state) { | 
|---|
| 3400 | case PRS_ROOT: | 
|---|
| 3401 | seq_puts(m: seq, s: "root\n"); | 
|---|
| 3402 | break; | 
|---|
| 3403 | case PRS_ISOLATED: | 
|---|
| 3404 | seq_puts(m: seq, s: "isolated\n"); | 
|---|
| 3405 | break; | 
|---|
| 3406 | case PRS_MEMBER: | 
|---|
| 3407 | seq_puts(m: seq, s: "member\n"); | 
|---|
| 3408 | break; | 
|---|
| 3409 | case PRS_INVALID_ROOT: | 
|---|
| 3410 | type = "root"; | 
|---|
| 3411 | fallthrough; | 
|---|
| 3412 | case PRS_INVALID_ISOLATED: | 
|---|
| 3413 | if (!type) | 
|---|
| 3414 | type = "isolated"; | 
|---|
| 3415 | err = perr_strings[READ_ONCE(cs->prs_err)]; | 
|---|
| 3416 | if (err) | 
|---|
| 3417 | seq_printf(m: seq, fmt: "%s invalid (%s)\n", type, err); | 
|---|
| 3418 | else | 
|---|
| 3419 | seq_printf(m: seq, fmt: "%s invalid\n", type); | 
|---|
| 3420 | break; | 
|---|
| 3421 | } | 
|---|
| 3422 | return 0; | 
|---|
| 3423 | } | 
|---|
| 3424 |  | 
|---|
| 3425 | static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf, | 
|---|
| 3426 | size_t nbytes, loff_t off) | 
|---|
| 3427 | { | 
|---|
| 3428 | struct cpuset *cs = css_cs(css: of_css(of)); | 
|---|
| 3429 | int val; | 
|---|
| 3430 | int retval = -ENODEV; | 
|---|
| 3431 |  | 
|---|
| 3432 | buf = strstrip(str: buf); | 
|---|
| 3433 |  | 
|---|
| 3434 | if (!strcmp(buf, "root")) | 
|---|
| 3435 | val = PRS_ROOT; | 
|---|
| 3436 | else if (!strcmp(buf, "member")) | 
|---|
| 3437 | val = PRS_MEMBER; | 
|---|
| 3438 | else if (!strcmp(buf, "isolated")) | 
|---|
| 3439 | val = PRS_ISOLATED; | 
|---|
| 3440 | else | 
|---|
| 3441 | return -EINVAL; | 
|---|
| 3442 |  | 
|---|
| 3443 | cpuset_full_lock(); | 
|---|
| 3444 | if (is_cpuset_online(cs)) | 
|---|
| 3445 | retval = update_prstate(cs, new_prs: val); | 
|---|
| 3446 | cpuset_full_unlock(); | 
|---|
| 3447 | return retval ?: nbytes; | 
|---|
| 3448 | } | 
|---|
| 3449 |  | 
|---|
| 3450 | /* | 
|---|
| 3451 | * This is currently a minimal set for the default hierarchy. It can be | 
|---|
| 3452 | * expanded later on by migrating more features and control files from v1. | 
|---|
| 3453 | */ | 
|---|
| 3454 | static struct cftype dfl_files[] = { | 
|---|
| 3455 | { | 
|---|
| 3456 | .name = "cpus", | 
|---|
| 3457 | .seq_show = cpuset_common_seq_show, | 
|---|
| 3458 | .write = cpuset_write_resmask, | 
|---|
| 3459 | .max_write_len = (100U + 6 * NR_CPUS), | 
|---|
| 3460 | .private = FILE_CPULIST, | 
|---|
| 3461 | .flags = CFTYPE_NOT_ON_ROOT, | 
|---|
| 3462 | }, | 
|---|
| 3463 |  | 
|---|
| 3464 | { | 
|---|
| 3465 | .name = "mems", | 
|---|
| 3466 | .seq_show = cpuset_common_seq_show, | 
|---|
| 3467 | .write = cpuset_write_resmask, | 
|---|
| 3468 | .max_write_len = (100U + 6 * MAX_NUMNODES), | 
|---|
| 3469 | .private = FILE_MEMLIST, | 
|---|
| 3470 | .flags = CFTYPE_NOT_ON_ROOT, | 
|---|
| 3471 | }, | 
|---|
| 3472 |  | 
|---|
| 3473 | { | 
|---|
| 3474 | .name = "cpus.effective", | 
|---|
| 3475 | .seq_show = cpuset_common_seq_show, | 
|---|
| 3476 | .private = FILE_EFFECTIVE_CPULIST, | 
|---|
| 3477 | }, | 
|---|
| 3478 |  | 
|---|
| 3479 | { | 
|---|
| 3480 | .name = "mems.effective", | 
|---|
| 3481 | .seq_show = cpuset_common_seq_show, | 
|---|
| 3482 | .private = FILE_EFFECTIVE_MEMLIST, | 
|---|
| 3483 | }, | 
|---|
| 3484 |  | 
|---|
| 3485 | { | 
|---|
| 3486 | .name = "cpus.partition", | 
|---|
| 3487 | .seq_show = cpuset_partition_show, | 
|---|
| 3488 | .write = cpuset_partition_write, | 
|---|
| 3489 | .private = FILE_PARTITION_ROOT, | 
|---|
| 3490 | .flags = CFTYPE_NOT_ON_ROOT, | 
|---|
| 3491 | .file_offset = offsetof(struct cpuset, partition_file), | 
|---|
| 3492 | }, | 
|---|
| 3493 |  | 
|---|
| 3494 | { | 
|---|
| 3495 | .name = "cpus.exclusive", | 
|---|
| 3496 | .seq_show = cpuset_common_seq_show, | 
|---|
| 3497 | .write = cpuset_write_resmask, | 
|---|
| 3498 | .max_write_len = (100U + 6 * NR_CPUS), | 
|---|
| 3499 | .private = FILE_EXCLUSIVE_CPULIST, | 
|---|
| 3500 | .flags = CFTYPE_NOT_ON_ROOT, | 
|---|
| 3501 | }, | 
|---|
| 3502 |  | 
|---|
| 3503 | { | 
|---|
| 3504 | .name = "cpus.exclusive.effective", | 
|---|
| 3505 | .seq_show = cpuset_common_seq_show, | 
|---|
| 3506 | .private = FILE_EFFECTIVE_XCPULIST, | 
|---|
| 3507 | .flags = CFTYPE_NOT_ON_ROOT, | 
|---|
| 3508 | }, | 
|---|
| 3509 |  | 
|---|
| 3510 | { | 
|---|
| 3511 | .name = "cpus.subpartitions", | 
|---|
| 3512 | .seq_show = cpuset_common_seq_show, | 
|---|
| 3513 | .private = FILE_SUBPARTS_CPULIST, | 
|---|
| 3514 | .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG, | 
|---|
| 3515 | }, | 
|---|
| 3516 |  | 
|---|
| 3517 | { | 
|---|
| 3518 | .name = "cpus.isolated", | 
|---|
| 3519 | .seq_show = cpuset_common_seq_show, | 
|---|
| 3520 | .private = FILE_ISOLATED_CPULIST, | 
|---|
| 3521 | .flags = CFTYPE_ONLY_ON_ROOT, | 
|---|
| 3522 | }, | 
|---|
| 3523 |  | 
|---|
| 3524 | { }	/* terminate */ | 
|---|
| 3525 | }; | 
|---|
| 3526 |  | 
|---|
| 3527 |  | 
|---|
| 3528 | /** | 
|---|
| 3529 | * cpuset_css_alloc - Allocate a cpuset css | 
|---|
| 3530 | * @parent_css: Parent css of the control group that the new cpuset will be | 
|---|
| 3531 | *              part of | 
|---|
| 3532 | * Return: cpuset css on success, -ENOMEM on failure. | 
|---|
| 3533 | * | 
|---|
| 3534 | * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return | 
|---|
| 3535 | * top cpuset css otherwise. | 
|---|
| 3536 | */ | 
|---|
| 3537 | static struct cgroup_subsys_state * | 
|---|
| 3538 | cpuset_css_alloc(struct cgroup_subsys_state *parent_css) | 
|---|
| 3539 | { | 
|---|
| 3540 | struct cpuset *cs; | 
|---|
| 3541 |  | 
|---|
| 3542 | if (!parent_css) | 
|---|
| 3543 | return &top_cpuset.css; | 
|---|
| 3544 |  | 
|---|
| 3545 | cs = dup_or_alloc_cpuset(NULL); | 
|---|
| 3546 | if (!cs) | 
|---|
| 3547 | return ERR_PTR(error: -ENOMEM); | 
|---|
| 3548 |  | 
|---|
| 3549 | __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); | 
|---|
| 3550 | fmeter_init(fmp: &cs->fmeter); | 
|---|
| 3551 | cs->relax_domain_level = -1; | 
|---|
| 3552 | INIT_LIST_HEAD(list: &cs->remote_sibling); | 
|---|
| 3553 |  | 
|---|
| 3554 | /* Set CS_MEMORY_MIGRATE for default hierarchy */ | 
|---|
| 3555 | if (cpuset_v2()) | 
|---|
| 3556 | __set_bit(CS_MEMORY_MIGRATE, &cs->flags); | 
|---|
| 3557 |  | 
|---|
| 3558 | return &cs->css; | 
|---|
| 3559 | } | 
|---|
| 3560 |  | 
|---|
| 3561 | static int cpuset_css_online(struct cgroup_subsys_state *css) | 
|---|
| 3562 | { | 
|---|
| 3563 | struct cpuset *cs = css_cs(css); | 
|---|
| 3564 | struct cpuset *parent = parent_cs(cs); | 
|---|
| 3565 | struct cpuset *tmp_cs; | 
|---|
| 3566 | struct cgroup_subsys_state *pos_css; | 
|---|
| 3567 |  | 
|---|
| 3568 | if (!parent) | 
|---|
| 3569 | return 0; | 
|---|
| 3570 |  | 
|---|
| 3571 | cpuset_full_lock(); | 
|---|
| 3572 | if (is_spread_page(cs: parent)) | 
|---|
| 3573 | set_bit(nr: CS_SPREAD_PAGE, addr: &cs->flags); | 
|---|
| 3574 | if (is_spread_slab(cs: parent)) | 
|---|
| 3575 | set_bit(nr: CS_SPREAD_SLAB, addr: &cs->flags); | 
|---|
| 3576 | /* | 
|---|
| 3577 | * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated | 
|---|
| 3578 | */ | 
|---|
| 3579 | if (cpuset_v2() && !is_sched_load_balance(cs: parent)) | 
|---|
| 3580 | clear_bit(nr: CS_SCHED_LOAD_BALANCE, addr: &cs->flags); | 
|---|
| 3581 |  | 
|---|
| 3582 | cpuset_inc(); | 
|---|
| 3583 |  | 
|---|
| 3584 | spin_lock_irq(lock: &callback_lock); | 
|---|
| 3585 | if (is_in_v2_mode()) { | 
|---|
| 3586 | cpumask_copy(dstp: cs->effective_cpus, srcp: parent->effective_cpus); | 
|---|
| 3587 | cs->effective_mems = parent->effective_mems; | 
|---|
| 3588 | } | 
|---|
| 3589 | spin_unlock_irq(lock: &callback_lock); | 
|---|
| 3590 |  | 
|---|
| 3591 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) | 
|---|
| 3592 | goto out_unlock; | 
|---|
| 3593 |  | 
|---|
| 3594 | /* | 
|---|
| 3595 | * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is | 
|---|
| 3596 | * set.  This flag handling is implemented in cgroup core for | 
|---|
| 3597 | * historical reasons - the flag may be specified during mount. | 
|---|
| 3598 | * | 
|---|
| 3599 | * Currently, if any sibling cpusets have exclusive cpus or mem, we | 
|---|
| 3600 | * refuse to clone the configuration - thereby refusing the task to | 
|---|
| 3601 | * be entered, and as a result refusing the sys_unshare() or | 
|---|
| 3602 | * clone() which initiated it.  If this becomes a problem for some | 
|---|
| 3603 | * users who wish to allow that scenario, then this could be | 
|---|
| 3604 | * changed to grant parent->cpus_allowed-sibling_cpus_exclusive | 
|---|
| 3605 | * (and likewise for mems) to the new cgroup. | 
|---|
| 3606 | */ | 
|---|
| 3607 | rcu_read_lock(); | 
|---|
| 3608 | cpuset_for_each_child(tmp_cs, pos_css, parent) { | 
|---|
| 3609 | if (is_mem_exclusive(cs: tmp_cs) || is_cpu_exclusive(cs: tmp_cs)) { | 
|---|
| 3610 | rcu_read_unlock(); | 
|---|
| 3611 | goto out_unlock; | 
|---|
| 3612 | } | 
|---|
| 3613 | } | 
|---|
| 3614 | rcu_read_unlock(); | 
|---|
| 3615 |  | 
|---|
| 3616 | spin_lock_irq(lock: &callback_lock); | 
|---|
| 3617 | cs->mems_allowed = parent->mems_allowed; | 
|---|
| 3618 | cs->effective_mems = parent->mems_allowed; | 
|---|
| 3619 | cpumask_copy(dstp: cs->cpus_allowed, srcp: parent->cpus_allowed); | 
|---|
| 3620 | cpumask_copy(dstp: cs->effective_cpus, srcp: parent->cpus_allowed); | 
|---|
| 3621 | spin_unlock_irq(lock: &callback_lock); | 
|---|
| 3622 | out_unlock: | 
|---|
| 3623 | cpuset_full_unlock(); | 
|---|
| 3624 | return 0; | 
|---|
| 3625 | } | 
|---|
| 3626 |  | 
|---|
| 3627 | /* | 
|---|
| 3628 | * If the cpuset being removed has its flag 'sched_load_balance' | 
|---|
| 3629 | * enabled, then simulate turning sched_load_balance off, which | 
|---|
| 3630 | * will call rebuild_sched_domains_locked(). That is not needed | 
|---|
| 3631 | * in the default hierarchy where only changes in partition | 
|---|
| 3632 | * will cause repartitioning. | 
|---|
| 3633 | */ | 
|---|
| 3634 | static void cpuset_css_offline(struct cgroup_subsys_state *css) | 
|---|
| 3635 | { | 
|---|
| 3636 | struct cpuset *cs = css_cs(css); | 
|---|
| 3637 |  | 
|---|
| 3638 | cpuset_full_lock(); | 
|---|
| 3639 | if (!cpuset_v2() && is_sched_load_balance(cs)) | 
|---|
| 3640 | cpuset_update_flag(bit: CS_SCHED_LOAD_BALANCE, cs, turning_on: 0); | 
|---|
| 3641 |  | 
|---|
| 3642 | cpuset_dec(); | 
|---|
| 3643 | cpuset_full_unlock(); | 
|---|
| 3644 | } | 
|---|
| 3645 |  | 
|---|
| 3646 | /* | 
|---|
| 3647 | * If a dying cpuset has the 'cpus.partition' enabled, turn it off by | 
|---|
| 3648 | * changing it back to member to free its exclusive CPUs back to the pool to | 
|---|
| 3649 | * be used by other online cpusets. | 
|---|
| 3650 | */ | 
|---|
| 3651 | static void cpuset_css_killed(struct cgroup_subsys_state *css) | 
|---|
| 3652 | { | 
|---|
| 3653 | struct cpuset *cs = css_cs(css); | 
|---|
| 3654 |  | 
|---|
| 3655 | cpuset_full_lock(); | 
|---|
| 3656 | /* Reset valid partition back to member */ | 
|---|
| 3657 | if (is_partition_valid(cs)) | 
|---|
| 3658 | update_prstate(cs, PRS_MEMBER); | 
|---|
| 3659 | cpuset_full_unlock(); | 
|---|
| 3660 | } | 
|---|
| 3661 |  | 
|---|
| 3662 | static void cpuset_css_free(struct cgroup_subsys_state *css) | 
|---|
| 3663 | { | 
|---|
| 3664 | struct cpuset *cs = css_cs(css); | 
|---|
| 3665 |  | 
|---|
| 3666 | free_cpuset(cs); | 
|---|
| 3667 | } | 
|---|
| 3668 |  | 
|---|
| 3669 | static void cpuset_bind(struct cgroup_subsys_state *root_css) | 
|---|
| 3670 | { | 
|---|
| 3671 | mutex_lock(lock: &cpuset_mutex); | 
|---|
| 3672 | spin_lock_irq(lock: &callback_lock); | 
|---|
| 3673 |  | 
|---|
| 3674 | if (is_in_v2_mode()) { | 
|---|
| 3675 | cpumask_copy(dstp: top_cpuset.cpus_allowed, cpu_possible_mask); | 
|---|
| 3676 | cpumask_copy(dstp: top_cpuset.effective_xcpus, cpu_possible_mask); | 
|---|
| 3677 | top_cpuset.mems_allowed = node_possible_map; | 
|---|
| 3678 | } else { | 
|---|
| 3679 | cpumask_copy(dstp: top_cpuset.cpus_allowed, | 
|---|
| 3680 | srcp: top_cpuset.effective_cpus); | 
|---|
| 3681 | top_cpuset.mems_allowed = top_cpuset.effective_mems; | 
|---|
| 3682 | } | 
|---|
| 3683 |  | 
|---|
| 3684 | spin_unlock_irq(lock: &callback_lock); | 
|---|
| 3685 | mutex_unlock(lock: &cpuset_mutex); | 
|---|
| 3686 | } | 
|---|
| 3687 |  | 
|---|
| 3688 | /* | 
|---|
| 3689 | * In case the child is cloned into a cpuset different from its parent, | 
|---|
| 3690 | * additional checks are done to see if the move is allowed. | 
|---|
| 3691 | */ | 
|---|
| 3692 | static int cpuset_can_fork(struct task_struct *task, struct css_set *cset) | 
|---|
| 3693 | { | 
|---|
| 3694 | struct cpuset *cs = css_cs(css: cset->subsys[cpuset_cgrp_id]); | 
|---|
| 3695 | bool same_cs; | 
|---|
| 3696 | int ret; | 
|---|
| 3697 |  | 
|---|
| 3698 | rcu_read_lock(); | 
|---|
| 3699 | same_cs = (cs == task_cs(current)); | 
|---|
| 3700 | rcu_read_unlock(); | 
|---|
| 3701 |  | 
|---|
| 3702 | if (same_cs) | 
|---|
| 3703 | return 0; | 
|---|
| 3704 |  | 
|---|
| 3705 | lockdep_assert_held(&cgroup_mutex); | 
|---|
| 3706 | mutex_lock(lock: &cpuset_mutex); | 
|---|
| 3707 |  | 
|---|
| 3708 | /* Check to see if task is allowed in the cpuset */ | 
|---|
| 3709 | ret = cpuset_can_attach_check(cs); | 
|---|
| 3710 | if (ret) | 
|---|
| 3711 | goto out_unlock; | 
|---|
| 3712 |  | 
|---|
| 3713 | ret = task_can_attach(p: task); | 
|---|
| 3714 | if (ret) | 
|---|
| 3715 | goto out_unlock; | 
|---|
| 3716 |  | 
|---|
| 3717 | ret = security_task_setscheduler(p: task); | 
|---|
| 3718 | if (ret) | 
|---|
| 3719 | goto out_unlock; | 
|---|
| 3720 |  | 
|---|
| 3721 | /* | 
|---|
| 3722 | * Mark attach is in progress.  This makes validate_change() fail | 
|---|
| 3723 | * changes which zero cpus/mems_allowed. | 
|---|
| 3724 | */ | 
|---|
| 3725 | cs->attach_in_progress++; | 
|---|
| 3726 | out_unlock: | 
|---|
| 3727 | mutex_unlock(lock: &cpuset_mutex); | 
|---|
| 3728 | return ret; | 
|---|
| 3729 | } | 
|---|
| 3730 |  | 
|---|
| 3731 | static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset) | 
|---|
| 3732 | { | 
|---|
| 3733 | struct cpuset *cs = css_cs(css: cset->subsys[cpuset_cgrp_id]); | 
|---|
| 3734 | bool same_cs; | 
|---|
| 3735 |  | 
|---|
| 3736 | rcu_read_lock(); | 
|---|
| 3737 | same_cs = (cs == task_cs(current)); | 
|---|
| 3738 | rcu_read_unlock(); | 
|---|
| 3739 |  | 
|---|
| 3740 | if (same_cs) | 
|---|
| 3741 | return; | 
|---|
| 3742 |  | 
|---|
| 3743 | dec_attach_in_progress(cs); | 
|---|
| 3744 | } | 
|---|
| 3745 |  | 
|---|
| 3746 | /* | 
|---|
| 3747 | * Make sure the new task conform to the current state of its parent, | 
|---|
| 3748 | * which could have been changed by cpuset just after it inherits the | 
|---|
| 3749 | * state from the parent and before it sits on the cgroup's task list. | 
|---|
| 3750 | */ | 
|---|
| 3751 | static void cpuset_fork(struct task_struct *task) | 
|---|
| 3752 | { | 
|---|
| 3753 | struct cpuset *cs; | 
|---|
| 3754 | bool same_cs; | 
|---|
| 3755 |  | 
|---|
| 3756 | rcu_read_lock(); | 
|---|
| 3757 | cs = task_cs(task); | 
|---|
| 3758 | same_cs = (cs == task_cs(current)); | 
|---|
| 3759 | rcu_read_unlock(); | 
|---|
| 3760 |  | 
|---|
| 3761 | if (same_cs) { | 
|---|
| 3762 | if (cs == &top_cpuset) | 
|---|
| 3763 | return; | 
|---|
| 3764 |  | 
|---|
| 3765 | set_cpus_allowed_ptr(p: task, current->cpus_ptr); | 
|---|
| 3766 | task->mems_allowed = current->mems_allowed; | 
|---|
| 3767 | return; | 
|---|
| 3768 | } | 
|---|
| 3769 |  | 
|---|
| 3770 | /* CLONE_INTO_CGROUP */ | 
|---|
| 3771 | mutex_lock(lock: &cpuset_mutex); | 
|---|
| 3772 | guarantee_online_mems(cs, pmask: &cpuset_attach_nodemask_to); | 
|---|
| 3773 | cpuset_attach_task(cs, task); | 
|---|
| 3774 |  | 
|---|
| 3775 | dec_attach_in_progress_locked(cs); | 
|---|
| 3776 | mutex_unlock(lock: &cpuset_mutex); | 
|---|
| 3777 | } | 
|---|
| 3778 |  | 
|---|
| 3779 | struct cgroup_subsys cpuset_cgrp_subsys = { | 
|---|
| 3780 | .css_alloc	= cpuset_css_alloc, | 
|---|
| 3781 | .css_online	= cpuset_css_online, | 
|---|
| 3782 | .css_offline	= cpuset_css_offline, | 
|---|
| 3783 | .css_killed	= cpuset_css_killed, | 
|---|
| 3784 | .css_free	= cpuset_css_free, | 
|---|
| 3785 | .can_attach	= cpuset_can_attach, | 
|---|
| 3786 | .cancel_attach	= cpuset_cancel_attach, | 
|---|
| 3787 | .attach		= cpuset_attach, | 
|---|
| 3788 | .bind		= cpuset_bind, | 
|---|
| 3789 | .can_fork	= cpuset_can_fork, | 
|---|
| 3790 | .cancel_fork	= cpuset_cancel_fork, | 
|---|
| 3791 | .fork		= cpuset_fork, | 
|---|
| 3792 | #ifdef CONFIG_CPUSETS_V1 | 
|---|
| 3793 | .legacy_cftypes	= cpuset1_files, | 
|---|
| 3794 | #endif | 
|---|
| 3795 | .dfl_cftypes	= dfl_files, | 
|---|
| 3796 | .early_init	= true, | 
|---|
| 3797 | .threaded	= true, | 
|---|
| 3798 | }; | 
|---|
| 3799 |  | 
|---|
| 3800 | /** | 
|---|
| 3801 | * cpuset_init - initialize cpusets at system boot | 
|---|
| 3802 | * | 
|---|
| 3803 | * Description: Initialize top_cpuset | 
|---|
| 3804 | **/ | 
|---|
| 3805 |  | 
|---|
| 3806 | int __init cpuset_init(void) | 
|---|
| 3807 | { | 
|---|
| 3808 | BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); | 
|---|
| 3809 | BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); | 
|---|
| 3810 | BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL)); | 
|---|
| 3811 | BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL)); | 
|---|
| 3812 | BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL)); | 
|---|
| 3813 | BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL)); | 
|---|
| 3814 |  | 
|---|
| 3815 | cpumask_setall(dstp: top_cpuset.cpus_allowed); | 
|---|
| 3816 | nodes_setall(top_cpuset.mems_allowed); | 
|---|
| 3817 | cpumask_setall(dstp: top_cpuset.effective_cpus); | 
|---|
| 3818 | cpumask_setall(dstp: top_cpuset.effective_xcpus); | 
|---|
| 3819 | cpumask_setall(dstp: top_cpuset.exclusive_cpus); | 
|---|
| 3820 | nodes_setall(top_cpuset.effective_mems); | 
|---|
| 3821 |  | 
|---|
| 3822 | fmeter_init(fmp: &top_cpuset.fmeter); | 
|---|
| 3823 | INIT_LIST_HEAD(list: &remote_children); | 
|---|
| 3824 |  | 
|---|
| 3825 | BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); | 
|---|
| 3826 |  | 
|---|
| 3827 | have_boot_isolcpus = housekeeping_enabled(type: HK_TYPE_DOMAIN); | 
|---|
| 3828 | if (have_boot_isolcpus) { | 
|---|
| 3829 | BUG_ON(!alloc_cpumask_var(&boot_hk_cpus, GFP_KERNEL)); | 
|---|
| 3830 | cpumask_copy(dstp: boot_hk_cpus, srcp: housekeeping_cpumask(type: HK_TYPE_DOMAIN)); | 
|---|
| 3831 | cpumask_andnot(dstp: isolated_cpus, cpu_possible_mask, src2p: boot_hk_cpus); | 
|---|
| 3832 | } | 
|---|
| 3833 |  | 
|---|
| 3834 | return 0; | 
|---|
| 3835 | } | 
|---|
| 3836 |  | 
|---|
| 3837 | static void | 
|---|
| 3838 | hotplug_update_tasks(struct cpuset *cs, | 
|---|
| 3839 | struct cpumask *new_cpus, nodemask_t *new_mems, | 
|---|
| 3840 | bool cpus_updated, bool mems_updated) | 
|---|
| 3841 | { | 
|---|
| 3842 | /* A partition root is allowed to have empty effective cpus */ | 
|---|
| 3843 | if (cpumask_empty(srcp: new_cpus) && !is_partition_valid(cs)) | 
|---|
| 3844 | cpumask_copy(dstp: new_cpus, srcp: parent_cs(cs)->effective_cpus); | 
|---|
| 3845 | if (nodes_empty(*new_mems)) | 
|---|
| 3846 | *new_mems = parent_cs(cs)->effective_mems; | 
|---|
| 3847 |  | 
|---|
| 3848 | spin_lock_irq(lock: &callback_lock); | 
|---|
| 3849 | cpumask_copy(dstp: cs->effective_cpus, srcp: new_cpus); | 
|---|
| 3850 | cs->effective_mems = *new_mems; | 
|---|
| 3851 | spin_unlock_irq(lock: &callback_lock); | 
|---|
| 3852 |  | 
|---|
| 3853 | if (cpus_updated) | 
|---|
| 3854 | cpuset_update_tasks_cpumask(cs, new_cpus); | 
|---|
| 3855 | if (mems_updated) | 
|---|
| 3856 | cpuset_update_tasks_nodemask(cs); | 
|---|
| 3857 | } | 
|---|
| 3858 |  | 
|---|
| 3859 | void cpuset_force_rebuild(void) | 
|---|
| 3860 | { | 
|---|
| 3861 | force_sd_rebuild = true; | 
|---|
| 3862 | } | 
|---|
| 3863 |  | 
|---|
| 3864 | /** | 
|---|
| 3865 | * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug | 
|---|
| 3866 | * @cs: cpuset in interest | 
|---|
| 3867 | * @tmp: the tmpmasks structure pointer | 
|---|
| 3868 | * | 
|---|
| 3869 | * Compare @cs's cpu and mem masks against top_cpuset and if some have gone | 
|---|
| 3870 | * offline, update @cs accordingly.  If @cs ends up with no CPU or memory, | 
|---|
| 3871 | * all its tasks are moved to the nearest ancestor with both resources. | 
|---|
| 3872 | */ | 
|---|
| 3873 | static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) | 
|---|
| 3874 | { | 
|---|
| 3875 | static cpumask_t new_cpus; | 
|---|
| 3876 | static nodemask_t new_mems; | 
|---|
| 3877 | bool cpus_updated; | 
|---|
| 3878 | bool mems_updated; | 
|---|
| 3879 | bool remote; | 
|---|
| 3880 | int partcmd = -1; | 
|---|
| 3881 | struct cpuset *parent; | 
|---|
| 3882 | retry: | 
|---|
| 3883 | wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); | 
|---|
| 3884 |  | 
|---|
| 3885 | mutex_lock(lock: &cpuset_mutex); | 
|---|
| 3886 |  | 
|---|
| 3887 | /* | 
|---|
| 3888 | * We have raced with task attaching. We wait until attaching | 
|---|
| 3889 | * is finished, so we won't attach a task to an empty cpuset. | 
|---|
| 3890 | */ | 
|---|
| 3891 | if (cs->attach_in_progress) { | 
|---|
| 3892 | mutex_unlock(lock: &cpuset_mutex); | 
|---|
| 3893 | goto retry; | 
|---|
| 3894 | } | 
|---|
| 3895 |  | 
|---|
| 3896 | parent = parent_cs(cs); | 
|---|
| 3897 | compute_effective_cpumask(new_cpus: &new_cpus, cs, parent); | 
|---|
| 3898 | nodes_and(new_mems, cs->mems_allowed, parent->effective_mems); | 
|---|
| 3899 |  | 
|---|
| 3900 | if (!tmp || !cs->partition_root_state) | 
|---|
| 3901 | goto update_tasks; | 
|---|
| 3902 |  | 
|---|
| 3903 | /* | 
|---|
| 3904 | * Compute effective_cpus for valid partition root, may invalidate | 
|---|
| 3905 | * child partition roots if necessary. | 
|---|
| 3906 | */ | 
|---|
| 3907 | remote = is_remote_partition(cs); | 
|---|
| 3908 | if (remote || (is_partition_valid(cs) && is_partition_valid(cs: parent))) | 
|---|
| 3909 | compute_partition_effective_cpumask(cs, new_ecpus: &new_cpus); | 
|---|
| 3910 |  | 
|---|
| 3911 | if (remote && cpumask_empty(srcp: &new_cpus) && | 
|---|
| 3912 | partition_is_populated(cs, NULL)) { | 
|---|
| 3913 | cs->prs_err = PERR_HOTPLUG; | 
|---|
| 3914 | remote_partition_disable(cs, tmp); | 
|---|
| 3915 | compute_effective_cpumask(new_cpus: &new_cpus, cs, parent); | 
|---|
| 3916 | remote = false; | 
|---|
| 3917 | } | 
|---|
| 3918 |  | 
|---|
| 3919 | /* | 
|---|
| 3920 | * Force the partition to become invalid if either one of | 
|---|
| 3921 | * the following conditions hold: | 
|---|
| 3922 | * 1) empty effective cpus but not valid empty partition. | 
|---|
| 3923 | * 2) parent is invalid or doesn't grant any cpus to child | 
|---|
| 3924 | *    partitions. | 
|---|
| 3925 | */ | 
|---|
| 3926 | if (is_local_partition(cs) && (!is_partition_valid(cs: parent) || | 
|---|
| 3927 | tasks_nocpu_error(parent, cs, xcpus: &new_cpus))) | 
|---|
| 3928 | partcmd = partcmd_invalidate; | 
|---|
| 3929 | /* | 
|---|
| 3930 | * On the other hand, an invalid partition root may be transitioned | 
|---|
| 3931 | * back to a regular one with a non-empty effective xcpus. | 
|---|
| 3932 | */ | 
|---|
| 3933 | else if (is_partition_valid(cs: parent) && is_partition_invalid(cs) && | 
|---|
| 3934 | !cpumask_empty(srcp: cs->effective_xcpus)) | 
|---|
| 3935 | partcmd = partcmd_update; | 
|---|
| 3936 |  | 
|---|
| 3937 | if (partcmd >= 0) { | 
|---|
| 3938 | update_parent_effective_cpumask(cs, cmd: partcmd, NULL, tmp); | 
|---|
| 3939 | if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) { | 
|---|
| 3940 | compute_partition_effective_cpumask(cs, new_ecpus: &new_cpus); | 
|---|
| 3941 | cpuset_force_rebuild(); | 
|---|
| 3942 | } | 
|---|
| 3943 | } | 
|---|
| 3944 |  | 
|---|
| 3945 | update_tasks: | 
|---|
| 3946 | cpus_updated = !cpumask_equal(src1p: &new_cpus, src2p: cs->effective_cpus); | 
|---|
| 3947 | mems_updated = !nodes_equal(new_mems, cs->effective_mems); | 
|---|
| 3948 | if (!cpus_updated && !mems_updated) | 
|---|
| 3949 | goto unlock;	/* Hotplug doesn't affect this cpuset */ | 
|---|
| 3950 |  | 
|---|
| 3951 | if (mems_updated) | 
|---|
| 3952 | check_insane_mems_config(nodes: &new_mems); | 
|---|
| 3953 |  | 
|---|
| 3954 | if (is_in_v2_mode()) | 
|---|
| 3955 | hotplug_update_tasks(cs, new_cpus: &new_cpus, new_mems: &new_mems, | 
|---|
| 3956 | cpus_updated, mems_updated); | 
|---|
| 3957 | else | 
|---|
| 3958 | cpuset1_hotplug_update_tasks(cs, new_cpus: &new_cpus, new_mems: &new_mems, | 
|---|
| 3959 | cpus_updated, mems_updated); | 
|---|
| 3960 |  | 
|---|
| 3961 | unlock: | 
|---|
| 3962 | mutex_unlock(lock: &cpuset_mutex); | 
|---|
| 3963 | } | 
|---|
| 3964 |  | 
|---|
| 3965 | /** | 
|---|
| 3966 | * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset | 
|---|
| 3967 | * | 
|---|
| 3968 | * This function is called after either CPU or memory configuration has | 
|---|
| 3969 | * changed and updates cpuset accordingly.  The top_cpuset is always | 
|---|
| 3970 | * synchronized to cpu_active_mask and N_MEMORY, which is necessary in | 
|---|
| 3971 | * order to make cpusets transparent (of no affect) on systems that are | 
|---|
| 3972 | * actively using CPU hotplug but making no active use of cpusets. | 
|---|
| 3973 | * | 
|---|
| 3974 | * Non-root cpusets are only affected by offlining.  If any CPUs or memory | 
|---|
| 3975 | * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on | 
|---|
| 3976 | * all descendants. | 
|---|
| 3977 | * | 
|---|
| 3978 | * Note that CPU offlining during suspend is ignored.  We don't modify | 
|---|
| 3979 | * cpusets across suspend/resume cycles at all. | 
|---|
| 3980 | * | 
|---|
| 3981 | * CPU / memory hotplug is handled synchronously. | 
|---|
| 3982 | */ | 
|---|
| 3983 | static void cpuset_handle_hotplug(void) | 
|---|
| 3984 | { | 
|---|
| 3985 | static cpumask_t new_cpus; | 
|---|
| 3986 | static nodemask_t new_mems; | 
|---|
| 3987 | bool cpus_updated, mems_updated; | 
|---|
| 3988 | bool on_dfl = is_in_v2_mode(); | 
|---|
| 3989 | struct tmpmasks tmp, *ptmp = NULL; | 
|---|
| 3990 |  | 
|---|
| 3991 | if (on_dfl && !alloc_tmpmasks(tmp: &tmp)) | 
|---|
| 3992 | ptmp = &tmp; | 
|---|
| 3993 |  | 
|---|
| 3994 | lockdep_assert_cpus_held(); | 
|---|
| 3995 | mutex_lock(lock: &cpuset_mutex); | 
|---|
| 3996 |  | 
|---|
| 3997 | /* fetch the available cpus/mems and find out which changed how */ | 
|---|
| 3998 | cpumask_copy(dstp: &new_cpus, cpu_active_mask); | 
|---|
| 3999 | new_mems = node_states[N_MEMORY]; | 
|---|
| 4000 |  | 
|---|
| 4001 | /* | 
|---|
| 4002 | * If subpartitions_cpus is populated, it is likely that the check | 
|---|
| 4003 | * below will produce a false positive on cpus_updated when the cpu | 
|---|
| 4004 | * list isn't changed. It is extra work, but it is better to be safe. | 
|---|
| 4005 | */ | 
|---|
| 4006 | cpus_updated = !cpumask_equal(src1p: top_cpuset.effective_cpus, src2p: &new_cpus) || | 
|---|
| 4007 | !cpumask_empty(srcp: subpartitions_cpus); | 
|---|
| 4008 | mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); | 
|---|
| 4009 |  | 
|---|
| 4010 | /* For v1, synchronize cpus_allowed to cpu_active_mask */ | 
|---|
| 4011 | if (cpus_updated) { | 
|---|
| 4012 | cpuset_force_rebuild(); | 
|---|
| 4013 | spin_lock_irq(lock: &callback_lock); | 
|---|
| 4014 | if (!on_dfl) | 
|---|
| 4015 | cpumask_copy(dstp: top_cpuset.cpus_allowed, srcp: &new_cpus); | 
|---|
| 4016 | /* | 
|---|
| 4017 | * Make sure that CPUs allocated to child partitions | 
|---|
| 4018 | * do not show up in effective_cpus. If no CPU is left, | 
|---|
| 4019 | * we clear the subpartitions_cpus & let the child partitions | 
|---|
| 4020 | * fight for the CPUs again. | 
|---|
| 4021 | */ | 
|---|
| 4022 | if (!cpumask_empty(srcp: subpartitions_cpus)) { | 
|---|
| 4023 | if (cpumask_subset(src1p: &new_cpus, src2p: subpartitions_cpus)) { | 
|---|
| 4024 | top_cpuset.nr_subparts = 0; | 
|---|
| 4025 | cpumask_clear(dstp: subpartitions_cpus); | 
|---|
| 4026 | } else { | 
|---|
| 4027 | cpumask_andnot(dstp: &new_cpus, src1p: &new_cpus, | 
|---|
| 4028 | src2p: subpartitions_cpus); | 
|---|
| 4029 | } | 
|---|
| 4030 | } | 
|---|
| 4031 | cpumask_copy(dstp: top_cpuset.effective_cpus, srcp: &new_cpus); | 
|---|
| 4032 | spin_unlock_irq(lock: &callback_lock); | 
|---|
| 4033 | /* we don't mess with cpumasks of tasks in top_cpuset */ | 
|---|
| 4034 | } | 
|---|
| 4035 |  | 
|---|
| 4036 | /* synchronize mems_allowed to N_MEMORY */ | 
|---|
| 4037 | if (mems_updated) { | 
|---|
| 4038 | spin_lock_irq(lock: &callback_lock); | 
|---|
| 4039 | if (!on_dfl) | 
|---|
| 4040 | top_cpuset.mems_allowed = new_mems; | 
|---|
| 4041 | top_cpuset.effective_mems = new_mems; | 
|---|
| 4042 | spin_unlock_irq(lock: &callback_lock); | 
|---|
| 4043 | cpuset_update_tasks_nodemask(cs: &top_cpuset); | 
|---|
| 4044 | } | 
|---|
| 4045 |  | 
|---|
| 4046 | mutex_unlock(lock: &cpuset_mutex); | 
|---|
| 4047 |  | 
|---|
| 4048 | /* if cpus or mems changed, we need to propagate to descendants */ | 
|---|
| 4049 | if (cpus_updated || mems_updated) { | 
|---|
| 4050 | struct cpuset *cs; | 
|---|
| 4051 | struct cgroup_subsys_state *pos_css; | 
|---|
| 4052 |  | 
|---|
| 4053 | rcu_read_lock(); | 
|---|
| 4054 | cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { | 
|---|
| 4055 | if (cs == &top_cpuset || !css_tryget_online(css: &cs->css)) | 
|---|
| 4056 | continue; | 
|---|
| 4057 | rcu_read_unlock(); | 
|---|
| 4058 |  | 
|---|
| 4059 | cpuset_hotplug_update_tasks(cs, tmp: ptmp); | 
|---|
| 4060 |  | 
|---|
| 4061 | rcu_read_lock(); | 
|---|
| 4062 | css_put(css: &cs->css); | 
|---|
| 4063 | } | 
|---|
| 4064 | rcu_read_unlock(); | 
|---|
| 4065 | } | 
|---|
| 4066 |  | 
|---|
| 4067 | /* rebuild sched domains if necessary */ | 
|---|
| 4068 | if (force_sd_rebuild) | 
|---|
| 4069 | rebuild_sched_domains_cpuslocked(); | 
|---|
| 4070 |  | 
|---|
| 4071 | free_tmpmasks(tmp: ptmp); | 
|---|
| 4072 | } | 
|---|
| 4073 |  | 
|---|
| 4074 | void cpuset_update_active_cpus(void) | 
|---|
| 4075 | { | 
|---|
| 4076 | /* | 
|---|
| 4077 | * We're inside cpu hotplug critical region which usually nests | 
|---|
| 4078 | * inside cgroup synchronization.  Bounce actual hotplug processing | 
|---|
| 4079 | * to a work item to avoid reverse locking order. | 
|---|
| 4080 | */ | 
|---|
| 4081 | cpuset_handle_hotplug(); | 
|---|
| 4082 | } | 
|---|
| 4083 |  | 
|---|
| 4084 | /* | 
|---|
| 4085 | * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. | 
|---|
| 4086 | * Call this routine anytime after node_states[N_MEMORY] changes. | 
|---|
| 4087 | * See cpuset_update_active_cpus() for CPU hotplug handling. | 
|---|
| 4088 | */ | 
|---|
| 4089 | static int cpuset_track_online_nodes(struct notifier_block *self, | 
|---|
| 4090 | unsigned long action, void *arg) | 
|---|
| 4091 | { | 
|---|
| 4092 | cpuset_handle_hotplug(); | 
|---|
| 4093 | return NOTIFY_OK; | 
|---|
| 4094 | } | 
|---|
| 4095 |  | 
|---|
| 4096 | /** | 
|---|
| 4097 | * cpuset_init_smp - initialize cpus_allowed | 
|---|
| 4098 | * | 
|---|
| 4099 | * Description: Finish top cpuset after cpu, node maps are initialized | 
|---|
| 4100 | */ | 
|---|
| 4101 | void __init cpuset_init_smp(void) | 
|---|
| 4102 | { | 
|---|
| 4103 | /* | 
|---|
| 4104 | * cpus_allowd/mems_allowed set to v2 values in the initial | 
|---|
| 4105 | * cpuset_bind() call will be reset to v1 values in another | 
|---|
| 4106 | * cpuset_bind() call when v1 cpuset is mounted. | 
|---|
| 4107 | */ | 
|---|
| 4108 | top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; | 
|---|
| 4109 |  | 
|---|
| 4110 | cpumask_copy(dstp: top_cpuset.effective_cpus, cpu_active_mask); | 
|---|
| 4111 | top_cpuset.effective_mems = node_states[N_MEMORY]; | 
|---|
| 4112 |  | 
|---|
| 4113 | hotplug_node_notifier(fn: cpuset_track_online_nodes, CPUSET_CALLBACK_PRI); | 
|---|
| 4114 |  | 
|---|
| 4115 | cpuset_migrate_mm_wq = alloc_ordered_workqueue( "cpuset_migrate_mm", 0); | 
|---|
| 4116 | BUG_ON(!cpuset_migrate_mm_wq); | 
|---|
| 4117 | } | 
|---|
| 4118 |  | 
|---|
| 4119 | /** | 
|---|
| 4120 | * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. | 
|---|
| 4121 | * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. | 
|---|
| 4122 | * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. | 
|---|
| 4123 | * | 
|---|
| 4124 | * Description: Returns the cpumask_var_t cpus_allowed of the cpuset | 
|---|
| 4125 | * attached to the specified @tsk.  Guaranteed to return some non-empty | 
|---|
| 4126 | * subset of cpu_active_mask, even if this means going outside the | 
|---|
| 4127 | * tasks cpuset, except when the task is in the top cpuset. | 
|---|
| 4128 | **/ | 
|---|
| 4129 |  | 
|---|
| 4130 | void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) | 
|---|
| 4131 | { | 
|---|
| 4132 | unsigned long flags; | 
|---|
| 4133 | struct cpuset *cs; | 
|---|
| 4134 |  | 
|---|
| 4135 | spin_lock_irqsave(&callback_lock, flags); | 
|---|
| 4136 |  | 
|---|
| 4137 | cs = task_cs(task: tsk); | 
|---|
| 4138 | if (cs != &top_cpuset) | 
|---|
| 4139 | guarantee_active_cpus(tsk, pmask); | 
|---|
| 4140 | /* | 
|---|
| 4141 | * Tasks in the top cpuset won't get update to their cpumasks | 
|---|
| 4142 | * when a hotplug online/offline event happens. So we include all | 
|---|
| 4143 | * offline cpus in the allowed cpu list. | 
|---|
| 4144 | */ | 
|---|
| 4145 | if ((cs == &top_cpuset) || cpumask_empty(srcp: pmask)) { | 
|---|
| 4146 | const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); | 
|---|
| 4147 |  | 
|---|
| 4148 | /* | 
|---|
| 4149 | * We first exclude cpus allocated to partitions. If there is no | 
|---|
| 4150 | * allowable online cpu left, we fall back to all possible cpus. | 
|---|
| 4151 | */ | 
|---|
| 4152 | cpumask_andnot(dstp: pmask, src1p: possible_mask, src2p: subpartitions_cpus); | 
|---|
| 4153 | if (!cpumask_intersects(src1p: pmask, cpu_active_mask)) | 
|---|
| 4154 | cpumask_copy(dstp: pmask, srcp: possible_mask); | 
|---|
| 4155 | } | 
|---|
| 4156 |  | 
|---|
| 4157 | spin_unlock_irqrestore(lock: &callback_lock, flags); | 
|---|
| 4158 | } | 
|---|
| 4159 |  | 
|---|
| 4160 | /** | 
|---|
| 4161 | * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe. | 
|---|
| 4162 | * @tsk: pointer to task_struct with which the scheduler is struggling | 
|---|
| 4163 | * | 
|---|
| 4164 | * Description: In the case that the scheduler cannot find an allowed cpu in | 
|---|
| 4165 | * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy | 
|---|
| 4166 | * mode however, this value is the same as task_cs(tsk)->effective_cpus, | 
|---|
| 4167 | * which will not contain a sane cpumask during cases such as cpu hotplugging. | 
|---|
| 4168 | * This is the absolute last resort for the scheduler and it is only used if | 
|---|
| 4169 | * _every_ other avenue has been traveled. | 
|---|
| 4170 | * | 
|---|
| 4171 | * Returns true if the affinity of @tsk was changed, false otherwise. | 
|---|
| 4172 | **/ | 
|---|
| 4173 |  | 
|---|
| 4174 | bool cpuset_cpus_allowed_fallback(struct task_struct *tsk) | 
|---|
| 4175 | { | 
|---|
| 4176 | const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); | 
|---|
| 4177 | const struct cpumask *cs_mask; | 
|---|
| 4178 | bool changed = false; | 
|---|
| 4179 |  | 
|---|
| 4180 | rcu_read_lock(); | 
|---|
| 4181 | cs_mask = task_cs(task: tsk)->cpus_allowed; | 
|---|
| 4182 | if (is_in_v2_mode() && cpumask_subset(src1p: cs_mask, src2p: possible_mask)) { | 
|---|
| 4183 | do_set_cpus_allowed(p: tsk, new_mask: cs_mask); | 
|---|
| 4184 | changed = true; | 
|---|
| 4185 | } | 
|---|
| 4186 | rcu_read_unlock(); | 
|---|
| 4187 |  | 
|---|
| 4188 | /* | 
|---|
| 4189 | * We own tsk->cpus_allowed, nobody can change it under us. | 
|---|
| 4190 | * | 
|---|
| 4191 | * But we used cs && cs->cpus_allowed lockless and thus can | 
|---|
| 4192 | * race with cgroup_attach_task() or update_cpumask() and get | 
|---|
| 4193 | * the wrong tsk->cpus_allowed. However, both cases imply the | 
|---|
| 4194 | * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr() | 
|---|
| 4195 | * which takes task_rq_lock(). | 
|---|
| 4196 | * | 
|---|
| 4197 | * If we are called after it dropped the lock we must see all | 
|---|
| 4198 | * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary | 
|---|
| 4199 | * set any mask even if it is not right from task_cs() pov, | 
|---|
| 4200 | * the pending set_cpus_allowed_ptr() will fix things. | 
|---|
| 4201 | * | 
|---|
| 4202 | * select_fallback_rq() will fix things ups and set cpu_possible_mask | 
|---|
| 4203 | * if required. | 
|---|
| 4204 | */ | 
|---|
| 4205 | return changed; | 
|---|
| 4206 | } | 
|---|
| 4207 |  | 
|---|
| 4208 | void __init cpuset_init_current_mems_allowed(void) | 
|---|
| 4209 | { | 
|---|
| 4210 | nodes_setall(current->mems_allowed); | 
|---|
| 4211 | } | 
|---|
| 4212 |  | 
|---|
| 4213 | /** | 
|---|
| 4214 | * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset. | 
|---|
| 4215 | * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed. | 
|---|
| 4216 | * | 
|---|
| 4217 | * Description: Returns the nodemask_t mems_allowed of the cpuset | 
|---|
| 4218 | * attached to the specified @tsk.  Guaranteed to return some non-empty | 
|---|
| 4219 | * subset of node_states[N_MEMORY], even if this means going outside the | 
|---|
| 4220 | * tasks cpuset. | 
|---|
| 4221 | **/ | 
|---|
| 4222 |  | 
|---|
| 4223 | nodemask_t cpuset_mems_allowed(struct task_struct *tsk) | 
|---|
| 4224 | { | 
|---|
| 4225 | nodemask_t mask; | 
|---|
| 4226 | unsigned long flags; | 
|---|
| 4227 |  | 
|---|
| 4228 | spin_lock_irqsave(&callback_lock, flags); | 
|---|
| 4229 | guarantee_online_mems(cs: task_cs(task: tsk), pmask: &mask); | 
|---|
| 4230 | spin_unlock_irqrestore(lock: &callback_lock, flags); | 
|---|
| 4231 |  | 
|---|
| 4232 | return mask; | 
|---|
| 4233 | } | 
|---|
| 4234 |  | 
|---|
| 4235 | /** | 
|---|
| 4236 | * cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed | 
|---|
| 4237 | * @nodemask: the nodemask to be checked | 
|---|
| 4238 | * | 
|---|
| 4239 | * Are any of the nodes in the nodemask allowed in current->mems_allowed? | 
|---|
| 4240 | */ | 
|---|
| 4241 | int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) | 
|---|
| 4242 | { | 
|---|
| 4243 | return nodes_intersects(*nodemask, current->mems_allowed); | 
|---|
| 4244 | } | 
|---|
| 4245 |  | 
|---|
| 4246 | /* | 
|---|
| 4247 | * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or | 
|---|
| 4248 | * mem_hardwall ancestor to the specified cpuset.  Call holding | 
|---|
| 4249 | * callback_lock.  If no ancestor is mem_exclusive or mem_hardwall | 
|---|
| 4250 | * (an unusual configuration), then returns the root cpuset. | 
|---|
| 4251 | */ | 
|---|
| 4252 | static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) | 
|---|
| 4253 | { | 
|---|
| 4254 | while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) | 
|---|
| 4255 | cs = parent_cs(cs); | 
|---|
| 4256 | return cs; | 
|---|
| 4257 | } | 
|---|
| 4258 |  | 
|---|
| 4259 | /* | 
|---|
| 4260 | * cpuset_current_node_allowed - Can current task allocate on a memory node? | 
|---|
| 4261 | * @node: is this an allowed node? | 
|---|
| 4262 | * @gfp_mask: memory allocation flags | 
|---|
| 4263 | * | 
|---|
| 4264 | * If we're in interrupt, yes, we can always allocate.  If @node is set in | 
|---|
| 4265 | * current's mems_allowed, yes.  If it's not a __GFP_HARDWALL request and this | 
|---|
| 4266 | * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, | 
|---|
| 4267 | * yes.  If current has access to memory reserves as an oom victim, yes. | 
|---|
| 4268 | * Otherwise, no. | 
|---|
| 4269 | * | 
|---|
| 4270 | * GFP_USER allocations are marked with the __GFP_HARDWALL bit, | 
|---|
| 4271 | * and do not allow allocations outside the current tasks cpuset | 
|---|
| 4272 | * unless the task has been OOM killed. | 
|---|
| 4273 | * GFP_KERNEL allocations are not so marked, so can escape to the | 
|---|
| 4274 | * nearest enclosing hardwalled ancestor cpuset. | 
|---|
| 4275 | * | 
|---|
| 4276 | * Scanning up parent cpusets requires callback_lock.  The | 
|---|
| 4277 | * __alloc_pages() routine only calls here with __GFP_HARDWALL bit | 
|---|
| 4278 | * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the | 
|---|
| 4279 | * current tasks mems_allowed came up empty on the first pass over | 
|---|
| 4280 | * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the | 
|---|
| 4281 | * cpuset are short of memory, might require taking the callback_lock. | 
|---|
| 4282 | * | 
|---|
| 4283 | * The first call here from mm/page_alloc:get_page_from_freelist() | 
|---|
| 4284 | * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, | 
|---|
| 4285 | * so no allocation on a node outside the cpuset is allowed (unless | 
|---|
| 4286 | * in interrupt, of course). | 
|---|
| 4287 | * | 
|---|
| 4288 | * The second pass through get_page_from_freelist() doesn't even call | 
|---|
| 4289 | * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages() | 
|---|
| 4290 | * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set | 
|---|
| 4291 | * in alloc_flags.  That logic and the checks below have the combined | 
|---|
| 4292 | * affect that: | 
|---|
| 4293 | *	in_interrupt - any node ok (current task context irrelevant) | 
|---|
| 4294 | *	GFP_ATOMIC   - any node ok | 
|---|
| 4295 | *	tsk_is_oom_victim   - any node ok | 
|---|
| 4296 | *	GFP_KERNEL   - any node in enclosing hardwalled cpuset ok | 
|---|
| 4297 | *	GFP_USER     - only nodes in current tasks mems allowed ok. | 
|---|
| 4298 | */ | 
|---|
| 4299 | bool cpuset_current_node_allowed(int node, gfp_t gfp_mask) | 
|---|
| 4300 | { | 
|---|
| 4301 | struct cpuset *cs;		/* current cpuset ancestors */ | 
|---|
| 4302 | bool allowed;			/* is allocation in zone z allowed? */ | 
|---|
| 4303 | unsigned long flags; | 
|---|
| 4304 |  | 
|---|
| 4305 | if (in_interrupt()) | 
|---|
| 4306 | return true; | 
|---|
| 4307 | if (node_isset(node, current->mems_allowed)) | 
|---|
| 4308 | return true; | 
|---|
| 4309 | /* | 
|---|
| 4310 | * Allow tasks that have access to memory reserves because they have | 
|---|
| 4311 | * been OOM killed to get memory anywhere. | 
|---|
| 4312 | */ | 
|---|
| 4313 | if (unlikely(tsk_is_oom_victim(current))) | 
|---|
| 4314 | return true; | 
|---|
| 4315 | if (gfp_mask & __GFP_HARDWALL)	/* If hardwall request, stop here */ | 
|---|
| 4316 | return false; | 
|---|
| 4317 |  | 
|---|
| 4318 | if (current->flags & PF_EXITING) /* Let dying task have memory */ | 
|---|
| 4319 | return true; | 
|---|
| 4320 |  | 
|---|
| 4321 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ | 
|---|
| 4322 | spin_lock_irqsave(&callback_lock, flags); | 
|---|
| 4323 |  | 
|---|
| 4324 | cs = nearest_hardwall_ancestor(cs: task_cs(current)); | 
|---|
| 4325 | allowed = node_isset(node, cs->mems_allowed); | 
|---|
| 4326 |  | 
|---|
| 4327 | spin_unlock_irqrestore(lock: &callback_lock, flags); | 
|---|
| 4328 | return allowed; | 
|---|
| 4329 | } | 
|---|
| 4330 |  | 
|---|
| 4331 | bool cpuset_node_allowed(struct cgroup *cgroup, int nid) | 
|---|
| 4332 | { | 
|---|
| 4333 | struct cgroup_subsys_state *css; | 
|---|
| 4334 | struct cpuset *cs; | 
|---|
| 4335 | bool allowed; | 
|---|
| 4336 |  | 
|---|
| 4337 | /* | 
|---|
| 4338 | * In v1, mem_cgroup and cpuset are unlikely in the same hierarchy | 
|---|
| 4339 | * and mems_allowed is likely to be empty even if we could get to it, | 
|---|
| 4340 | * so return true to avoid taking a global lock on the empty check. | 
|---|
| 4341 | */ | 
|---|
| 4342 | if (!cpuset_v2()) | 
|---|
| 4343 | return true; | 
|---|
| 4344 |  | 
|---|
| 4345 | css = cgroup_get_e_css(cgroup, ss: &cpuset_cgrp_subsys); | 
|---|
| 4346 | if (!css) | 
|---|
| 4347 | return true; | 
|---|
| 4348 |  | 
|---|
| 4349 | /* | 
|---|
| 4350 | * Normally, accessing effective_mems would require the cpuset_mutex | 
|---|
| 4351 | * or callback_lock - but node_isset is atomic and the reference | 
|---|
| 4352 | * taken via cgroup_get_e_css is sufficient to protect css. | 
|---|
| 4353 | * | 
|---|
| 4354 | * Since this interface is intended for use by migration paths, we | 
|---|
| 4355 | * relax locking here to avoid taking global locks - while accepting | 
|---|
| 4356 | * there may be rare scenarios where the result may be innaccurate. | 
|---|
| 4357 | * | 
|---|
| 4358 | * Reclaim and migration are subject to these same race conditions, and | 
|---|
| 4359 | * cannot make strong isolation guarantees, so this is acceptable. | 
|---|
| 4360 | */ | 
|---|
| 4361 | cs = container_of(css, struct cpuset, css); | 
|---|
| 4362 | allowed = node_isset(nid, cs->effective_mems); | 
|---|
| 4363 | css_put(css); | 
|---|
| 4364 | return allowed; | 
|---|
| 4365 | } | 
|---|
| 4366 |  | 
|---|
| 4367 | /** | 
|---|
| 4368 | * cpuset_spread_node() - On which node to begin search for a page | 
|---|
| 4369 | * @rotor: round robin rotor | 
|---|
| 4370 | * | 
|---|
| 4371 | * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for | 
|---|
| 4372 | * tasks in a cpuset with is_spread_page or is_spread_slab set), | 
|---|
| 4373 | * and if the memory allocation used cpuset_mem_spread_node() | 
|---|
| 4374 | * to determine on which node to start looking, as it will for | 
|---|
| 4375 | * certain page cache or slab cache pages such as used for file | 
|---|
| 4376 | * system buffers and inode caches, then instead of starting on the | 
|---|
| 4377 | * local node to look for a free page, rather spread the starting | 
|---|
| 4378 | * node around the tasks mems_allowed nodes. | 
|---|
| 4379 | * | 
|---|
| 4380 | * We don't have to worry about the returned node being offline | 
|---|
| 4381 | * because "it can't happen", and even if it did, it would be ok. | 
|---|
| 4382 | * | 
|---|
| 4383 | * The routines calling guarantee_online_mems() are careful to | 
|---|
| 4384 | * only set nodes in task->mems_allowed that are online.  So it | 
|---|
| 4385 | * should not be possible for the following code to return an | 
|---|
| 4386 | * offline node.  But if it did, that would be ok, as this routine | 
|---|
| 4387 | * is not returning the node where the allocation must be, only | 
|---|
| 4388 | * the node where the search should start.  The zonelist passed to | 
|---|
| 4389 | * __alloc_pages() will include all nodes.  If the slab allocator | 
|---|
| 4390 | * is passed an offline node, it will fall back to the local node. | 
|---|
| 4391 | * See kmem_cache_alloc_node(). | 
|---|
| 4392 | */ | 
|---|
| 4393 | static int cpuset_spread_node(int *rotor) | 
|---|
| 4394 | { | 
|---|
| 4395 | return *rotor = next_node_in(*rotor, current->mems_allowed); | 
|---|
| 4396 | } | 
|---|
| 4397 |  | 
|---|
| 4398 | /** | 
|---|
| 4399 | * cpuset_mem_spread_node() - On which node to begin search for a file page | 
|---|
| 4400 | */ | 
|---|
| 4401 | int cpuset_mem_spread_node(void) | 
|---|
| 4402 | { | 
|---|
| 4403 | if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE) | 
|---|
| 4404 | current->cpuset_mem_spread_rotor = | 
|---|
| 4405 | node_random(maskp: ¤t->mems_allowed); | 
|---|
| 4406 |  | 
|---|
| 4407 | return cpuset_spread_node(rotor: ¤t->cpuset_mem_spread_rotor); | 
|---|
| 4408 | } | 
|---|
| 4409 |  | 
|---|
| 4410 | /** | 
|---|
| 4411 | * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's? | 
|---|
| 4412 | * @tsk1: pointer to task_struct of some task. | 
|---|
| 4413 | * @tsk2: pointer to task_struct of some other task. | 
|---|
| 4414 | * | 
|---|
| 4415 | * Description: Return true if @tsk1's mems_allowed intersects the | 
|---|
| 4416 | * mems_allowed of @tsk2.  Used by the OOM killer to determine if | 
|---|
| 4417 | * one of the task's memory usage might impact the memory available | 
|---|
| 4418 | * to the other. | 
|---|
| 4419 | **/ | 
|---|
| 4420 |  | 
|---|
| 4421 | int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, | 
|---|
| 4422 | const struct task_struct *tsk2) | 
|---|
| 4423 | { | 
|---|
| 4424 | return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); | 
|---|
| 4425 | } | 
|---|
| 4426 |  | 
|---|
| 4427 | /** | 
|---|
| 4428 | * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed | 
|---|
| 4429 | * | 
|---|
| 4430 | * Description: Prints current's name, cpuset name, and cached copy of its | 
|---|
| 4431 | * mems_allowed to the kernel log. | 
|---|
| 4432 | */ | 
|---|
| 4433 | void cpuset_print_current_mems_allowed(void) | 
|---|
| 4434 | { | 
|---|
| 4435 | struct cgroup *cgrp; | 
|---|
| 4436 |  | 
|---|
| 4437 | rcu_read_lock(); | 
|---|
| 4438 |  | 
|---|
| 4439 | cgrp = task_cs(current)->css.cgroup; | 
|---|
| 4440 | pr_cont( ",cpuset="); | 
|---|
| 4441 | pr_cont_cgroup_name(cgrp); | 
|---|
| 4442 | pr_cont( ",mems_allowed=%*pbl", | 
|---|
| 4443 | nodemask_pr_args(¤t->mems_allowed)); | 
|---|
| 4444 |  | 
|---|
| 4445 | rcu_read_unlock(); | 
|---|
| 4446 | } | 
|---|
| 4447 |  | 
|---|
| 4448 | /* Display task mems_allowed in /proc/<pid>/status file. */ | 
|---|
| 4449 | void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) | 
|---|
| 4450 | { | 
|---|
| 4451 | seq_printf(m, fmt: "Mems_allowed:\t%*pb\n", | 
|---|
| 4452 | nodemask_pr_args(&task->mems_allowed)); | 
|---|
| 4453 | seq_printf(m, fmt: "Mems_allowed_list:\t%*pbl\n", | 
|---|
| 4454 | nodemask_pr_args(&task->mems_allowed)); | 
|---|
| 4455 | } | 
|---|
| 4456 |  | 
|---|