workqueue.c source code [Linux/kernel/workqueue.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* kernel/workqueue.c - generic async execution with shared worker pool
4	*
5	* Copyright (C) 2002 Ingo Molnar
6	*
7	* Derived from the taskqueue/keventd code by:
8	* David Woodhouse <dwmw2@infradead.org>
9	* Andrew Morton
10	* Kai Petzke <wpp@marie.physik.tu-berlin.de>
11	* Theodore Ts'o <tytso@mit.edu>
12	*
13	* Made to use alloc_percpu by Christoph Lameter.
14	*
15	* Copyright (C) 2010 SUSE Linux Products GmbH
16	* Copyright (C) 2010 Tejun Heo <tj@kernel.org>
17	*
18	* This is the generic async execution mechanism. Work items as are
19	* executed in process context. The worker pool is shared and
20	* automatically managed. There are two worker pools for each CPU (one for
21	* normal work items and the other for high priority ones) and some extra
22	* pools for workqueues which are not bound to any specific CPU - the
23	* number of these backing pools is dynamic.
24	*
25	* Please read Documentation/core-api/workqueue.rst for details.
26	*/
27
28	#include <linux/export.h>
29	#include <linux/kernel.h>
30	#include <linux/sched.h>
31	#include <linux/init.h>
32	#include <linux/interrupt.h>
33	#include <linux/signal.h>
34	#include <linux/completion.h>
35	#include <linux/workqueue.h>
36	#include <linux/slab.h>
37	#include <linux/cpu.h>
38	#include <linux/notifier.h>
39	#include <linux/kthread.h>
40	#include <linux/hardirq.h>
41	#include <linux/mempolicy.h>
42	#include <linux/freezer.h>
43	#include <linux/debug_locks.h>
44	#include <linux/lockdep.h>
45	#include <linux/idr.h>
46	#include <linux/jhash.h>
47	#include <linux/hashtable.h>
48	#include <linux/rculist.h>
49	#include <linux/nodemask.h>
50	#include <linux/moduleparam.h>
51	#include <linux/uaccess.h>
52	#include <linux/sched/isolation.h>
53	#include <linux/sched/debug.h>
54	#include <linux/nmi.h>
55	#include <linux/kvm_para.h>
56	#include <linux/delay.h>
57	#include <linux/irq_work.h>
58
59	#include "workqueue_internal.h"
60
61	enum worker_pool_flags {
62	/*
63	* worker_pool flags
64	*
65	* A bound pool is either associated or disassociated with its CPU.
66	* While associated (!DISASSOCIATED), all workers are bound to the
67	* CPU and none has %WORKER_UNBOUND set and concurrency management
68	* is in effect.
69	*
70	* While DISASSOCIATED, the cpu may be offline and all workers have
71	* %WORKER_UNBOUND set and concurrency management disabled, and may
72	* be executing on any CPU. The pool behaves as an unbound one.
73	*
74	* Note that DISASSOCIATED should be flipped only while holding
75	* wq_pool_attach_mutex to avoid changing binding state while
76	* worker_attach_to_pool() is in progress.
77	*
78	* As there can only be one concurrent BH execution context per CPU, a
79	* BH pool is per-CPU and always DISASSOCIATED.
80	*/
81	POOL_BH = `1` << `0`, / is a BH pool /
82	POOL_MANAGER_ACTIVE = `1` << `1`, / being managed /
83	POOL_DISASSOCIATED = `1` << `2`, / cpu can't serve workers /
84	POOL_BH_DRAINING = `1` << `3`, / draining after CPU offline /
85	};
86
87	enum worker_flags {
88	/ worker flags /
89	WORKER_DIE = `1` << `1`, / die die die /
90	WORKER_IDLE = `1` << `2`, / is idle /
91	WORKER_PREP = `1` << `3`, / preparing to run works /
92	WORKER_CPU_INTENSIVE = `1` << `6`, / cpu intensive /
93	WORKER_UNBOUND = `1` << `7`, / worker is unbound /
94	WORKER_REBOUND = `1` << `8`, / worker was rebound /
95
96	WORKER_NOT_RUNNING = WORKER_PREP \| WORKER_CPU_INTENSIVE \|
97	WORKER_UNBOUND \| WORKER_REBOUND,
98	};
99
100	enum work_cancel_flags {
101	WORK_CANCEL_DELAYED = `1` << `0`, / canceling a delayed_work /
102	WORK_CANCEL_DISABLE = `1` << `1`, / canceling to disable /
103	};
104
105	enum wq_internal_consts {
106	NR_STD_WORKER_POOLS = `2`, / # standard pools per cpu /
107
108	UNBOUND_POOL_HASH_ORDER = `6`, / hashed by pool->attrs /
109	BUSY_WORKER_HASH_ORDER = `6`, / 64 pointers /
110
111	MAX_IDLE_WORKERS_RATIO = `4`, / 1/4 of busy can be idle /
112	IDLE_WORKER_TIMEOUT = `300` * HZ, / keep idle ones for 5 mins /
113
114	MAYDAY_INITIAL_TIMEOUT = HZ / `100` >= `2` ? HZ / `100` : `2`,
115	/ call for help after 10ms*
116	(min two ticks) /*
117	MAYDAY_INTERVAL = HZ / `10`, / and then every 100ms /
118	CREATE_COOLDOWN = HZ, / time to breath after fail /
119
120	/*
121	* Rescue workers are used only on emergencies and shared by
122	* all cpus. Give MIN_NICE.
123	*/
124	RESCUER_NICE_LEVEL = MIN_NICE,
125	HIGHPRI_NICE_LEVEL = MIN_NICE,
126
127	WQ_NAME_LEN = `32`,
128	WORKER_ID_LEN = `10` + WQ_NAME_LEN, / "kworker/R-" + WQ_NAME_LEN /
129	};
130
131	/*
132	* We don't want to trap softirq for too long. See MAX_SOFTIRQ_TIME and
133	* MAX_SOFTIRQ_RESTART in kernel/softirq.c. These are macros because
134	* msecs_to_jiffies() can't be an initializer.
135	*/
136	#define BH_WORKER_JIFFIES msecs_to_jiffies(2)
137	#define BH_WORKER_RESTARTS 10
138
139	/*
140	* Structure fields follow one of the following exclusion rules.
141	*
142	* I: Modifiable by initialization/destruction paths and read-only for
143	* everyone else.
144	*
145	* P: Preemption protected. Disabling preemption is enough and should
146	* only be modified and accessed from the local cpu.
147	*
148	* L: pool->lock protected. Access with pool->lock held.
149	*
150	* LN: pool->lock and wq_node_nr_active->lock protected for writes. Either for
151	* reads.
152	*
153	* K: Only modified by worker while holding pool->lock. Can be safely read by
154	* self, while holding pool->lock or from IRQ context if %current is the
155	* kworker.
156	*
157	* S: Only modified by worker self.
158	*
159	* A: wq_pool_attach_mutex protected.
160	*
161	* PL: wq_pool_mutex protected.
162	*
163	* PR: wq_pool_mutex protected for writes. RCU protected for reads.
164	*
165	* PW: wq_pool_mutex and wq->mutex protected for writes. Either for reads.
166	*
167	* PWR: wq_pool_mutex and wq->mutex protected for writes. Either or
168	* RCU for reads.
169	*
170	* WQ: wq->mutex protected.
171	*
172	* WR: wq->mutex protected for writes. RCU protected for reads.
173	*
174	* WO: wq->mutex protected for writes. Updated with WRITE_ONCE() and can be read
175	* with READ_ONCE() without locking.
176	*
177	* MD: wq_mayday_lock protected.
178	*
179	* WD: Used internally by the watchdog.
180	*/
181
182	/ struct worker is defined in workqueue_internal.h /
183
184	struct worker_pool {
185	raw_spinlock_t lock; / the pool lock /
186	int cpu; / I: the associated cpu /
187	int node; / I: the associated node ID /
188	int id; / I: pool ID /
189	unsigned int flags; / L: flags /
190
191	unsigned long watchdog_ts; / L: watchdog timestamp /
192	bool cpu_stall; / WD: stalled cpu bound pool /
193
194	/*
195	* The counter is incremented in a process context on the associated CPU
196	* w/ preemption disabled, and decremented or reset in the same context
197	* but w/ pool->lock held. The readers grab pool->lock and are
198	* guaranteed to see if the counter reached zero.
199	*/
200	int nr_running;
201
202	struct list_head worklist; / L: list of pending works /
203
204	int nr_workers; / L: total number of workers /
205	int nr_idle; / L: currently idle workers /
206
207	struct list_head idle_list; / L: list of idle workers /
208	struct timer_list idle_timer; / L: worker idle timeout /
209	struct work_struct idle_cull_work; / L: worker idle cleanup /
210
211	struct timer_list mayday_timer; / L: SOS timer for workers /
212
213	/ a workers is either on busy_hash or idle_list, or the manager /
214	DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
215	/ L: hash of busy workers /
216
217	struct worker manager; /* L: purely informational /
218	struct list_head workers; / A: attached workers /
219
220	struct ida worker_ida; / worker IDs for task name /
221
222	struct workqueue_attrs attrs; /* I: worker attributes /
223	struct hlist_node hash_node; / PL: unbound_pool_hash node /
224	int refcnt; / PL: refcnt for unbound pools /
225	#ifdef CONFIG_PREEMPT_RT
226	spinlock_t cb_lock; / BH worker cancel lock /
227	#endif
228	/*
229	* Destruction of pool is RCU protected to allow dereferences
230	* from get_work_pool().
231	*/
232	struct rcu_head rcu;
233	};
234
235	/*
236	* Per-pool_workqueue statistics. These can be monitored using
237	* tools/workqueue/wq_monitor.py.
238	*/
239	enum pool_workqueue_stats {
240	PWQ_STAT_STARTED, / work items started execution /
241	PWQ_STAT_COMPLETED, / work items completed execution /
242	PWQ_STAT_CPU_TIME, / total CPU time consumed /
243	PWQ_STAT_CPU_INTENSIVE, / wq_cpu_intensive_thresh_us violations /
244	PWQ_STAT_CM_WAKEUP, / concurrency-management worker wakeups /
245	PWQ_STAT_REPATRIATED, / unbound workers brought back into scope /
246	PWQ_STAT_MAYDAY, / maydays to rescuer /
247	PWQ_STAT_RESCUED, / linked work items executed by rescuer /
248
249	PWQ_NR_STATS,
250	};
251
252	/*
253	* The per-pool workqueue. While queued, bits below WORK_PWQ_SHIFT
254	* of work_struct->data are used for flags and the remaining high bits
255	* point to the pwq; thus, pwqs need to be aligned at two's power of the
256	* number of flag bits.
257	*/
258	struct pool_workqueue {
259	struct worker_pool pool; /* I: the associated pool /
260	struct workqueue_struct wq; /* I: the owning workqueue /
261	int work_color; / L: current color /
262	int flush_color; / L: flushing color /
263	int refcnt; / L: reference count /
264	int nr_in_flight[WORK_NR_COLORS];
265	/ L: nr of in_flight works /
266	bool plugged; / L: execution suspended /
267
268	/*
269	* nr_active management and WORK_STRUCT_INACTIVE:
270	*
271	* When pwq->nr_active >= max_active, new work item is queued to
272	* pwq->inactive_works instead of pool->worklist and marked with
273	* WORK_STRUCT_INACTIVE.
274	*
275	* All work items marked with WORK_STRUCT_INACTIVE do not participate in
276	* nr_active and all work items in pwq->inactive_works are marked with
277	* WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE work items are
278	* in pwq->inactive_works. Some of them are ready to run in
279	* pool->worklist or worker->scheduled. Those work itmes are only struct
280	* wq_barrier which is used for flush_work() and should not participate
281	* in nr_active. For non-barrier work item, it is marked with
282	* WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works.
283	*/
284	int nr_active; / L: nr of active works /
285	struct list_head inactive_works; / L: inactive works /
286	struct list_head pending_node; / LN: node on wq_node_nr_active->pending_pwqs /
287	struct list_head pwqs_node; / WR: node on wq->pwqs /
288	struct list_head mayday_node; / MD: node on wq->maydays /
289
290	u64 stats[PWQ_NR_STATS];
291
292	/*
293	* Release of unbound pwq is punted to a kthread_worker. See put_pwq()
294	* and pwq_release_workfn() for details. pool_workqueue itself is also
295	* RCU protected so that the first pwq can be determined without
296	* grabbing wq->mutex.
297	*/
298	struct kthread_work release_work;
299	struct rcu_head rcu;
300	} __aligned(`1` << WORK_STRUCT_PWQ_SHIFT);
301
302	/*
303	* Structure used to wait for workqueue flush.
304	*/
305	struct wq_flusher {
306	struct list_head list; / WQ: list of flushers /
307	int flush_color; / WQ: flush color waiting for /
308	struct completion done; / flush completion /
309	};
310
311	struct wq_device;
312
313	/*
314	* Unlike in a per-cpu workqueue where max_active limits its concurrency level
315	* on each CPU, in an unbound workqueue, max_active applies to the whole system.
316	* As sharing a single nr_active across multiple sockets can be very expensive,
317	* the counting and enforcement is per NUMA node.
318	*
319	* The following struct is used to enforce per-node max_active. When a pwq wants
320	* to start executing a work item, it should increment ->nr using
321	* tryinc_node_nr_active(). If acquisition fails due to ->nr already being over
322	* ->max, the pwq is queued on ->pending_pwqs. As in-flight work items finish
323	* and decrement ->nr, node_activate_pending_pwq() activates the pending pwqs in
324	* round-robin order.
325	*/
326	struct wq_node_nr_active {
327	int max; / per-node max_active /
328	atomic_t nr; / per-node nr_active /
329	raw_spinlock_t lock; / nests inside pool locks /
330	struct list_head pending_pwqs; / LN: pwqs with inactive works /
331	};
332
333	/*
334	* The externally visible workqueue. It relays the issued work items to
335	* the appropriate worker_pool through its pool_workqueues.
336	*/
337	struct workqueue_struct {
338	struct list_head pwqs; / WR: all pwqs of this wq /
339	struct list_head list; / PR: list of all workqueues /
340
341	struct mutex mutex; / protects this wq /
342	int work_color; / WQ: current work color /
343	int flush_color; / WQ: current flush color /
344	atomic_t nr_pwqs_to_flush; / flush in progress /
345	struct wq_flusher first_flusher; /* WQ: first flusher /
346	struct list_head flusher_queue; / WQ: flush waiters /
347	struct list_head flusher_overflow; / WQ: flush overflow list /
348
349	struct list_head maydays; / MD: pwqs requesting rescue /
350	struct worker rescuer; /* MD: rescue worker /
351
352	int nr_drainers; / WQ: drain in progress /
353
354	/ See alloc_workqueue() function comment for info on min/max_active /
355	int max_active; / WO: max active works /
356	int min_active; / WO: min active works /
357	int saved_max_active; / WQ: saved max_active /
358	int saved_min_active; / WQ: saved min_active /
359
360	struct workqueue_attrs unbound_attrs; /* PW: only for unbound wqs /
361	struct pool_workqueue __rcu dfl_pwq; /* PW: only for unbound wqs /
362
363	#ifdef CONFIG_SYSFS
364	struct wq_device wq_dev; /* I: for sysfs interface /
365	#endif
366	#ifdef CONFIG_LOCKDEP
367	char *lock_name;
368	struct lock_class_key key;
369	struct lockdep_map __lockdep_map;
370	struct lockdep_map *lockdep_map;
371	#endif
372	char name[WQ_NAME_LEN]; / I: workqueue name /
373
374	/*
375	* Destruction of workqueue_struct is RCU protected to allow walking
376	* the workqueues list without grabbing wq_pool_mutex.
377	* This is used to dump all workqueues from sysrq.
378	*/
379	struct rcu_head rcu;
380
381	/ hot fields used during command issue, aligned to cacheline /
382	unsigned int flags ____cacheline_aligned; / WQ: WQ_* flags /
383	struct pool_workqueue __rcu * __percpu cpu_pwq; /* I: per-cpu pwqs /
384	struct wq_node_nr_active node_nr_active[]; /* I: per-node nr_active /
385	};
386
387	/*
388	* Each pod type describes how CPUs should be grouped for unbound workqueues.
389	* See the comment above workqueue_attrs->affn_scope.
390	*/
391	struct wq_pod_type {
392	int nr_pods; / number of pods /
393	cpumask_var_t pod_cpus; /* pod -> cpus /
394	int pod_node; /* pod -> node /
395	int cpu_pod; /* cpu -> pod /
396	};
397
398	struct work_offq_data {
399	u32 pool_id;
400	u32 disable;
401	u32 flags;
402	};
403
404	static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = {
405	[WQ_AFFN_DFL] = "default",
406	[WQ_AFFN_CPU] = "cpu",
407	[WQ_AFFN_SMT] = "smt",
408	[WQ_AFFN_CACHE] = "cache",
409	[WQ_AFFN_NUMA] = "numa",
410	[WQ_AFFN_SYSTEM] = "system",
411	};
412
413	/*
414	* Per-cpu work items which run for longer than the following threshold are
415	* automatically considered CPU intensive and excluded from concurrency
416	* management to prevent them from noticeably delaying other per-cpu work items.
417	* ULONG_MAX indicates that the user hasn't overridden it with a boot parameter.
418	* The actual value is initialized in wq_cpu_intensive_thresh_init().
419	*/
420	static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX;
421	module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, `0644`);
422	#ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT
423	static unsigned int wq_cpu_intensive_warning_thresh = `4`;
424	module_param_named(cpu_intensive_warning_thresh, wq_cpu_intensive_warning_thresh, uint, `0644`);
425	#endif
426
427	/ see the comment above the definition of WQ_POWER_EFFICIENT /
428	static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
429	module_param_named(power_efficient, wq_power_efficient, bool, `0444`);
430
431	static bool wq_online; / can kworkers be created yet? /
432	static bool wq_topo_initialized __read_mostly = false;
433
434	static struct kmem_cache *pwq_cache;
435
436	static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES];
437	static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE;
438
439	/ buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion /
440	static struct workqueue_attrs *unbound_wq_update_pwq_attrs_buf;
441
442	static DEFINE_MUTEX(wq_pool_mutex); / protects pools and workqueues list /
443	static DEFINE_MUTEX(wq_pool_attach_mutex); / protects worker attach/detach /
444	static DEFINE_RAW_SPINLOCK(wq_mayday_lock); / protects wq->maydays list /
445	/ wait for manager to go away /
446	static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait);
447
448	static LIST_HEAD(workqueues); / PR: list of all workqueues /
449	static bool workqueue_freezing; / PL: have wqs started freezing? /
450
451	/ PL: mirror the cpu_online_mask excluding the CPU in the midst of hotplugging /
452	static cpumask_var_t wq_online_cpumask;
453
454	/ PL&A: allowable cpus for unbound wqs and work items /
455	static cpumask_var_t wq_unbound_cpumask;
456
457	/ PL: user requested unbound cpumask via sysfs /
458	static cpumask_var_t wq_requested_unbound_cpumask;
459
460	/ PL: isolated cpumask to be excluded from unbound cpumask /
461	static cpumask_var_t wq_isolated_cpumask;
462
463	/ for further constrain wq_unbound_cpumask by cmdline parameter/
464	static struct cpumask wq_cmdline_cpumask __initdata;
465
466	/ CPU where unbound work was last round robin scheduled from this CPU /
467	static DEFINE_PER_CPU(int, wq_rr_cpu_last);
468
469	/*
470	* Local execution of unbound work items is no longer guaranteed. The
471	* following always forces round-robin CPU selection on unbound work items
472	* to uncover usages which depend on it.
473	*/
474	#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU
475	static bool wq_debug_force_rr_cpu = true;
476	#else
477	static bool wq_debug_force_rr_cpu = false;
478	#endif
479	module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, `0644`);
480
481	/ to raise softirq for the BH worker pools on other CPUs /
482	static DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_work [NR_STD_WORKER_POOLS], bh_pool_irq_works);
483
484	/ the BH worker pools /
485	static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], bh_worker_pools);
486
487	/ the per-cpu worker pools /
488	static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools);
489
490	static DEFINE_IDR(worker_pool_idr); / PR: idr of all pools /
491
492	/ PL: hash of all unbound pools keyed by pool->attrs /
493	static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
494
495	/ I: attributes used when instantiating standard unbound pools on demand /
496	static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
497
498	/ I: attributes used when instantiating ordered pools on demand /
499	static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
500
501	/*
502	* I: kthread_worker to release pwq's. pwq release needs to be bounced to a
503	* process context while holding a pool lock. Bounce to a dedicated kthread
504	* worker to avoid A-A deadlocks.
505	*/
506	static struct kthread_worker *pwq_release_worker __ro_after_init;
507
508	struct workqueue_struct *system_wq __ro_after_init;
509	EXPORT_SYMBOL(system_wq);
510	struct workqueue_struct *system_percpu_wq __ro_after_init;
511	EXPORT_SYMBOL(system_percpu_wq);
512	struct workqueue_struct *system_highpri_wq __ro_after_init;
513	EXPORT_SYMBOL_GPL(system_highpri_wq);
514	struct workqueue_struct *system_long_wq __ro_after_init;
515	EXPORT_SYMBOL_GPL(system_long_wq);
516	struct workqueue_struct *system_unbound_wq __ro_after_init;
517	EXPORT_SYMBOL_GPL(system_unbound_wq);
518	struct workqueue_struct *system_dfl_wq __ro_after_init;
519	EXPORT_SYMBOL_GPL(system_dfl_wq);
520	struct workqueue_struct *system_freezable_wq __ro_after_init;
521	EXPORT_SYMBOL_GPL(system_freezable_wq);
522	struct workqueue_struct *system_power_efficient_wq __ro_after_init;
523	EXPORT_SYMBOL_GPL(system_power_efficient_wq);
524	struct workqueue_struct *system_freezable_power_efficient_wq __ro_after_init;
525	EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
526	struct workqueue_struct *system_bh_wq;
527	EXPORT_SYMBOL_GPL(system_bh_wq);
528	struct workqueue_struct *system_bh_highpri_wq;
529	EXPORT_SYMBOL_GPL(system_bh_highpri_wq);
530
531	static int worker_thread(void *__worker);
532	static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
533	static void show_pwq(struct pool_workqueue *pwq);
534	static void show_one_worker_pool(struct worker_pool *pool);
535
536	#define CREATE_TRACE_POINTS
537	#include <trace/events/workqueue.h>
538
539	#define assert_rcu_or_pool_mutex() \
540	RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() && \
541	!lockdep_is_held(&wq_pool_mutex), \
542	"RCU or wq_pool_mutex should be held")
543
544	#define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \
545	RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() && \
546	!lockdep_is_held(&wq->mutex) && \
547	!lockdep_is_held(&wq_pool_mutex), \
548	"RCU, wq->mutex or wq_pool_mutex should be held")
549
550	#define for_each_bh_worker_pool(pool, cpu) \
551	for ((pool) = &per_cpu(bh_worker_pools, cpu)[0]; \
552	(pool) < &per_cpu(bh_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
553	(pool)++)
554
555	#define for_each_cpu_worker_pool(pool, cpu) \
556	for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
557	(pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
558	(pool)++)
559
560	/**
561	* for_each_pool - iterate through all worker_pools in the system
562	* @pool: iteration cursor
563	* @pi: integer used for iteration
564	*
565	* This must be called either with wq_pool_mutex held or RCU read
566	* locked. If the pool needs to be used beyond the locking in effect, the
567	* caller is responsible for guaranteeing that the pool stays online.
568	*
569	* The if/else clause exists only for the lockdep assertion and can be
570	* ignored.
571	*/
572	#define for_each_pool(pool, pi) \
573	idr_for_each_entry(&worker_pool_idr, pool, pi) \
574	if (({ assert_rcu_or_pool_mutex(); false; })) { } \
575	else
576
577	/**
578	* for_each_pool_worker - iterate through all workers of a worker_pool
579	* @worker: iteration cursor
580	* @pool: worker_pool to iterate workers of
581	*
582	* This must be called with wq_pool_attach_mutex.
583	*
584	* The if/else clause exists only for the lockdep assertion and can be
585	* ignored.
586	*/
587	#define for_each_pool_worker(worker, pool) \
588	list_for_each_entry((worker), &(pool)->workers, node) \
589	if (({ lockdep_assert_held(&wq_pool_attach_mutex); false; })) { } \
590	else
591
592	/**
593	* for_each_pwq - iterate through all pool_workqueues of the specified workqueue
594	* @pwq: iteration cursor
595	* @wq: the target workqueue
596	*
597	* This must be called either with wq->mutex held or RCU read locked.
598	* If the pwq needs to be used beyond the locking in effect, the caller is
599	* responsible for guaranteeing that the pwq stays online.
600	*
601	* The if/else clause exists only for the lockdep assertion and can be
602	* ignored.
603	*/
604	#define for_each_pwq(pwq, wq) \
605	list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node, \
606	lockdep_is_held(&(wq->mutex)))
607
608	#ifdef CONFIG_DEBUG_OBJECTS_WORK
609
610	static const struct debug_obj_descr work_debug_descr;
611
612	static void work_debug_hint(void* *addr)
613	{
614	return ((struct work_struct *) addr)->func;
615	}
616
617	static bool work_is_static_object(void *addr)
618	{
619	struct work_struct *work = addr;
620
621	return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work));
622	}
623
624	/*
625	* fixup_init is called when:
626	* - an active object is initialized
627	*/
628	static bool work_fixup_init(void addr, enum* debug_obj_state state)
629	{
630	struct work_struct *work = addr;
631
632	switch (state) {
633	case ODEBUG_STATE_ACTIVE:
634	cancel_work_sync(work);
635	debug_object_init(work, &work_debug_descr);
636	return true;
637	default:
638	return false;
639	}
640	}
641
642	/*
643	* fixup_free is called when:
644	* - an active object is freed
645	*/
646	static bool work_fixup_free(void addr, enum* debug_obj_state state)
647	{
648	struct work_struct *work = addr;
649
650	switch (state) {
651	case ODEBUG_STATE_ACTIVE:
652	cancel_work_sync(work);
653	debug_object_free(work, &work_debug_descr);
654	return true;
655	default:
656	return false;
657	}
658	}
659
660	static const struct debug_obj_descr work_debug_descr = {
661	.name = "work_struct",
662	.debug_hint = work_debug_hint,
663	.is_static_object = work_is_static_object,
664	.fixup_init = work_fixup_init,
665	.fixup_free = work_fixup_free,
666	};
667
668	static inline void debug_work_activate(struct work_struct *work)
669	{
670	debug_object_activate(work, &work_debug_descr);
671	}
672
673	static inline void debug_work_deactivate(struct work_struct *work)
674	{
675	debug_object_deactivate(work, &work_debug_descr);
676	}
677
678	void __init_work(struct work_struct work, int* onstack)
679	{
680	if (onstack)
681	debug_object_init_on_stack(work, &work_debug_descr);
682	else
683	debug_object_init(work, &work_debug_descr);
684	}
685	EXPORT_SYMBOL_GPL(__init_work);
686
687	void destroy_work_on_stack(struct work_struct *work)
688	{
689	debug_object_free(work, &work_debug_descr);
690	}
691	EXPORT_SYMBOL_GPL(destroy_work_on_stack);
692
693	void destroy_delayed_work_on_stack(struct delayed_work *work)
694	{
695	timer_destroy_on_stack(&work->timer);
696	debug_object_free(&work->work, &work_debug_descr);
697	}
698	EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack);
699
700	#else
701	static inline void debug_work_activate(struct work_struct *work) { }
702	static inline void debug_work_deactivate(struct work_struct *work) { }
703	#endif
704
705	/**
706	* worker_pool_assign_id - allocate ID and assign it to @pool
707	* @pool: the pool pointer of interest
708	*
709	* Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
710	* successfully, -errno on failure.
711	*/
712	static int worker_pool_assign_id(struct worker_pool *pool)
713	{
714	int ret;
715
716	lockdep_assert_held(&wq_pool_mutex);
717
718	ret = idr_alloc(&worker_pool_idr, ptr: pool, start: `0`, WORK_OFFQ_POOL_NONE,
719	GFP_KERNEL);
720	if (ret >= `0`) {
721	pool->id = ret;
722	return `0`;
723	}
724	return ret;
725	}
726
727	static struct pool_workqueue __rcu **
728	unbound_pwq_slot(struct workqueue_struct wq, int* cpu)
729	{
730	if (cpu >= `0`)
731	return per_cpu_ptr(wq->cpu_pwq, cpu);
732	else
733	return &wq->dfl_pwq;
734	}
735
736	/ @cpu < 0 for dfl_pwq /
737	static struct pool_workqueue unbound_pwq(struct* workqueue_struct wq, int* cpu)
738	{
739	return rcu_dereference_check(*unbound_pwq_slot(wq, cpu),
740	lockdep_is_held(&wq_pool_mutex) \|\|
741	lockdep_is_held(&wq->mutex));
742	}
743
744	/**
745	* unbound_effective_cpumask - effective cpumask of an unbound workqueue
746	* @wq: workqueue of interest
747	*
748	* @wq->unbound_attrs->cpumask contains the cpumask requested by the user which
749	* is masked with wq_unbound_cpumask to determine the effective cpumask. The
750	* default pwq is always mapped to the pool with the current effective cpumask.
751	*/
752	static struct cpumask unbound_effective_cpumask(struct* workqueue_struct *wq)
753	{
754	return unbound_pwq(wq, cpu: -`1`)->pool->attrs->__pod_cpumask;
755	}
756
757	static unsigned int work_color_to_flags(int color)
758	{
759	return color << WORK_STRUCT_COLOR_SHIFT;
760	}
761
762	static int get_work_color(unsigned long work_data)
763	{
764	return (work_data >> WORK_STRUCT_COLOR_SHIFT) &
765	((`1` << WORK_STRUCT_COLOR_BITS) - `1`);
766	}
767
768	static int work_next_color(int color)
769	{
770	return (color + `1`) % WORK_NR_COLORS;
771	}
772
773	static unsigned long pool_offq_flags(struct worker_pool *pool)
774	{
775	return (pool->flags & POOL_BH) ? WORK_OFFQ_BH : `0`;
776	}
777
778	/*
779	* While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
780	* contain the pointer to the queued pwq. Once execution starts, the flag
781	* is cleared and the high bits contain OFFQ flags and pool ID.
782	*
783	* set_work_pwq(), set_work_pool_and_clear_pending() and mark_work_canceling()
784	* can be used to set the pwq, pool or clear work->data. These functions should
785	* only be called while the work is owned - ie. while the PENDING bit is set.
786	*
787	* get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
788	* corresponding to a work. Pool is available once the work has been
789	* queued anywhere after initialization until it is sync canceled. pwq is
790	* available only while the work item is queued.
791	*/
792	static inline void set_work_data(struct work_struct work, unsigned* long data)
793	{
794	WARN_ON_ONCE(!work_pending(work));
795	atomic_long_set(v: &work->data, i: data \| work_static(work));
796	}
797
798	static void set_work_pwq(struct work_struct work, struct* pool_workqueue *pwq,
799	unsigned long flags)
800	{
801	set_work_data(work, data: (unsigned long)pwq \| WORK_STRUCT_PENDING \|
802	WORK_STRUCT_PWQ \| flags);
803	}
804
805	static void set_work_pool_and_keep_pending(struct work_struct *work,
806	int pool_id, unsigned long flags)
807	{
808	set_work_data(work, data: ((unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT) \|
809	WORK_STRUCT_PENDING \| flags);
810	}
811
812	static void set_work_pool_and_clear_pending(struct work_struct *work,
813	int pool_id, unsigned long flags)
814	{
815	/*
816	* The following wmb is paired with the implied mb in
817	* test_and_set_bit(PENDING) and ensures all updates to @work made
818	* here are visible to and precede any updates by the next PENDING
819	* owner.
820	*/
821	smp_wmb();
822	set_work_data(work, data: ((unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT) \|
823	flags);
824	/*
825	* The following mb guarantees that previous clear of a PENDING bit
826	* will not be reordered with any speculative LOADS or STORES from
827	* work->current_func, which is executed afterwards. This possible
828	* reordering can lead to a missed execution on attempt to queue
829	* the same @work. E.g. consider this case:
830	*
831	* CPU#0 CPU#1
832	* ---------------------------- --------------------------------
833	*
834	* 1 STORE event_indicated
835	* 2 queue_work_on() {
836	* 3 test_and_set_bit(PENDING)
837	* 4 } set_..._and_clear_pending() {
838	* 5 set_work_data() # clear bit
839	* 6 smp_mb()
840	* 7 work->current_func() {
841	* 8 LOAD event_indicated
842	* }
843	*
844	* Without an explicit full barrier speculative LOAD on line 8 can
845	* be executed before CPU#0 does STORE on line 1. If that happens,
846	* CPU#0 observes the PENDING bit is still set and new execution of
847	* a @work is not queued in a hope, that CPU#1 will eventually
848	* finish the queued @work. Meanwhile CPU#1 does not see
849	* event_indicated is set, because speculative LOAD was executed
850	* before actual STORE.
851	*/
852	smp_mb();
853	}
854
855	static inline struct pool_workqueue work_struct_pwq(unsigned* long data)
856	{
857	return (struct pool_workqueue *)(data & WORK_STRUCT_PWQ_MASK);
858	}
859
860	static struct pool_workqueue get_work_pwq(struct* work_struct *work)
861	{
862	unsigned long data = atomic_long_read(v: &work->data);
863
864	if (data & WORK_STRUCT_PWQ)
865	return work_struct_pwq(data);
866	else
867	return NULL;
868	}
869
870	/**
871	* get_work_pool - return the worker_pool a given work was associated with
872	* @work: the work item of interest
873	*
874	* Pools are created and destroyed under wq_pool_mutex, and allows read
875	* access under RCU read lock. As such, this function should be
876	* called under wq_pool_mutex or inside of a rcu_read_lock() region.
877	*
878	* All fields of the returned pool are accessible as long as the above
879	* mentioned locking is in effect. If the returned pool needs to be used
880	* beyond the critical section, the caller is responsible for ensuring the
881	* returned pool is and stays online.
882	*
883	* Return: The worker_pool @work was last associated with. %NULL if none.
884	*/
885	static struct worker_pool get_work_pool(struct* work_struct *work)
886	{
887	unsigned long data = atomic_long_read(v: &work->data);
888	int pool_id;
889
890	assert_rcu_or_pool_mutex();
891
892	if (data & WORK_STRUCT_PWQ)
893	return work_struct_pwq(data)->pool;
894
895	pool_id = data >> WORK_OFFQ_POOL_SHIFT;
896	if (pool_id == WORK_OFFQ_POOL_NONE)
897	return NULL;
898
899	return idr_find(&worker_pool_idr, id: pool_id);
900	}
901
902	static unsigned long shift_and_mask(unsigned long v, u32 shift, u32 bits)
903	{
904	return (v >> shift) & ((`1U` << bits) - `1`);
905	}
906
907	static void work_offqd_unpack(struct work_offq_data offqd, unsigned* long data)
908	{
909	WARN_ON_ONCE(data & WORK_STRUCT_PWQ);
910
911	offqd->pool_id = shift_and_mask(v: data, shift: WORK_OFFQ_POOL_SHIFT,
912	bits: WORK_OFFQ_POOL_BITS);
913	offqd->disable = shift_and_mask(v: data, shift: WORK_OFFQ_DISABLE_SHIFT,
914	bits: WORK_OFFQ_DISABLE_BITS);
915	offqd->flags = data & WORK_OFFQ_FLAG_MASK;
916	}
917
918	static unsigned long work_offqd_pack_flags(struct work_offq_data *offqd)
919	{
920	return ((unsigned long)offqd->disable << WORK_OFFQ_DISABLE_SHIFT) \|
921	((unsigned long)offqd->flags);
922	}
923
924	/*
925	* Policy functions. These define the policies on how the global worker
926	* pools are managed. Unless noted otherwise, these functions assume that
927	* they're being called with pool->lock held.
928	*/
929
930	/*
931	* Need to wake up a worker? Called from anything but currently
932	* running workers.
933	*
934	* Note that, because unbound workers never contribute to nr_running, this
935	* function will always return %true for unbound pools as long as the
936	* worklist isn't empty.
937	*/
938	static bool need_more_worker(struct worker_pool *pool)
939	{
940	return !list_empty(head: &pool->worklist) && !pool->nr_running;
941	}
942
943	/ Can I start working? Called from busy but !running workers. /
944	static bool may_start_working(struct worker_pool *pool)
945	{
946	return pool->nr_idle;
947	}
948
949	/ Do I need to keep working? Called from currently running workers. /
950	static bool keep_working(struct worker_pool *pool)
951	{
952	return !list_empty(head: &pool->worklist) && (pool->nr_running <= `1`);
953	}
954
955	/ Do we need a new worker? Called from manager. /
956	static bool need_to_create_worker(struct worker_pool *pool)
957	{
958	return need_more_worker(pool) && !may_start_working(pool);
959	}
960
961	/ Do we have too many workers and should some go away? /
962	static bool too_many_workers(struct worker_pool *pool)
963	{
964	bool managing = pool->flags & POOL_MANAGER_ACTIVE;
965	int nr_idle = pool->nr_idle + managing; / manager is considered idle /
966	int nr_busy = pool->nr_workers - nr_idle;
967
968	return nr_idle > `2` && (nr_idle - `2`) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
969	}
970
971	/**
972	* worker_set_flags - set worker flags and adjust nr_running accordingly
973	* @worker: self
974	* @flags: flags to set
975	*
976	* Set @flags in @worker->flags and adjust nr_running accordingly.
977	*/
978	static inline void worker_set_flags(struct worker worker, unsigned* int flags)
979	{
980	struct worker_pool *pool = worker->pool;
981
982	lockdep_assert_held(&pool->lock);
983
984	/ If transitioning into NOT_RUNNING, adjust nr_running. /
985	if ((flags & WORKER_NOT_RUNNING) &&
986	!(worker->flags & WORKER_NOT_RUNNING)) {
987	pool->nr_running--;
988	}
989
990	worker->flags \|= flags;
991	}
992
993	/**
994	* worker_clr_flags - clear worker flags and adjust nr_running accordingly
995	* @worker: self
996	* @flags: flags to clear
997	*
998	* Clear @flags in @worker->flags and adjust nr_running accordingly.
999	*/
1000	static inline void worker_clr_flags(struct worker worker, unsigned* int flags)
1001	{
1002	struct worker_pool *pool = worker->pool;
1003	unsigned int oflags = worker->flags;
1004
1005	lockdep_assert_held(&pool->lock);
1006
1007	worker->flags &= ~flags;
1008
1009	/*
1010	* If transitioning out of NOT_RUNNING, increment nr_running. Note
1011	* that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask
1012	* of multiple flags, not a single flag.
1013	*/
1014	if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
1015	if (!(worker->flags & WORKER_NOT_RUNNING))
1016	pool->nr_running++;
1017	}
1018
1019	/ Return the first idle worker. Called with pool->lock held. /
1020	static struct worker first_idle_worker(struct* worker_pool *pool)
1021	{
1022	if (unlikely(list_empty(&pool->idle_list)))
1023	return NULL;
1024
1025	return list_first_entry(&pool->idle_list, struct worker, entry);
1026	}
1027
1028	/**
1029	* worker_enter_idle - enter idle state
1030	* @worker: worker which is entering idle state
1031	*
1032	* @worker is entering idle state. Update stats and idle timer if
1033	* necessary.
1034	*
1035	* LOCKING:
1036	* raw_spin_lock_irq(pool->lock).
1037	*/
1038	static void worker_enter_idle(struct worker *worker)
1039	{
1040	struct worker_pool *pool = worker->pool;
1041
1042	if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) \|\|
1043	WARN_ON_ONCE(!list_empty(&worker->entry) &&
1044	(worker->hentry.next \|\| worker->hentry.pprev)))
1045	return;
1046
1047	/ can't use worker_set_flags(), also called from create_worker() /
1048	worker->flags \|= WORKER_IDLE;
1049	pool->nr_idle++;
1050	worker->last_active = jiffies;
1051
1052	/ idle_list is LIFO /
1053	list_add(new: &worker->entry, head: &pool->idle_list);
1054
1055	if (too_many_workers(pool) && !timer_pending(timer: &pool->idle_timer))
1056	mod_timer(timer: &pool->idle_timer, expires: jiffies + IDLE_WORKER_TIMEOUT);
1057
1058	/ Sanity check nr_running. /
1059	WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running);
1060	}
1061
1062	/**
1063	* worker_leave_idle - leave idle state
1064	* @worker: worker which is leaving idle state
1065	*
1066	* @worker is leaving idle state. Update stats.
1067	*
1068	* LOCKING:
1069	* raw_spin_lock_irq(pool->lock).
1070	*/
1071	static void worker_leave_idle(struct worker *worker)
1072	{
1073	struct worker_pool *pool = worker->pool;
1074
1075	if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE)))
1076	return;
1077	worker_clr_flags(worker, flags: WORKER_IDLE);
1078	pool->nr_idle--;
1079	list_del_init(entry: &worker->entry);
1080	}
1081
1082	/**
1083	* find_worker_executing_work - find worker which is executing a work
1084	* @pool: pool of interest
1085	* @work: work to find worker for
1086	*
1087	* Find a worker which is executing @work on @pool by searching
1088	* @pool->busy_hash which is keyed by the address of @work. For a worker
1089	* to match, its current execution should match the address of @work and
1090	* its work function. This is to avoid unwanted dependency between
1091	* unrelated work executions through a work item being recycled while still
1092	* being executed.
1093	*
1094	* This is a bit tricky. A work item may be freed once its execution
1095	* starts and nothing prevents the freed area from being recycled for
1096	* another work item. If the same work item address ends up being reused
1097	* before the original execution finishes, workqueue will identify the
1098	* recycled work item as currently executing and make it wait until the
1099	* current execution finishes, introducing an unwanted dependency.
1100	*
1101	* This function checks the work item address and work function to avoid
1102	* false positives. Note that this isn't complete as one may construct a
1103	* work function which can introduce dependency onto itself through a
1104	* recycled work item. Well, if somebody wants to shoot oneself in the
1105	* foot that badly, there's only so much we can do, and if such deadlock
1106	* actually occurs, it should be easy to locate the culprit work function.
1107	*
1108	* CONTEXT:
1109	* raw_spin_lock_irq(pool->lock).
1110	*
1111	* Return:
1112	* Pointer to worker which is executing @work if found, %NULL
1113	* otherwise.
1114	*/
1115	static struct worker find_worker_executing_work(struct* worker_pool *pool,
1116	struct work_struct *work)
1117	{
1118	struct worker *worker;
1119
1120	hash_for_each_possible(pool->busy_hash, worker, hentry,
1121	(unsigned long)work)
1122	if (worker->current_work == work &&
1123	worker->current_func == work->func)
1124	return worker;
1125
1126	return NULL;
1127	}
1128
1129	/**
1130	* move_linked_works - move linked works to a list
1131	* @work: start of series of works to be scheduled
1132	* @head: target list to append @work to
1133	* @nextp: out parameter for nested worklist walking
1134	*
1135	* Schedule linked works starting from @work to @head. Work series to be
1136	* scheduled starts at @work and includes any consecutive work with
1137	* WORK_STRUCT_LINKED set in its predecessor. See assign_work() for details on
1138	* @nextp.
1139	*
1140	* CONTEXT:
1141	* raw_spin_lock_irq(pool->lock).
1142	*/
1143	static void move_linked_works(struct work_struct work, struct* list_head *head,
1144	struct work_struct **nextp)
1145	{
1146	struct work_struct *n;
1147
1148	/*
1149	* Linked worklist will always end before the end of the list,
1150	* use NULL for list head.
1151	*/
1152	list_for_each_entry_safe_from(work, n, NULL, entry) {
1153	list_move_tail(list: &work->entry, head);
1154	if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
1155	break;
1156	}
1157
1158	/*
1159	* If we're already inside safe list traversal and have moved
1160	* multiple works to the scheduled queue, the next position
1161	* needs to be updated.
1162	*/
1163	if (nextp)
1164	*nextp = n;
1165	}
1166
1167	/**
1168	* assign_work - assign a work item and its linked work items to a worker
1169	* @work: work to assign
1170	* @worker: worker to assign to
1171	* @nextp: out parameter for nested worklist walking
1172	*
1173	* Assign @work and its linked work items to @worker. If @work is already being
1174	* executed by another worker in the same pool, it'll be punted there.
1175	*
1176	* If @nextp is not NULL, it's updated to point to the next work of the last
1177	* scheduled work. This allows assign_work() to be nested inside
1178	* list_for_each_entry_safe().
1179	*
1180	* Returns %true if @work was successfully assigned to @worker. %false if @work
1181	* was punted to another worker already executing it.
1182	*/
1183	static bool assign_work(struct work_struct work, struct* worker *worker,
1184	struct work_struct **nextp)
1185	{
1186	struct worker_pool *pool = worker->pool;
1187	struct worker *collision;
1188
1189	lockdep_assert_held(&pool->lock);
1190
1191	/*
1192	* A single work shouldn't be executed concurrently by multiple workers.
1193	* __queue_work() ensures that @work doesn't jump to a different pool
1194	* while still running in the previous pool. Here, we should ensure that
1195	* @work is not executed concurrently by multiple workers from the same
1196	* pool. Check whether anyone is already processing the work. If so,
1197	* defer the work to the currently executing one.
1198	*/
1199	collision = find_worker_executing_work(pool, work);
1200	if (unlikely(collision)) {
1201	move_linked_works(work, head: &collision->scheduled, nextp);
1202	return false;
1203	}
1204
1205	move_linked_works(work, head: &worker->scheduled, nextp);
1206	return true;
1207	}
1208
1209	static struct irq_work bh_pool_irq_work(struct* worker_pool *pool)
1210	{
1211	int high = pool->attrs->nice == HIGHPRI_NICE_LEVEL ? `1` : `0`;
1212
1213	return &per_cpu(bh_pool_irq_works, pool->cpu)[high];
1214	}
1215
1216	static void kick_bh_pool(struct worker_pool *pool)
1217	{
1218	#ifdef CONFIG_SMP
1219	/ see drain_dead_softirq_workfn() for BH_DRAINING /
1220	if (unlikely(pool->cpu != smp_processor_id() &&
1221	!(pool->flags & POOL_BH_DRAINING))) {
1222	irq_work_queue_on(work: bh_pool_irq_work(pool), cpu: pool->cpu);
1223	return;
1224	}
1225	#endif
1226	if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
1227	raise_softirq_irqoff(nr: HI_SOFTIRQ);
1228	else
1229	raise_softirq_irqoff(nr: TASKLET_SOFTIRQ);
1230	}
1231
1232	/**
1233	* kick_pool - wake up an idle worker if necessary
1234	* @pool: pool to kick
1235	*
1236	* @pool may have pending work items. Wake up worker if necessary. Returns
1237	* whether a worker was woken up.
1238	*/
1239	static bool kick_pool(struct worker_pool *pool)
1240	{
1241	struct worker *worker = first_idle_worker(pool);
1242	struct task_struct *p;
1243
1244	lockdep_assert_held(&pool->lock);
1245
1246	if (!need_more_worker(pool) \|\| !worker)
1247	return false;
1248
1249	if (pool->flags & POOL_BH) {
1250	kick_bh_pool(pool);
1251	return true;
1252	}
1253
1254	p = worker->task;
1255
1256	#ifdef CONFIG_SMP
1257	/*
1258	* Idle @worker is about to execute @work and waking up provides an
1259	* opportunity to migrate @worker at a lower cost by setting the task's
1260	* wake_cpu field. Let's see if we want to move @worker to improve
1261	* execution locality.
1262	*
1263	* We're waking the worker that went idle the latest and there's some
1264	* chance that @worker is marked idle but hasn't gone off CPU yet. If
1265	* so, setting the wake_cpu won't do anything. As this is a best-effort
1266	* optimization and the race window is narrow, let's leave as-is for
1267	* now. If this becomes pronounced, we can skip over workers which are
1268	* still on cpu when picking an idle worker.
1269	*
1270	* If @pool has non-strict affinity, @worker might have ended up outside
1271	* its affinity scope. Repatriate.
1272	*/
1273	if (!pool->attrs->affn_strict &&
1274	!cpumask_test_cpu(cpu: p->wake_cpu, cpumask: pool->attrs->__pod_cpumask)) {
1275	struct work_struct *work = list_first_entry(&pool->worklist,
1276	struct work_struct, entry);
1277	int wake_cpu = cpumask_any_and_distribute(src1p: pool->attrs->__pod_cpumask,
1278	cpu_online_mask);
1279	if (wake_cpu < nr_cpu_ids) {
1280	p->wake_cpu = wake_cpu;
1281	get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++;
1282	}
1283	}
1284	#endif
1285	wake_up_process(tsk: p);
1286	return true;
1287	}
1288
1289	#ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT
1290
1291	/*
1292	* Concurrency-managed per-cpu work items that hog CPU for longer than
1293	* wq_cpu_intensive_thresh_us trigger the automatic CPU_INTENSIVE mechanism,
1294	* which prevents them from stalling other concurrency-managed work items. If a
1295	* work function keeps triggering this mechanism, it's likely that the work item
1296	* should be using an unbound workqueue instead.
1297	*
1298	* wq_cpu_intensive_report() tracks work functions which trigger such conditions
1299	* and report them so that they can be examined and converted to use unbound
1300	* workqueues as appropriate. To avoid flooding the console, each violating work
1301	* function is tracked and reported with exponential backoff.
1302	*/
1303	#define WCI_MAX_ENTS 128
1304
1305	struct wci_ent {
1306	work_func_t func;
1307	atomic64_t cnt;
1308	struct hlist_node hash_node;
1309	};
1310
1311	static struct wci_ent wci_ents[WCI_MAX_ENTS];
1312	static int wci_nr_ents;
1313	static DEFINE_RAW_SPINLOCK(wci_lock);
1314	static DEFINE_HASHTABLE(wci_hash, ilog2(WCI_MAX_ENTS));
1315
1316	static struct wci_ent *wci_find_ent(work_func_t func)
1317	{
1318	struct wci_ent *ent;
1319
1320	hash_for_each_possible_rcu(wci_hash, ent, hash_node,
1321	(unsigned long)func) {
1322	if (ent->func == func)
1323	return ent;
1324	}
1325	return NULL;
1326	}
1327
1328	static void wq_cpu_intensive_report(work_func_t func)
1329	{
1330	struct wci_ent *ent;
1331
1332	restart:
1333	ent = wci_find_ent(func);
1334	if (ent) {
1335	u64 cnt;
1336
1337	/*
1338	* Start reporting from the warning_thresh and back off
1339	* exponentially.
1340	*/
1341	cnt = atomic64_inc_return_relaxed(&ent->cnt);
1342	if (wq_cpu_intensive_warning_thresh &&
1343	cnt >= wq_cpu_intensive_warning_thresh &&
1344	is_power_of_2(cnt + `1` - wq_cpu_intensive_warning_thresh))
1345	printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n",
1346	ent->func, wq_cpu_intensive_thresh_us,
1347	atomic64_read(&ent->cnt));
1348	return;
1349	}
1350
1351	/*
1352	* @func is a new violation. Allocate a new entry for it. If wcn_ents[]
1353	* is exhausted, something went really wrong and we probably made enough
1354	* noise already.
1355	*/
1356	if (wci_nr_ents >= WCI_MAX_ENTS)
1357	return;
1358
1359	raw_spin_lock(&wci_lock);
1360
1361	if (wci_nr_ents >= WCI_MAX_ENTS) {
1362	raw_spin_unlock(&wci_lock);
1363	return;
1364	}
1365
1366	if (wci_find_ent(func)) {
1367	raw_spin_unlock(&wci_lock);
1368	goto restart;
1369	}
1370
1371	ent = &wci_ents[wci_nr_ents++];
1372	ent->func = func;
1373	atomic64_set(&ent->cnt, `0`);
1374	hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func);
1375
1376	raw_spin_unlock(&wci_lock);
1377
1378	goto restart;
1379	}
1380
1381	#else /* CONFIG_WQ_CPU_INTENSIVE_REPORT */
1382	static void wq_cpu_intensive_report(work_func_t func) {}
1383	#endif /* CONFIG_WQ_CPU_INTENSIVE_REPORT */
1384
1385	/**
1386	* wq_worker_running - a worker is running again
1387	* @task: task waking up
1388	*
1389	* This function is called when a worker returns from schedule()
1390	*/
1391	void wq_worker_running(struct task_struct *task)
1392	{
1393	struct worker *worker = kthread_data(k: task);
1394
1395	if (!READ_ONCE(worker->sleeping))
1396	return;
1397
1398	/*
1399	* If preempted by unbind_workers() between the WORKER_NOT_RUNNING check
1400	* and the nr_running increment below, we may ruin the nr_running reset
1401	* and leave with an unexpected pool->nr_running == 1 on the newly unbound
1402	* pool. Protect against such race.
1403	*/
1404	preempt_disable();
1405	if (!(worker->flags & WORKER_NOT_RUNNING))
1406	worker->pool->nr_running++;
1407	preempt_enable();
1408
1409	/*
1410	* CPU intensive auto-detection cares about how long a work item hogged
1411	* CPU without sleeping. Reset the starting timestamp on wakeup.
1412	*/
1413	worker->current_at = worker->task->se.sum_exec_runtime;
1414
1415	WRITE_ONCE(worker->sleeping, `0`);
1416	}
1417
1418	/**
1419	* wq_worker_sleeping - a worker is going to sleep
1420	* @task: task going to sleep
1421	*
1422	* This function is called from schedule() when a busy worker is
1423	* going to sleep.
1424	*/
1425	void wq_worker_sleeping(struct task_struct *task)
1426	{
1427	struct worker *worker = kthread_data(k: task);
1428	struct worker_pool *pool;
1429
1430	/*
1431	* Rescuers, which may not have all the fields set up like normal
1432	* workers, also reach here, let's not access anything before
1433	* checking NOT_RUNNING.
1434	*/
1435	if (worker->flags & WORKER_NOT_RUNNING)
1436	return;
1437
1438	pool = worker->pool;
1439
1440	/ Return if preempted before wq_worker_running() was reached /
1441	if (READ_ONCE(worker->sleeping))
1442	return;
1443
1444	WRITE_ONCE(worker->sleeping, `1`);
1445	raw_spin_lock_irq(&pool->lock);
1446
1447	/*
1448	* Recheck in case unbind_workers() preempted us. We don't
1449	* want to decrement nr_running after the worker is unbound
1450	* and nr_running has been reset.
1451	*/
1452	if (worker->flags & WORKER_NOT_RUNNING) {
1453	raw_spin_unlock_irq(&pool->lock);
1454	return;
1455	}
1456
1457	pool->nr_running--;
1458	if (kick_pool(pool))
1459	worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++;
1460
1461	raw_spin_unlock_irq(&pool->lock);
1462	}
1463
1464	/**
1465	* wq_worker_tick - a scheduler tick occurred while a kworker is running
1466	* @task: task currently running
1467	*
1468	* Called from sched_tick(). We're in the IRQ context and the current
1469	* worker's fields which follow the 'K' locking rule can be accessed safely.
1470	*/
1471	void wq_worker_tick(struct task_struct *task)
1472	{
1473	struct worker *worker = kthread_data(k: task);
1474	struct pool_workqueue *pwq = worker->current_pwq;
1475	struct worker_pool *pool = worker->pool;
1476
1477	if (!pwq)
1478	return;
1479
1480	pwq->stats[PWQ_STAT_CPU_TIME] += TICK_USEC;
1481
1482	if (!wq_cpu_intensive_thresh_us)
1483	return;
1484
1485	/*
1486	* If the current worker is concurrency managed and hogged the CPU for
1487	* longer than wq_cpu_intensive_thresh_us, it's automatically marked
1488	* CPU_INTENSIVE to avoid stalling other concurrency-managed work items.
1489	*
1490	* Set @worker->sleeping means that @worker is in the process of
1491	* switching out voluntarily and won't be contributing to
1492	* @pool->nr_running until it wakes up. As wq_worker_sleeping() also
1493	* decrements ->nr_running, setting CPU_INTENSIVE here can lead to
1494	* double decrements. The task is releasing the CPU anyway. Let's skip.
1495	* We probably want to make this prettier in the future.
1496	*/
1497	if ((worker->flags & WORKER_NOT_RUNNING) \|\| READ_ONCE(worker->sleeping) \|\|
1498	worker->task->se.sum_exec_runtime - worker->current_at <
1499	wq_cpu_intensive_thresh_us * NSEC_PER_USEC)
1500	return;
1501
1502	raw_spin_lock(&pool->lock);
1503
1504	worker_set_flags(worker, flags: WORKER_CPU_INTENSIVE);
1505	wq_cpu_intensive_report(func: worker->current_func);
1506	pwq->stats[PWQ_STAT_CPU_INTENSIVE]++;
1507
1508	if (kick_pool(pool))
1509	pwq->stats[PWQ_STAT_CM_WAKEUP]++;
1510
1511	raw_spin_unlock(&pool->lock);
1512	}
1513
1514	/**
1515	* wq_worker_last_func - retrieve worker's last work function
1516	* @task: Task to retrieve last work function of.
1517	*
1518	* Determine the last function a worker executed. This is called from
1519	* the scheduler to get a worker's last known identity.
1520	*
1521	* CONTEXT:
1522	* raw_spin_lock_irq(rq->lock)
1523	*
1524	* This function is called during schedule() when a kworker is going
1525	* to sleep. It's used by psi to identify aggregation workers during
1526	* dequeuing, to allow periodic aggregation to shut-off when that
1527	* worker is the last task in the system or cgroup to go to sleep.
1528	*
1529	* As this function doesn't involve any workqueue-related locking, it
1530	* only returns stable values when called from inside the scheduler's
1531	* queuing and dequeuing paths, when @task, which must be a kworker,
1532	* is guaranteed to not be processing any works.
1533	*
1534	* Return:
1535	* The last work function %current executed as a worker, NULL if it
1536	* hasn't executed any work yet.
1537	*/
1538	work_func_t wq_worker_last_func(struct task_struct *task)
1539	{
1540	struct worker *worker = kthread_data(k: task);
1541
1542	return worker->last_func;
1543	}
1544
1545	/**
1546	* wq_node_nr_active - Determine wq_node_nr_active to use
1547	* @wq: workqueue of interest
1548	* @node: NUMA node, can be %NUMA_NO_NODE
1549	*
1550	* Determine wq_node_nr_active to use for @wq on @node. Returns:
1551	*
1552	* - %NULL for per-cpu workqueues as they don't need to use shared nr_active.
1553	*
1554	* - node_nr_active[nr_node_ids] if @node is %NUMA_NO_NODE.
1555	*
1556	* - Otherwise, node_nr_active[@node].
1557	*/
1558	static struct wq_node_nr_active wq_node_nr_active(struct* workqueue_struct *wq,
1559	int node)
1560	{
1561	if (!(wq->flags & WQ_UNBOUND))
1562	return NULL;
1563
1564	if (node == NUMA_NO_NODE)
1565	node = nr_node_ids;
1566
1567	return wq->node_nr_active[node];
1568	}
1569
1570	/**
1571	* wq_update_node_max_active - Update per-node max_actives to use
1572	* @wq: workqueue to update
1573	* @off_cpu: CPU that's going down, -1 if a CPU is not going down
1574	*
1575	* Update @wq->node_nr_active[]->max. @wq must be unbound. max_active is
1576	* distributed among nodes according to the proportions of numbers of online
1577	* cpus. The result is always between @wq->min_active and max_active.
1578	*/
1579	static void wq_update_node_max_active(struct workqueue_struct wq, int* off_cpu)
1580	{
1581	struct cpumask *effective = unbound_effective_cpumask(wq);
1582	int min_active = READ_ONCE(wq->min_active);
1583	int max_active = READ_ONCE(wq->max_active);
1584	int total_cpus, node;
1585
1586	lockdep_assert_held(&wq->mutex);
1587
1588	if (!wq_topo_initialized)
1589	return;
1590
1591	if (off_cpu >= `0` && !cpumask_test_cpu(cpu: off_cpu, cpumask: effective))
1592	off_cpu = -`1`;
1593
1594	total_cpus = cpumask_weight_and(srcp1: effective, cpu_online_mask);
1595	if (off_cpu >= `0`)
1596	total_cpus--;
1597
1598	/ If all CPUs of the wq get offline, use the default values /
1599	if (unlikely(!total_cpus)) {
1600	for_each_node(node)
1601	wq_node_nr_active(wq, node)->max = min_active;
1602
1603	wq_node_nr_active(wq, NUMA_NO_NODE)->max = max_active;
1604	return;
1605	}
1606
1607	for_each_node(node) {
1608	int node_cpus;
1609
1610	node_cpus = cpumask_weight_and(srcp1: effective, srcp2: cpumask_of_node(node));
1611	if (off_cpu >= `0` && cpu_to_node(cpu: off_cpu) == node)
1612	node_cpus--;
1613
1614	wq_node_nr_active(wq, node)->max =
1615	clamp(DIV_ROUND_UP(max_active * node_cpus, total_cpus),
1616	min_active, max_active);
1617	}
1618
1619	wq_node_nr_active(wq, NUMA_NO_NODE)->max = max_active;
1620	}
1621
1622	/**
1623	* get_pwq - get an extra reference on the specified pool_workqueue
1624	* @pwq: pool_workqueue to get
1625	*
1626	* Obtain an extra reference on @pwq. The caller should guarantee that
1627	* @pwq has positive refcnt and be holding the matching pool->lock.
1628	*/
1629	static void get_pwq(struct pool_workqueue *pwq)
1630	{
1631	lockdep_assert_held(&pwq->pool->lock);
1632	WARN_ON_ONCE(pwq->refcnt <= `0`);
1633	pwq->refcnt++;
1634	}
1635
1636	/**
1637	* put_pwq - put a pool_workqueue reference
1638	* @pwq: pool_workqueue to put
1639	*
1640	* Drop a reference of @pwq. If its refcnt reaches zero, schedule its
1641	* destruction. The caller should be holding the matching pool->lock.
1642	*/
1643	static void put_pwq(struct pool_workqueue *pwq)
1644	{
1645	lockdep_assert_held(&pwq->pool->lock);
1646	if (likely(--pwq->refcnt))
1647	return;
1648	/*
1649	* @pwq can't be released under pool->lock, bounce to a dedicated
1650	* kthread_worker to avoid A-A deadlocks.
1651	*/
1652	kthread_queue_work(worker: pwq_release_worker, work: &pwq->release_work);
1653	}
1654
1655	/**
1656	* put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock
1657	* @pwq: pool_workqueue to put (can be %NULL)
1658	*
1659	* put_pwq() with locking. This function also allows %NULL @pwq.
1660	*/
1661	static void put_pwq_unlocked(struct pool_workqueue *pwq)
1662	{
1663	if (pwq) {
1664	/*
1665	* As both pwqs and pools are RCU protected, the
1666	* following lock operations are safe.
1667	*/
1668	raw_spin_lock_irq(&pwq->pool->lock);
1669	put_pwq(pwq);
1670	raw_spin_unlock_irq(&pwq->pool->lock);
1671	}
1672	}
1673
1674	static bool pwq_is_empty(struct pool_workqueue *pwq)
1675	{
1676	return !pwq->nr_active && list_empty(head: &pwq->inactive_works);
1677	}
1678
1679	static void __pwq_activate_work(struct pool_workqueue *pwq,
1680	struct work_struct *work)
1681	{
1682	unsigned long *wdb = work_data_bits(work);
1683
1684	WARN_ON_ONCE(!(*wdb & WORK_STRUCT_INACTIVE));
1685	trace_workqueue_activate_work(work);
1686	if (list_empty(head: &pwq->pool->worklist))
1687	pwq->pool->watchdog_ts = jiffies;
1688	move_linked_works(work, head: &pwq->pool->worklist, NULL);
1689	__clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb);
1690	}
1691
1692	static bool tryinc_node_nr_active(struct wq_node_nr_active *nna)
1693	{
1694	int max = READ_ONCE(nna->max);
1695	int old = atomic_read(v: &nna->nr);
1696
1697	do {
1698	if (old >= max)
1699	return false;
1700	} while (!atomic_try_cmpxchg_relaxed(v: &nna->nr, old: &old, new: old + `1`));
1701
1702	return true;
1703	}
1704
1705	/**
1706	* pwq_tryinc_nr_active - Try to increment nr_active for a pwq
1707	* @pwq: pool_workqueue of interest
1708	* @fill: max_active may have increased, try to increase concurrency level
1709	*
1710	* Try to increment nr_active for @pwq. Returns %true if an nr_active count is
1711	* successfully obtained. %false otherwise.
1712	*/
1713	static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq, bool fill)
1714	{
1715	struct workqueue_struct *wq = pwq->wq;
1716	struct worker_pool *pool = pwq->pool;
1717	struct wq_node_nr_active *nna = wq_node_nr_active(wq, node: pool->node);
1718	bool obtained = false;
1719
1720	lockdep_assert_held(&pool->lock);
1721
1722	if (!nna) {
1723	/ BH or per-cpu workqueue, pwq->nr_active is sufficient /
1724	obtained = pwq->nr_active < READ_ONCE(wq->max_active);
1725	goto out;
1726	}
1727
1728	if (unlikely(pwq->plugged))
1729	return false;
1730
1731	/*
1732	* Unbound workqueue uses per-node shared nr_active $nna. If @pwq is
1733	* already waiting on $nna, pwq_dec_nr_active() will maintain the
1734	* concurrency level. Don't jump the line.
1735	*
1736	* We need to ignore the pending test after max_active has increased as
1737	* pwq_dec_nr_active() can only maintain the concurrency level but not
1738	* increase it. This is indicated by @fill.
1739	*/
1740	if (!list_empty(head: &pwq->pending_node) && likely(!fill))
1741	goto out;
1742
1743	obtained = tryinc_node_nr_active(nna);
1744	if (obtained)
1745	goto out;
1746
1747	/*
1748	* Lockless acquisition failed. Lock, add ourself to $nna->pending_pwqs
1749	* and try again. The smp_mb() is paired with the implied memory barrier
1750	* of atomic_dec_return() in pwq_dec_nr_active() to ensure that either
1751	* we see the decremented $nna->nr or they see non-empty
1752	* $nna->pending_pwqs.
1753	*/
1754	raw_spin_lock(&nna->lock);
1755
1756	if (list_empty(head: &pwq->pending_node))
1757	list_add_tail(new: &pwq->pending_node, head: &nna->pending_pwqs);
1758	else if (likely(!fill))
1759	goto out_unlock;
1760
1761	smp_mb();
1762
1763	obtained = tryinc_node_nr_active(nna);
1764
1765	/*
1766	* If @fill, @pwq might have already been pending. Being spuriously
1767	* pending in cold paths doesn't affect anything. Let's leave it be.
1768	*/
1769	if (obtained && likely(!fill))
1770	list_del_init(entry: &pwq->pending_node);
1771
1772	out_unlock:
1773	raw_spin_unlock(&nna->lock);
1774	out:
1775	if (obtained)
1776	pwq->nr_active++;
1777	return obtained;
1778	}
1779
1780	/**
1781	* pwq_activate_first_inactive - Activate the first inactive work item on a pwq
1782	* @pwq: pool_workqueue of interest
1783	* @fill: max_active may have increased, try to increase concurrency level
1784	*
1785	* Activate the first inactive work item of @pwq if available and allowed by
1786	* max_active limit.
1787	*
1788	* Returns %true if an inactive work item has been activated. %false if no
1789	* inactive work item is found or max_active limit is reached.
1790	*/
1791	static bool pwq_activate_first_inactive(struct pool_workqueue *pwq, bool fill)
1792	{
1793	struct work_struct *work =
1794	list_first_entry_or_null(&pwq->inactive_works,
1795	struct work_struct, entry);
1796
1797	if (work && pwq_tryinc_nr_active(pwq, fill)) {
1798	__pwq_activate_work(pwq, work);
1799	return true;
1800	} else {
1801	return false;
1802	}
1803	}
1804
1805	/**
1806	* unplug_oldest_pwq - unplug the oldest pool_workqueue
1807	* @wq: workqueue_struct where its oldest pwq is to be unplugged
1808	*
1809	* This function should only be called for ordered workqueues where only the
1810	* oldest pwq is unplugged, the others are plugged to suspend execution to
1811	* ensure proper work item ordering::
1812	*
1813	* dfl_pwq --------------+ [P] - plugged
1814	* \|
1815	* v
1816	* pwqs -> A -> B [P] -> C [P] (newest)
1817	* \| \| \|
1818	* 1 3 5
1819	* \| \| \|
1820	* 2 4 6
1821	*
1822	* When the oldest pwq is drained and removed, this function should be called
1823	* to unplug the next oldest one to start its work item execution. Note that
1824	* pwq's are linked into wq->pwqs with the oldest first, so the first one in
1825	* the list is the oldest.
1826	*/
1827	static void unplug_oldest_pwq(struct workqueue_struct *wq)
1828	{
1829	struct pool_workqueue *pwq;
1830
1831	lockdep_assert_held(&wq->mutex);
1832
1833	/ Caller should make sure that pwqs isn't empty before calling /
1834	pwq = list_first_entry_or_null(&wq->pwqs, struct pool_workqueue,
1835	pwqs_node);
1836	raw_spin_lock_irq(&pwq->pool->lock);
1837	if (pwq->plugged) {
1838	pwq->plugged = false;
1839	if (pwq_activate_first_inactive(pwq, fill: true))
1840	kick_pool(pool: pwq->pool);
1841	}
1842	raw_spin_unlock_irq(&pwq->pool->lock);
1843	}
1844
1845	/**
1846	* node_activate_pending_pwq - Activate a pending pwq on a wq_node_nr_active
1847	* @nna: wq_node_nr_active to activate a pending pwq for
1848	* @caller_pool: worker_pool the caller is locking
1849	*
1850	* Activate a pwq in @nna->pending_pwqs. Called with @caller_pool locked.
1851	* @caller_pool may be unlocked and relocked to lock other worker_pools.
1852	*/
1853	static void node_activate_pending_pwq(struct wq_node_nr_active *nna,
1854	struct worker_pool *caller_pool)
1855	{
1856	struct worker_pool *locked_pool = caller_pool;
1857	struct pool_workqueue *pwq;
1858	struct work_struct *work;
1859
1860	lockdep_assert_held(&caller_pool->lock);
1861
1862	raw_spin_lock(&nna->lock);
1863	retry:
1864	pwq = list_first_entry_or_null(&nna->pending_pwqs,
1865	struct pool_workqueue, pending_node);
1866	if (!pwq)
1867	goto out_unlock;
1868
1869	/*
1870	* If @pwq is for a different pool than @locked_pool, we need to lock
1871	* @pwq->pool->lock. Let's trylock first. If unsuccessful, do the unlock
1872	* / lock dance. For that, we also need to release @nna->lock as it's
1873	* nested inside pool locks.
1874	*/
1875	if (pwq->pool != locked_pool) {
1876	raw_spin_unlock(&locked_pool->lock);
1877	locked_pool = pwq->pool;
1878	if (!raw_spin_trylock(&locked_pool->lock)) {
1879	raw_spin_unlock(&nna->lock);
1880	raw_spin_lock(&locked_pool->lock);
1881	raw_spin_lock(&nna->lock);
1882	goto retry;
1883	}
1884	}
1885
1886	/*
1887	* $pwq may not have any inactive work items due to e.g. cancellations.
1888	* Drop it from pending_pwqs and see if there's another one.
1889	*/
1890	work = list_first_entry_or_null(&pwq->inactive_works,
1891	struct work_struct, entry);
1892	if (!work) {
1893	list_del_init(entry: &pwq->pending_node);
1894	goto retry;
1895	}
1896
1897	/*
1898	* Acquire an nr_active count and activate the inactive work item. If
1899	* $pwq still has inactive work items, rotate it to the end of the
1900	* pending_pwqs so that we round-robin through them. This means that
1901	* inactive work items are not activated in queueing order which is fine
1902	* given that there has never been any ordering across different pwqs.
1903	*/
1904	if (likely(tryinc_node_nr_active(nna))) {
1905	pwq->nr_active++;
1906	__pwq_activate_work(pwq, work);
1907
1908	if (list_empty(head: &pwq->inactive_works))
1909	list_del_init(entry: &pwq->pending_node);
1910	else
1911	list_move_tail(list: &pwq->pending_node, head: &nna->pending_pwqs);
1912
1913	/ if activating a foreign pool, make sure it's running /
1914	if (pwq->pool != caller_pool)
1915	kick_pool(pool: pwq->pool);
1916	}
1917
1918	out_unlock:
1919	raw_spin_unlock(&nna->lock);
1920	if (locked_pool != caller_pool) {
1921	raw_spin_unlock(&locked_pool->lock);
1922	raw_spin_lock(&caller_pool->lock);
1923	}
1924	}
1925
1926	/**
1927	* pwq_dec_nr_active - Retire an active count
1928	* @pwq: pool_workqueue of interest
1929	*
1930	* Decrement @pwq's nr_active and try to activate the first inactive work item.
1931	* For unbound workqueues, this function may temporarily drop @pwq->pool->lock.
1932	*/
1933	static void pwq_dec_nr_active(struct pool_workqueue *pwq)
1934	{
1935	struct worker_pool *pool = pwq->pool;
1936	struct wq_node_nr_active *nna = wq_node_nr_active(wq: pwq->wq, node: pool->node);
1937
1938	lockdep_assert_held(&pool->lock);
1939
1940	/*
1941	* @pwq->nr_active should be decremented for both percpu and unbound
1942	* workqueues.
1943	*/
1944	pwq->nr_active--;
1945
1946	/*
1947	* For a percpu workqueue, it's simple. Just need to kick the first
1948	* inactive work item on @pwq itself.
1949	*/
1950	if (!nna) {
1951	pwq_activate_first_inactive(pwq, fill: false);
1952	return;
1953	}
1954
1955	/*
1956	* If @pwq is for an unbound workqueue, it's more complicated because
1957	* multiple pwqs and pools may be sharing the nr_active count. When a
1958	* pwq needs to wait for an nr_active count, it puts itself on
1959	* $nna->pending_pwqs. The following atomic_dec_return()'s implied
1960	* memory barrier is paired with smp_mb() in pwq_tryinc_nr_active() to
1961	* guarantee that either we see non-empty pending_pwqs or they see
1962	* decremented $nna->nr.
1963	*
1964	* $nna->max may change as CPUs come online/offline and @pwq->wq's
1965	* max_active gets updated. However, it is guaranteed to be equal to or
1966	* larger than @pwq->wq->min_active which is above zero unless freezing.
1967	* This maintains the forward progress guarantee.
1968	*/
1969	if (atomic_dec_return(v: &nna->nr) >= READ_ONCE(nna->max))
1970	return;
1971
1972	if (!list_empty(head: &nna->pending_pwqs))
1973	node_activate_pending_pwq(nna, caller_pool: pool);
1974	}
1975
1976	/**
1977	* pwq_dec_nr_in_flight - decrement pwq's nr_in_flight
1978	* @pwq: pwq of interest
1979	* @work_data: work_data of work which left the queue
1980	*
1981	* A work either has completed or is removed from pending queue,
1982	* decrement nr_in_flight of its pwq and handle workqueue flushing.
1983	*
1984	* NOTE:
1985	* For unbound workqueues, this function may temporarily drop @pwq->pool->lock
1986	* and thus should be called after all other state updates for the in-flight
1987	* work item is complete.
1988	*
1989	* CONTEXT:
1990	* raw_spin_lock_irq(pool->lock).
1991	*/
1992	static void pwq_dec_nr_in_flight(struct pool_workqueue pwq, unsigned* long work_data)
1993	{
1994	int color = get_work_color(work_data);
1995
1996	if (!(work_data & WORK_STRUCT_INACTIVE))
1997	pwq_dec_nr_active(pwq);
1998
1999	pwq->nr_in_flight[color]--;
2000
2001	/ is flush in progress and are we at the flushing tip? /
2002	if (likely(pwq->flush_color != color))
2003	goto out_put;
2004
2005	/ are there still in-flight works? /
2006	if (pwq->nr_in_flight[color])
2007	goto out_put;
2008
2009	/ this pwq is done, clear flush_color /
2010	pwq->flush_color = -`1`;
2011
2012	/*
2013	* If this was the last pwq, wake up the first flusher. It
2014	* will handle the rest.
2015	*/
2016	if (atomic_dec_and_test(v: &pwq->wq->nr_pwqs_to_flush))
2017	complete(&pwq->wq->first_flusher->done);
2018	out_put:
2019	put_pwq(pwq);
2020	}
2021
2022	/**
2023	* try_to_grab_pending - steal work item from worklist and disable irq
2024	* @work: work item to steal
2025	* @cflags: %WORK_CANCEL_ flags
2026	* @irq_flags: place to store irq state
2027	*
2028	* Try to grab PENDING bit of @work. This function can handle @work in any
2029	* stable state - idle, on timer or on worklist.
2030	*
2031	* Return:
2032	*
2033	* ======== ================================================================
2034	* 1 if @work was pending and we successfully stole PENDING
2035	* 0 if @work was idle and we claimed PENDING
2036	* -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry
2037	* ======== ================================================================
2038	*
2039	* Note:
2040	* On >= 0 return, the caller owns @work's PENDING bit. To avoid getting
2041	* interrupted while holding PENDING and @work off queue, irq must be
2042	* disabled on entry. This, combined with delayed_work->timer being
2043	* irqsafe, ensures that we return -EAGAIN for finite short period of time.
2044	*
2045	* On successful return, >= 0, irq is disabled and the caller is
2046	* responsible for releasing it using local_irq_restore(*@irq_flags).
2047	*
2048	* This function is safe to call from any context including IRQ handler.
2049	*/
2050	static int try_to_grab_pending(struct work_struct *work, u32 cflags,
2051	unsigned long *irq_flags)
2052	{
2053	struct worker_pool *pool;
2054	struct pool_workqueue *pwq;
2055
2056	local_irq_save(*irq_flags);
2057
2058	/ try to steal the timer if it exists /
2059	if (cflags & WORK_CANCEL_DELAYED) {
2060	struct delayed_work *dwork = to_delayed_work(work);
2061
2062	/*
2063	* dwork->timer is irqsafe. If timer_delete() fails, it's
2064	* guaranteed that the timer is not queued anywhere and not
2065	* running on the local CPU.
2066	*/
2067	if (likely(timer_delete(&dwork->timer)))
2068	return `1`;
2069	}
2070
2071	/ try to claim PENDING the normal way /
2072	if (!test_and_set_bit(nr: WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
2073	return `0`;
2074
2075	rcu_read_lock();
2076	/*
2077	* The queueing is in progress, or it is already queued. Try to
2078	* steal it from ->worklist without clearing WORK_STRUCT_PENDING.
2079	*/
2080	pool = get_work_pool(work);
2081	if (!pool)
2082	goto fail;
2083
2084	raw_spin_lock(&pool->lock);
2085	/*
2086	* work->data is guaranteed to point to pwq only while the work
2087	* item is queued on pwq->wq, and both updating work->data to point
2088	* to pwq on queueing and to pool on dequeueing are done under
2089	* pwq->pool->lock. This in turn guarantees that, if work->data
2090	* points to pwq which is associated with a locked pool, the work
2091	* item is currently queued on that pool.
2092	*/
2093	pwq = get_work_pwq(work);
2094	if (pwq && pwq->pool == pool) {
2095	unsigned long work_data = *work_data_bits(work);
2096
2097	debug_work_deactivate(work);
2098
2099	/*
2100	* A cancelable inactive work item must be in the
2101	* pwq->inactive_works since a queued barrier can't be
2102	* canceled (see the comments in insert_wq_barrier()).
2103	*
2104	* An inactive work item cannot be deleted directly because
2105	* it might have linked barrier work items which, if left
2106	* on the inactive_works list, will confuse pwq->nr_active
2107	* management later on and cause stall. Move the linked
2108	* barrier work items to the worklist when deleting the grabbed
2109	* item. Also keep WORK_STRUCT_INACTIVE in work_data, so that
2110	* it doesn't participate in nr_active management in later
2111	* pwq_dec_nr_in_flight().
2112	*/
2113	if (work_data & WORK_STRUCT_INACTIVE)
2114	move_linked_works(work, head: &pwq->pool->worklist, NULL);
2115
2116	list_del_init(entry: &work->entry);
2117
2118	/*
2119	* work->data points to pwq iff queued. Let's point to pool. As
2120	* this destroys work->data needed by the next step, stash it.
2121	*/
2122	set_work_pool_and_keep_pending(work, pool_id: pool->id,
2123	flags: pool_offq_flags(pool));
2124
2125	/ must be the last step, see the function comment /
2126	pwq_dec_nr_in_flight(pwq, work_data);
2127
2128	raw_spin_unlock(&pool->lock);
2129	rcu_read_unlock();
2130	return `1`;
2131	}
2132	raw_spin_unlock(&pool->lock);
2133	fail:
2134	rcu_read_unlock();
2135	local_irq_restore(*irq_flags);
2136	return -EAGAIN;
2137	}
2138
2139	/**
2140	* work_grab_pending - steal work item from worklist and disable irq
2141	* @work: work item to steal
2142	* @cflags: %WORK_CANCEL_ flags
2143	* @irq_flags: place to store IRQ state
2144	*
2145	* Grab PENDING bit of @work. @work can be in any stable state - idle, on timer
2146	* or on worklist.
2147	*
2148	* Can be called from any context. IRQ is disabled on return with IRQ state
2149	* stored in *@irq_flags. The caller is responsible for re-enabling it using
2150	* local_irq_restore().
2151	*
2152	* Returns %true if @work was pending. %false if idle.
2153	*/
2154	static bool work_grab_pending(struct work_struct *work, u32 cflags,
2155	unsigned long *irq_flags)
2156	{
2157	int ret;
2158
2159	while (true) {
2160	ret = try_to_grab_pending(work, cflags, irq_flags);
2161	if (ret >= `0`)
2162	return ret;
2163	cpu_relax();
2164	}
2165	}
2166
2167	/**
2168	* insert_work - insert a work into a pool
2169	* @pwq: pwq @work belongs to
2170	* @work: work to insert
2171	* @head: insertion point
2172	* @extra_flags: extra WORK_STRUCT_* flags to set
2173	*
2174	* Insert @work which belongs to @pwq after @head. @extra_flags is or'd to
2175	* work_struct flags.
2176	*
2177	* CONTEXT:
2178	* raw_spin_lock_irq(pool->lock).
2179	*/
2180	static void insert_work(struct pool_workqueue pwq, struct* work_struct *work,
2181	struct list_head head, unsigned* int extra_flags)
2182	{
2183	debug_work_activate(work);
2184
2185	/ record the work call stack in order to print it in KASAN reports /
2186	kasan_record_aux_stack(ptr: work);
2187
2188	/ we own @work, set data and link /
2189	set_work_pwq(work, pwq, flags: extra_flags);
2190	list_add_tail(new: &work->entry, head);
2191	get_pwq(pwq);
2192	}
2193
2194	/*
2195	* Test whether @work is being queued from another work executing on the
2196	* same workqueue.
2197	*/
2198	static bool is_chained_work(struct workqueue_struct *wq)
2199	{
2200	struct worker *worker;
2201
2202	worker = current_wq_worker();
2203	/*
2204	* Return %true iff I'm a worker executing a work item on @wq. If
2205	* I'm @worker, it's safe to dereference it without locking.
2206	*/
2207	return worker && worker->current_pwq->wq == wq;
2208	}
2209
2210	/*
2211	* When queueing an unbound work item to a wq, prefer local CPU if allowed
2212	* by wq_unbound_cpumask. Otherwise, round robin among the allowed ones to
2213	* avoid perturbing sensitive tasks.
2214	*/
2215	static int wq_select_unbound_cpu(int cpu)
2216	{
2217	int new_cpu;
2218
2219	if (likely(!wq_debug_force_rr_cpu)) {
2220	if (cpumask_test_cpu(cpu, cpumask: wq_unbound_cpumask))
2221	return cpu;
2222	} else {
2223	pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n");
2224	}
2225
2226	new_cpu = __this_cpu_read(wq_rr_cpu_last);
2227	new_cpu = cpumask_next_and_wrap(n: new_cpu, src1p: wq_unbound_cpumask, cpu_online_mask);
2228	if (unlikely(new_cpu >= nr_cpu_ids))
2229	return cpu;
2230	__this_cpu_write(wq_rr_cpu_last, new_cpu);
2231
2232	return new_cpu;
2233	}
2234
2235	static void __queue_work(int cpu, struct workqueue_struct *wq,
2236	struct work_struct *work)
2237	{
2238	struct pool_workqueue *pwq;
2239	struct worker_pool last_pool, pool;
2240	unsigned int work_flags;
2241	unsigned int req_cpu = cpu;
2242
2243	/*
2244	* While a work item is PENDING && off queue, a task trying to
2245	* steal the PENDING will busy-loop waiting for it to either get
2246	* queued or lose PENDING. Grabbing PENDING and queueing should
2247	* happen with IRQ disabled.
2248	*/
2249	lockdep_assert_irqs_disabled();
2250
2251	/*
2252	* For a draining wq, only works from the same workqueue are
2253	* allowed. The __WQ_DESTROYING helps to spot the issue that
2254	* queues a new work item to a wq after destroy_workqueue(wq).
2255	*/
2256	if (unlikely(wq->flags & (__WQ_DESTROYING \| __WQ_DRAINING) &&
2257	WARN_ONCE(!is_chained_work(wq), "workqueue: cannot queue %ps on wq %s\n",
2258	work->func, wq->name))) {
2259	return;
2260	}
2261	rcu_read_lock();
2262	retry:
2263	/ pwq which will be used unless @work is executing elsewhere /
2264	if (req_cpu == WORK_CPU_UNBOUND) {
2265	if (wq->flags & WQ_UNBOUND)
2266	cpu = wq_select_unbound_cpu(raw_smp_processor_id());
2267	else
2268	cpu = raw_smp_processor_id();
2269	}
2270
2271	pwq = rcu_dereference(*per_cpu_ptr(wq->cpu_pwq, cpu));
2272	pool = pwq->pool;
2273
2274	/*
2275	* If @work was previously on a different pool, it might still be
2276	* running there, in which case the work needs to be queued on that
2277	* pool to guarantee non-reentrancy.
2278	*
2279	* For ordered workqueue, work items must be queued on the newest pwq
2280	* for accurate order management. Guaranteed order also guarantees
2281	* non-reentrancy. See the comments above unplug_oldest_pwq().
2282	*/
2283	last_pool = get_work_pool(work);
2284	if (last_pool && last_pool != pool && !(wq->flags & __WQ_ORDERED)) {
2285	struct worker *worker;
2286
2287	raw_spin_lock(&last_pool->lock);
2288
2289	worker = find_worker_executing_work(pool: last_pool, work);
2290
2291	if (worker && worker->current_pwq->wq == wq) {
2292	pwq = worker->current_pwq;
2293	pool = pwq->pool;
2294	WARN_ON_ONCE(pool != last_pool);
2295	} else {
2296	/ meh... not running there, queue here /
2297	raw_spin_unlock(&last_pool->lock);
2298	raw_spin_lock(&pool->lock);
2299	}
2300	} else {
2301	raw_spin_lock(&pool->lock);
2302	}
2303
2304	/*
2305	* pwq is determined and locked. For unbound pools, we could have raced
2306	* with pwq release and it could already be dead. If its refcnt is zero,
2307	* repeat pwq selection. Note that unbound pwqs never die without
2308	* another pwq replacing it in cpu_pwq or while work items are executing
2309	* on it, so the retrying is guaranteed to make forward-progress.
2310	*/
2311	if (unlikely(!pwq->refcnt)) {
2312	if (wq->flags & WQ_UNBOUND) {
2313	raw_spin_unlock(&pool->lock);
2314	cpu_relax();
2315	goto retry;
2316	}
2317	/ oops /
2318	WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
2319	wq->name, cpu);
2320	}
2321
2322	/ pwq determined, queue /
2323	trace_workqueue_queue_work(req_cpu, pwq, work);
2324
2325	if (WARN_ON(!list_empty(&work->entry)))
2326	goto out;
2327
2328	pwq->nr_in_flight[pwq->work_color]++;
2329	work_flags = work_color_to_flags(color: pwq->work_color);
2330
2331	/*
2332	* Limit the number of concurrently active work items to max_active.
2333	* @work must also queue behind existing inactive work items to maintain
2334	* ordering when max_active changes. See wq_adjust_max_active().
2335	*/
2336	if (list_empty(head: &pwq->inactive_works) && pwq_tryinc_nr_active(pwq, fill: false)) {
2337	if (list_empty(head: &pool->worklist))
2338	pool->watchdog_ts = jiffies;
2339
2340	trace_workqueue_activate_work(work);
2341	insert_work(pwq, work, head: &pool->worklist, extra_flags: work_flags);
2342	kick_pool(pool);
2343	} else {
2344	work_flags \|= WORK_STRUCT_INACTIVE;
2345	insert_work(pwq, work, head: &pwq->inactive_works, extra_flags: work_flags);
2346	}
2347
2348	out:
2349	raw_spin_unlock(&pool->lock);
2350	rcu_read_unlock();
2351	}
2352
2353	static bool clear_pending_if_disabled(struct work_struct *work)
2354	{
2355	unsigned long data = *work_data_bits(work);
2356	struct work_offq_data offqd;
2357
2358	if (likely((data & WORK_STRUCT_PWQ) \|\|
2359	!(data & WORK_OFFQ_DISABLE_MASK)))
2360	return false;
2361
2362	work_offqd_unpack(offqd: &offqd, data);
2363	set_work_pool_and_clear_pending(work, pool_id: offqd.pool_id,
2364	flags: work_offqd_pack_flags(offqd: &offqd));
2365	return true;
2366	}
2367
2368	/**
2369	* queue_work_on - queue work on specific cpu
2370	* @cpu: CPU number to execute work on
2371	* @wq: workqueue to use
2372	* @work: work to queue
2373	*
2374	* We queue the work to a specific CPU, the caller must ensure it
2375	* can't go away. Callers that fail to ensure that the specified
2376	* CPU cannot go away will execute on a randomly chosen CPU.
2377	* But note well that callers specifying a CPU that never has been
2378	* online will get a splat.
2379	*
2380	* Return: %false if @work was already on a queue, %true otherwise.
2381	*/
2382	bool queue_work_on(int cpu, struct workqueue_struct *wq,
2383	struct work_struct *work)
2384	{
2385	bool ret = false;
2386	unsigned long irq_flags;
2387
2388	local_irq_save(irq_flags);
2389
2390	if (!test_and_set_bit(nr: WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
2391	!clear_pending_if_disabled(work)) {
2392	__queue_work(cpu, wq, work);
2393	ret = true;
2394	}
2395
2396	local_irq_restore(irq_flags);
2397	return ret;
2398	}
2399	EXPORT_SYMBOL(queue_work_on);
2400
2401	/**
2402	* select_numa_node_cpu - Select a CPU based on NUMA node
2403	* @node: NUMA node ID that we want to select a CPU from
2404	*
2405	* This function will attempt to find a "random" cpu available on a given
2406	* node. If there are no CPUs available on the given node it will return
2407	* WORK_CPU_UNBOUND indicating that we should just schedule to any
2408	* available CPU if we need to schedule this work.
2409	*/
2410	static int select_numa_node_cpu(int node)
2411	{
2412	int cpu;
2413
2414	/ Delay binding to CPU if node is not valid or online /
2415	if (node < `0` \|\| node >= MAX_NUMNODES \|\| !node_online(node))
2416	return WORK_CPU_UNBOUND;
2417
2418	/ Use local node/cpu if we are already there /
2419	cpu = raw_smp_processor_id();
2420	if (node == cpu_to_node(cpu))
2421	return cpu;
2422
2423	/ Use "random" otherwise know as "first" online CPU of node /
2424	cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask);
2425
2426	/ If CPU is valid return that, otherwise just defer /
2427	return cpu < nr_cpu_ids ? cpu : WORK_CPU_UNBOUND;
2428	}
2429
2430	/**
2431	* queue_work_node - queue work on a "random" cpu for a given NUMA node
2432	* @node: NUMA node that we are targeting the work for
2433	* @wq: workqueue to use
2434	* @work: work to queue
2435	*
2436	* We queue the work to a "random" CPU within a given NUMA node. The basic
2437	* idea here is to provide a way to somehow associate work with a given
2438	* NUMA node.
2439	*
2440	* This function will only make a best effort attempt at getting this onto
2441	* the right NUMA node. If no node is requested or the requested node is
2442	* offline then we just fall back to standard queue_work behavior.
2443	*
2444	* Currently the "random" CPU ends up being the first available CPU in the
2445	* intersection of cpu_online_mask and the cpumask of the node, unless we
2446	* are running on the node. In that case we just use the current CPU.
2447	*
2448	* Return: %false if @work was already on a queue, %true otherwise.
2449	*/
2450	bool queue_work_node(int node, struct workqueue_struct *wq,
2451	struct work_struct *work)
2452	{
2453	unsigned long irq_flags;
2454	bool ret = false;
2455
2456	/*
2457	* This current implementation is specific to unbound workqueues.
2458	* Specifically we only return the first available CPU for a given
2459	* node instead of cycling through individual CPUs within the node.
2460	*
2461	* If this is used with a per-cpu workqueue then the logic in
2462	* workqueue_select_cpu_near would need to be updated to allow for
2463	* some round robin type logic.
2464	*/
2465	WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND));
2466
2467	local_irq_save(irq_flags);
2468
2469	if (!test_and_set_bit(nr: WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
2470	!clear_pending_if_disabled(work)) {
2471	int cpu = select_numa_node_cpu(node);
2472
2473	__queue_work(cpu, wq, work);
2474	ret = true;
2475	}
2476
2477	local_irq_restore(irq_flags);
2478	return ret;
2479	}
2480	EXPORT_SYMBOL_GPL(queue_work_node);
2481
2482	void delayed_work_timer_fn(struct timer_list *t)
2483	{
2484	struct delayed_work *dwork = timer_container_of(dwork, t, timer);
2485
2486	/ should have been called from irqsafe timer with irq already off /
2487	__queue_work(cpu: dwork->cpu, wq: dwork->wq, work: &dwork->work);
2488	}
2489	EXPORT_SYMBOL(delayed_work_timer_fn);
2490
2491	static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
2492	struct delayed_work dwork, unsigned* long delay)
2493	{
2494	struct timer_list *timer = &dwork->timer;
2495	struct work_struct *work = &dwork->work;
2496
2497	WARN_ON_ONCE(!wq);
2498	WARN_ON_ONCE(timer->function != delayed_work_timer_fn);
2499	WARN_ON_ONCE(timer_pending(timer));
2500	WARN_ON_ONCE(!list_empty(&work->entry));
2501
2502	/*
2503	* If @delay is 0, queue @dwork->work immediately. This is for
2504	* both optimization and correctness. The earliest @timer can
2505	* expire is on the closest next tick and delayed_work users depend
2506	* on that there's no such delay when @delay is 0.
2507	*/
2508	if (!delay) {
2509	__queue_work(cpu, wq, work: &dwork->work);
2510	return;
2511	}
2512
2513	WARN_ON_ONCE(cpu != WORK_CPU_UNBOUND && !cpu_online(cpu));
2514	dwork->wq = wq;
2515	dwork->cpu = cpu;
2516	timer->expires = jiffies + delay;
2517
2518	if (housekeeping_enabled(type: HK_TYPE_TIMER)) {
2519	/ If the current cpu is a housekeeping cpu, use it. /
2520	cpu = smp_processor_id();
2521	if (!housekeeping_test_cpu(cpu, type: HK_TYPE_TIMER))
2522	cpu = housekeeping_any_cpu(type: HK_TYPE_TIMER);
2523	add_timer_on(timer, cpu);
2524	} else {
2525	if (likely(cpu == WORK_CPU_UNBOUND))
2526	add_timer_global(timer);
2527	else
2528	add_timer_on(timer, cpu);
2529	}
2530	}
2531
2532	/**
2533	* queue_delayed_work_on - queue work on specific CPU after delay
2534	* @cpu: CPU number to execute work on
2535	* @wq: workqueue to use
2536	* @dwork: work to queue
2537	* @delay: number of jiffies to wait before queueing
2538	*
2539	* We queue the delayed_work to a specific CPU, for non-zero delays the
2540	* caller must ensure it is online and can't go away. Callers that fail
2541	* to ensure this, may get @dwork->timer queued to an offlined CPU and
2542	* this will prevent queueing of @dwork->work unless the offlined CPU
2543	* becomes online again.
2544	*
2545	* Return: %false if @work was already on a queue, %true otherwise. If
2546	* @delay is zero and @dwork is idle, it will be scheduled for immediate
2547	* execution.
2548	*/
2549	bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
2550	struct delayed_work dwork, unsigned* long delay)
2551	{
2552	struct work_struct *work = &dwork->work;
2553	bool ret = false;
2554	unsigned long irq_flags;
2555
2556	/ read the comment in __queue_work() /
2557	local_irq_save(irq_flags);
2558
2559	if (!test_and_set_bit(nr: WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
2560	!clear_pending_if_disabled(work)) {
2561	__queue_delayed_work(cpu, wq, dwork, delay);
2562	ret = true;
2563	}
2564
2565	local_irq_restore(irq_flags);
2566	return ret;
2567	}
2568	EXPORT_SYMBOL(queue_delayed_work_on);
2569
2570	/**
2571	* mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
2572	* @cpu: CPU number to execute work on
2573	* @wq: workqueue to use
2574	* @dwork: work to queue
2575	* @delay: number of jiffies to wait before queueing
2576	*
2577	* If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise,
2578	* modify @dwork's timer so that it expires after @delay. If @delay is
2579	* zero, @work is guaranteed to be scheduled immediately regardless of its
2580	* current state.
2581	*
2582	* Return: %false if @dwork was idle and queued, %true if @dwork was
2583	* pending and its timer was modified.
2584	*
2585	* This function is safe to call from any context including IRQ handler.
2586	* See try_to_grab_pending() for details.
2587	*/
2588	bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
2589	struct delayed_work dwork, unsigned* long delay)
2590	{
2591	unsigned long irq_flags;
2592	bool ret;
2593
2594	ret = work_grab_pending(work: &dwork->work, cflags: WORK_CANCEL_DELAYED, irq_flags: &irq_flags);
2595
2596	if (!clear_pending_if_disabled(work: &dwork->work))
2597	__queue_delayed_work(cpu, wq, dwork, delay);
2598
2599	local_irq_restore(irq_flags);
2600	return ret;
2601	}
2602	EXPORT_SYMBOL_GPL(mod_delayed_work_on);
2603
2604	static void rcu_work_rcufn(struct rcu_head *rcu)
2605	{
2606	struct rcu_work rwork = container_of(rcu, struct* rcu_work, rcu);
2607
2608	/ read the comment in __queue_work() /
2609	local_irq_disable();
2610	__queue_work(cpu: WORK_CPU_UNBOUND, wq: rwork->wq, work: &rwork->work);
2611	local_irq_enable();
2612	}
2613
2614	/**
2615	* queue_rcu_work - queue work after a RCU grace period
2616	* @wq: workqueue to use
2617	* @rwork: work to queue
2618	*
2619	* Return: %false if @rwork was already pending, %true otherwise. Note
2620	* that a full RCU grace period is guaranteed only after a %true return.
2621	* While @rwork is guaranteed to be executed after a %false return, the
2622	* execution may happen before a full RCU grace period has passed.
2623	*/
2624	bool queue_rcu_work(struct workqueue_struct wq, struct* rcu_work *rwork)
2625	{
2626	struct work_struct *work = &rwork->work;
2627
2628	/*
2629	* rcu_work can't be canceled or disabled. Warn if the user reached
2630	* inside @rwork and disabled the inner work.
2631	*/
2632	if (!test_and_set_bit(nr: WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
2633	!WARN_ON_ONCE(clear_pending_if_disabled(work))) {
2634	rwork->wq = wq;
2635	call_rcu_hurry(head: &rwork->rcu, func: rcu_work_rcufn);
2636	return true;
2637	}
2638
2639	return false;
2640	}
2641	EXPORT_SYMBOL(queue_rcu_work);
2642
2643	static struct worker alloc_worker(int* node)
2644	{
2645	struct worker *worker;
2646
2647	worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node);
2648	if (worker) {
2649	INIT_LIST_HEAD(list: &worker->entry);
2650	INIT_LIST_HEAD(list: &worker->scheduled);
2651	INIT_LIST_HEAD(list: &worker->node);
2652	/ on creation a worker is in !idle && prep state /
2653	worker->flags = WORKER_PREP;
2654	}
2655	return worker;
2656	}
2657
2658	static cpumask_t pool_allowed_cpus(struct* worker_pool *pool)
2659	{
2660	if (pool->cpu < `0` && pool->attrs->affn_strict)
2661	return pool->attrs->__pod_cpumask;
2662	else
2663	return pool->attrs->cpumask;
2664	}
2665
2666	/**
2667	* worker_attach_to_pool() - attach a worker to a pool
2668	* @worker: worker to be attached
2669	* @pool: the target pool
2670	*
2671	* Attach @worker to @pool. Once attached, the %WORKER_UNBOUND flag and
2672	* cpu-binding of @worker are kept coordinated with the pool across
2673	* cpu-[un]hotplugs.
2674	*/
2675	static void worker_attach_to_pool(struct worker *worker,
2676	struct worker_pool *pool)
2677	{
2678	mutex_lock(lock: &wq_pool_attach_mutex);
2679
2680	/*
2681	* The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains stable
2682	* across this function. See the comments above the flag definition for
2683	* details. BH workers are, while per-CPU, always DISASSOCIATED.
2684	*/
2685	if (pool->flags & POOL_DISASSOCIATED) {
2686	worker->flags \|= WORKER_UNBOUND;
2687	} else {
2688	WARN_ON_ONCE(pool->flags & POOL_BH);
2689	kthread_set_per_cpu(k: worker->task, cpu: pool->cpu);
2690	}
2691
2692	if (worker->rescue_wq)
2693	set_cpus_allowed_ptr(p: worker->task, new_mask: pool_allowed_cpus(pool));
2694
2695	list_add_tail(new: &worker->node, head: &pool->workers);
2696	worker->pool = pool;
2697
2698	mutex_unlock(lock: &wq_pool_attach_mutex);
2699	}
2700
2701	static void unbind_worker(struct worker *worker)
2702	{
2703	lockdep_assert_held(&wq_pool_attach_mutex);
2704
2705	kthread_set_per_cpu(k: worker->task, cpu: -`1`);
2706	if (cpumask_intersects(src1p: wq_unbound_cpumask, cpu_active_mask))
2707	WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < `0`);
2708	else
2709	WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < `0`);
2710	}
2711
2712
2713	static void detach_worker(struct worker *worker)
2714	{
2715	lockdep_assert_held(&wq_pool_attach_mutex);
2716
2717	unbind_worker(worker);
2718	list_del(entry: &worker->node);
2719	}
2720
2721	/**
2722	* worker_detach_from_pool() - detach a worker from its pool
2723	* @worker: worker which is attached to its pool
2724	*
2725	* Undo the attaching which had been done in worker_attach_to_pool(). The
2726	* caller worker shouldn't access to the pool after detached except it has
2727	* other reference to the pool.
2728	*/
2729	static void worker_detach_from_pool(struct worker *worker)
2730	{
2731	struct worker_pool *pool = worker->pool;
2732
2733	/ there is one permanent BH worker per CPU which should never detach /
2734	WARN_ON_ONCE(pool->flags & POOL_BH);
2735
2736	mutex_lock(lock: &wq_pool_attach_mutex);
2737	detach_worker(worker);
2738	worker->pool = NULL;
2739	mutex_unlock(lock: &wq_pool_attach_mutex);
2740
2741	/ clear leftover flags without pool->lock after it is detached /
2742	worker->flags &= ~(WORKER_UNBOUND \| WORKER_REBOUND);
2743	}
2744
2745	static int format_worker_id(char buf, size_t size, struct* worker *worker,
2746	struct worker_pool *pool)
2747	{
2748	if (worker->rescue_wq)
2749	return scnprintf(buf, size, fmt: "kworker/R-%s",
2750	worker->rescue_wq->name);
2751
2752	if (pool) {
2753	if (pool->cpu >= `0`)
2754	return scnprintf(buf, size, fmt: "kworker/%d:%d%s",
2755	pool->cpu, worker->id,
2756	pool->attrs->nice < `0` ? "H" : "");
2757	else
2758	return scnprintf(buf, size, fmt: "kworker/u%d:%d",
2759	pool->id, worker->id);
2760	} else {
2761	return scnprintf(buf, size, fmt: "kworker/dying");
2762	}
2763	}
2764
2765	/**
2766	* create_worker - create a new workqueue worker
2767	* @pool: pool the new worker will belong to
2768	*
2769	* Create and start a new worker which is attached to @pool.
2770	*
2771	* CONTEXT:
2772	* Might sleep. Does GFP_KERNEL allocations.
2773	*
2774	* Return:
2775	* Pointer to the newly created worker.
2776	*/
2777	static struct worker create_worker(struct* worker_pool *pool)
2778	{
2779	struct worker *worker;
2780	int id;
2781
2782	/ ID is needed to determine kthread name /
2783	id = ida_alloc(ida: &pool->worker_ida, GFP_KERNEL);
2784	if (id < `0`) {
2785	pr_err_once("workqueue: Failed to allocate a worker ID: %pe\n",
2786	ERR_PTR(id));
2787	return NULL;
2788	}
2789
2790	worker = alloc_worker(node: pool->node);
2791	if (!worker) {
2792	pr_err_once("workqueue: Failed to allocate a worker\n");
2793	goto fail;
2794	}
2795
2796	worker->id = id;
2797
2798	if (!(pool->flags & POOL_BH)) {
2799	char id_buf[WORKER_ID_LEN];
2800
2801	format_worker_id(buf: id_buf, size: sizeof(id_buf), worker, pool);
2802	worker->task = kthread_create_on_node(threadfn: worker_thread, data: worker,
2803	node: pool->node, namefmt: "%s", id_buf);
2804	if (IS_ERR(ptr: worker->task)) {
2805	if (PTR_ERR(ptr: worker->task) == -EINTR) {
2806	pr_err("workqueue: Interrupted when creating a worker thread \"%s\"\n",
2807	id_buf);
2808	} else {
2809	pr_err_once("workqueue: Failed to create a worker thread: %pe",
2810	worker->task);
2811	}
2812	goto fail;
2813	}
2814
2815	set_user_nice(p: worker->task, nice: pool->attrs->nice);
2816	kthread_bind_mask(k: worker->task, mask: pool_allowed_cpus(pool));
2817	}
2818
2819	/ successful, attach the worker to the pool /
2820	worker_attach_to_pool(worker, pool);
2821
2822	/ start the newly created worker /
2823	raw_spin_lock_irq(&pool->lock);
2824
2825	worker->pool->nr_workers++;
2826	worker_enter_idle(worker);
2827
2828	/*
2829	* @worker is waiting on a completion in kthread() and will trigger hung
2830	* check if not woken up soon. As kick_pool() is noop if @pool is empty,
2831	* wake it up explicitly.
2832	*/
2833	if (worker->task)
2834	wake_up_process(tsk: worker->task);
2835
2836	raw_spin_unlock_irq(&pool->lock);
2837
2838	return worker;
2839
2840	fail:
2841	ida_free(&pool->worker_ida, id);
2842	kfree(objp: worker);
2843	return NULL;
2844	}
2845
2846	static void detach_dying_workers(struct list_head *cull_list)
2847	{
2848	struct worker *worker;
2849
2850	list_for_each_entry(worker, cull_list, entry)
2851	detach_worker(worker);
2852	}
2853
2854	static void reap_dying_workers(struct list_head *cull_list)
2855	{
2856	struct worker worker, tmp;
2857
2858	list_for_each_entry_safe(worker, tmp, cull_list, entry) {
2859	list_del_init(entry: &worker->entry);
2860	kthread_stop_put(k: worker->task);
2861	kfree(objp: worker);
2862	}
2863	}
2864
2865	/**
2866	* set_worker_dying - Tag a worker for destruction
2867	* @worker: worker to be destroyed
2868	* @list: transfer worker away from its pool->idle_list and into list
2869	*
2870	* Tag @worker for destruction and adjust @pool stats accordingly. The worker
2871	* should be idle.
2872	*
2873	* CONTEXT:
2874	* raw_spin_lock_irq(pool->lock).
2875	*/
2876	static void set_worker_dying(struct worker worker, struct* list_head *list)
2877	{
2878	struct worker_pool *pool = worker->pool;
2879
2880	lockdep_assert_held(&pool->lock);
2881	lockdep_assert_held(&wq_pool_attach_mutex);
2882
2883	/ sanity check frenzy /
2884	if (WARN_ON(worker->current_work) \|\|
2885	WARN_ON(!list_empty(&worker->scheduled)) \|\|
2886	WARN_ON(!(worker->flags & WORKER_IDLE)))
2887	return;
2888
2889	pool->nr_workers--;
2890	pool->nr_idle--;
2891
2892	worker->flags \|= WORKER_DIE;
2893
2894	list_move(list: &worker->entry, head: list);
2895
2896	/ get an extra task struct reference for later kthread_stop_put() /
2897	get_task_struct(t: worker->task);
2898	}
2899
2900	/**
2901	* idle_worker_timeout - check if some idle workers can now be deleted.
2902	* @t: The pool's idle_timer that just expired
2903	*
2904	* The timer is armed in worker_enter_idle(). Note that it isn't disarmed in
2905	* worker_leave_idle(), as a worker flicking between idle and active while its
2906	* pool is at the too_many_workers() tipping point would cause too much timer
2907	* housekeeping overhead. Since IDLE_WORKER_TIMEOUT is long enough, we just let
2908	* it expire and re-evaluate things from there.
2909	*/
2910	static void idle_worker_timeout(struct timer_list *t)
2911	{
2912	struct worker_pool *pool = timer_container_of(pool, t, idle_timer);
2913	bool do_cull = false;
2914
2915	if (work_pending(&pool->idle_cull_work))
2916	return;
2917
2918	raw_spin_lock_irq(&pool->lock);
2919
2920	if (too_many_workers(pool)) {
2921	struct worker *worker;
2922	unsigned long expires;
2923
2924	/ idle_list is kept in LIFO order, check the last one /
2925	worker = list_last_entry(&pool->idle_list, struct worker, entry);
2926	expires = worker->last_active + IDLE_WORKER_TIMEOUT;
2927	do_cull = !time_before(jiffies, expires);
2928
2929	if (!do_cull)
2930	mod_timer(timer: &pool->idle_timer, expires);
2931	}
2932	raw_spin_unlock_irq(&pool->lock);
2933
2934	if (do_cull)
2935	queue_work(wq: system_dfl_wq, work: &pool->idle_cull_work);
2936	}
2937
2938	/**
2939	* idle_cull_fn - cull workers that have been idle for too long.
2940	* @work: the pool's work for handling these idle workers
2941	*
2942	* This goes through a pool's idle workers and gets rid of those that have been
2943	* idle for at least IDLE_WORKER_TIMEOUT seconds.
2944	*
2945	* We don't want to disturb isolated CPUs because of a pcpu kworker being
2946	* culled, so this also resets worker affinity. This requires a sleepable
2947	* context, hence the split between timer callback and work item.
2948	*/
2949	static void idle_cull_fn(struct work_struct *work)
2950	{
2951	struct worker_pool pool = container_of(work, struct* worker_pool, idle_cull_work);
2952	LIST_HEAD(cull_list);
2953
2954	/*
2955	* Grabbing wq_pool_attach_mutex here ensures an already-running worker
2956	* cannot proceed beyong set_pf_worker() in its self-destruct path.
2957	* This is required as a previously-preempted worker could run after
2958	* set_worker_dying() has happened but before detach_dying_workers() did.
2959	*/
2960	mutex_lock(lock: &wq_pool_attach_mutex);
2961	raw_spin_lock_irq(&pool->lock);
2962
2963	while (too_many_workers(pool)) {
2964	struct worker *worker;
2965	unsigned long expires;
2966
2967	worker = list_last_entry(&pool->idle_list, struct worker, entry);
2968	expires = worker->last_active + IDLE_WORKER_TIMEOUT;
2969
2970	if (time_before(jiffies, expires)) {
2971	mod_timer(timer: &pool->idle_timer, expires);
2972	break;
2973	}
2974
2975	set_worker_dying(worker, list: &cull_list);
2976	}
2977
2978	raw_spin_unlock_irq(&pool->lock);
2979	detach_dying_workers(cull_list: &cull_list);
2980	mutex_unlock(lock: &wq_pool_attach_mutex);
2981
2982	reap_dying_workers(cull_list: &cull_list);
2983	}
2984
2985	static void send_mayday(struct work_struct *work)
2986	{
2987	struct pool_workqueue *pwq = get_work_pwq(work);
2988	struct workqueue_struct *wq = pwq->wq;
2989
2990	lockdep_assert_held(&wq_mayday_lock);
2991
2992	if (!wq->rescuer)
2993	return;
2994
2995	/ mayday mayday mayday /
2996	if (list_empty(head: &pwq->mayday_node)) {
2997	/*
2998	* If @pwq is for an unbound wq, its base ref may be put at
2999	* any time due to an attribute change. Pin @pwq until the
3000	* rescuer is done with it.
3001	*/
3002	get_pwq(pwq);
3003	list_add_tail(new: &pwq->mayday_node, head: &wq->maydays);
3004	wake_up_process(tsk: wq->rescuer->task);
3005	pwq->stats[PWQ_STAT_MAYDAY]++;
3006	}
3007	}
3008
3009	static void pool_mayday_timeout(struct timer_list *t)
3010	{
3011	struct worker_pool *pool = timer_container_of(pool, t, mayday_timer);
3012	struct work_struct *work;
3013
3014	raw_spin_lock_irq(&pool->lock);
3015	raw_spin_lock(&wq_mayday_lock); / for wq->maydays /
3016
3017	if (need_to_create_worker(pool)) {
3018	/*
3019	* We've been trying to create a new worker but
3020	* haven't been successful. We might be hitting an
3021	* allocation deadlock. Send distress signals to
3022	* rescuers.
3023	*/
3024	list_for_each_entry(work, &pool->worklist, entry)
3025	send_mayday(work);
3026	}
3027
3028	raw_spin_unlock(&wq_mayday_lock);
3029	raw_spin_unlock_irq(&pool->lock);
3030
3031	mod_timer(timer: &pool->mayday_timer, expires: jiffies + MAYDAY_INTERVAL);
3032	}
3033
3034	/**
3035	* maybe_create_worker - create a new worker if necessary
3036	* @pool: pool to create a new worker for
3037	*
3038	* Create a new worker for @pool if necessary. @pool is guaranteed to
3039	* have at least one idle worker on return from this function. If
3040	* creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
3041	* sent to all rescuers with works scheduled on @pool to resolve
3042	* possible allocation deadlock.
3043	*
3044	* On return, need_to_create_worker() is guaranteed to be %false and
3045	* may_start_working() %true.
3046	*
3047	* LOCKING:
3048	* raw_spin_lock_irq(pool->lock) which may be released and regrabbed
3049	* multiple times. Does GFP_KERNEL allocations. Called only from
3050	* manager.
3051	*/
3052	static void maybe_create_worker(struct worker_pool *pool)
3053	__releases(&pool->lock)
3054	__acquires(&pool->lock)
3055	{
3056	restart:
3057	raw_spin_unlock_irq(&pool->lock);
3058
3059	/ if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help /
3060	mod_timer(timer: &pool->mayday_timer, expires: jiffies + MAYDAY_INITIAL_TIMEOUT);
3061
3062	while (true) {
3063	if (create_worker(pool) \|\| !need_to_create_worker(pool))
3064	break;
3065
3066	schedule_timeout_interruptible(timeout: CREATE_COOLDOWN);
3067
3068	if (!need_to_create_worker(pool))
3069	break;
3070	}
3071
3072	timer_delete_sync(timer: &pool->mayday_timer);
3073	raw_spin_lock_irq(&pool->lock);
3074	/*
3075	* This is necessary even after a new worker was just successfully
3076	* created as @pool->lock was dropped and the new worker might have
3077	* already become busy.
3078	*/
3079	if (need_to_create_worker(pool))
3080	goto restart;
3081	}
3082
3083	#ifdef CONFIG_PREEMPT_RT
3084	static void worker_lock_callback(struct worker_pool *pool)
3085	{
3086	spin_lock(&pool->cb_lock);
3087	}
3088
3089	static void worker_unlock_callback(struct worker_pool *pool)
3090	{
3091	spin_unlock(&pool->cb_lock);
3092	}
3093
3094	static void workqueue_callback_cancel_wait_running(struct worker_pool *pool)
3095	{
3096	spin_lock(&pool->cb_lock);
3097	spin_unlock(&pool->cb_lock);
3098	}
3099
3100	#else
3101
3102	static void worker_lock_callback(struct worker_pool *pool) { }
3103	static void worker_unlock_callback(struct worker_pool *pool) { }
3104	static void workqueue_callback_cancel_wait_running(struct worker_pool *pool) { }
3105
3106	#endif
3107
3108	/**
3109	* manage_workers - manage worker pool
3110	* @worker: self
3111	*
3112	* Assume the manager role and manage the worker pool @worker belongs
3113	* to. At any given time, there can be only zero or one manager per
3114	* pool. The exclusion is handled automatically by this function.
3115	*
3116	* The caller can safely start processing works on false return. On
3117	* true return, it's guaranteed that need_to_create_worker() is false
3118	* and may_start_working() is true.
3119	*
3120	* CONTEXT:
3121	* raw_spin_lock_irq(pool->lock) which may be released and regrabbed
3122	* multiple times. Does GFP_KERNEL allocations.
3123	*
3124	* Return:
3125	* %false if the pool doesn't need management and the caller can safely
3126	* start processing works, %true if management function was performed and
3127	* the conditions that the caller verified before calling the function may
3128	* no longer be true.
3129	*/
3130	static bool manage_workers(struct worker *worker)
3131	{
3132	struct worker_pool *pool = worker->pool;
3133
3134	if (pool->flags & POOL_MANAGER_ACTIVE)
3135	return false;
3136
3137	pool->flags \|= POOL_MANAGER_ACTIVE;
3138	pool->manager = worker;
3139
3140	maybe_create_worker(pool);
3141
3142	pool->manager = NULL;
3143	pool->flags &= ~POOL_MANAGER_ACTIVE;
3144	rcuwait_wake_up(w: &manager_wait);
3145	return true;
3146	}
3147
3148	/**
3149	* process_one_work - process single work
3150	* @worker: self
3151	* @work: work to process
3152	*
3153	* Process @work. This function contains all the logics necessary to
3154	* process a single work including synchronization against and
3155	* interaction with other workers on the same cpu, queueing and
3156	* flushing. As long as context requirement is met, any worker can
3157	* call this function to process a work.
3158	*
3159	* CONTEXT:
3160	* raw_spin_lock_irq(pool->lock) which is released and regrabbed.
3161	*/
3162	static void process_one_work(struct worker worker, struct* work_struct *work)
3163	__releases(&pool->lock)
3164	__acquires(&pool->lock)
3165	{
3166	struct pool_workqueue *pwq = get_work_pwq(work);
3167	struct worker_pool *pool = worker->pool;
3168	unsigned long work_data;
3169	int lockdep_start_depth, rcu_start_depth;
3170	bool bh_draining = pool->flags & POOL_BH_DRAINING;
3171	#ifdef CONFIG_LOCKDEP
3172	/*
3173	* It is permissible to free the struct work_struct from
3174	* inside the function that is called from it, this we need to
3175	* take into account for lockdep too. To avoid bogus "held
3176	* lock freed" warnings as well as problems when looking into
3177	* work->lockdep_map, make a copy and use that here.
3178	*/
3179	struct lockdep_map lockdep_map;
3180
3181	lockdep_copy_map(&lockdep_map, &work->lockdep_map);
3182	#endif
3183	/ ensure we're on the correct CPU /
3184	WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
3185	raw_smp_processor_id() != pool->cpu);
3186
3187	/ claim and dequeue /
3188	debug_work_deactivate(work);
3189	hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
3190	worker->current_work = work;
3191	worker->current_func = work->func;
3192	worker->current_pwq = pwq;
3193	if (worker->task)
3194	worker->current_at = worker->task->se.sum_exec_runtime;
3195	work_data = *work_data_bits(work);
3196	worker->current_color = get_work_color(work_data);
3197
3198	/*
3199	* Record wq name for cmdline and debug reporting, may get
3200	* overridden through set_worker_desc().
3201	*/
3202	strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN);
3203
3204	list_del_init(entry: &work->entry);
3205
3206	/*
3207	* CPU intensive works don't participate in concurrency management.
3208	* They're the scheduler's responsibility. This takes @worker out
3209	* of concurrency management and the next code block will chain
3210	* execution of the pending work items.
3211	*/
3212	if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE))
3213	worker_set_flags(worker, flags: WORKER_CPU_INTENSIVE);
3214
3215	/*
3216	* Kick @pool if necessary. It's always noop for per-cpu worker pools
3217	* since nr_running would always be >= 1 at this point. This is used to
3218	* chain execution of the pending work items for WORKER_NOT_RUNNING
3219	* workers such as the UNBOUND and CPU_INTENSIVE ones.
3220	*/
3221	kick_pool(pool);
3222
3223	/*
3224	* Record the last pool and clear PENDING which should be the last
3225	* update to @work. Also, do this inside @pool->lock so that
3226	* PENDING and queued state changes happen together while IRQ is
3227	* disabled.
3228	*/
3229	set_work_pool_and_clear_pending(work, pool_id: pool->id, flags: pool_offq_flags(pool));
3230
3231	pwq->stats[PWQ_STAT_STARTED]++;
3232	raw_spin_unlock_irq(&pool->lock);
3233
3234	rcu_start_depth = rcu_preempt_depth();
3235	lockdep_start_depth = lockdep_depth(current);
3236	/ see drain_dead_softirq_workfn() /
3237	if (!bh_draining)
3238	lock_map_acquire(pwq->wq->lockdep_map);
3239	lock_map_acquire(&lockdep_map);
3240	/*
3241	* Strictly speaking we should mark the invariant state without holding
3242	* any locks, that is, before these two lock_map_acquire()'s.
3243	*
3244	* However, that would result in:
3245	*
3246	* A(W1)
3247	* WFC(C)
3248	* A(W1)
3249	* C(C)
3250	*
3251	* Which would create W1->C->W1 dependencies, even though there is no
3252	* actual deadlock possible. There are two solutions, using a
3253	* read-recursive acquire on the work(queue) 'locks', but this will then
3254	* hit the lockdep limitation on recursive locks, or simply discard
3255	* these locks.
3256	*
3257	* AFAICT there is no possible deadlock scenario between the
3258	* flush_work() and complete() primitives (except for single-threaded
3259	* workqueues), so hiding them isn't a problem.
3260	*/
3261	lockdep_invariant_state(force: true);
3262	trace_workqueue_execute_start(work);
3263	worker->current_func(work);
3264	/*
3265	* While we must be careful to not use "work" after this, the trace
3266	* point will only record its address.
3267	*/
3268	trace_workqueue_execute_end(work, function: worker->current_func);
3269
3270	lock_map_release(&lockdep_map);
3271	if (!bh_draining)
3272	lock_map_release(pwq->wq->lockdep_map);
3273
3274	if (unlikely((worker->task && in_atomic()) \|\|
3275	lockdep_depth(current) != lockdep_start_depth \|\|
3276	rcu_preempt_depth() != rcu_start_depth)) {
3277	pr_err("BUG: workqueue leaked atomic, lock or RCU: %s[%d]\n"
3278	" preempt=0x%08x lock=%d->%d RCU=%d->%d workfn=%ps\n",
3279	current->comm, task_pid_nr(current), preempt_count(),
3280	lockdep_start_depth, lockdep_depth(current),
3281	rcu_start_depth, rcu_preempt_depth(),
3282	worker->current_func);
3283	debug_show_held_locks(current);
3284	dump_stack();
3285	}
3286
3287	/*
3288	* The following prevents a kworker from hogging CPU on !PREEMPTION
3289	* kernels, where a requeueing work item waiting for something to
3290	* happen could deadlock with stop_machine as such work item could
3291	* indefinitely requeue itself while all other CPUs are trapped in
3292	* stop_machine. At the same time, report a quiescent RCU state so
3293	* the same condition doesn't freeze RCU.
3294	*/
3295	if (worker->task)
3296	cond_resched();
3297
3298	raw_spin_lock_irq(&pool->lock);
3299
3300	pwq->stats[PWQ_STAT_COMPLETED]++;
3301
3302	/*
3303	* In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked
3304	* CPU intensive by wq_worker_tick() if @work hogged CPU longer than
3305	* wq_cpu_intensive_thresh_us. Clear it.
3306	*/
3307	worker_clr_flags(worker, flags: WORKER_CPU_INTENSIVE);
3308
3309	/ tag the worker for identification in schedule() /
3310	worker->last_func = worker->current_func;
3311
3312	/ we're done with it, release /
3313	hash_del(node: &worker->hentry);
3314	worker->current_work = NULL;
3315	worker->current_func = NULL;
3316	worker->current_pwq = NULL;
3317	worker->current_color = INT_MAX;
3318
3319	/ must be the last step, see the function comment /
3320	pwq_dec_nr_in_flight(pwq, work_data);
3321	}
3322
3323	/**
3324	* process_scheduled_works - process scheduled works
3325	* @worker: self
3326	*
3327	* Process all scheduled works. Please note that the scheduled list
3328	* may change while processing a work, so this function repeatedly
3329	* fetches a work from the top and executes it.
3330	*
3331	* CONTEXT:
3332	* raw_spin_lock_irq(pool->lock) which may be released and regrabbed
3333	* multiple times.
3334	*/
3335	static void process_scheduled_works(struct worker *worker)
3336	{
3337	struct work_struct *work;
3338	bool first = true;
3339
3340	while ((work = list_first_entry_or_null(&worker->scheduled,
3341	struct work_struct, entry))) {
3342	if (first) {
3343	worker->pool->watchdog_ts = jiffies;
3344	first = false;
3345	}
3346	process_one_work(worker, work);
3347	}
3348	}
3349
3350	static void set_pf_worker(bool val)
3351	{
3352	mutex_lock(lock: &wq_pool_attach_mutex);
3353	if (val)
3354	current->flags \|= PF_WQ_WORKER;
3355	else
3356	current->flags &= ~PF_WQ_WORKER;
3357	mutex_unlock(lock: &wq_pool_attach_mutex);
3358	}
3359
3360	/**
3361	* worker_thread - the worker thread function
3362	* @__worker: self
3363	*
3364	* The worker thread function. All workers belong to a worker_pool -
3365	* either a per-cpu one or dynamic unbound one. These workers process all
3366	* work items regardless of their specific target workqueue. The only
3367	* exception is work items which belong to workqueues with a rescuer which
3368	* will be explained in rescuer_thread().
3369	*
3370	* Return: 0
3371	*/
3372	static int worker_thread(void *__worker)
3373	{
3374	struct worker *worker = __worker;
3375	struct worker_pool *pool = worker->pool;
3376
3377	/ tell the scheduler that this is a workqueue worker /
3378	set_pf_worker(true);
3379	woke_up:
3380	raw_spin_lock_irq(&pool->lock);
3381
3382	/ am I supposed to die? /
3383	if (unlikely(worker->flags & WORKER_DIE)) {
3384	raw_spin_unlock_irq(&pool->lock);
3385	set_pf_worker(false);
3386	/*
3387	* The worker is dead and PF_WQ_WORKER is cleared, worker->pool
3388	* shouldn't be accessed, reset it to NULL in case otherwise.
3389	*/
3390	worker->pool = NULL;
3391	ida_free(&pool->worker_ida, id: worker->id);
3392	return `0`;
3393	}
3394
3395	worker_leave_idle(worker);
3396	recheck:
3397	/ no more worker necessary? /
3398	if (!need_more_worker(pool))
3399	goto sleep;
3400
3401	/ do we need to manage? /
3402	if (unlikely(!may_start_working(pool)) && manage_workers(worker))
3403	goto recheck;
3404
3405	/*
3406	* ->scheduled list can only be filled while a worker is
3407	* preparing to process a work or actually processing it.
3408	* Make sure nobody diddled with it while I was sleeping.
3409	*/
3410	WARN_ON_ONCE(!list_empty(&worker->scheduled));
3411
3412	/*
3413	* Finish PREP stage. We're guaranteed to have at least one idle
3414	* worker or that someone else has already assumed the manager
3415	* role. This is where @worker starts participating in concurrency
3416	* management if applicable and concurrency management is restored
3417	* after being rebound. See rebind_workers() for details.
3418	*/
3419	worker_clr_flags(worker, flags: WORKER_PREP \| WORKER_REBOUND);
3420
3421	do {
3422	struct work_struct *work =
3423	list_first_entry(&pool->worklist,
3424	struct work_struct, entry);
3425
3426	if (assign_work(work, worker, NULL))
3427	process_scheduled_works(worker);
3428	} while (keep_working(pool));
3429
3430	worker_set_flags(worker, flags: WORKER_PREP);
3431	sleep:
3432	/*
3433	* pool->lock is held and there's no work to process and no need to
3434	* manage, sleep. Workers are woken up only while holding
3435	* pool->lock or from local cpu, so setting the current state
3436	* before releasing pool->lock is enough to prevent losing any
3437	* event.
3438	*/
3439	worker_enter_idle(worker);
3440	__set_current_state(TASK_IDLE);
3441	raw_spin_unlock_irq(&pool->lock);
3442	schedule();
3443	goto woke_up;
3444	}
3445
3446	/**
3447	* rescuer_thread - the rescuer thread function
3448	* @__rescuer: self
3449	*
3450	* Workqueue rescuer thread function. There's one rescuer for each
3451	* workqueue which has WQ_MEM_RECLAIM set.
3452	*
3453	* Regular work processing on a pool may block trying to create a new
3454	* worker which uses GFP_KERNEL allocation which has slight chance of
3455	* developing into deadlock if some works currently on the same queue
3456	* need to be processed to satisfy the GFP_KERNEL allocation. This is
3457	* the problem rescuer solves.
3458	*
3459	* When such condition is possible, the pool summons rescuers of all
3460	* workqueues which have works queued on the pool and let them process
3461	* those works so that forward progress can be guaranteed.
3462	*
3463	* This should happen rarely.
3464	*
3465	* Return: 0
3466	*/
3467	static int rescuer_thread(void *__rescuer)
3468	{
3469	struct worker *rescuer = __rescuer;
3470	struct workqueue_struct *wq = rescuer->rescue_wq;
3471	bool should_stop;
3472
3473	set_user_nice(current, nice: RESCUER_NICE_LEVEL);
3474
3475	/*
3476	* Mark rescuer as worker too. As WORKER_PREP is never cleared, it
3477	* doesn't participate in concurrency management.
3478	*/
3479	set_pf_worker(true);
3480	repeat:
3481	set_current_state(TASK_IDLE);
3482
3483	/*
3484	* By the time the rescuer is requested to stop, the workqueue
3485	* shouldn't have any work pending, but @wq->maydays may still have
3486	* pwq(s) queued. This can happen by non-rescuer workers consuming
3487	* all the work items before the rescuer got to them. Go through
3488	* @wq->maydays processing before acting on should_stop so that the
3489	* list is always empty on exit.
3490	*/
3491	should_stop = kthread_should_stop();
3492
3493	/ see whether any pwq is asking for help /
3494	raw_spin_lock_irq(&wq_mayday_lock);
3495
3496	while (!list_empty(head: &wq->maydays)) {
3497	struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
3498	struct pool_workqueue, mayday_node);
3499	struct worker_pool *pool = pwq->pool;
3500	struct work_struct work, n;
3501
3502	__set_current_state(TASK_RUNNING);
3503	list_del_init(entry: &pwq->mayday_node);
3504
3505	raw_spin_unlock_irq(&wq_mayday_lock);
3506
3507	worker_attach_to_pool(worker: rescuer, pool);
3508
3509	raw_spin_lock_irq(&pool->lock);
3510
3511	/*
3512	* Slurp in all works issued via this workqueue and
3513	* process'em.
3514	*/
3515	WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
3516	list_for_each_entry_safe(work, n, &pool->worklist, entry) {
3517	if (get_work_pwq(work) == pwq &&
3518	assign_work(work, worker: rescuer, nextp: &n))
3519	pwq->stats[PWQ_STAT_RESCUED]++;
3520	}
3521
3522	if (!list_empty(head: &rescuer->scheduled)) {
3523	process_scheduled_works(worker: rescuer);
3524
3525	/*
3526	* The above execution of rescued work items could
3527	* have created more to rescue through
3528	* pwq_activate_first_inactive() or chained
3529	* queueing. Let's put @pwq back on mayday list so
3530	* that such back-to-back work items, which may be
3531	* being used to relieve memory pressure, don't
3532	* incur MAYDAY_INTERVAL delay inbetween.
3533	*/
3534	if (pwq->nr_active && need_to_create_worker(pool)) {
3535	raw_spin_lock(&wq_mayday_lock);
3536	/*
3537	* Queue iff we aren't racing destruction
3538	* and somebody else hasn't queued it already.
3539	*/
3540	if (wq->rescuer && list_empty(head: &pwq->mayday_node)) {
3541	get_pwq(pwq);
3542	list_add_tail(new: &pwq->mayday_node, head: &wq->maydays);
3543	}
3544	raw_spin_unlock(&wq_mayday_lock);
3545	}
3546	}
3547
3548	/*
3549	* Leave this pool. Notify regular workers; otherwise, we end up
3550	* with 0 concurrency and stalling the execution.
3551	*/
3552	kick_pool(pool);
3553
3554	raw_spin_unlock_irq(&pool->lock);
3555
3556	worker_detach_from_pool(worker: rescuer);
3557
3558	/*
3559	* Put the reference grabbed by send_mayday(). @pool might
3560	* go away any time after it.
3561	*/
3562	put_pwq_unlocked(pwq);
3563
3564	raw_spin_lock_irq(&wq_mayday_lock);
3565	}
3566
3567	raw_spin_unlock_irq(&wq_mayday_lock);
3568
3569	if (should_stop) {
3570	__set_current_state(TASK_RUNNING);
3571	set_pf_worker(false);
3572	return `0`;
3573	}
3574
3575	/ rescuers should never participate in concurrency management /
3576	WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
3577	schedule();
3578	goto repeat;
3579	}
3580
3581	static void bh_worker(struct worker *worker)
3582	{
3583	struct worker_pool *pool = worker->pool;
3584	int nr_restarts = BH_WORKER_RESTARTS;
3585	unsigned long end = jiffies + BH_WORKER_JIFFIES;
3586
3587	worker_lock_callback(pool);
3588	raw_spin_lock_irq(&pool->lock);
3589	worker_leave_idle(worker);
3590
3591	/*
3592	* This function follows the structure of worker_thread(). See there for
3593	* explanations on each step.
3594	*/
3595	if (!need_more_worker(pool))
3596	goto done;
3597
3598	WARN_ON_ONCE(!list_empty(&worker->scheduled));
3599	worker_clr_flags(worker, flags: WORKER_PREP \| WORKER_REBOUND);
3600
3601	do {
3602	struct work_struct *work =
3603	list_first_entry(&pool->worklist,
3604	struct work_struct, entry);
3605
3606	if (assign_work(work, worker, NULL))
3607	process_scheduled_works(worker);
3608	} while (keep_working(pool) &&
3609	--nr_restarts && time_before(jiffies, end));
3610
3611	worker_set_flags(worker, flags: WORKER_PREP);
3612	done:
3613	worker_enter_idle(worker);
3614	kick_pool(pool);
3615	raw_spin_unlock_irq(&pool->lock);
3616	worker_unlock_callback(pool);
3617	}
3618
3619	/*
3620	* TODO: Convert all tasklet users to workqueue and use softirq directly.
3621	*
3622	* This is currently called from tasklet[_hi]action() and thus is also called
3623	* whenever there are tasklets to run. Let's do an early exit if there's nothing
3624	* queued. Once conversion from tasklet is complete, the need_more_worker() test
3625	* can be dropped.
3626	*
3627	* After full conversion, we'll add worker->softirq_action, directly use the
3628	* softirq action and obtain the worker pointer from the softirq_action pointer.
3629	*/
3630	void workqueue_softirq_action(bool highpri)
3631	{
3632	struct worker_pool *pool =
3633	&per_cpu(bh_worker_pools, smp_processor_id())[highpri];
3634	if (need_more_worker(pool))
3635	bh_worker(list_first_entry(&pool->workers, struct worker, node));
3636	}
3637
3638	struct wq_drain_dead_softirq_work {
3639	struct work_struct work;
3640	struct worker_pool *pool;
3641	struct completion done;
3642	};
3643
3644	static void drain_dead_softirq_workfn(struct work_struct *work)
3645	{
3646	struct wq_drain_dead_softirq_work *dead_work =
3647	container_of(work, struct wq_drain_dead_softirq_work, work);
3648	struct worker_pool *pool = dead_work->pool;
3649	bool repeat;
3650
3651	/*
3652	* @pool's CPU is dead and we want to execute its still pending work
3653	* items from this BH work item which is running on a different CPU. As
3654	* its CPU is dead, @pool can't be kicked and, as work execution path
3655	* will be nested, a lockdep annotation needs to be suppressed. Mark
3656	* @pool with %POOL_BH_DRAINING for the special treatments.
3657	*/
3658	raw_spin_lock_irq(&pool->lock);
3659	pool->flags \|= POOL_BH_DRAINING;
3660	raw_spin_unlock_irq(&pool->lock);
3661
3662	bh_worker(list_first_entry(&pool->workers, struct worker, node));
3663
3664	raw_spin_lock_irq(&pool->lock);
3665	pool->flags &= ~POOL_BH_DRAINING;
3666	repeat = need_more_worker(pool);
3667	raw_spin_unlock_irq(&pool->lock);
3668
3669	/*
3670	* bh_worker() might hit consecutive execution limit and bail. If there
3671	* still are pending work items, reschedule self and return so that we
3672	* don't hog this CPU's BH.
3673	*/
3674	if (repeat) {
3675	if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
3676	queue_work(wq: system_bh_highpri_wq, work);
3677	else
3678	queue_work(wq: system_bh_wq, work);
3679	} else {
3680	complete(&dead_work->done);
3681	}
3682	}
3683
3684	/*
3685	* @cpu is dead. Drain the remaining BH work items on the current CPU. It's
3686	* possible to allocate dead_work per CPU and avoid flushing. However, then we
3687	* have to worry about draining overlapping with CPU coming back online or
3688	* nesting (one CPU's dead_work queued on another CPU which is also dead and so
3689	* on). Let's keep it simple and drain them synchronously. These are BH work
3690	* items which shouldn't be requeued on the same pool. Shouldn't take long.
3691	*/
3692	void workqueue_softirq_dead(unsigned int cpu)
3693	{
3694	int i;
3695
3696	for (i = `0`; i < NR_STD_WORKER_POOLS; i++) {
3697	struct worker_pool *pool = &per_cpu(bh_worker_pools, cpu)[i];
3698	struct wq_drain_dead_softirq_work dead_work;
3699
3700	if (!need_more_worker(pool))
3701	continue;
3702
3703	INIT_WORK_ONSTACK(&dead_work.work, drain_dead_softirq_workfn);
3704	dead_work.pool = pool;
3705	init_completion(x: &dead_work.done);
3706
3707	if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
3708	queue_work(wq: system_bh_highpri_wq, work: &dead_work.work);
3709	else
3710	queue_work(wq: system_bh_wq, work: &dead_work.work);
3711
3712	wait_for_completion(&dead_work.done);
3713	destroy_work_on_stack(work: &dead_work.work);
3714	}
3715	}
3716
3717	/**
3718	* check_flush_dependency - check for flush dependency sanity
3719	* @target_wq: workqueue being flushed
3720	* @target_work: work item being flushed (NULL for workqueue flushes)
3721	* @from_cancel: are we called from the work cancel path
3722	*
3723	* %current is trying to flush the whole @target_wq or @target_work on it.
3724	* If this is not the cancel path (which implies work being flushed is either
3725	* already running, or will not be at all), check if @target_wq doesn't have
3726	* %WQ_MEM_RECLAIM and verify that %current is not reclaiming memory or running
3727	* on a workqueue which doesn't have %WQ_MEM_RECLAIM as that can break forward-
3728	* progress guarantee leading to a deadlock.
3729	*/
3730	static void check_flush_dependency(struct workqueue_struct *target_wq,
3731	struct work_struct *target_work,
3732	bool from_cancel)
3733	{
3734	work_func_t target_func;
3735	struct worker *worker;
3736
3737	if (from_cancel \|\| target_wq->flags & WQ_MEM_RECLAIM)
3738	return;
3739
3740	worker = current_wq_worker();
3741	target_func = target_work ? target_work->func : NULL;
3742
3743	WARN_ONCE(current->flags & PF_MEMALLOC,
3744	"workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps",
3745	current->pid, current->comm, target_wq->name, target_func);
3746	WARN_ONCE(worker && ((worker->current_pwq->wq->flags &
3747	(WQ_MEM_RECLAIM \| __WQ_LEGACY)) == WQ_MEM_RECLAIM),
3748	"workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps",
3749	worker->current_pwq->wq->name, worker->current_func,
3750	target_wq->name, target_func);
3751	}
3752
3753	struct wq_barrier {
3754	struct work_struct work;
3755	struct completion done;
3756	struct task_struct task; /* purely informational /
3757	};
3758
3759	static void wq_barrier_func(struct work_struct *work)
3760	{
3761	struct wq_barrier barr = container_of(work, struct* wq_barrier, work);
3762	complete(&barr->done);
3763	}
3764
3765	/**
3766	* insert_wq_barrier - insert a barrier work
3767	* @pwq: pwq to insert barrier into
3768	* @barr: wq_barrier to insert
3769	* @target: target work to attach @barr to
3770	* @worker: worker currently executing @target, NULL if @target is not executing
3771	*
3772	* @barr is linked to @target such that @barr is completed only after
3773	* @target finishes execution. Please note that the ordering
3774	* guarantee is observed only with respect to @target and on the local
3775	* cpu.
3776	*
3777	* Currently, a queued barrier can't be canceled. This is because
3778	* try_to_grab_pending() can't determine whether the work to be
3779	* grabbed is at the head of the queue and thus can't clear LINKED
3780	* flag of the previous work while there must be a valid next work
3781	* after a work with LINKED flag set.
3782	*
3783	* Note that when @worker is non-NULL, @target may be modified
3784	* underneath us, so we can't reliably determine pwq from @target.
3785	*
3786	* CONTEXT:
3787	* raw_spin_lock_irq(pool->lock).
3788	*/
3789	static void insert_wq_barrier(struct pool_workqueue *pwq,
3790	struct wq_barrier *barr,
3791	struct work_struct target, struct* worker *worker)
3792	{
3793	static __maybe_unused struct lock_class_key bh_key, thr_key;
3794	unsigned int work_flags = `0`;
3795	unsigned int work_color;
3796	struct list_head *head;
3797
3798	/*
3799	* debugobject calls are safe here even with pool->lock locked
3800	* as we know for sure that this will not trigger any of the
3801	* checks and call back into the fixup functions where we
3802	* might deadlock.
3803	*
3804	* BH and threaded workqueues need separate lockdep keys to avoid
3805	* spuriously triggering "inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W}
3806	* usage".
3807	*/
3808	INIT_WORK_ONSTACK_KEY(&barr->work, wq_barrier_func,
3809	(pwq->wq->flags & WQ_BH) ? &bh_key : &thr_key);
3810	__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
3811
3812	init_completion_map(&barr->done, &target->lockdep_map);
3813
3814	barr->task = current;
3815
3816	/ The barrier work item does not participate in nr_active. /
3817	work_flags \|= WORK_STRUCT_INACTIVE;
3818
3819	/*
3820	* If @target is currently being executed, schedule the
3821	* barrier to the worker; otherwise, put it after @target.
3822	*/
3823	if (worker) {
3824	head = worker->scheduled.next;
3825	work_color = worker->current_color;
3826	} else {
3827	unsigned long *bits = work_data_bits(target);
3828
3829	head = target->entry.next;
3830	/ there can already be other linked works, inherit and set /
3831	work_flags \|= *bits & WORK_STRUCT_LINKED;
3832	work_color = get_work_color(work_data: *bits);
3833	__set_bit(WORK_STRUCT_LINKED_BIT, bits);
3834	}
3835
3836	pwq->nr_in_flight[work_color]++;
3837	work_flags \|= work_color_to_flags(color: work_color);
3838
3839	insert_work(pwq, work: &barr->work, head, extra_flags: work_flags);
3840	}
3841
3842	/**
3843	* flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing
3844	* @wq: workqueue being flushed
3845	* @flush_color: new flush color, < 0 for no-op
3846	* @work_color: new work color, < 0 for no-op
3847	*
3848	* Prepare pwqs for workqueue flushing.
3849	*
3850	* If @flush_color is non-negative, flush_color on all pwqs should be
3851	* -1. If no pwq has in-flight commands at the specified color, all
3852	* pwq->flush_color's stay at -1 and %false is returned. If any pwq
3853	* has in flight commands, its pwq->flush_color is set to
3854	* @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq
3855	* wakeup logic is armed and %true is returned.
3856	*
3857	* The caller should have initialized @wq->first_flusher prior to
3858	* calling this function with non-negative @flush_color. If
3859	* @flush_color is negative, no flush color update is done and %false
3860	* is returned.
3861	*
3862	* If @work_color is non-negative, all pwqs should have the same
3863	* work_color which is previous to @work_color and all will be
3864	* advanced to @work_color.
3865	*
3866	* CONTEXT:
3867	* mutex_lock(wq->mutex).
3868	*
3869	* Return:
3870	* %true if @flush_color >= 0 and there's something to flush. %false
3871	* otherwise.
3872	*/
3873	static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
3874	int flush_color, int work_color)
3875	{
3876	bool wait = false;
3877	struct pool_workqueue *pwq;
3878	struct worker_pool *current_pool = NULL;
3879
3880	if (flush_color >= `0`) {
3881	WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush));
3882	atomic_set(v: &wq->nr_pwqs_to_flush, i: `1`);
3883	}
3884
3885	/*
3886	* For unbound workqueue, pwqs will map to only a few pools.
3887	* Most of the time, pwqs within the same pool will be linked
3888	* sequentially to wq->pwqs by cpu index. So in the majority
3889	* of pwq iters, the pool is the same, only doing lock/unlock
3890	* if the pool has changed. This can largely reduce expensive
3891	* lock operations.
3892	*/
3893	for_each_pwq(pwq, wq) {
3894	if (current_pool != pwq->pool) {
3895	if (likely(current_pool))
3896	raw_spin_unlock_irq(&current_pool->lock);
3897	current_pool = pwq->pool;
3898	raw_spin_lock_irq(&current_pool->lock);
3899	}
3900
3901	if (flush_color >= `0`) {
3902	WARN_ON_ONCE(pwq->flush_color != -`1`);
3903
3904	if (pwq->nr_in_flight[flush_color]) {
3905	pwq->flush_color = flush_color;
3906	atomic_inc(v: &wq->nr_pwqs_to_flush);
3907	wait = true;
3908	}
3909	}
3910
3911	if (work_color >= `0`) {
3912	WARN_ON_ONCE(work_color != work_next_color(pwq->work_color));
3913	pwq->work_color = work_color;
3914	}
3915
3916	}
3917
3918	if (current_pool)
3919	raw_spin_unlock_irq(&current_pool->lock);
3920
3921	if (flush_color >= `0` && atomic_dec_and_test(v: &wq->nr_pwqs_to_flush))
3922	complete(&wq->first_flusher->done);
3923
3924	return wait;
3925	}
3926
3927	static void touch_wq_lockdep_map(struct workqueue_struct *wq)
3928	{
3929	#ifdef CONFIG_LOCKDEP
3930	if (unlikely(!wq->lockdep_map))
3931	return;
3932
3933	if (wq->flags & WQ_BH)
3934	local_bh_disable();
3935
3936	lock_map_acquire(wq->lockdep_map);
3937	lock_map_release(wq->lockdep_map);
3938
3939	if (wq->flags & WQ_BH)
3940	local_bh_enable();
3941	#endif
3942	}
3943
3944	static void touch_work_lockdep_map(struct work_struct *work,
3945	struct workqueue_struct *wq)
3946	{
3947	#ifdef CONFIG_LOCKDEP
3948	if (wq->flags & WQ_BH)
3949	local_bh_disable();
3950
3951	lock_map_acquire(&work->lockdep_map);
3952	lock_map_release(&work->lockdep_map);
3953
3954	if (wq->flags & WQ_BH)
3955	local_bh_enable();
3956	#endif
3957	}
3958
3959	/**
3960	* __flush_workqueue - ensure that any scheduled work has run to completion.
3961	* @wq: workqueue to flush
3962	*
3963	* This function sleeps until all work items which were queued on entry
3964	* have finished execution, but it is not livelocked by new incoming ones.
3965	*/
3966	void __flush_workqueue(struct workqueue_struct *wq)
3967	{
3968	struct wq_flusher this_flusher = {
3969	.list = LIST_HEAD_INIT(this_flusher.list),
3970	.flush_color = -`1`,
3971	.done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, (*wq->lockdep_map)),
3972	};
3973	int next_color;
3974
3975	if (WARN_ON(!wq_online))
3976	return;
3977
3978	touch_wq_lockdep_map(wq);
3979
3980	mutex_lock(lock: &wq->mutex);
3981
3982	/*
3983	* Start-to-wait phase
3984	*/
3985	next_color = work_next_color(color: wq->work_color);
3986
3987	if (next_color != wq->flush_color) {
3988	/*
3989	* Color space is not full. The current work_color
3990	* becomes our flush_color and work_color is advanced
3991	* by one.
3992	*/
3993	WARN_ON_ONCE(!list_empty(&wq->flusher_overflow));
3994	this_flusher.flush_color = wq->work_color;
3995	wq->work_color = next_color;
3996
3997	if (!wq->first_flusher) {
3998	/ no flush in progress, become the first flusher /
3999	WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
4000
4001	wq->first_flusher = &this_flusher;
4002
4003	if (!flush_workqueue_prep_pwqs(wq, flush_color: wq->flush_color,
4004	work_color: wq->work_color)) {
4005	/ nothing to flush, done /
4006	wq->flush_color = next_color;
4007	wq->first_flusher = NULL;
4008	goto out_unlock;
4009	}
4010	} else {
4011	/ wait in queue /
4012	WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color);
4013	list_add_tail(new: &this_flusher.list, head: &wq->flusher_queue);
4014	flush_workqueue_prep_pwqs(wq, flush_color: -`1`, work_color: wq->work_color);
4015	}
4016	} else {
4017	/*
4018	* Oops, color space is full, wait on overflow queue.
4019	* The next flush completion will assign us
4020	* flush_color and transfer to flusher_queue.
4021	*/
4022	list_add_tail(new: &this_flusher.list, head: &wq->flusher_overflow);
4023	}
4024
4025	check_flush_dependency(target_wq: wq, NULL, from_cancel: false);
4026
4027	mutex_unlock(lock: &wq->mutex);
4028
4029	wait_for_completion(&this_flusher.done);
4030
4031	/*
4032	* Wake-up-and-cascade phase
4033	*
4034	* First flushers are responsible for cascading flushes and
4035	* handling overflow. Non-first flushers can simply return.
4036	*/
4037	if (READ_ONCE(wq->first_flusher) != &this_flusher)
4038	return;
4039
4040	mutex_lock(lock: &wq->mutex);
4041
4042	/ we might have raced, check again with mutex held /
4043	if (wq->first_flusher != &this_flusher)
4044	goto out_unlock;
4045
4046	WRITE_ONCE(wq->first_flusher, NULL);
4047
4048	WARN_ON_ONCE(!list_empty(&this_flusher.list));
4049	WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
4050
4051	while (true) {
4052	struct wq_flusher next, tmp;
4053
4054	/ complete all the flushers sharing the current flush color /
4055	list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
4056	if (next->flush_color != wq->flush_color)
4057	break;
4058	list_del_init(entry: &next->list);
4059	complete(&next->done);
4060	}
4061
4062	WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) &&
4063	wq->flush_color != work_next_color(wq->work_color));
4064
4065	/ this flush_color is finished, advance by one /
4066	wq->flush_color = work_next_color(color: wq->flush_color);
4067
4068	/ one color has been freed, handle overflow queue /
4069	if (!list_empty(head: &wq->flusher_overflow)) {
4070	/*
4071	* Assign the same color to all overflowed
4072	* flushers, advance work_color and append to
4073	* flusher_queue. This is the start-to-wait
4074	* phase for these overflowed flushers.
4075	*/
4076	list_for_each_entry(tmp, &wq->flusher_overflow, list)
4077	tmp->flush_color = wq->work_color;
4078
4079	wq->work_color = work_next_color(color: wq->work_color);
4080
4081	list_splice_tail_init(list: &wq->flusher_overflow,
4082	head: &wq->flusher_queue);
4083	flush_workqueue_prep_pwqs(wq, flush_color: -`1`, work_color: wq->work_color);
4084	}
4085
4086	if (list_empty(head: &wq->flusher_queue)) {
4087	WARN_ON_ONCE(wq->flush_color != wq->work_color);
4088	break;
4089	}
4090
4091	/*
4092	* Need to flush more colors. Make the next flusher
4093	* the new first flusher and arm pwqs.
4094	*/
4095	WARN_ON_ONCE(wq->flush_color == wq->work_color);
4096	WARN_ON_ONCE(wq->flush_color != next->flush_color);
4097
4098	list_del_init(entry: &next->list);
4099	wq->first_flusher = next;
4100
4101	if (flush_workqueue_prep_pwqs(wq, flush_color: wq->flush_color, work_color: -`1`))
4102	break;
4103
4104	/*
4105	* Meh... this color is already done, clear first
4106	* flusher and repeat cascading.
4107	*/
4108	wq->first_flusher = NULL;
4109	}
4110
4111	out_unlock:
4112	mutex_unlock(lock: &wq->mutex);
4113	}
4114	EXPORT_SYMBOL(__flush_workqueue);
4115
4116	/**
4117	* drain_workqueue - drain a workqueue
4118	* @wq: workqueue to drain
4119	*
4120	* Wait until the workqueue becomes empty. While draining is in progress,
4121	* only chain queueing is allowed. IOW, only currently pending or running
4122	* work items on @wq can queue further work items on it. @wq is flushed
4123	* repeatedly until it becomes empty. The number of flushing is determined
4124	* by the depth of chaining and should be relatively short. Whine if it
4125	* takes too long.
4126	*/
4127	void drain_workqueue(struct workqueue_struct *wq)
4128	{
4129	unsigned int flush_cnt = `0`;
4130	struct pool_workqueue *pwq;
4131
4132	/*
4133	* __queue_work() needs to test whether there are drainers, is much
4134	* hotter than drain_workqueue() and already looks at @wq->flags.
4135	* Use __WQ_DRAINING so that queue doesn't have to check nr_drainers.
4136	*/
4137	mutex_lock(lock: &wq->mutex);
4138	if (!wq->nr_drainers++)
4139	wq->flags \|= __WQ_DRAINING;
4140	mutex_unlock(lock: &wq->mutex);
4141	reflush:
4142	__flush_workqueue(wq);
4143
4144	mutex_lock(lock: &wq->mutex);
4145
4146	for_each_pwq(pwq, wq) {
4147	bool drained;
4148
4149	raw_spin_lock_irq(&pwq->pool->lock);
4150	drained = pwq_is_empty(pwq);
4151	raw_spin_unlock_irq(&pwq->pool->lock);
4152
4153	if (drained)
4154	continue;
4155
4156	if (++flush_cnt == `10` \|\|
4157	(flush_cnt % `100` == `0` && flush_cnt <= `1000`))
4158	pr_warn("workqueue %s: %s() isn't complete after %u tries\n",
4159	wq->name, __func__, flush_cnt);
4160
4161	mutex_unlock(lock: &wq->mutex);
4162	goto reflush;
4163	}
4164
4165	if (!--wq->nr_drainers)
4166	wq->flags &= ~__WQ_DRAINING;
4167	mutex_unlock(lock: &wq->mutex);
4168	}
4169	EXPORT_SYMBOL_GPL(drain_workqueue);
4170
4171	static bool start_flush_work(struct work_struct work, struct* wq_barrier *barr,
4172	bool from_cancel)
4173	{
4174	struct worker *worker = NULL;
4175	struct worker_pool *pool;
4176	struct pool_workqueue *pwq;
4177	struct workqueue_struct *wq;
4178
4179	rcu_read_lock();
4180	pool = get_work_pool(work);
4181	if (!pool) {
4182	rcu_read_unlock();
4183	return false;
4184	}
4185
4186	raw_spin_lock_irq(&pool->lock);
4187	/ see the comment in try_to_grab_pending() with the same code /
4188	pwq = get_work_pwq(work);
4189	if (pwq) {
4190	if (unlikely(pwq->pool != pool))
4191	goto already_gone;
4192	} else {
4193	worker = find_worker_executing_work(pool, work);
4194	if (!worker)
4195	goto already_gone;
4196	pwq = worker->current_pwq;
4197	}
4198
4199	wq = pwq->wq;
4200	check_flush_dependency(target_wq: wq, target_work: work, from_cancel);
4201
4202	insert_wq_barrier(pwq, barr, target: work, worker);
4203	raw_spin_unlock_irq(&pool->lock);
4204
4205	touch_work_lockdep_map(work, wq);
4206
4207	/*
4208	* Force a lock recursion deadlock when using flush_work() inside a
4209	* single-threaded or rescuer equipped workqueue.
4210	*
4211	* For single threaded workqueues the deadlock happens when the work
4212	* is after the work issuing the flush_work(). For rescuer equipped
4213	* workqueues the deadlock happens when the rescuer stalls, blocking
4214	* forward progress.
4215	*/
4216	if (!from_cancel && (wq->saved_max_active == `1` \|\| wq->rescuer))
4217	touch_wq_lockdep_map(wq);
4218
4219	rcu_read_unlock();
4220	return true;
4221	already_gone:
4222	raw_spin_unlock_irq(&pool->lock);
4223	rcu_read_unlock();
4224	return false;
4225	}
4226
4227	static bool __flush_work(struct work_struct *work, bool from_cancel)
4228	{
4229	struct wq_barrier barr;
4230
4231	if (WARN_ON(!wq_online))
4232	return false;
4233
4234	if (WARN_ON(!work->func))
4235	return false;
4236
4237	if (!start_flush_work(work, barr: &barr, from_cancel))
4238	return false;
4239
4240	/*
4241	* start_flush_work() returned %true. If @from_cancel is set, we know
4242	* that @work must have been executing during start_flush_work() and
4243	* can't currently be queued. Its data must contain OFFQ bits. If @work
4244	* was queued on a BH workqueue, we also know that it was running in the
4245	* BH context and thus can be busy-waited.
4246	*/
4247	if (from_cancel) {
4248	unsigned long data = *work_data_bits(work);
4249
4250	if (!WARN_ON_ONCE(data & WORK_STRUCT_PWQ) &&
4251	(data & WORK_OFFQ_BH)) {
4252	/*
4253	* On RT, prevent a live lock when %current preempted
4254	* soft interrupt processing by blocking on lock which
4255	* is owned by the thread invoking the callback.
4256	*/
4257	while (!try_wait_for_completion(x: &barr.done)) {
4258	if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
4259	struct worker_pool *pool;
4260
4261	guard(rcu)();
4262	pool = get_work_pool(work);
4263	if (pool)
4264	workqueue_callback_cancel_wait_running(pool);
4265	} else {
4266	cpu_relax();
4267	}
4268	}
4269	goto out_destroy;
4270	}
4271	}
4272
4273	wait_for_completion(&barr.done);
4274
4275	out_destroy:
4276	destroy_work_on_stack(work: &barr.work);
4277	return true;
4278	}
4279
4280	/**
4281	* flush_work - wait for a work to finish executing the last queueing instance
4282	* @work: the work to flush
4283	*
4284	* Wait until @work has finished execution. @work is guaranteed to be idle
4285	* on return if it hasn't been requeued since flush started.
4286	*
4287	* Return:
4288	* %true if flush_work() waited for the work to finish execution,
4289	* %false if it was already idle.
4290	*/
4291	bool flush_work(struct work_struct *work)
4292	{
4293	might_sleep();
4294	return __flush_work(work, from_cancel: false);
4295	}
4296	EXPORT_SYMBOL_GPL(flush_work);
4297
4298	/**
4299	* flush_delayed_work - wait for a dwork to finish executing the last queueing
4300	* @dwork: the delayed work to flush
4301	*
4302	* Delayed timer is cancelled and the pending work is queued for
4303	* immediate execution. Like flush_work(), this function only
4304	* considers the last queueing instance of @dwork.
4305	*
4306	* Return:
4307	* %true if flush_work() waited for the work to finish execution,
4308	* %false if it was already idle.
4309	*/
4310	bool flush_delayed_work(struct delayed_work *dwork)
4311	{
4312	local_irq_disable();
4313	if (timer_delete_sync(timer: &dwork->timer))
4314	__queue_work(cpu: dwork->cpu, wq: dwork->wq, work: &dwork->work);
4315	local_irq_enable();
4316	return flush_work(&dwork->work);
4317	}
4318	EXPORT_SYMBOL(flush_delayed_work);
4319
4320	/**
4321	* flush_rcu_work - wait for a rwork to finish executing the last queueing
4322	* @rwork: the rcu work to flush
4323	*
4324	* Return:
4325	* %true if flush_rcu_work() waited for the work to finish execution,
4326	* %false if it was already idle.
4327	*/
4328	bool flush_rcu_work(struct rcu_work *rwork)
4329	{
4330	if (test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&rwork->work))) {
4331	rcu_barrier();
4332	flush_work(&rwork->work);
4333	return true;
4334	} else {
4335	return flush_work(&rwork->work);
4336	}
4337	}
4338	EXPORT_SYMBOL(flush_rcu_work);
4339
4340	static void work_offqd_disable(struct work_offq_data *offqd)
4341	{
4342	const unsigned long max = (`1lu` << WORK_OFFQ_DISABLE_BITS) - `1`;
4343
4344	if (likely(offqd->disable < max))
4345	offqd->disable++;
4346	else
4347	WARN_ONCE(true, "workqueue: work disable count overflowed\n");
4348	}
4349
4350	static void work_offqd_enable(struct work_offq_data *offqd)
4351	{
4352	if (likely(offqd->disable > `0`))
4353	offqd->disable--;
4354	else
4355	WARN_ONCE(true, "workqueue: work disable count underflowed\n");
4356	}
4357
4358	static bool __cancel_work(struct work_struct *work, u32 cflags)
4359	{
4360	struct work_offq_data offqd;
4361	unsigned long irq_flags;
4362	int ret;
4363
4364	ret = work_grab_pending(work, cflags, irq_flags: &irq_flags);
4365
4366	work_offqd_unpack(offqd: &offqd, data: *work_data_bits(work));
4367
4368	if (cflags & WORK_CANCEL_DISABLE)
4369	work_offqd_disable(offqd: &offqd);
4370
4371	set_work_pool_and_clear_pending(work, pool_id: offqd.pool_id,
4372	flags: work_offqd_pack_flags(offqd: &offqd));
4373	local_irq_restore(irq_flags);
4374	return ret;
4375	}
4376
4377	static bool __cancel_work_sync(struct work_struct *work, u32 cflags)
4378	{
4379	bool ret;
4380
4381	ret = __cancel_work(work, cflags: cflags \| WORK_CANCEL_DISABLE);
4382
4383	if (*work_data_bits(work) & WORK_OFFQ_BH)
4384	WARN_ON_ONCE(in_hardirq());
4385	else
4386	might_sleep();
4387
4388	/*
4389	* Skip __flush_work() during early boot when we know that @work isn't
4390	* executing. This allows canceling during early boot.
4391	*/
4392	if (wq_online)
4393	__flush_work(work, from_cancel: true);
4394
4395	if (!(cflags & WORK_CANCEL_DISABLE))
4396	enable_work(work);
4397
4398	return ret;
4399	}
4400
4401	/*
4402	* See cancel_delayed_work()
4403	*/
4404	bool cancel_work(struct work_struct *work)
4405	{
4406	return __cancel_work(work, cflags: `0`);
4407	}
4408	EXPORT_SYMBOL(cancel_work);
4409
4410	/**
4411	* cancel_work_sync - cancel a work and wait for it to finish
4412	* @work: the work to cancel
4413	*
4414	* Cancel @work and wait for its execution to finish. This function can be used
4415	* even if the work re-queues itself or migrates to another workqueue. On return
4416	* from this function, @work is guaranteed to be not pending or executing on any
4417	* CPU as long as there aren't racing enqueues.
4418	*
4419	* cancel_work_sync(&delayed_work->work) must not be used for delayed_work's.
4420	* Use cancel_delayed_work_sync() instead.
4421	*
4422	* Must be called from a sleepable context if @work was last queued on a non-BH
4423	* workqueue. Can also be called from non-hardirq atomic contexts including BH
4424	* if @work was last queued on a BH workqueue.
4425	*
4426	* Returns %true if @work was pending, %false otherwise.
4427	*/
4428	bool cancel_work_sync(struct work_struct *work)
4429	{
4430	return __cancel_work_sync(work, cflags: `0`);
4431	}
4432	EXPORT_SYMBOL_GPL(cancel_work_sync);
4433
4434	/**
4435	* cancel_delayed_work - cancel a delayed work
4436	* @dwork: delayed_work to cancel
4437	*
4438	* Kill off a pending delayed_work.
4439	*
4440	* Return: %true if @dwork was pending and canceled; %false if it wasn't
4441	* pending.
4442	*
4443	* Note:
4444	* The work callback function may still be running on return, unless
4445	* it returns %true and the work doesn't re-arm itself. Explicitly flush or
4446	* use cancel_delayed_work_sync() to wait on it.
4447	*
4448	* This function is safe to call from any context including IRQ handler.
4449	*/
4450	bool cancel_delayed_work(struct delayed_work *dwork)
4451	{
4452	return __cancel_work(work: &dwork->work, cflags: WORK_CANCEL_DELAYED);
4453	}
4454	EXPORT_SYMBOL(cancel_delayed_work);
4455
4456	/**
4457	* cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
4458	* @dwork: the delayed work cancel
4459	*
4460	* This is cancel_work_sync() for delayed works.
4461	*
4462	* Return:
4463	* %true if @dwork was pending, %false otherwise.
4464	*/
4465	bool cancel_delayed_work_sync(struct delayed_work *dwork)
4466	{
4467	return __cancel_work_sync(work: &dwork->work, cflags: WORK_CANCEL_DELAYED);
4468	}
4469	EXPORT_SYMBOL(cancel_delayed_work_sync);
4470
4471	/**
4472	* disable_work - Disable and cancel a work item
4473	* @work: work item to disable
4474	*
4475	* Disable @work by incrementing its disable count and cancel it if currently
4476	* pending. As long as the disable count is non-zero, any attempt to queue @work
4477	* will fail and return %false. The maximum supported disable depth is 2 to the
4478	* power of %WORK_OFFQ_DISABLE_BITS, currently 65536.
4479	*
4480	* Can be called from any context. Returns %true if @work was pending, %false
4481	* otherwise.
4482	*/
4483	bool disable_work(struct work_struct *work)
4484	{
4485	return __cancel_work(work, cflags: WORK_CANCEL_DISABLE);
4486	}
4487	EXPORT_SYMBOL_GPL(disable_work);
4488
4489	/**
4490	* disable_work_sync - Disable, cancel and drain a work item
4491	* @work: work item to disable
4492	*
4493	* Similar to disable_work() but also wait for @work to finish if currently
4494	* executing.
4495	*
4496	* Must be called from a sleepable context if @work was last queued on a non-BH
4497	* workqueue. Can also be called from non-hardirq atomic contexts including BH
4498	* if @work was last queued on a BH workqueue.
4499	*
4500	* Returns %true if @work was pending, %false otherwise.
4501	*/
4502	bool disable_work_sync(struct work_struct *work)
4503	{
4504	return __cancel_work_sync(work, cflags: WORK_CANCEL_DISABLE);
4505	}
4506	EXPORT_SYMBOL_GPL(disable_work_sync);
4507
4508	/**
4509	* enable_work - Enable a work item
4510	* @work: work item to enable
4511	*
4512	* Undo disable_work[_sync]() by decrementing @work's disable count. @work can
4513	* only be queued if its disable count is 0.
4514	*
4515	* Can be called from any context. Returns %true if the disable count reached 0.
4516	* Otherwise, %false.
4517	*/
4518	bool enable_work(struct work_struct *work)
4519	{
4520	struct work_offq_data offqd;
4521	unsigned long irq_flags;
4522
4523	work_grab_pending(work, cflags: `0`, irq_flags: &irq_flags);
4524
4525	work_offqd_unpack(offqd: &offqd, data: *work_data_bits(work));
4526	work_offqd_enable(offqd: &offqd);
4527	set_work_pool_and_clear_pending(work, pool_id: offqd.pool_id,
4528	flags: work_offqd_pack_flags(offqd: &offqd));
4529	local_irq_restore(irq_flags);
4530
4531	return !offqd.disable;
4532	}
4533	EXPORT_SYMBOL_GPL(enable_work);
4534
4535	/**
4536	* disable_delayed_work - Disable and cancel a delayed work item
4537	* @dwork: delayed work item to disable
4538	*
4539	* disable_work() for delayed work items.
4540	*/
4541	bool disable_delayed_work(struct delayed_work *dwork)
4542	{
4543	return __cancel_work(work: &dwork->work,
4544	cflags: WORK_CANCEL_DELAYED \| WORK_CANCEL_DISABLE);
4545	}
4546	EXPORT_SYMBOL_GPL(disable_delayed_work);
4547
4548	/**
4549	* disable_delayed_work_sync - Disable, cancel and drain a delayed work item
4550	* @dwork: delayed work item to disable
4551	*
4552	* disable_work_sync() for delayed work items.
4553	*/
4554	bool disable_delayed_work_sync(struct delayed_work *dwork)
4555	{
4556	return __cancel_work_sync(work: &dwork->work,
4557	cflags: WORK_CANCEL_DELAYED \| WORK_CANCEL_DISABLE);
4558	}
4559	EXPORT_SYMBOL_GPL(disable_delayed_work_sync);
4560
4561	/**
4562	* enable_delayed_work - Enable a delayed work item
4563	* @dwork: delayed work item to enable
4564	*
4565	* enable_work() for delayed work items.
4566	*/
4567	bool enable_delayed_work(struct delayed_work *dwork)
4568	{
4569	return enable_work(&dwork->work);
4570	}
4571	EXPORT_SYMBOL_GPL(enable_delayed_work);
4572
4573	/**
4574	* schedule_on_each_cpu - execute a function synchronously on each online CPU
4575	* @func: the function to call
4576	*
4577	* schedule_on_each_cpu() executes @func on each online CPU using the
4578	* system workqueue and blocks until all CPUs have completed.
4579	* schedule_on_each_cpu() is very slow.
4580	*
4581	* Return:
4582	* 0 on success, -errno on failure.
4583	*/
4584	int schedule_on_each_cpu(work_func_t func)
4585	{
4586	int cpu;
4587	struct work_struct __percpu *works;
4588
4589	works = alloc_percpu(struct work_struct);
4590	if (!works)
4591	return -ENOMEM;
4592
4593	cpus_read_lock();
4594
4595	for_each_online_cpu(cpu) {
4596	struct work_struct *work = per_cpu_ptr(works, cpu);
4597
4598	INIT_WORK(work, func);
4599	schedule_work_on(cpu, work);
4600	}
4601
4602	for_each_online_cpu(cpu)
4603	flush_work(per_cpu_ptr(works, cpu));
4604
4605	cpus_read_unlock();
4606	free_percpu(pdata: works);
4607	return `0`;
4608	}
4609
4610	/**
4611	* execute_in_process_context - reliably execute the routine with user context
4612	* @fn: the function to execute
4613	* @ew: guaranteed storage for the execute work structure (must
4614	* be available when the work executes)
4615	*
4616	* Executes the function immediately if process context is available,
4617	* otherwise schedules the function for delayed execution.
4618	*
4619	* Return: 0 - function was executed
4620	* 1 - function was scheduled for execution
4621	*/
4622	int execute_in_process_context(work_func_t fn, struct execute_work *ew)
4623	{
4624	if (!in_interrupt()) {
4625	fn(&ew->work);
4626	return `0`;
4627	}
4628
4629	INIT_WORK(&ew->work, fn);
4630	schedule_work(work: &ew->work);
4631
4632	return `1`;
4633	}
4634	EXPORT_SYMBOL_GPL(execute_in_process_context);
4635
4636	/**
4637	* free_workqueue_attrs - free a workqueue_attrs
4638	* @attrs: workqueue_attrs to free
4639	*
4640	* Undo alloc_workqueue_attrs().
4641	*/
4642	void free_workqueue_attrs(struct workqueue_attrs *attrs)
4643	{
4644	if (attrs) {
4645	free_cpumask_var(mask: attrs->cpumask);
4646	free_cpumask_var(mask: attrs->__pod_cpumask);
4647	kfree(objp: attrs);
4648	}
4649	}
4650
4651	/**
4652	* alloc_workqueue_attrs - allocate a workqueue_attrs
4653	*
4654	* Allocate a new workqueue_attrs, initialize with default settings and
4655	* return it.
4656	*
4657	* Return: The allocated new workqueue_attr on success. %NULL on failure.
4658	*/
4659	struct workqueue_attrs alloc_workqueue_attrs_noprof(void*)
4660	{
4661	struct workqueue_attrs *attrs;
4662
4663	attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
4664	if (!attrs)
4665	goto fail;
4666	if (!alloc_cpumask_var(mask: &attrs->cpumask, GFP_KERNEL))
4667	goto fail;
4668	if (!alloc_cpumask_var(mask: &attrs->__pod_cpumask, GFP_KERNEL))
4669	goto fail;
4670
4671	cpumask_copy(dstp: attrs->cpumask, cpu_possible_mask);
4672	attrs->affn_scope = WQ_AFFN_DFL;
4673	return attrs;
4674	fail:
4675	free_workqueue_attrs(attrs);
4676	return NULL;
4677	}
4678
4679	static void copy_workqueue_attrs(struct workqueue_attrs *to,
4680	const struct workqueue_attrs *from)
4681	{
4682	to->nice = from->nice;
4683	cpumask_copy(dstp: to->cpumask, srcp: from->cpumask);
4684	cpumask_copy(dstp: to->__pod_cpumask, srcp: from->__pod_cpumask);
4685	to->affn_strict = from->affn_strict;
4686
4687	/*
4688	* Unlike hash and equality test, copying shouldn't ignore wq-only
4689	* fields as copying is used for both pool and wq attrs. Instead,
4690	* get_unbound_pool() explicitly clears the fields.
4691	*/
4692	to->affn_scope = from->affn_scope;
4693	to->ordered = from->ordered;
4694	}
4695
4696	/*
4697	* Some attrs fields are workqueue-only. Clear them for worker_pool's. See the
4698	* comments in 'struct workqueue_attrs' definition.
4699	*/
4700	static void wqattrs_clear_for_pool(struct workqueue_attrs *attrs)
4701	{
4702	attrs->affn_scope = WQ_AFFN_NR_TYPES;
4703	attrs->ordered = false;
4704	if (attrs->affn_strict)
4705	cpumask_copy(dstp: attrs->cpumask, cpu_possible_mask);
4706	}
4707
4708	/ hash value of the content of @attr /
4709	static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
4710	{
4711	u32 hash = `0`;
4712
4713	hash = jhash_1word(a: attrs->nice, initval: hash);
4714	hash = jhash_1word(a: attrs->affn_strict, initval: hash);
4715	hash = jhash(cpumask_bits(attrs->__pod_cpumask),
4716	BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), initval: hash);
4717	if (!attrs->affn_strict)
4718	hash = jhash(cpumask_bits(attrs->cpumask),
4719	BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), initval: hash);
4720	return hash;
4721	}
4722
4723	/ content equality test /
4724	static bool wqattrs_equal(const struct workqueue_attrs *a,
4725	const struct workqueue_attrs *b)
4726	{
4727	if (a->nice != b->nice)
4728	return false;
4729	if (a->affn_strict != b->affn_strict)
4730	return false;
4731	if (!cpumask_equal(src1p: a->__pod_cpumask, src2p: b->__pod_cpumask))
4732	return false;
4733	if (!a->affn_strict && !cpumask_equal(src1p: a->cpumask, src2p: b->cpumask))
4734	return false;
4735	return true;
4736	}
4737
4738	/ Update @attrs with actually available CPUs /
4739	static void wqattrs_actualize_cpumask(struct workqueue_attrs *attrs,
4740	const cpumask_t *unbound_cpumask)
4741	{
4742	/*
4743	* Calculate the effective CPU mask of @attrs given @unbound_cpumask. If
4744	* @attrs->cpumask doesn't overlap with @unbound_cpumask, we fallback to
4745	* @unbound_cpumask.
4746	*/
4747	cpumask_and(dstp: attrs->cpumask, src1p: attrs->cpumask, src2p: unbound_cpumask);
4748	if (unlikely(cpumask_empty(attrs->cpumask)))
4749	cpumask_copy(dstp: attrs->cpumask, srcp: unbound_cpumask);
4750	}
4751
4752	/ find wq_pod_type to use for @attrs /
4753	static const struct wq_pod_type *
4754	wqattrs_pod_type(const struct workqueue_attrs *attrs)
4755	{
4756	enum wq_affn_scope scope;
4757	struct wq_pod_type *pt;
4758
4759	/ to synchronize access to wq_affn_dfl /
4760	lockdep_assert_held(&wq_pool_mutex);
4761
4762	if (attrs->affn_scope == WQ_AFFN_DFL)
4763	scope = wq_affn_dfl;
4764	else
4765	scope = attrs->affn_scope;
4766
4767	pt = &wq_pod_types[scope];
4768
4769	if (!WARN_ON_ONCE(attrs->affn_scope == WQ_AFFN_NR_TYPES) &&
4770	likely(pt->nr_pods))
4771	return pt;
4772
4773	/*
4774	* Before workqueue_init_topology(), only SYSTEM is available which is
4775	* initialized in workqueue_init_early().
4776	*/
4777	pt = &wq_pod_types[WQ_AFFN_SYSTEM];
4778	BUG_ON(!pt->nr_pods);
4779	return pt;
4780	}
4781
4782	/**
4783	* init_worker_pool - initialize a newly zalloc'd worker_pool
4784	* @pool: worker_pool to initialize
4785	*
4786	* Initialize a newly zalloc'd @pool. It also allocates @pool->attrs.
4787	*
4788	* Return: 0 on success, -errno on failure. Even on failure, all fields
4789	* inside @pool proper are initialized and put_unbound_pool() can be called
4790	* on @pool safely to release it.
4791	*/
4792	static int init_worker_pool(struct worker_pool *pool)
4793	{
4794	raw_spin_lock_init(&pool->lock);
4795	pool->id = -`1`;
4796	pool->cpu = -`1`;
4797	pool->node = NUMA_NO_NODE;
4798	pool->flags \|= POOL_DISASSOCIATED;
4799	pool->watchdog_ts = jiffies;
4800	INIT_LIST_HEAD(list: &pool->worklist);
4801	INIT_LIST_HEAD(list: &pool->idle_list);
4802	hash_init(pool->busy_hash);
4803
4804	timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE);
4805	INIT_WORK(&pool->idle_cull_work, idle_cull_fn);
4806
4807	timer_setup(&pool->mayday_timer, pool_mayday_timeout, `0`);
4808
4809	INIT_LIST_HEAD(list: &pool->workers);
4810
4811	ida_init(ida: &pool->worker_ida);
4812	INIT_HLIST_NODE(h: &pool->hash_node);
4813	pool->refcnt = `1`;
4814	#ifdef CONFIG_PREEMPT_RT
4815	spin_lock_init(&pool->cb_lock);
4816	#endif
4817
4818	/ shouldn't fail above this point /
4819	pool->attrs = alloc_workqueue_attrs();
4820	if (!pool->attrs)
4821	return -ENOMEM;
4822
4823	wqattrs_clear_for_pool(attrs: pool->attrs);
4824
4825	return `0`;
4826	}
4827
4828	#ifdef CONFIG_LOCKDEP
4829	static void wq_init_lockdep(struct workqueue_struct *wq)
4830	{
4831	char *lock_name;
4832
4833	lockdep_register_key(&wq->key);
4834	lock_name = kasprintf(GFP_KERNEL, "%s%s", "(wq_completion)", wq->name);
4835	if (!lock_name)
4836	lock_name = wq->name;
4837
4838	wq->lock_name = lock_name;
4839	wq->lockdep_map = &wq->__lockdep_map;
4840	lockdep_init_map(wq->lockdep_map, lock_name, &wq->key, `0`);
4841	}
4842
4843	static void wq_unregister_lockdep(struct workqueue_struct *wq)
4844	{
4845	if (wq->lockdep_map != &wq->__lockdep_map)
4846	return;
4847
4848	lockdep_unregister_key(&wq->key);
4849	}
4850
4851	static void wq_free_lockdep(struct workqueue_struct *wq)
4852	{
4853	if (wq->lockdep_map != &wq->__lockdep_map)
4854	return;
4855
4856	if (wq->lock_name != wq->name)
4857	kfree(wq->lock_name);
4858	}
4859	#else
4860	static void wq_init_lockdep(struct workqueue_struct *wq)
4861	{
4862	}
4863
4864	static void wq_unregister_lockdep(struct workqueue_struct *wq)
4865	{
4866	}
4867
4868	static void wq_free_lockdep(struct workqueue_struct *wq)
4869	{
4870	}
4871	#endif
4872
4873	static void free_node_nr_active(struct wq_node_nr_active **nna_ar)
4874	{
4875	int node;
4876
4877	for_each_node(node) {
4878	kfree(objp: nna_ar[node]);
4879	nna_ar[node] = NULL;
4880	}
4881
4882	kfree(objp: nna_ar[nr_node_ids]);
4883	nna_ar[nr_node_ids] = NULL;
4884	}
4885
4886	static void init_node_nr_active(struct wq_node_nr_active *nna)
4887	{
4888	nna->max = WQ_DFL_MIN_ACTIVE;
4889	atomic_set(v: &nna->nr, i: `0`);
4890	raw_spin_lock_init(&nna->lock);
4891	INIT_LIST_HEAD(list: &nna->pending_pwqs);
4892	}
4893
4894	/*
4895	* Each node's nr_active counter will be accessed mostly from its own node and
4896	* should be allocated in the node.
4897	*/
4898	static int alloc_node_nr_active(struct wq_node_nr_active **nna_ar)
4899	{
4900	struct wq_node_nr_active *nna;
4901	int node;
4902
4903	for_each_node(node) {
4904	nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, node);
4905	if (!nna)
4906	goto err_free;
4907	init_node_nr_active(nna);
4908	nna_ar[node] = nna;
4909	}
4910
4911	/ [nr_node_ids] is used as the fallback /
4912	nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, NUMA_NO_NODE);
4913	if (!nna)
4914	goto err_free;
4915	init_node_nr_active(nna);
4916	nna_ar[nr_node_ids] = nna;
4917
4918	return `0`;
4919
4920	err_free:
4921	free_node_nr_active(nna_ar);
4922	return -ENOMEM;
4923	}
4924
4925	static void rcu_free_wq(struct rcu_head *rcu)
4926	{
4927	struct workqueue_struct *wq =
4928	container_of(rcu, struct workqueue_struct, rcu);
4929
4930	if (wq->flags & WQ_UNBOUND)
4931	free_node_nr_active(nna_ar: wq->node_nr_active);
4932
4933	wq_free_lockdep(wq);
4934	free_percpu(pdata: wq->cpu_pwq);
4935	free_workqueue_attrs(attrs: wq->unbound_attrs);
4936	kfree(objp: wq);
4937	}
4938
4939	static void rcu_free_pool(struct rcu_head *rcu)
4940	{
4941	struct worker_pool pool = container_of(rcu, struct* worker_pool, rcu);
4942
4943	ida_destroy(ida: &pool->worker_ida);
4944	free_workqueue_attrs(attrs: pool->attrs);
4945	kfree(objp: pool);
4946	}
4947
4948	/**
4949	* put_unbound_pool - put a worker_pool
4950	* @pool: worker_pool to put
4951	*
4952	* Put @pool. If its refcnt reaches zero, it gets destroyed in RCU
4953	* safe manner. get_unbound_pool() calls this function on its failure path
4954	* and this function should be able to release pools which went through,
4955	* successfully or not, init_worker_pool().
4956	*
4957	* Should be called with wq_pool_mutex held.
4958	*/
4959	static void put_unbound_pool(struct worker_pool *pool)
4960	{
4961	struct worker *worker;
4962	LIST_HEAD(cull_list);
4963
4964	lockdep_assert_held(&wq_pool_mutex);
4965
4966	if (--pool->refcnt)
4967	return;
4968
4969	/ sanity checks /
4970	if (WARN_ON(!(pool->cpu < `0`)) \|\|
4971	WARN_ON(!list_empty(&pool->worklist)))
4972	return;
4973
4974	/ release id and unhash /
4975	if (pool->id >= `0`)
4976	idr_remove(&worker_pool_idr, id: pool->id);
4977	hash_del(node: &pool->hash_node);
4978
4979	/*
4980	* Become the manager and destroy all workers. This prevents
4981	* @pool's workers from blocking on attach_mutex. We're the last
4982	* manager and @pool gets freed with the flag set.
4983	*
4984	* Having a concurrent manager is quite unlikely to happen as we can
4985	* only get here with
4986	* pwq->refcnt == pool->refcnt == 0
4987	* which implies no work queued to the pool, which implies no worker can
4988	* become the manager. However a worker could have taken the role of
4989	* manager before the refcnts dropped to 0, since maybe_create_worker()
4990	* drops pool->lock
4991	*/
4992	while (true) {
4993	rcuwait_wait_event(&manager_wait,
4994	!(pool->flags & POOL_MANAGER_ACTIVE),
4995	TASK_UNINTERRUPTIBLE);
4996
4997	mutex_lock(lock: &wq_pool_attach_mutex);
4998	raw_spin_lock_irq(&pool->lock);
4999	if (!(pool->flags & POOL_MANAGER_ACTIVE)) {
5000	pool->flags \|= POOL_MANAGER_ACTIVE;
5001	break;
5002	}
5003	raw_spin_unlock_irq(&pool->lock);
5004	mutex_unlock(lock: &wq_pool_attach_mutex);
5005	}
5006
5007	while ((worker = first_idle_worker(pool)))
5008	set_worker_dying(worker, list: &cull_list);
5009	WARN_ON(pool->nr_workers \|\| pool->nr_idle);
5010	raw_spin_unlock_irq(&pool->lock);
5011
5012	detach_dying_workers(cull_list: &cull_list);
5013
5014	mutex_unlock(lock: &wq_pool_attach_mutex);
5015
5016	reap_dying_workers(cull_list: &cull_list);
5017
5018	/ shut down the timers /
5019	timer_delete_sync(timer: &pool->idle_timer);
5020	cancel_work_sync(&pool->idle_cull_work);
5021	timer_delete_sync(timer: &pool->mayday_timer);
5022
5023	/ RCU protected to allow dereferences from get_work_pool() /
5024	call_rcu(head: &pool->rcu, func: rcu_free_pool);
5025	}
5026
5027	/**
5028	* get_unbound_pool - get a worker_pool with the specified attributes
5029	* @attrs: the attributes of the worker_pool to get
5030	*
5031	* Obtain a worker_pool which has the same attributes as @attrs, bump the
5032	* reference count and return it. If there already is a matching
5033	* worker_pool, it will be used; otherwise, this function attempts to
5034	* create a new one.
5035	*
5036	* Should be called with wq_pool_mutex held.
5037	*
5038	* Return: On success, a worker_pool with the same attributes as @attrs.
5039	* On failure, %NULL.
5040	*/
5041	static struct worker_pool get_unbound_pool(const* struct workqueue_attrs *attrs)
5042	{
5043	struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_NUMA];
5044	u32 hash = wqattrs_hash(attrs);
5045	struct worker_pool *pool;
5046	int pod, node = NUMA_NO_NODE;
5047
5048	lockdep_assert_held(&wq_pool_mutex);
5049
5050	/ do we already have a matching pool? /
5051	hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
5052	if (wqattrs_equal(a: pool->attrs, b: attrs)) {
5053	pool->refcnt++;
5054	return pool;
5055	}
5056	}
5057
5058	/ If __pod_cpumask is contained inside a NUMA pod, that's our node /
5059	for (pod = `0`; pod < pt->nr_pods; pod++) {
5060	if (cpumask_subset(src1p: attrs->__pod_cpumask, src2p: pt->pod_cpus[pod])) {
5061	node = pt->pod_node[pod];
5062	break;
5063	}
5064	}
5065
5066	/ nope, create a new one /
5067	pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, node);
5068	if (!pool \|\| init_worker_pool(pool) < `0`)
5069	goto fail;
5070
5071	pool->node = node;
5072	copy_workqueue_attrs(to: pool->attrs, from: attrs);
5073	wqattrs_clear_for_pool(attrs: pool->attrs);
5074
5075	if (worker_pool_assign_id(pool) < `0`)
5076	goto fail;
5077
5078	/ create and start the initial worker /
5079	if (wq_online && !create_worker(pool))
5080	goto fail;
5081
5082	/ install /
5083	hash_add(unbound_pool_hash, &pool->hash_node, hash);
5084
5085	return pool;
5086	fail:
5087	if (pool)
5088	put_unbound_pool(pool);
5089	return NULL;
5090	}
5091
5092	/*
5093	* Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero
5094	* refcnt and needs to be destroyed.
5095	*/
5096	static void pwq_release_workfn(struct kthread_work *work)
5097	{
5098	struct pool_workqueue pwq = container_of(work, struct* pool_workqueue,
5099	release_work);
5100	struct workqueue_struct *wq = pwq->wq;
5101	struct worker_pool *pool = pwq->pool;
5102	bool is_last = false;
5103
5104	/*
5105	* When @pwq is not linked, it doesn't hold any reference to the
5106	* @wq, and @wq is invalid to access.
5107	*/
5108	if (!list_empty(head: &pwq->pwqs_node)) {
5109	mutex_lock(lock: &wq->mutex);
5110	list_del_rcu(entry: &pwq->pwqs_node);
5111	is_last = list_empty(head: &wq->pwqs);
5112
5113	/*
5114	* For ordered workqueue with a plugged dfl_pwq, restart it now.
5115	*/
5116	if (!is_last && (wq->flags & __WQ_ORDERED))
5117	unplug_oldest_pwq(wq);
5118
5119	mutex_unlock(lock: &wq->mutex);
5120	}
5121
5122	if (wq->flags & WQ_UNBOUND) {
5123	mutex_lock(lock: &wq_pool_mutex);
5124	put_unbound_pool(pool);
5125	mutex_unlock(lock: &wq_pool_mutex);
5126	}
5127
5128	if (!list_empty(head: &pwq->pending_node)) {
5129	struct wq_node_nr_active *nna =
5130	wq_node_nr_active(wq: pwq->wq, node: pwq->pool->node);
5131
5132	raw_spin_lock_irq(&nna->lock);
5133	list_del_init(entry: &pwq->pending_node);
5134	raw_spin_unlock_irq(&nna->lock);
5135	}
5136
5137	kfree_rcu(pwq, rcu);
5138
5139	/*
5140	* If we're the last pwq going away, @wq is already dead and no one
5141	* is gonna access it anymore. Schedule RCU free.
5142	*/
5143	if (is_last) {
5144	wq_unregister_lockdep(wq);
5145	call_rcu(head: &wq->rcu, func: rcu_free_wq);
5146	}
5147	}
5148
5149	/ initialize newly allocated @pwq which is associated with @wq and @pool /
5150	static void init_pwq(struct pool_workqueue pwq, struct* workqueue_struct *wq,
5151	struct worker_pool *pool)
5152	{
5153	BUG_ON((unsigned long)pwq & ~WORK_STRUCT_PWQ_MASK);
5154
5155	memset(s: pwq, c: `0`, n: sizeof(*pwq));
5156
5157	pwq->pool = pool;
5158	pwq->wq = wq;
5159	pwq->flush_color = -`1`;
5160	pwq->refcnt = `1`;
5161	INIT_LIST_HEAD(list: &pwq->inactive_works);
5162	INIT_LIST_HEAD(list: &pwq->pending_node);
5163	INIT_LIST_HEAD(list: &pwq->pwqs_node);
5164	INIT_LIST_HEAD(list: &pwq->mayday_node);
5165	kthread_init_work(&pwq->release_work, pwq_release_workfn);
5166	}
5167
5168	/ sync @pwq with the current state of its associated wq and link it /
5169	static void link_pwq(struct pool_workqueue *pwq)
5170	{
5171	struct workqueue_struct *wq = pwq->wq;
5172
5173	lockdep_assert_held(&wq->mutex);
5174
5175	/ may be called multiple times, ignore if already linked /
5176	if (!list_empty(head: &pwq->pwqs_node))
5177	return;
5178
5179	/ set the matching work_color /
5180	pwq->work_color = wq->work_color;
5181
5182	/ link in @pwq /
5183	list_add_tail_rcu(new: &pwq->pwqs_node, head: &wq->pwqs);
5184	}
5185
5186	/ obtain a pool matching @attr and create a pwq associating the pool and @wq /
5187	static struct pool_workqueue alloc_unbound_pwq(struct* workqueue_struct *wq,
5188	const struct workqueue_attrs *attrs)
5189	{
5190	struct worker_pool *pool;
5191	struct pool_workqueue *pwq;
5192
5193	lockdep_assert_held(&wq_pool_mutex);
5194
5195	pool = get_unbound_pool(attrs);
5196	if (!pool)
5197	return NULL;
5198
5199	pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node);
5200	if (!pwq) {
5201	put_unbound_pool(pool);
5202	return NULL;
5203	}
5204
5205	init_pwq(pwq, wq, pool);
5206	return pwq;
5207	}
5208
5209	static void apply_wqattrs_lock(void)
5210	{
5211	mutex_lock(lock: &wq_pool_mutex);
5212	}
5213
5214	static void apply_wqattrs_unlock(void)
5215	{
5216	mutex_unlock(lock: &wq_pool_mutex);
5217	}
5218
5219	/**
5220	* wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod
5221	* @attrs: the wq_attrs of the default pwq of the target workqueue
5222	* @cpu: the target CPU
5223	*
5224	* Calculate the cpumask a workqueue with @attrs should use on @pod.
5225	* The result is stored in @attrs->__pod_cpumask.
5226	*
5227	* If pod affinity is not enabled, @attrs->cpumask is always used. If enabled
5228	* and @pod has online CPUs requested by @attrs, the returned cpumask is the
5229	* intersection of the possible CPUs of @pod and @attrs->cpumask.
5230	*
5231	* The caller is responsible for ensuring that the cpumask of @pod stays stable.
5232	*/
5233	static void wq_calc_pod_cpumask(struct workqueue_attrs attrs, int* cpu)
5234	{
5235	const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
5236	int pod = pt->cpu_pod[cpu];
5237
5238	/ calculate possible CPUs in @pod that @attrs wants /
5239	cpumask_and(dstp: attrs->__pod_cpumask, src1p: pt->pod_cpus[pod], src2p: attrs->cpumask);
5240	/ does @pod have any online CPUs @attrs wants? /
5241	if (!cpumask_intersects(src1p: attrs->__pod_cpumask, src2p: wq_online_cpumask)) {
5242	cpumask_copy(dstp: attrs->__pod_cpumask, srcp: attrs->cpumask);
5243	return;
5244	}
5245	}
5246
5247	/ install @pwq into @wq and return the old pwq, @cpu < 0 for dfl_pwq /
5248	static struct pool_workqueue install_unbound_pwq(struct* workqueue_struct *wq,
5249	int cpu, struct pool_workqueue *pwq)
5250	{
5251	struct pool_workqueue __rcu **slot = unbound_pwq_slot(wq, cpu);
5252	struct pool_workqueue *old_pwq;
5253
5254	lockdep_assert_held(&wq_pool_mutex);
5255	lockdep_assert_held(&wq->mutex);
5256
5257	/ link_pwq() can handle duplicate calls /
5258	link_pwq(pwq);
5259
5260	old_pwq = rcu_access_pointer(*slot);
5261	rcu_assign_pointer(*slot, pwq);
5262	return old_pwq;
5263	}
5264
5265	/ context to store the prepared attrs & pwqs before applying /
5266	struct apply_wqattrs_ctx {
5267	struct workqueue_struct wq; /* target workqueue /
5268	struct workqueue_attrs attrs; /* attrs to apply /
5269	struct list_head list; / queued for batching commit /
5270	struct pool_workqueue *dfl_pwq;
5271	struct pool_workqueue *pwq_tbl[];
5272	};
5273
5274	/ free the resources after success or abort /
5275	static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
5276	{
5277	if (ctx) {
5278	int cpu;
5279
5280	for_each_possible_cpu(cpu)
5281	put_pwq_unlocked(pwq: ctx->pwq_tbl[cpu]);
5282	put_pwq_unlocked(pwq: ctx->dfl_pwq);
5283
5284	free_workqueue_attrs(attrs: ctx->attrs);
5285
5286	kfree(objp: ctx);
5287	}
5288	}
5289
5290	/ allocate the attrs and pwqs for later installation /
5291	static struct apply_wqattrs_ctx *
5292	apply_wqattrs_prepare(struct workqueue_struct *wq,
5293	const struct workqueue_attrs *attrs,
5294	const cpumask_var_t unbound_cpumask)
5295	{
5296	struct apply_wqattrs_ctx *ctx;
5297	struct workqueue_attrs *new_attrs;
5298	int cpu;
5299
5300	lockdep_assert_held(&wq_pool_mutex);
5301
5302	if (WARN_ON(attrs->affn_scope < `0` \|\|
5303	attrs->affn_scope >= WQ_AFFN_NR_TYPES))
5304	return ERR_PTR(error: -EINVAL);
5305
5306	ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_cpu_ids), GFP_KERNEL);
5307
5308	new_attrs = alloc_workqueue_attrs();
5309	if (!ctx \|\| !new_attrs)
5310	goto out_free;
5311
5312	/*
5313	* If something goes wrong during CPU up/down, we'll fall back to
5314	* the default pwq covering whole @attrs->cpumask. Always create
5315	* it even if we don't use it immediately.
5316	*/
5317	copy_workqueue_attrs(to: new_attrs, from: attrs);
5318	wqattrs_actualize_cpumask(attrs: new_attrs, unbound_cpumask);
5319	cpumask_copy(dstp: new_attrs->__pod_cpumask, srcp: new_attrs->cpumask);
5320	ctx->dfl_pwq = alloc_unbound_pwq(wq, attrs: new_attrs);
5321	if (!ctx->dfl_pwq)
5322	goto out_free;
5323
5324	for_each_possible_cpu(cpu) {
5325	if (new_attrs->ordered) {
5326	ctx->dfl_pwq->refcnt++;
5327	ctx->pwq_tbl[cpu] = ctx->dfl_pwq;
5328	} else {
5329	wq_calc_pod_cpumask(attrs: new_attrs, cpu);
5330	ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, attrs: new_attrs);
5331	if (!ctx->pwq_tbl[cpu])
5332	goto out_free;
5333	}
5334	}
5335
5336	/ save the user configured attrs and sanitize it. /
5337	copy_workqueue_attrs(to: new_attrs, from: attrs);
5338	cpumask_and(dstp: new_attrs->cpumask, src1p: new_attrs->cpumask, cpu_possible_mask);
5339	cpumask_copy(dstp: new_attrs->__pod_cpumask, srcp: new_attrs->cpumask);
5340	ctx->attrs = new_attrs;
5341
5342	/*
5343	* For initialized ordered workqueues, there should only be one pwq
5344	* (dfl_pwq). Set the plugged flag of ctx->dfl_pwq to suspend execution
5345	* of newly queued work items until execution of older work items in
5346	* the old pwq's have completed.
5347	*/
5348	if ((wq->flags & __WQ_ORDERED) && !list_empty(head: &wq->pwqs))
5349	ctx->dfl_pwq->plugged = true;
5350
5351	ctx->wq = wq;
5352	return ctx;
5353
5354	out_free:
5355	free_workqueue_attrs(attrs: new_attrs);
5356	apply_wqattrs_cleanup(ctx);
5357	return ERR_PTR(error: -ENOMEM);
5358	}
5359
5360	/ set attrs and install prepared pwqs, @ctx points to old pwqs on return /
5361	static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
5362	{
5363	int cpu;
5364
5365	/ all pwqs have been created successfully, let's install'em /
5366	mutex_lock(lock: &ctx->wq->mutex);
5367
5368	copy_workqueue_attrs(to: ctx->wq->unbound_attrs, from: ctx->attrs);
5369
5370	/ save the previous pwqs and install the new ones /
5371	for_each_possible_cpu(cpu)
5372	ctx->pwq_tbl[cpu] = install_unbound_pwq(wq: ctx->wq, cpu,
5373	pwq: ctx->pwq_tbl[cpu]);
5374	ctx->dfl_pwq = install_unbound_pwq(wq: ctx->wq, cpu: -`1`, pwq: ctx->dfl_pwq);
5375
5376	/ update node_nr_active->max /
5377	wq_update_node_max_active(wq: ctx->wq, off_cpu: -`1`);
5378
5379	/ rescuer needs to respect wq cpumask changes /
5380	if (ctx->wq->rescuer)
5381	set_cpus_allowed_ptr(p: ctx->wq->rescuer->task,
5382	new_mask: unbound_effective_cpumask(wq: ctx->wq));
5383
5384	mutex_unlock(lock: &ctx->wq->mutex);
5385	}
5386
5387	static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
5388	const struct workqueue_attrs *attrs)
5389	{
5390	struct apply_wqattrs_ctx *ctx;
5391
5392	/ only unbound workqueues can change attributes /
5393	if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
5394	return -EINVAL;
5395
5396	ctx = apply_wqattrs_prepare(wq, attrs, unbound_cpumask: wq_unbound_cpumask);
5397	if (IS_ERR(ptr: ctx))
5398	return PTR_ERR(ptr: ctx);
5399
5400	/ the ctx has been prepared successfully, let's commit it /
5401	apply_wqattrs_commit(ctx);
5402	apply_wqattrs_cleanup(ctx);
5403
5404	return `0`;
5405	}
5406
5407	/**
5408	* apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
5409	* @wq: the target workqueue
5410	* @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
5411	*
5412	* Apply @attrs to an unbound workqueue @wq. Unless disabled, this function maps
5413	* a separate pwq to each CPU pod with possibles CPUs in @attrs->cpumask so that
5414	* work items are affine to the pod it was issued on. Older pwqs are released as
5415	* in-flight work items finish. Note that a work item which repeatedly requeues
5416	* itself back-to-back will stay on its current pwq.
5417	*
5418	* Performs GFP_KERNEL allocations.
5419	*
5420	* Return: 0 on success and -errno on failure.
5421	*/
5422	int apply_workqueue_attrs(struct workqueue_struct *wq,
5423	const struct workqueue_attrs *attrs)
5424	{
5425	int ret;
5426
5427	mutex_lock(lock: &wq_pool_mutex);
5428	ret = apply_workqueue_attrs_locked(wq, attrs);
5429	mutex_unlock(lock: &wq_pool_mutex);
5430
5431	return ret;
5432	}
5433
5434	/**
5435	* unbound_wq_update_pwq - update a pwq slot for CPU hot[un]plug
5436	* @wq: the target workqueue
5437	* @cpu: the CPU to update the pwq slot for
5438	*
5439	* This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
5440	* %CPU_DOWN_FAILED. @cpu is in the same pod of the CPU being hot[un]plugged.
5441	*
5442	*
5443	* If pod affinity can't be adjusted due to memory allocation failure, it falls
5444	* back to @wq->dfl_pwq which may not be optimal but is always correct.
5445	*
5446	* Note that when the last allowed CPU of a pod goes offline for a workqueue
5447	* with a cpumask spanning multiple pods, the workers which were already
5448	* executing the work items for the workqueue will lose their CPU affinity and
5449	* may execute on any CPU. This is similar to how per-cpu workqueues behave on
5450	* CPU_DOWN. If a workqueue user wants strict affinity, it's the user's
5451	* responsibility to flush the work item from CPU_DOWN_PREPARE.
5452	*/
5453	static void unbound_wq_update_pwq(struct workqueue_struct wq, int* cpu)
5454	{
5455	struct pool_workqueue old_pwq = NULL, pwq;
5456	struct workqueue_attrs *target_attrs;
5457
5458	lockdep_assert_held(&wq_pool_mutex);
5459
5460	if (!(wq->flags & WQ_UNBOUND) \|\| wq->unbound_attrs->ordered)
5461	return;
5462
5463	/*
5464	* We don't wanna alloc/free wq_attrs for each wq for each CPU.
5465	* Let's use a preallocated one. The following buf is protected by
5466	* CPU hotplug exclusion.
5467	*/
5468	target_attrs = unbound_wq_update_pwq_attrs_buf;
5469
5470	copy_workqueue_attrs(to: target_attrs, from: wq->unbound_attrs);
5471	wqattrs_actualize_cpumask(attrs: target_attrs, unbound_cpumask: wq_unbound_cpumask);
5472
5473	/ nothing to do if the target cpumask matches the current pwq /
5474	wq_calc_pod_cpumask(attrs: target_attrs, cpu);
5475	if (wqattrs_equal(a: target_attrs, b: unbound_pwq(wq, cpu)->pool->attrs))
5476	return;
5477
5478	/ create a new pwq /
5479	pwq = alloc_unbound_pwq(wq, attrs: target_attrs);
5480	if (!pwq) {
5481	pr_warn("workqueue: allocation failed while updating CPU pod affinity of \"%s\"\n",
5482	wq->name);
5483	goto use_dfl_pwq;
5484	}
5485
5486	/ Install the new pwq. /
5487	mutex_lock(lock: &wq->mutex);
5488	old_pwq = install_unbound_pwq(wq, cpu, pwq);
5489	goto out_unlock;
5490
5491	use_dfl_pwq:
5492	mutex_lock(lock: &wq->mutex);
5493	pwq = unbound_pwq(wq, cpu: -`1`);
5494	raw_spin_lock_irq(&pwq->pool->lock);
5495	get_pwq(pwq);
5496	raw_spin_unlock_irq(&pwq->pool->lock);
5497	old_pwq = install_unbound_pwq(wq, cpu, pwq);
5498	out_unlock:
5499	mutex_unlock(lock: &wq->mutex);
5500	put_pwq_unlocked(pwq: old_pwq);
5501	}
5502
5503	static int alloc_and_link_pwqs(struct workqueue_struct *wq)
5504	{
5505	bool highpri = wq->flags & WQ_HIGHPRI;
5506	int cpu, ret;
5507
5508	lockdep_assert_held(&wq_pool_mutex);
5509
5510	wq->cpu_pwq = alloc_percpu(struct pool_workqueue *);
5511	if (!wq->cpu_pwq)
5512	goto enomem;
5513
5514	if (!(wq->flags & WQ_UNBOUND)) {
5515	struct worker_pool __percpu *pools;
5516
5517	if (wq->flags & WQ_BH)
5518	pools = bh_worker_pools;
5519	else
5520	pools = cpu_worker_pools;
5521
5522	for_each_possible_cpu(cpu) {
5523	struct pool_workqueue **pwq_p;
5524	struct worker_pool *pool;
5525
5526	pool = &(per_cpu_ptr(pools, cpu)[highpri]);
5527	pwq_p = per_cpu_ptr(wq->cpu_pwq, cpu);
5528
5529	*pwq_p = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL,
5530	pool->node);
5531	if (!*pwq_p)
5532	goto enomem;
5533
5534	init_pwq(pwq: *pwq_p, wq, pool);
5535
5536	mutex_lock(lock: &wq->mutex);
5537	link_pwq(pwq: *pwq_p);
5538	mutex_unlock(lock: &wq->mutex);
5539	}
5540	return `0`;
5541	}
5542
5543	if (wq->flags & __WQ_ORDERED) {
5544	struct pool_workqueue *dfl_pwq;
5545
5546	ret = apply_workqueue_attrs_locked(wq, attrs: ordered_wq_attrs[highpri]);
5547	/ there should only be single pwq for ordering guarantee /
5548	dfl_pwq = rcu_access_pointer(wq->dfl_pwq);
5549	WARN(!ret && (wq->pwqs.next != &dfl_pwq->pwqs_node \|\|
5550	wq->pwqs.prev != &dfl_pwq->pwqs_node),
5551	"ordering guarantee broken for workqueue %s\n", wq->name);
5552	} else {
5553	ret = apply_workqueue_attrs_locked(wq, attrs: unbound_std_wq_attrs[highpri]);
5554	}
5555
5556	return ret;
5557
5558	enomem:
5559	if (wq->cpu_pwq) {
5560	for_each_possible_cpu(cpu) {
5561	struct pool_workqueue pwq = per_cpu_ptr(wq->cpu_pwq, cpu);
5562
5563	if (pwq)
5564	kmem_cache_free(s: pwq_cache, objp: pwq);
5565	}
5566	free_percpu(pdata: wq->cpu_pwq);
5567	wq->cpu_pwq = NULL;
5568	}
5569	return -ENOMEM;
5570	}
5571
5572	static int wq_clamp_max_active(int max_active, unsigned int flags,
5573	const char *name)
5574	{
5575	if (max_active < `1` \|\| max_active > WQ_MAX_ACTIVE)
5576	pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",
5577	max_active, name, `1`, WQ_MAX_ACTIVE);
5578
5579	return clamp_val(max_active, `1`, WQ_MAX_ACTIVE);
5580	}
5581
5582	/*
5583	* Workqueues which may be used during memory reclaim should have a rescuer
5584	* to guarantee forward progress.
5585	*/
5586	static int init_rescuer(struct workqueue_struct *wq)
5587	{
5588	struct worker *rescuer;
5589	char id_buf[WORKER_ID_LEN];
5590	int ret;
5591
5592	lockdep_assert_held(&wq_pool_mutex);
5593
5594	if (!(wq->flags & WQ_MEM_RECLAIM))
5595	return `0`;
5596
5597	rescuer = alloc_worker(NUMA_NO_NODE);
5598	if (!rescuer) {
5599	pr_err("workqueue: Failed to allocate a rescuer for wq \"%s\"\n",
5600	wq->name);
5601	return -ENOMEM;
5602	}
5603
5604	rescuer->rescue_wq = wq;
5605	format_worker_id(buf: id_buf, size: sizeof(id_buf), worker: rescuer, NULL);
5606
5607	rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", id_buf);
5608	if (IS_ERR(ptr: rescuer->task)) {
5609	ret = PTR_ERR(ptr: rescuer->task);
5610	pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe",
5611	wq->name, ERR_PTR(ret));
5612	kfree(objp: rescuer);
5613	return ret;
5614	}
5615
5616	wq->rescuer = rescuer;
5617	if (wq->flags & WQ_UNBOUND)
5618	kthread_bind_mask(k: rescuer->task, mask: unbound_effective_cpumask(wq));
5619	else
5620	kthread_bind_mask(k: rescuer->task, cpu_possible_mask);
5621	wake_up_process(tsk: rescuer->task);
5622
5623	return `0`;
5624	}
5625
5626	/**
5627	* wq_adjust_max_active - update a wq's max_active to the current setting
5628	* @wq: target workqueue
5629	*
5630	* If @wq isn't freezing, set @wq->max_active to the saved_max_active and
5631	* activate inactive work items accordingly. If @wq is freezing, clear
5632	* @wq->max_active to zero.
5633	*/
5634	static void wq_adjust_max_active(struct workqueue_struct *wq)
5635	{
5636	bool activated;
5637	int new_max, new_min;
5638
5639	lockdep_assert_held(&wq->mutex);
5640
5641	if ((wq->flags & WQ_FREEZABLE) && workqueue_freezing) {
5642	new_max = `0`;
5643	new_min = `0`;
5644	} else {
5645	new_max = wq->saved_max_active;
5646	new_min = wq->saved_min_active;
5647	}
5648
5649	if (wq->max_active == new_max && wq->min_active == new_min)
5650	return;
5651
5652	/*
5653	* Update @wq->max/min_active and then kick inactive work items if more
5654	* active work items are allowed. This doesn't break work item ordering
5655	* because new work items are always queued behind existing inactive
5656	* work items if there are any.
5657	*/
5658	WRITE_ONCE(wq->max_active, new_max);
5659	WRITE_ONCE(wq->min_active, new_min);
5660
5661	if (wq->flags & WQ_UNBOUND)
5662	wq_update_node_max_active(wq, off_cpu: -`1`);
5663
5664	if (new_max == `0`)
5665	return;
5666
5667	/*
5668	* Round-robin through pwq's activating the first inactive work item
5669	* until max_active is filled.
5670	*/
5671	do {
5672	struct pool_workqueue *pwq;
5673
5674	activated = false;
5675	for_each_pwq(pwq, wq) {
5676	unsigned long irq_flags;
5677
5678	/ can be called during early boot w/ irq disabled /
5679	raw_spin_lock_irqsave(&pwq->pool->lock, irq_flags);
5680	if (pwq_activate_first_inactive(pwq, fill: true)) {
5681	activated = true;
5682	kick_pool(pool: pwq->pool);
5683	}
5684	raw_spin_unlock_irqrestore(&pwq->pool->lock, irq_flags);
5685	}
5686	} while (activated);
5687	}
5688
5689	__printf(`1`, `0`)
5690	static struct workqueue_struct __alloc_workqueue(const* char *fmt,
5691	unsigned int flags,
5692	int max_active, va_list args)
5693	{
5694	struct workqueue_struct *wq;
5695	size_t wq_size;
5696	int name_len;
5697
5698	if (flags & WQ_BH) {
5699	if (WARN_ON_ONCE(flags & ~__WQ_BH_ALLOWS))
5700	return NULL;
5701	if (WARN_ON_ONCE(max_active))
5702	return NULL;
5703	}
5704
5705	/ see the comment above the definition of WQ_POWER_EFFICIENT /
5706	if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
5707	flags \|= WQ_UNBOUND;
5708
5709	/ allocate wq and format name /
5710	if (flags & WQ_UNBOUND)
5711	wq_size = struct_size(wq, node_nr_active, nr_node_ids + `1`);
5712	else
5713	wq_size = sizeof(*wq);
5714
5715	wq = kzalloc_noprof(size: wq_size, GFP_KERNEL);
5716	if (!wq)
5717	return NULL;
5718
5719	if (flags & WQ_UNBOUND) {
5720	wq->unbound_attrs = alloc_workqueue_attrs_noprof();
5721	if (!wq->unbound_attrs)
5722	goto err_free_wq;
5723	}
5724
5725	name_len = vsnprintf(buf: wq->name, size: sizeof(wq->name), fmt, args);
5726
5727	if (name_len >= WQ_NAME_LEN)
5728	pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n",
5729	wq->name);
5730
5731	if (flags & WQ_BH) {
5732	/*
5733	* BH workqueues always share a single execution context per CPU
5734	* and don't impose any max_active limit.
5735	*/
5736	max_active = INT_MAX;
5737	} else {
5738	max_active = max_active ?: WQ_DFL_ACTIVE;
5739	max_active = wq_clamp_max_active(max_active, flags, name: wq->name);
5740	}
5741
5742	/ init wq /
5743	wq->flags = flags;
5744	wq->max_active = max_active;
5745	wq->min_active = min(max_active, WQ_DFL_MIN_ACTIVE);
5746	wq->saved_max_active = wq->max_active;
5747	wq->saved_min_active = wq->min_active;
5748	mutex_init(&wq->mutex);
5749	atomic_set(v: &wq->nr_pwqs_to_flush, i: `0`);
5750	INIT_LIST_HEAD(list: &wq->pwqs);
5751	INIT_LIST_HEAD(list: &wq->flusher_queue);
5752	INIT_LIST_HEAD(list: &wq->flusher_overflow);
5753	INIT_LIST_HEAD(list: &wq->maydays);
5754
5755	INIT_LIST_HEAD(list: &wq->list);
5756
5757	if (flags & WQ_UNBOUND) {
5758	if (alloc_node_nr_active(nna_ar: wq->node_nr_active) < `0`)
5759	goto err_free_wq;
5760	}
5761
5762	/*
5763	* wq_pool_mutex protects the workqueues list, allocations of PWQs,
5764	* and the global freeze state.
5765	*/
5766	apply_wqattrs_lock();
5767
5768	if (alloc_and_link_pwqs(wq) < `0`)
5769	goto err_unlock_free_node_nr_active;
5770
5771	mutex_lock(lock: &wq->mutex);
5772	wq_adjust_max_active(wq);
5773	mutex_unlock(lock: &wq->mutex);
5774
5775	list_add_tail_rcu(new: &wq->list, head: &workqueues);
5776
5777	if (wq_online && init_rescuer(wq) < `0`)
5778	goto err_unlock_destroy;
5779
5780	apply_wqattrs_unlock();
5781
5782	if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
5783	goto err_destroy;
5784
5785	return wq;
5786
5787	err_unlock_free_node_nr_active:
5788	apply_wqattrs_unlock();
5789	/*
5790	* Failed alloc_and_link_pwqs() may leave pending pwq->release_work,
5791	* flushing the pwq_release_worker ensures that the pwq_release_workfn()
5792	* completes before calling kfree(wq).
5793	*/
5794	if (wq->flags & WQ_UNBOUND) {
5795	kthread_flush_worker(worker: pwq_release_worker);
5796	free_node_nr_active(nna_ar: wq->node_nr_active);
5797	}
5798	err_free_wq:
5799	free_workqueue_attrs(attrs: wq->unbound_attrs);
5800	kfree(objp: wq);
5801	return NULL;
5802	err_unlock_destroy:
5803	apply_wqattrs_unlock();
5804	err_destroy:
5805	destroy_workqueue(wq);
5806	return NULL;
5807	}
5808
5809	__printf(`1`, `4`)
5810	struct workqueue_struct alloc_workqueue_noprof(const* char *fmt,
5811	unsigned int flags,
5812	int max_active, ...)
5813	{
5814	struct workqueue_struct *wq;
5815	va_list args;
5816
5817	va_start(args, max_active);
5818	wq = __alloc_workqueue(fmt, flags, max_active, args);
5819	va_end(args);
5820	if (!wq)
5821	return NULL;
5822
5823	wq_init_lockdep(wq);
5824
5825	return wq;
5826	}
5827	EXPORT_SYMBOL_GPL(alloc_workqueue_noprof);
5828
5829	#ifdef CONFIG_LOCKDEP
5830	__printf(`1`, `5`)
5831	struct workqueue_struct *
5832	alloc_workqueue_lockdep_map(const char fmt, unsigned* int flags,
5833	int max_active, struct lockdep_map *lockdep_map, ...)
5834	{
5835	struct workqueue_struct *wq;
5836	va_list args;
5837
5838	va_start(args, lockdep_map);
5839	wq = __alloc_workqueue(fmt, flags, max_active, args);
5840	va_end(args);
5841	if (!wq)
5842	return NULL;
5843
5844	wq->lockdep_map = lockdep_map;
5845
5846	return wq;
5847	}
5848	EXPORT_SYMBOL_GPL(alloc_workqueue_lockdep_map);
5849	#endif
5850
5851	static bool pwq_busy(struct pool_workqueue *pwq)
5852	{
5853	int i;
5854
5855	for (i = `0`; i < WORK_NR_COLORS; i++)
5856	if (pwq->nr_in_flight[i])
5857	return true;
5858
5859	if ((pwq != rcu_access_pointer(pwq->wq->dfl_pwq)) && (pwq->refcnt > `1`))
5860	return true;
5861	if (!pwq_is_empty(pwq))
5862	return true;
5863
5864	return false;
5865	}
5866
5867	/**
5868	* destroy_workqueue - safely terminate a workqueue
5869	* @wq: target workqueue
5870	*
5871	* Safely destroy a workqueue. All work currently pending will be done first.
5872	*
5873	* This function does NOT guarantee that non-pending work that has been
5874	* submitted with queue_delayed_work() and similar functions will be done
5875	* before destroying the workqueue. The fundamental problem is that, currently,
5876	* the workqueue has no way of accessing non-pending delayed_work. delayed_work
5877	* is only linked on the timer-side. All delayed_work must, therefore, be
5878	* canceled before calling this function.
5879	*
5880	* TODO: It would be better if the problem described above wouldn't exist and
5881	* destroy_workqueue() would cleanly cancel all pending and non-pending
5882	* delayed_work.
5883	*/
5884	void destroy_workqueue(struct workqueue_struct *wq)
5885	{
5886	struct pool_workqueue *pwq;
5887	int cpu;
5888
5889	/*
5890	* Remove it from sysfs first so that sanity check failure doesn't
5891	* lead to sysfs name conflicts.
5892	*/
5893	workqueue_sysfs_unregister(wq);
5894
5895	/ mark the workqueue destruction is in progress /
5896	mutex_lock(lock: &wq->mutex);
5897	wq->flags \|= __WQ_DESTROYING;
5898	mutex_unlock(lock: &wq->mutex);
5899
5900	/ drain it before proceeding with destruction /
5901	drain_workqueue(wq);
5902
5903	/ kill rescuer, if sanity checks fail, leave it w/o rescuer /
5904	if (wq->rescuer) {
5905	struct worker *rescuer = wq->rescuer;
5906
5907	/ this prevents new queueing /
5908	raw_spin_lock_irq(&wq_mayday_lock);
5909	wq->rescuer = NULL;
5910	raw_spin_unlock_irq(&wq_mayday_lock);
5911
5912	/ rescuer will empty maydays list before exiting /
5913	kthread_stop(k: rescuer->task);
5914	kfree(objp: rescuer);
5915	}
5916
5917	/*
5918	* Sanity checks - grab all the locks so that we wait for all
5919	* in-flight operations which may do put_pwq().
5920	*/
5921	mutex_lock(lock: &wq_pool_mutex);
5922	mutex_lock(lock: &wq->mutex);
5923	for_each_pwq(pwq, wq) {
5924	raw_spin_lock_irq(&pwq->pool->lock);
5925	if (WARN_ON(pwq_busy(pwq))) {
5926	pr_warn("%s: %s has the following busy pwq\n",
5927	__func__, wq->name);
5928	show_pwq(pwq);
5929	raw_spin_unlock_irq(&pwq->pool->lock);
5930	mutex_unlock(lock: &wq->mutex);
5931	mutex_unlock(lock: &wq_pool_mutex);
5932	show_one_workqueue(wq);
5933	return;
5934	}
5935	raw_spin_unlock_irq(&pwq->pool->lock);
5936	}
5937	mutex_unlock(lock: &wq->mutex);
5938
5939	/*
5940	* wq list is used to freeze wq, remove from list after
5941	* flushing is complete in case freeze races us.
5942	*/
5943	list_del_rcu(entry: &wq->list);
5944	mutex_unlock(lock: &wq_pool_mutex);
5945
5946	/*
5947	* We're the sole accessor of @wq. Directly access cpu_pwq and dfl_pwq
5948	* to put the base refs. @wq will be auto-destroyed from the last
5949	* pwq_put. RCU read lock prevents @wq from going away from under us.
5950	*/
5951	rcu_read_lock();
5952
5953	for_each_possible_cpu(cpu) {
5954	put_pwq_unlocked(pwq: unbound_pwq(wq, cpu));
5955	RCU_INIT_POINTER(*unbound_pwq_slot(wq, cpu), NULL);
5956	}
5957
5958	put_pwq_unlocked(pwq: unbound_pwq(wq, cpu: -`1`));
5959	RCU_INIT_POINTER(*unbound_pwq_slot(wq, -`1`), NULL);
5960
5961	rcu_read_unlock();
5962	}
5963	EXPORT_SYMBOL_GPL(destroy_workqueue);
5964
5965	/**
5966	* workqueue_set_max_active - adjust max_active of a workqueue
5967	* @wq: target workqueue
5968	* @max_active: new max_active value.
5969	*
5970	* Set max_active of @wq to @max_active. See the alloc_workqueue() function
5971	* comment.
5972	*
5973	* CONTEXT:
5974	* Don't call from IRQ context.
5975	*/
5976	void workqueue_set_max_active(struct workqueue_struct wq, int* max_active)
5977	{
5978	/ max_active doesn't mean anything for BH workqueues /
5979	if (WARN_ON(wq->flags & WQ_BH))
5980	return;
5981	/ disallow meddling with max_active for ordered workqueues /
5982	if (WARN_ON(wq->flags & __WQ_ORDERED))
5983	return;
5984
5985	max_active = wq_clamp_max_active(max_active, flags: wq->flags, name: wq->name);
5986
5987	mutex_lock(lock: &wq->mutex);
5988
5989	wq->saved_max_active = max_active;
5990	if (wq->flags & WQ_UNBOUND)
5991	wq->saved_min_active = min(wq->saved_min_active, max_active);
5992
5993	wq_adjust_max_active(wq);
5994
5995	mutex_unlock(lock: &wq->mutex);
5996	}
5997	EXPORT_SYMBOL_GPL(workqueue_set_max_active);
5998
5999	/**
6000	* workqueue_set_min_active - adjust min_active of an unbound workqueue
6001	* @wq: target unbound workqueue
6002	* @min_active: new min_active value
6003	*
6004	* Set min_active of an unbound workqueue. Unlike other types of workqueues, an
6005	* unbound workqueue is not guaranteed to be able to process max_active
6006	* interdependent work items. Instead, an unbound workqueue is guaranteed to be
6007	* able to process min_active number of interdependent work items which is
6008	* %WQ_DFL_MIN_ACTIVE by default.
6009	*
6010	* Use this function to adjust the min_active value between 0 and the current
6011	* max_active.
6012	*/
6013	void workqueue_set_min_active(struct workqueue_struct wq, int* min_active)
6014	{
6015	/ min_active is only meaningful for non-ordered unbound workqueues /
6016	if (WARN_ON((wq->flags & (WQ_BH \| WQ_UNBOUND \| __WQ_ORDERED)) !=
6017	WQ_UNBOUND))
6018	return;
6019
6020	mutex_lock(lock: &wq->mutex);
6021	wq->saved_min_active = clamp(min_active, `0`, wq->saved_max_active);
6022	wq_adjust_max_active(wq);
6023	mutex_unlock(lock: &wq->mutex);
6024	}
6025
6026	/**
6027	* current_work - retrieve %current task's work struct
6028	*
6029	* Determine if %current task is a workqueue worker and what it's working on.
6030	* Useful to find out the context that the %current task is running in.
6031	*
6032	* Return: work struct if %current task is a workqueue worker, %NULL otherwise.
6033	*/
6034	struct work_struct current_work(void*)
6035	{
6036	struct worker *worker = current_wq_worker();
6037
6038	return worker ? worker->current_work : NULL;
6039	}
6040	EXPORT_SYMBOL(current_work);
6041
6042	/**
6043	* current_is_workqueue_rescuer - is %current workqueue rescuer?
6044	*
6045	* Determine whether %current is a workqueue rescuer. Can be used from
6046	* work functions to determine whether it's being run off the rescuer task.
6047	*
6048	* Return: %true if %current is a workqueue rescuer. %false otherwise.
6049	*/
6050	bool current_is_workqueue_rescuer(void)
6051	{
6052	struct worker *worker = current_wq_worker();
6053
6054	return worker && worker->rescue_wq;
6055	}
6056
6057	/**
6058	* workqueue_congested - test whether a workqueue is congested
6059	* @cpu: CPU in question
6060	* @wq: target workqueue
6061	*
6062	* Test whether @wq's cpu workqueue for @cpu is congested. There is
6063	* no synchronization around this function and the test result is
6064	* unreliable and only useful as advisory hints or for debugging.
6065	*
6066	* If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU.
6067	*
6068	* With the exception of ordered workqueues, all workqueues have per-cpu
6069	* pool_workqueues, each with its own congested state. A workqueue being
6070	* congested on one CPU doesn't mean that the workqueue is contested on any
6071	* other CPUs.
6072	*
6073	* Return:
6074	* %true if congested, %false otherwise.
6075	*/
6076	bool workqueue_congested(int cpu, struct workqueue_struct *wq)
6077	{
6078	struct pool_workqueue *pwq;
6079	bool ret;
6080
6081	preempt_disable();
6082
6083	if (cpu == WORK_CPU_UNBOUND)
6084	cpu = smp_processor_id();
6085
6086	pwq = *per_cpu_ptr(wq->cpu_pwq, cpu);
6087	ret = !list_empty(head: &pwq->inactive_works);
6088
6089	preempt_enable();
6090
6091	return ret;
6092	}
6093	EXPORT_SYMBOL_GPL(workqueue_congested);
6094
6095	/**
6096	* work_busy - test whether a work is currently pending or running
6097	* @work: the work to be tested
6098	*
6099	* Test whether @work is currently pending or running. There is no
6100	* synchronization around this function and the test result is
6101	* unreliable and only useful as advisory hints or for debugging.
6102	*
6103	* Return:
6104	* OR'd bitmask of WORK_BUSY_* bits.
6105	*/
6106	unsigned int work_busy(struct work_struct *work)
6107	{
6108	struct worker_pool *pool;
6109	unsigned long irq_flags;
6110	unsigned int ret = `0`;
6111
6112	if (work_pending(work))
6113	ret \|= WORK_BUSY_PENDING;
6114
6115	rcu_read_lock();
6116	pool = get_work_pool(work);
6117	if (pool) {
6118	raw_spin_lock_irqsave(&pool->lock, irq_flags);
6119	if (find_worker_executing_work(pool, work))
6120	ret \|= WORK_BUSY_RUNNING;
6121	raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
6122	}
6123	rcu_read_unlock();
6124
6125	return ret;
6126	}
6127	EXPORT_SYMBOL_GPL(work_busy);
6128
6129	/**
6130	* set_worker_desc - set description for the current work item
6131	* @fmt: printf-style format string
6132	* @...: arguments for the format string
6133	*
6134	* This function can be called by a running work function to describe what
6135	* the work item is about. If the worker task gets dumped, this
6136	* information will be printed out together to help debugging. The
6137	* description can be at most WORKER_DESC_LEN including the trailing '\0'.
6138	*/
6139	void set_worker_desc(const char *fmt, ...)
6140	{
6141	struct worker *worker = current_wq_worker();
6142	va_list args;
6143
6144	if (worker) {
6145	va_start(args, fmt);
6146	vsnprintf(buf: worker->desc, size: sizeof(worker->desc), fmt, args);
6147	va_end(args);
6148	}
6149	}
6150	EXPORT_SYMBOL_GPL(set_worker_desc);
6151
6152	/**
6153	* print_worker_info - print out worker information and description
6154	* @log_lvl: the log level to use when printing
6155	* @task: target task
6156	*
6157	* If @task is a worker and currently executing a work item, print out the
6158	* name of the workqueue being serviced and worker description set with
6159	* set_worker_desc() by the currently executing work item.
6160	*
6161	* This function can be safely called on any task as long as the
6162	* task_struct itself is accessible. While safe, this function isn't
6163	* synchronized and may print out mixups or garbages of limited length.
6164	*/
6165	void print_worker_info(const char log_lvl, struct* task_struct *task)
6166	{
6167	work_func_t *fn = NULL;
6168	char name[WQ_NAME_LEN] = { };
6169	char desc[WORKER_DESC_LEN] = { };
6170	struct pool_workqueue *pwq = NULL;
6171	struct workqueue_struct *wq = NULL;
6172	struct worker *worker;
6173
6174	if (!(task->flags & PF_WQ_WORKER))
6175	return;
6176
6177	/*
6178	* This function is called without any synchronization and @task
6179	* could be in any state. Be careful with dereferences.
6180	*/
6181	worker = kthread_probe_data(k: task);
6182
6183	/*
6184	* Carefully copy the associated workqueue's workfn, name and desc.
6185	* Keep the original last '\0' in case the original is garbage.
6186	*/
6187	copy_from_kernel_nofault(dst: &fn, src: &worker->current_func, size: sizeof(fn));
6188	copy_from_kernel_nofault(dst: &pwq, src: &worker->current_pwq, size: sizeof(pwq));
6189	copy_from_kernel_nofault(dst: &wq, src: &pwq->wq, size: sizeof(wq));
6190	copy_from_kernel_nofault(dst: name, src: wq->name, size: sizeof(name) - `1`);
6191	copy_from_kernel_nofault(dst: desc, src: worker->desc, size: sizeof(desc) - `1`);
6192
6193	if (fn \|\| name[`0`] \|\| desc[`0`]) {
6194	printk("%sWorkqueue: %s %ps", log_lvl, name, fn);
6195	if (strcmp(name, desc))
6196	pr_cont(" (%s)", desc);
6197	pr_cont("\n");
6198	}
6199	}
6200
6201	static void pr_cont_pool_info(struct worker_pool *pool)
6202	{
6203	pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask);
6204	if (pool->node != NUMA_NO_NODE)
6205	pr_cont(" node=%d", pool->node);
6206	pr_cont(" flags=0x%x", pool->flags);
6207	if (pool->flags & POOL_BH)
6208	pr_cont(" bh%s",
6209	pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : "");
6210	else
6211	pr_cont(" nice=%d", pool->attrs->nice);
6212	}
6213
6214	static void pr_cont_worker_id(struct worker *worker)
6215	{
6216	struct worker_pool *pool = worker->pool;
6217
6218	if (pool->flags & WQ_BH)
6219	pr_cont("bh%s",
6220	pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : "");
6221	else
6222	pr_cont("%d%s", task_pid_nr(worker->task),
6223	worker->rescue_wq ? "(RESCUER)" : "");
6224	}
6225
6226	struct pr_cont_work_struct {
6227	bool comma;
6228	work_func_t func;
6229	long ctr;
6230	};
6231
6232	static void pr_cont_work_flush(bool comma, work_func_t func, struct pr_cont_work_struct *pcwsp)
6233	{
6234	if (!pcwsp->ctr)
6235	goto out_record;
6236	if (func == pcwsp->func) {
6237	pcwsp->ctr++;
6238	return;
6239	}
6240	if (pcwsp->ctr == `1`)
6241	pr_cont("%s %ps", pcwsp->comma ? "," : "", pcwsp->func);
6242	else
6243	pr_cont("%s %ld*%ps", pcwsp->comma ? "," : "", pcwsp->ctr, pcwsp->func);
6244	pcwsp->ctr = `0`;
6245	out_record:
6246	if ((long)func == -`1L`)
6247	return;
6248	pcwsp->comma = comma;
6249	pcwsp->func = func;
6250	pcwsp->ctr = `1`;
6251	}
6252
6253	static void pr_cont_work(bool comma, struct work_struct work, struct* pr_cont_work_struct *pcwsp)
6254	{
6255	if (work->func == wq_barrier_func) {
6256	struct wq_barrier *barr;
6257
6258	barr = container_of(work, struct wq_barrier, work);
6259
6260	pr_cont_work_flush(comma, func: (work_func_t)-`1`, pcwsp);
6261	pr_cont("%s BAR(%d)", comma ? "," : "",
6262	task_pid_nr(barr->task));
6263	} else {
6264	if (!comma)
6265	pr_cont_work_flush(comma, func: (work_func_t)-`1`, pcwsp);
6266	pr_cont_work_flush(comma, func: work->func, pcwsp);
6267	}
6268	}
6269
6270	static void show_pwq(struct pool_workqueue *pwq)
6271	{
6272	struct pr_cont_work_struct pcws = { .ctr = `0`, };
6273	struct worker_pool *pool = pwq->pool;
6274	struct work_struct *work;
6275	struct worker *worker;
6276	bool has_in_flight = false, has_pending = false;
6277	int bkt;
6278
6279	pr_info(" pwq %d:", pool->id);
6280	pr_cont_pool_info(pool);
6281
6282	pr_cont(" active=%d refcnt=%d%s\n",
6283	pwq->nr_active, pwq->refcnt,
6284	!list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
6285
6286	hash_for_each(pool->busy_hash, bkt, worker, hentry) {
6287	if (worker->current_pwq == pwq) {
6288	has_in_flight = true;
6289	break;
6290	}
6291	}
6292	if (has_in_flight) {
6293	bool comma = false;
6294
6295	pr_info(" in-flight:");
6296	hash_for_each(pool->busy_hash, bkt, worker, hentry) {
6297	if (worker->current_pwq != pwq)
6298	continue;
6299
6300	pr_cont(" %s", comma ? "," : "");
6301	pr_cont_worker_id(worker);
6302	pr_cont(":%ps", worker->current_func);
6303	list_for_each_entry(work, &worker->scheduled, entry)
6304	pr_cont_work(comma: false, work, pcwsp: &pcws);
6305	pr_cont_work_flush(comma, func: (work_func_t)-`1L`, pcwsp: &pcws);
6306	comma = true;
6307	}
6308	pr_cont("\n");
6309	}
6310
6311	list_for_each_entry(work, &pool->worklist, entry) {
6312	if (get_work_pwq(work) == pwq) {
6313	has_pending = true;
6314	break;
6315	}
6316	}
6317	if (has_pending) {
6318	bool comma = false;
6319
6320	pr_info(" pending:");
6321	list_for_each_entry(work, &pool->worklist, entry) {
6322	if (get_work_pwq(work) != pwq)
6323	continue;
6324
6325	pr_cont_work(comma, work, pcwsp: &pcws);
6326	comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
6327	}
6328	pr_cont_work_flush(comma, func: (work_func_t)-`1L`, pcwsp: &pcws);
6329	pr_cont("\n");
6330	}
6331
6332	if (!list_empty(head: &pwq->inactive_works)) {
6333	bool comma = false;
6334
6335	pr_info(" inactive:");
6336	list_for_each_entry(work, &pwq->inactive_works, entry) {
6337	pr_cont_work(comma, work, pcwsp: &pcws);
6338	comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
6339	}
6340	pr_cont_work_flush(comma, func: (work_func_t)-`1L`, pcwsp: &pcws);
6341	pr_cont("\n");
6342	}
6343	}
6344
6345	/**
6346	* show_one_workqueue - dump state of specified workqueue
6347	* @wq: workqueue whose state will be printed
6348	*/
6349	void show_one_workqueue(struct workqueue_struct *wq)
6350	{
6351	struct pool_workqueue *pwq;
6352	bool idle = true;
6353	unsigned long irq_flags;
6354
6355	for_each_pwq(pwq, wq) {
6356	if (!pwq_is_empty(pwq)) {
6357	idle = false;
6358	break;
6359	}
6360	}
6361	if (idle) / Nothing to print for idle workqueue /
6362	return;
6363
6364	pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
6365
6366	for_each_pwq(pwq, wq) {
6367	raw_spin_lock_irqsave(&pwq->pool->lock, irq_flags);
6368	if (!pwq_is_empty(pwq)) {
6369	/*
6370	* Defer printing to avoid deadlocks in console
6371	* drivers that queue work while holding locks
6372	* also taken in their write paths.
6373	*/
6374	printk_deferred_enter();
6375	show_pwq(pwq);
6376	printk_deferred_exit();
6377	}
6378	raw_spin_unlock_irqrestore(&pwq->pool->lock, irq_flags);
6379	/*
6380	* We could be printing a lot from atomic context, e.g.
6381	* sysrq-t -> show_all_workqueues(). Avoid triggering
6382	* hard lockup.
6383	*/
6384	touch_nmi_watchdog();
6385	}
6386
6387	}
6388
6389	/**
6390	* show_one_worker_pool - dump state of specified worker pool
6391	* @pool: worker pool whose state will be printed
6392	*/
6393	static void show_one_worker_pool(struct worker_pool *pool)
6394	{
6395	struct worker *worker;
6396	bool first = true;
6397	unsigned long irq_flags;
6398	unsigned long hung = `0`;
6399
6400	raw_spin_lock_irqsave(&pool->lock, irq_flags);
6401	if (pool->nr_workers == pool->nr_idle)
6402	goto next_pool;
6403
6404	/ How long the first pending work is waiting for a worker. /
6405	if (!list_empty(head: &pool->worklist))
6406	hung = jiffies_to_msecs(j: jiffies - pool->watchdog_ts) / `1000`;
6407
6408	/*
6409	* Defer printing to avoid deadlocks in console drivers that
6410	* queue work while holding locks also taken in their write
6411	* paths.
6412	*/
6413	printk_deferred_enter();
6414	pr_info("pool %d:", pool->id);
6415	pr_cont_pool_info(pool);
6416	pr_cont(" hung=%lus workers=%d", hung, pool->nr_workers);
6417	if (pool->manager)
6418	pr_cont(" manager: %d",
6419	task_pid_nr(pool->manager->task));
6420	list_for_each_entry(worker, &pool->idle_list, entry) {
6421	pr_cont(" %s", first ? "idle: " : "");
6422	pr_cont_worker_id(worker);
6423	first = false;
6424	}
6425	pr_cont("\n");
6426	printk_deferred_exit();
6427	next_pool:
6428	raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
6429	/*
6430	* We could be printing a lot from atomic context, e.g.
6431	* sysrq-t -> show_all_workqueues(). Avoid triggering
6432	* hard lockup.
6433	*/
6434	touch_nmi_watchdog();
6435
6436	}
6437
6438	/**
6439	* show_all_workqueues - dump workqueue state
6440	*
6441	* Called from a sysrq handler and prints out all busy workqueues and pools.
6442	*/
6443	void show_all_workqueues(void)
6444	{
6445	struct workqueue_struct *wq;
6446	struct worker_pool *pool;
6447	int pi;
6448
6449	rcu_read_lock();
6450
6451	pr_info("Showing busy workqueues and worker pools:\n");
6452
6453	list_for_each_entry_rcu(wq, &workqueues, list)
6454	show_one_workqueue(wq);
6455
6456	for_each_pool(pool, pi)
6457	show_one_worker_pool(pool);
6458
6459	rcu_read_unlock();
6460	}
6461
6462	/**
6463	* show_freezable_workqueues - dump freezable workqueue state
6464	*
6465	* Called from try_to_freeze_tasks() and prints out all freezable workqueues
6466	* still busy.
6467	*/
6468	void show_freezable_workqueues(void)
6469	{
6470	struct workqueue_struct *wq;
6471
6472	rcu_read_lock();
6473
6474	pr_info("Showing freezable workqueues that are still busy:\n");
6475
6476	list_for_each_entry_rcu(wq, &workqueues, list) {
6477	if (!(wq->flags & WQ_FREEZABLE))
6478	continue;
6479	show_one_workqueue(wq);
6480	}
6481
6482	rcu_read_unlock();
6483	}
6484
6485	/ used to show worker information through /proc/PID/{comm,stat,status} /
6486	void wq_worker_comm(char buf, size_t size, struct* task_struct *task)
6487	{
6488	/ stabilize PF_WQ_WORKER and worker pool association /
6489	mutex_lock(lock: &wq_pool_attach_mutex);
6490
6491	if (task->flags & PF_WQ_WORKER) {
6492	struct worker *worker = kthread_data(k: task);
6493	struct worker_pool *pool = worker->pool;
6494	int off;
6495
6496	off = format_worker_id(buf, size, worker, pool);
6497
6498	if (pool) {
6499	raw_spin_lock_irq(&pool->lock);
6500	/*
6501	* ->desc tracks information (wq name or
6502	* set_worker_desc()) for the latest execution. If
6503	* current, prepend '+', otherwise '-'.
6504	*/
6505	if (worker->desc[`0`] != `'\0'`) {
6506	if (worker->current_work)
6507	scnprintf(buf: buf + off, size: size - off, fmt: "+%s",
6508	worker->desc);
6509	else
6510	scnprintf(buf: buf + off, size: size - off, fmt: "-%s",
6511	worker->desc);
6512	}
6513	raw_spin_unlock_irq(&pool->lock);
6514	}
6515	} else {
6516	strscpy(buf, task->comm, size);
6517	}
6518
6519	mutex_unlock(lock: &wq_pool_attach_mutex);
6520	}
6521
6522	#ifdef CONFIG_SMP
6523
6524	/*
6525	* CPU hotplug.
6526	*
6527	* There are two challenges in supporting CPU hotplug. Firstly, there
6528	* are a lot of assumptions on strong associations among work, pwq and
6529	* pool which make migrating pending and scheduled works very
6530	* difficult to implement without impacting hot paths. Secondly,
6531	* worker pools serve mix of short, long and very long running works making
6532	* blocked draining impractical.
6533	*
6534	* This is solved by allowing the pools to be disassociated from the CPU
6535	* running as an unbound one and allowing it to be reattached later if the
6536	* cpu comes back online.
6537	*/
6538
6539	static void unbind_workers(int cpu)
6540	{
6541	struct worker_pool *pool;
6542	struct worker *worker;
6543
6544	for_each_cpu_worker_pool(pool, cpu) {
6545	mutex_lock(lock: &wq_pool_attach_mutex);
6546	raw_spin_lock_irq(&pool->lock);
6547
6548	/*
6549	* We've blocked all attach/detach operations. Make all workers
6550	* unbound and set DISASSOCIATED. Before this, all workers
6551	* must be on the cpu. After this, they may become diasporas.
6552	* And the preemption disabled section in their sched callbacks
6553	* are guaranteed to see WORKER_UNBOUND since the code here
6554	* is on the same cpu.
6555	*/
6556	for_each_pool_worker(worker, pool)
6557	worker->flags \|= WORKER_UNBOUND;
6558
6559	pool->flags \|= POOL_DISASSOCIATED;
6560
6561	/*
6562	* The handling of nr_running in sched callbacks are disabled
6563	* now. Zap nr_running. After this, nr_running stays zero and
6564	* need_more_worker() and keep_working() are always true as
6565	* long as the worklist is not empty. This pool now behaves as
6566	* an unbound (in terms of concurrency management) pool which
6567	* are served by workers tied to the pool.
6568	*/
6569	pool->nr_running = `0`;
6570
6571	/*
6572	* With concurrency management just turned off, a busy
6573	* worker blocking could lead to lengthy stalls. Kick off
6574	* unbound chain execution of currently pending work items.
6575	*/
6576	kick_pool(pool);
6577
6578	raw_spin_unlock_irq(&pool->lock);
6579
6580	for_each_pool_worker(worker, pool)
6581	unbind_worker(worker);
6582
6583	mutex_unlock(lock: &wq_pool_attach_mutex);
6584	}
6585	}
6586
6587	/**
6588	* rebind_workers - rebind all workers of a pool to the associated CPU
6589	* @pool: pool of interest
6590	*
6591	* @pool->cpu is coming online. Rebind all workers to the CPU.
6592	*/
6593	static void rebind_workers(struct worker_pool *pool)
6594	{
6595	struct worker *worker;
6596
6597	lockdep_assert_held(&wq_pool_attach_mutex);
6598
6599	/*
6600	* Restore CPU affinity of all workers. As all idle workers should
6601	* be on the run-queue of the associated CPU before any local
6602	* wake-ups for concurrency management happen, restore CPU affinity
6603	* of all workers first and then clear UNBOUND. As we're called
6604	* from CPU_ONLINE, the following shouldn't fail.
6605	*/
6606	for_each_pool_worker(worker, pool) {
6607	kthread_set_per_cpu(k: worker->task, cpu: pool->cpu);
6608	WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
6609	pool_allowed_cpus(pool)) < `0`);
6610	}
6611
6612	raw_spin_lock_irq(&pool->lock);
6613
6614	pool->flags &= ~POOL_DISASSOCIATED;
6615
6616	for_each_pool_worker(worker, pool) {
6617	unsigned int worker_flags = worker->flags;
6618
6619	/*
6620	* We want to clear UNBOUND but can't directly call
6621	* worker_clr_flags() or adjust nr_running. Atomically
6622	* replace UNBOUND with another NOT_RUNNING flag REBOUND.
6623	* @worker will clear REBOUND using worker_clr_flags() when
6624	* it initiates the next execution cycle thus restoring
6625	* concurrency management. Note that when or whether
6626	* @worker clears REBOUND doesn't affect correctness.
6627	*
6628	* WRITE_ONCE() is necessary because @worker->flags may be
6629	* tested without holding any lock in
6630	* wq_worker_running(). Without it, NOT_RUNNING test may
6631	* fail incorrectly leading to premature concurrency
6632	* management operations.
6633	*/
6634	WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND));
6635	worker_flags \|= WORKER_REBOUND;
6636	worker_flags &= ~WORKER_UNBOUND;
6637	WRITE_ONCE(worker->flags, worker_flags);
6638	}
6639
6640	raw_spin_unlock_irq(&pool->lock);
6641	}
6642
6643	/**
6644	* restore_unbound_workers_cpumask - restore cpumask of unbound workers
6645	* @pool: unbound pool of interest
6646	* @cpu: the CPU which is coming up
6647	*
6648	* An unbound pool may end up with a cpumask which doesn't have any online
6649	* CPUs. When a worker of such pool get scheduled, the scheduler resets
6650	* its cpus_allowed. If @cpu is in @pool's cpumask which didn't have any
6651	* online CPU before, cpus_allowed of all its workers should be restored.
6652	*/
6653	static void restore_unbound_workers_cpumask(struct worker_pool pool, int* cpu)
6654	{
6655	static cpumask_t cpumask;
6656	struct worker *worker;
6657
6658	lockdep_assert_held(&wq_pool_attach_mutex);
6659
6660	/ is @cpu allowed for @pool? /
6661	if (!cpumask_test_cpu(cpu, cpumask: pool->attrs->cpumask))
6662	return;
6663
6664	cpumask_and(dstp: &cpumask, src1p: pool->attrs->cpumask, cpu_online_mask);
6665
6666	/ as we're called from CPU_ONLINE, the following shouldn't fail /
6667	for_each_pool_worker(worker, pool)
6668	WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < `0`);
6669	}
6670
6671	int workqueue_prepare_cpu(unsigned int cpu)
6672	{
6673	struct worker_pool *pool;
6674
6675	for_each_cpu_worker_pool(pool, cpu) {
6676	if (pool->nr_workers)
6677	continue;
6678	if (!create_worker(pool))
6679	return -ENOMEM;
6680	}
6681	return `0`;
6682	}
6683
6684	int workqueue_online_cpu(unsigned int cpu)
6685	{
6686	struct worker_pool *pool;
6687	struct workqueue_struct *wq;
6688	int pi;
6689
6690	mutex_lock(lock: &wq_pool_mutex);
6691
6692	cpumask_set_cpu(cpu, dstp: wq_online_cpumask);
6693
6694	for_each_pool(pool, pi) {
6695	/ BH pools aren't affected by hotplug /
6696	if (pool->flags & POOL_BH)
6697	continue;
6698
6699	mutex_lock(lock: &wq_pool_attach_mutex);
6700	if (pool->cpu == cpu)
6701	rebind_workers(pool);
6702	else if (pool->cpu < `0`)
6703	restore_unbound_workers_cpumask(pool, cpu);
6704	mutex_unlock(lock: &wq_pool_attach_mutex);
6705	}
6706
6707	/ update pod affinity of unbound workqueues /
6708	list_for_each_entry(wq, &workqueues, list) {
6709	struct workqueue_attrs *attrs = wq->unbound_attrs;
6710
6711	if (attrs) {
6712	const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
6713	int tcpu;
6714
6715	for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
6716	unbound_wq_update_pwq(wq, cpu: tcpu);
6717
6718	mutex_lock(lock: &wq->mutex);
6719	wq_update_node_max_active(wq, off_cpu: -`1`);
6720	mutex_unlock(lock: &wq->mutex);
6721	}
6722	}
6723
6724	mutex_unlock(lock: &wq_pool_mutex);
6725	return `0`;
6726	}
6727
6728	int workqueue_offline_cpu(unsigned int cpu)
6729	{
6730	struct workqueue_struct *wq;
6731
6732	/ unbinding per-cpu workers should happen on the local CPU /
6733	if (WARN_ON(cpu != smp_processor_id()))
6734	return -`1`;
6735
6736	unbind_workers(cpu);
6737
6738	/ update pod affinity of unbound workqueues /
6739	mutex_lock(lock: &wq_pool_mutex);
6740
6741	cpumask_clear_cpu(cpu, dstp: wq_online_cpumask);
6742
6743	list_for_each_entry(wq, &workqueues, list) {
6744	struct workqueue_attrs *attrs = wq->unbound_attrs;
6745
6746	if (attrs) {
6747	const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
6748	int tcpu;
6749
6750	for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
6751	unbound_wq_update_pwq(wq, cpu: tcpu);
6752
6753	mutex_lock(lock: &wq->mutex);
6754	wq_update_node_max_active(wq, off_cpu: cpu);
6755	mutex_unlock(lock: &wq->mutex);
6756	}
6757	}
6758	mutex_unlock(lock: &wq_pool_mutex);
6759
6760	return `0`;
6761	}
6762
6763	struct work_for_cpu {
6764	struct work_struct work;
6765	long (fn)(void* *);
6766	void *arg;
6767	long ret;
6768	};
6769
6770	static void work_for_cpu_fn(struct work_struct *work)
6771	{
6772	struct work_for_cpu wfc = container_of(work, struct* work_for_cpu, work);
6773
6774	wfc->ret = wfc->fn(wfc->arg);
6775	}
6776
6777	/**
6778	* work_on_cpu_key - run a function in thread context on a particular cpu
6779	* @cpu: the cpu to run on
6780	* @fn: the function to run
6781	* @arg: the function arg
6782	* @key: The lock class key for lock debugging purposes
6783	*
6784	* It is up to the caller to ensure that the cpu doesn't go offline.
6785	* The caller must not hold any locks which would prevent @fn from completing.
6786	*
6787	* Return: The value @fn returns.
6788	*/
6789	long work_on_cpu_key(int cpu, long (fn)(void* *),
6790	void arg, struct* lock_class_key *key)
6791	{
6792	struct work_for_cpu wfc = { .fn = fn, .arg = arg };
6793
6794	INIT_WORK_ONSTACK_KEY(&wfc.work, work_for_cpu_fn, key);
6795	schedule_work_on(cpu, work: &wfc.work);
6796	flush_work(&wfc.work);
6797	destroy_work_on_stack(work: &wfc.work);
6798	return wfc.ret;
6799	}
6800	EXPORT_SYMBOL_GPL(work_on_cpu_key);
6801	#endif /* CONFIG_SMP */
6802
6803	#ifdef CONFIG_FREEZER
6804
6805	/**
6806	* freeze_workqueues_begin - begin freezing workqueues
6807	*
6808	* Start freezing workqueues. After this function returns, all freezable
6809	* workqueues will queue new works to their inactive_works list instead of
6810	* pool->worklist.
6811	*
6812	* CONTEXT:
6813	* Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
6814	*/
6815	void freeze_workqueues_begin(void)
6816	{
6817	struct workqueue_struct *wq;
6818
6819	mutex_lock(lock: &wq_pool_mutex);
6820
6821	WARN_ON_ONCE(workqueue_freezing);
6822	workqueue_freezing = true;
6823
6824	list_for_each_entry(wq, &workqueues, list) {
6825	mutex_lock(lock: &wq->mutex);
6826	wq_adjust_max_active(wq);
6827	mutex_unlock(lock: &wq->mutex);
6828	}
6829
6830	mutex_unlock(lock: &wq_pool_mutex);
6831	}
6832
6833	/**
6834	* freeze_workqueues_busy - are freezable workqueues still busy?
6835	*
6836	* Check whether freezing is complete. This function must be called
6837	* between freeze_workqueues_begin() and thaw_workqueues().
6838	*
6839	* CONTEXT:
6840	* Grabs and releases wq_pool_mutex.
6841	*
6842	* Return:
6843	* %true if some freezable workqueues are still busy. %false if freezing
6844	* is complete.
6845	*/
6846	bool freeze_workqueues_busy(void)
6847	{
6848	bool busy = false;
6849	struct workqueue_struct *wq;
6850	struct pool_workqueue *pwq;
6851
6852	mutex_lock(lock: &wq_pool_mutex);
6853
6854	WARN_ON_ONCE(!workqueue_freezing);
6855
6856	list_for_each_entry(wq, &workqueues, list) {
6857	if (!(wq->flags & WQ_FREEZABLE))
6858	continue;
6859	/*
6860	* nr_active is monotonically decreasing. It's safe
6861	* to peek without lock.
6862	*/
6863	rcu_read_lock();
6864	for_each_pwq(pwq, wq) {
6865	WARN_ON_ONCE(pwq->nr_active < `0`);
6866	if (pwq->nr_active) {
6867	busy = true;
6868	rcu_read_unlock();
6869	goto out_unlock;
6870	}
6871	}
6872	rcu_read_unlock();
6873	}
6874	out_unlock:
6875	mutex_unlock(lock: &wq_pool_mutex);
6876	return busy;
6877	}
6878
6879	/**
6880	* thaw_workqueues - thaw workqueues
6881	*
6882	* Thaw workqueues. Normal queueing is restored and all collected
6883	* frozen works are transferred to their respective pool worklists.
6884	*
6885	* CONTEXT:
6886	* Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
6887	*/
6888	void thaw_workqueues(void)
6889	{
6890	struct workqueue_struct *wq;
6891
6892	mutex_lock(lock: &wq_pool_mutex);
6893
6894	if (!workqueue_freezing)
6895	goto out_unlock;
6896
6897	workqueue_freezing = false;
6898
6899	/ restore max_active and repopulate worklist /
6900	list_for_each_entry(wq, &workqueues, list) {
6901	mutex_lock(lock: &wq->mutex);
6902	wq_adjust_max_active(wq);
6903	mutex_unlock(lock: &wq->mutex);
6904	}
6905
6906	out_unlock:
6907	mutex_unlock(lock: &wq_pool_mutex);
6908	}
6909	#endif /* CONFIG_FREEZER */
6910
6911	static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
6912	{
6913	LIST_HEAD(ctxs);
6914	int ret = `0`;
6915	struct workqueue_struct *wq;
6916	struct apply_wqattrs_ctx ctx, n;
6917
6918	lockdep_assert_held(&wq_pool_mutex);
6919
6920	list_for_each_entry(wq, &workqueues, list) {
6921	if (!(wq->flags & WQ_UNBOUND) \|\| (wq->flags & __WQ_DESTROYING))
6922	continue;
6923
6924	ctx = apply_wqattrs_prepare(wq, attrs: wq->unbound_attrs, unbound_cpumask);
6925	if (IS_ERR(ptr: ctx)) {
6926	ret = PTR_ERR(ptr: ctx);
6927	break;
6928	}
6929
6930	list_add_tail(new: &ctx->list, head: &ctxs);
6931	}
6932
6933	list_for_each_entry_safe(ctx, n, &ctxs, list) {
6934	if (!ret)
6935	apply_wqattrs_commit(ctx);
6936	apply_wqattrs_cleanup(ctx);
6937	}
6938
6939	if (!ret) {
6940	mutex_lock(lock: &wq_pool_attach_mutex);
6941	cpumask_copy(dstp: wq_unbound_cpumask, srcp: unbound_cpumask);
6942	mutex_unlock(lock: &wq_pool_attach_mutex);
6943	}
6944	return ret;
6945	}
6946
6947	/**
6948	* workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask
6949	* @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask
6950	*
6951	* This function can be called from cpuset code to provide a set of isolated
6952	* CPUs that should be excluded from wq_unbound_cpumask.
6953	*/
6954	int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
6955	{
6956	cpumask_var_t cpumask;
6957	int ret = `0`;
6958
6959	if (!zalloc_cpumask_var(mask: &cpumask, GFP_KERNEL))
6960	return -ENOMEM;
6961
6962	mutex_lock(lock: &wq_pool_mutex);
6963
6964	/*
6965	* If the operation fails, it will fall back to
6966	* wq_requested_unbound_cpumask which is initially set to
6967	* (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten
6968	* by any subsequent write to workqueue/cpumask sysfs file.
6969	*/
6970	if (!cpumask_andnot(dstp: cpumask, src1p: wq_requested_unbound_cpumask, src2p: exclude_cpumask))
6971	cpumask_copy(dstp: cpumask, srcp: wq_requested_unbound_cpumask);
6972	if (!cpumask_equal(src1p: cpumask, src2p: wq_unbound_cpumask))
6973	ret = workqueue_apply_unbound_cpumask(unbound_cpumask: cpumask);
6974
6975	/ Save the current isolated cpumask & export it via sysfs /
6976	if (!ret)
6977	cpumask_copy(dstp: wq_isolated_cpumask, srcp: exclude_cpumask);
6978
6979	mutex_unlock(lock: &wq_pool_mutex);
6980	free_cpumask_var(mask: cpumask);
6981	return ret;
6982	}
6983
6984	static int parse_affn_scope(const char *val)
6985	{
6986	int i;
6987
6988	for (i = `0`; i < ARRAY_SIZE(wq_affn_names); i++) {
6989	if (!strncasecmp(s1: val, s2: wq_affn_names[i], n: strlen(wq_affn_names[i])))
6990	return i;
6991	}
6992	return -EINVAL;
6993	}
6994
6995	static int wq_affn_dfl_set(const char val, const* struct kernel_param *kp)
6996	{
6997	struct workqueue_struct *wq;
6998	int affn, cpu;
6999
7000	affn = parse_affn_scope(val);
7001	if (affn < `0`)
7002	return affn;
7003	if (affn == WQ_AFFN_DFL)
7004	return -EINVAL;
7005
7006	cpus_read_lock();
7007	mutex_lock(lock: &wq_pool_mutex);
7008
7009	wq_affn_dfl = affn;
7010
7011	list_for_each_entry(wq, &workqueues, list) {
7012	for_each_online_cpu(cpu)
7013	unbound_wq_update_pwq(wq, cpu);
7014	}
7015
7016	mutex_unlock(lock: &wq_pool_mutex);
7017	cpus_read_unlock();
7018
7019	return `0`;
7020	}
7021
7022	static int wq_affn_dfl_get(char buffer, const* struct kernel_param *kp)
7023	{
7024	return scnprintf(buf: buffer, PAGE_SIZE, fmt: "%s\n", wq_affn_names[wq_affn_dfl]);
7025	}
7026
7027	static const struct kernel_param_ops wq_affn_dfl_ops = {
7028	.set = wq_affn_dfl_set,
7029	.get = wq_affn_dfl_get,
7030	};
7031
7032	module_param_cb(default_affinity_scope, &wq_affn_dfl_ops, NULL, `0644`);
7033
7034	#ifdef CONFIG_SYSFS
7035	/*
7036	* Workqueues with WQ_SYSFS flag set is visible to userland via
7037	* /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the
7038	* following attributes.
7039	*
7040	* per_cpu RO bool : whether the workqueue is per-cpu or unbound
7041	* max_active RW int : maximum number of in-flight work items
7042	*
7043	* Unbound workqueues have the following extra attributes.
7044	*
7045	* nice RW int : nice value of the workers
7046	* cpumask RW mask : bitmask of allowed CPUs for the workers
7047	* affinity_scope RW str : worker CPU affinity scope (cache, numa, none)
7048	* affinity_strict RW bool : worker CPU affinity is strict
7049	*/
7050	struct wq_device {
7051	struct workqueue_struct *wq;
7052	struct device dev;
7053	};
7054
7055	static struct workqueue_struct dev_to_wq(struct* device *dev)
7056	{
7057	struct wq_device wq_dev = container_of(dev, struct* wq_device, dev);
7058
7059	return wq_dev->wq;
7060	}
7061
7062	static ssize_t per_cpu_show(struct device dev, struct* device_attribute *attr,
7063	char *buf)
7064	{
7065	struct workqueue_struct *wq = dev_to_wq(dev);
7066
7067	return scnprintf(buf, PAGE_SIZE, fmt: "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
7068	}
7069	static DEVICE_ATTR_RO(per_cpu);
7070
7071	static ssize_t max_active_show(struct device *dev,
7072	struct device_attribute attr, char* *buf)
7073	{
7074	struct workqueue_struct *wq = dev_to_wq(dev);
7075
7076	return scnprintf(buf, PAGE_SIZE, fmt: "%d\n", wq->saved_max_active);
7077	}
7078
7079	static ssize_t max_active_store(struct device *dev,
7080	struct device_attribute attr, const* char *buf,
7081	size_t count)
7082	{
7083	struct workqueue_struct *wq = dev_to_wq(dev);
7084	int val;
7085
7086	if (sscanf(buf, "%d", &val) != `1` \|\| val <= `0`)
7087	return -EINVAL;
7088
7089	workqueue_set_max_active(wq, val);
7090	return count;
7091	}
7092	static DEVICE_ATTR_RW(max_active);
7093
7094	static struct attribute *wq_sysfs_attrs[] = {
7095	&dev_attr_per_cpu.attr,
7096	&dev_attr_max_active.attr,
7097	NULL,
7098	};
7099	ATTRIBUTE_GROUPS(wq_sysfs);
7100
7101	static ssize_t wq_nice_show(struct device dev, struct* device_attribute *attr,
7102	char *buf)
7103	{
7104	struct workqueue_struct *wq = dev_to_wq(dev);
7105	int written;
7106
7107	mutex_lock(lock: &wq->mutex);
7108	written = scnprintf(buf, PAGE_SIZE, fmt: "%d\n", wq->unbound_attrs->nice);
7109	mutex_unlock(lock: &wq->mutex);
7110
7111	return written;
7112	}
7113
7114	/ prepare workqueue_attrs for sysfs store operations /
7115	static struct workqueue_attrs wq_sysfs_prep_attrs(struct* workqueue_struct *wq)
7116	{
7117	struct workqueue_attrs *attrs;
7118
7119	lockdep_assert_held(&wq_pool_mutex);
7120
7121	attrs = alloc_workqueue_attrs();
7122	if (!attrs)
7123	return NULL;
7124
7125	copy_workqueue_attrs(to: attrs, from: wq->unbound_attrs);
7126	return attrs;
7127	}
7128
7129	static ssize_t wq_nice_store(struct device dev, struct* device_attribute *attr,
7130	const char *buf, size_t count)
7131	{
7132	struct workqueue_struct *wq = dev_to_wq(dev);
7133	struct workqueue_attrs *attrs;
7134	int ret = -ENOMEM;
7135
7136	apply_wqattrs_lock();
7137
7138	attrs = wq_sysfs_prep_attrs(wq);
7139	if (!attrs)
7140	goto out_unlock;
7141
7142	if (sscanf(buf, "%d", &attrs->nice) == `1` &&
7143	attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
7144	ret = apply_workqueue_attrs_locked(wq, attrs);
7145	else
7146	ret = -EINVAL;
7147
7148	out_unlock:
7149	apply_wqattrs_unlock();
7150	free_workqueue_attrs(attrs);
7151	return ret ?: count;
7152	}
7153
7154	static ssize_t wq_cpumask_show(struct device *dev,
7155	struct device_attribute attr, char* *buf)
7156	{
7157	struct workqueue_struct *wq = dev_to_wq(dev);
7158	int written;
7159
7160	mutex_lock(lock: &wq->mutex);
7161	written = scnprintf(buf, PAGE_SIZE, fmt: "%*pb\n",
7162	cpumask_pr_args(wq->unbound_attrs->cpumask));
7163	mutex_unlock(lock: &wq->mutex);
7164	return written;
7165	}
7166
7167	static ssize_t wq_cpumask_store(struct device *dev,
7168	struct device_attribute *attr,
7169	const char *buf, size_t count)
7170	{
7171	struct workqueue_struct *wq = dev_to_wq(dev);
7172	struct workqueue_attrs *attrs;
7173	int ret = -ENOMEM;
7174
7175	apply_wqattrs_lock();
7176
7177	attrs = wq_sysfs_prep_attrs(wq);
7178	if (!attrs)
7179	goto out_unlock;
7180
7181	ret = cpumask_parse(buf, dstp: attrs->cpumask);
7182	if (!ret)
7183	ret = apply_workqueue_attrs_locked(wq, attrs);
7184
7185	out_unlock:
7186	apply_wqattrs_unlock();
7187	free_workqueue_attrs(attrs);
7188	return ret ?: count;
7189	}
7190
7191	static ssize_t wq_affn_scope_show(struct device *dev,
7192	struct device_attribute attr, char* *buf)
7193	{
7194	struct workqueue_struct *wq = dev_to_wq(dev);
7195	int written;
7196
7197	mutex_lock(lock: &wq->mutex);
7198	if (wq->unbound_attrs->affn_scope == WQ_AFFN_DFL)
7199	written = scnprintf(buf, PAGE_SIZE, fmt: "%s (%s)\n",
7200	wq_affn_names[WQ_AFFN_DFL],
7201	wq_affn_names[wq_affn_dfl]);
7202	else
7203	written = scnprintf(buf, PAGE_SIZE, fmt: "%s\n",
7204	wq_affn_names[wq->unbound_attrs->affn_scope]);
7205	mutex_unlock(lock: &wq->mutex);
7206
7207	return written;
7208	}
7209
7210	static ssize_t wq_affn_scope_store(struct device *dev,
7211	struct device_attribute *attr,
7212	const char *buf, size_t count)
7213	{
7214	struct workqueue_struct *wq = dev_to_wq(dev);
7215	struct workqueue_attrs *attrs;
7216	int affn, ret = -ENOMEM;
7217
7218	affn = parse_affn_scope(val: buf);
7219	if (affn < `0`)
7220	return affn;
7221
7222	apply_wqattrs_lock();
7223	attrs = wq_sysfs_prep_attrs(wq);
7224	if (attrs) {
7225	attrs->affn_scope = affn;
7226	ret = apply_workqueue_attrs_locked(wq, attrs);
7227	}
7228	apply_wqattrs_unlock();
7229	free_workqueue_attrs(attrs);
7230	return ret ?: count;
7231	}
7232
7233	static ssize_t wq_affinity_strict_show(struct device *dev,
7234	struct device_attribute attr, char* *buf)
7235	{
7236	struct workqueue_struct *wq = dev_to_wq(dev);
7237
7238	return scnprintf(buf, PAGE_SIZE, fmt: "%d\n",
7239	wq->unbound_attrs->affn_strict);
7240	}
7241
7242	static ssize_t wq_affinity_strict_store(struct device *dev,
7243	struct device_attribute *attr,
7244	const char *buf, size_t count)
7245	{
7246	struct workqueue_struct *wq = dev_to_wq(dev);
7247	struct workqueue_attrs *attrs;
7248	int v, ret = -ENOMEM;
7249
7250	if (sscanf(buf, "%d", &v) != `1`)
7251	return -EINVAL;
7252
7253	apply_wqattrs_lock();
7254	attrs = wq_sysfs_prep_attrs(wq);
7255	if (attrs) {
7256	attrs->affn_strict = (bool)v;
7257	ret = apply_workqueue_attrs_locked(wq, attrs);
7258	}
7259	apply_wqattrs_unlock();
7260	free_workqueue_attrs(attrs);
7261	return ret ?: count;
7262	}
7263
7264	static struct device_attribute wq_sysfs_unbound_attrs[] = {
7265	__ATTR(nice, `0644`, wq_nice_show, wq_nice_store),
7266	__ATTR(cpumask, `0644`, wq_cpumask_show, wq_cpumask_store),
7267	__ATTR(affinity_scope, `0644`, wq_affn_scope_show, wq_affn_scope_store),
7268	__ATTR(affinity_strict, `0644`, wq_affinity_strict_show, wq_affinity_strict_store),
7269	__ATTR_NULL,
7270	};
7271
7272	static const struct bus_type wq_subsys = {
7273	.name = "workqueue",
7274	.dev_groups = wq_sysfs_groups,
7275	};
7276
7277	/**
7278	* workqueue_set_unbound_cpumask - Set the low-level unbound cpumask
7279	* @cpumask: the cpumask to set
7280	*
7281	* The low-level workqueues cpumask is a global cpumask that limits
7282	* the affinity of all unbound workqueues. This function check the @cpumask
7283	* and apply it to all unbound workqueues and updates all pwqs of them.
7284	*
7285	* Return: 0 - Success
7286	* -EINVAL - Invalid @cpumask
7287	* -ENOMEM - Failed to allocate memory for attrs or pwqs.
7288	*/
7289	static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
7290	{
7291	int ret = -EINVAL;
7292
7293	/*
7294	* Not excluding isolated cpus on purpose.
7295	* If the user wishes to include them, we allow that.
7296	*/
7297	cpumask_and(dstp: cpumask, src1p: cpumask, cpu_possible_mask);
7298	if (!cpumask_empty(srcp: cpumask)) {
7299	ret = `0`;
7300	apply_wqattrs_lock();
7301	if (!cpumask_equal(src1p: cpumask, src2p: wq_unbound_cpumask))
7302	ret = workqueue_apply_unbound_cpumask(unbound_cpumask: cpumask);
7303	if (!ret)
7304	cpumask_copy(dstp: wq_requested_unbound_cpumask, srcp: cpumask);
7305	apply_wqattrs_unlock();
7306	}
7307
7308	return ret;
7309	}
7310
7311	static ssize_t __wq_cpumask_show(struct device *dev,
7312	struct device_attribute attr, char* *buf, cpumask_var_t mask)
7313	{
7314	int written;
7315
7316	mutex_lock(lock: &wq_pool_mutex);
7317	written = scnprintf(buf, PAGE_SIZE, fmt: "%*pb\n", cpumask_pr_args(mask));
7318	mutex_unlock(lock: &wq_pool_mutex);
7319
7320	return written;
7321	}
7322
7323	static ssize_t cpumask_requested_show(struct device *dev,
7324	struct device_attribute attr, char* *buf)
7325	{
7326	return __wq_cpumask_show(dev, attr, buf, mask: wq_requested_unbound_cpumask);
7327	}
7328	static DEVICE_ATTR_RO(cpumask_requested);
7329
7330	static ssize_t cpumask_isolated_show(struct device *dev,
7331	struct device_attribute attr, char* *buf)
7332	{
7333	return __wq_cpumask_show(dev, attr, buf, mask: wq_isolated_cpumask);
7334	}
7335	static DEVICE_ATTR_RO(cpumask_isolated);
7336
7337	static ssize_t cpumask_show(struct device *dev,
7338	struct device_attribute attr, char* *buf)
7339	{
7340	return __wq_cpumask_show(dev, attr, buf, mask: wq_unbound_cpumask);
7341	}
7342
7343	static ssize_t cpumask_store(struct device *dev,
7344	struct device_attribute attr, const* char *buf, size_t count)
7345	{
7346	cpumask_var_t cpumask;
7347	int ret;
7348
7349	if (!zalloc_cpumask_var(mask: &cpumask, GFP_KERNEL))
7350	return -ENOMEM;
7351
7352	ret = cpumask_parse(buf, dstp: cpumask);
7353	if (!ret)
7354	ret = workqueue_set_unbound_cpumask(cpumask);
7355
7356	free_cpumask_var(mask: cpumask);
7357	return ret ? ret : count;
7358	}
7359	static DEVICE_ATTR_RW(cpumask);
7360
7361	static struct attribute *wq_sysfs_cpumask_attrs[] = {
7362	&dev_attr_cpumask.attr,
7363	&dev_attr_cpumask_requested.attr,
7364	&dev_attr_cpumask_isolated.attr,
7365	NULL,
7366	};
7367	ATTRIBUTE_GROUPS(wq_sysfs_cpumask);
7368
7369	static int __init wq_sysfs_init(void)
7370	{
7371	return subsys_virtual_register(subsys: &wq_subsys, groups: wq_sysfs_cpumask_groups);
7372	}
7373	core_initcall(wq_sysfs_init);
7374
7375	static void wq_device_release(struct device *dev)
7376	{
7377	struct wq_device wq_dev = container_of(dev, struct* wq_device, dev);
7378
7379	kfree(objp: wq_dev);
7380	}
7381
7382	/**
7383	* workqueue_sysfs_register - make a workqueue visible in sysfs
7384	* @wq: the workqueue to register
7385	*
7386	* Expose @wq in sysfs under /sys/bus/workqueue/devices.
7387	* alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
7388	* which is the preferred method.
7389	*
7390	* Workqueue user should use this function directly iff it wants to apply
7391	* workqueue_attrs before making the workqueue visible in sysfs; otherwise,
7392	* apply_workqueue_attrs() may race against userland updating the
7393	* attributes.
7394	*
7395	* Return: 0 on success, -errno on failure.
7396	*/
7397	int workqueue_sysfs_register(struct workqueue_struct *wq)
7398	{
7399	struct wq_device *wq_dev;
7400	int ret;
7401
7402	/*
7403	* Adjusting max_active breaks ordering guarantee. Disallow exposing
7404	* ordered workqueues.
7405	*/
7406	if (WARN_ON(wq->flags & __WQ_ORDERED))
7407	return -EINVAL;
7408
7409	wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
7410	if (!wq_dev)
7411	return -ENOMEM;
7412
7413	wq_dev->wq = wq;
7414	wq_dev->dev.bus = &wq_subsys;
7415	wq_dev->dev.release = wq_device_release;
7416	dev_set_name(dev: &wq_dev->dev, name: "%s", wq->name);
7417
7418	/*
7419	* unbound_attrs are created separately. Suppress uevent until
7420	* everything is ready.
7421	*/
7422	dev_set_uevent_suppress(dev: &wq_dev->dev, val: true);
7423
7424	ret = device_register(dev: &wq_dev->dev);
7425	if (ret) {
7426	put_device(dev: &wq_dev->dev);
7427	wq->wq_dev = NULL;
7428	return ret;
7429	}
7430
7431	if (wq->flags & WQ_UNBOUND) {
7432	struct device_attribute *attr;
7433
7434	for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
7435	ret = device_create_file(device: &wq_dev->dev, entry: attr);
7436	if (ret) {
7437	device_unregister(dev: &wq_dev->dev);
7438	wq->wq_dev = NULL;
7439	return ret;
7440	}
7441	}
7442	}
7443
7444	dev_set_uevent_suppress(dev: &wq_dev->dev, val: false);
7445	kobject_uevent(kobj: &wq_dev->dev.kobj, action: KOBJ_ADD);
7446	return `0`;
7447	}
7448
7449	/**
7450	* workqueue_sysfs_unregister - undo workqueue_sysfs_register()
7451	* @wq: the workqueue to unregister
7452	*
7453	* If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
7454	*/
7455	static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
7456	{
7457	struct wq_device *wq_dev = wq->wq_dev;
7458
7459	if (!wq->wq_dev)
7460	return;
7461
7462	wq->wq_dev = NULL;
7463	device_unregister(dev: &wq_dev->dev);
7464	}
7465	#else /* CONFIG_SYSFS */
7466	static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
7467	#endif /* CONFIG_SYSFS */
7468
7469	/*
7470	* Workqueue watchdog.
7471	*
7472	* Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal
7473	* flush dependency, a concurrency managed work item which stays RUNNING
7474	* indefinitely. Workqueue stalls can be very difficult to debug as the
7475	* usual warning mechanisms don't trigger and internal workqueue state is
7476	* largely opaque.
7477	*
7478	* Workqueue watchdog monitors all worker pools periodically and dumps
7479	* state if some pools failed to make forward progress for a while where
7480	* forward progress is defined as the first item on ->worklist changing.
7481	*
7482	* This mechanism is controlled through the kernel parameter
7483	* "workqueue.watchdog_thresh" which can be updated at runtime through the
7484	* corresponding sysfs parameter file.
7485	*/
7486	#ifdef CONFIG_WQ_WATCHDOG
7487
7488	static unsigned long wq_watchdog_thresh = `30`;
7489	static struct timer_list wq_watchdog_timer;
7490
7491	static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
7492	static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
7493
7494	static unsigned int wq_panic_on_stall;
7495	module_param_named(panic_on_stall, wq_panic_on_stall, uint, `0644`);
7496
7497	/*
7498	* Show workers that might prevent the processing of pending work items.
7499	* The only candidates are CPU-bound workers in the running state.
7500	* Pending work items should be handled by another idle worker
7501	* in all other situations.
7502	*/
7503	static void show_cpu_pool_hog(struct worker_pool *pool)
7504	{
7505	struct worker *worker;
7506	unsigned long irq_flags;
7507	int bkt;
7508
7509	raw_spin_lock_irqsave(&pool->lock, irq_flags);
7510
7511	hash_for_each(pool->busy_hash, bkt, worker, hentry) {
7512	if (task_is_running(worker->task)) {
7513	/*
7514	* Defer printing to avoid deadlocks in console
7515	* drivers that queue work while holding locks
7516	* also taken in their write paths.
7517	*/
7518	printk_deferred_enter();
7519
7520	pr_info("pool %d:\n", pool->id);
7521	sched_show_task(worker->task);
7522
7523	printk_deferred_exit();
7524	}
7525	}
7526
7527	raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
7528	}
7529
7530	static void show_cpu_pools_hogs(void)
7531	{
7532	struct worker_pool *pool;
7533	int pi;
7534
7535	pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n");
7536
7537	rcu_read_lock();
7538
7539	for_each_pool(pool, pi) {
7540	if (pool->cpu_stall)
7541	show_cpu_pool_hog(pool);
7542
7543	}
7544
7545	rcu_read_unlock();
7546	}
7547
7548	static void panic_on_wq_watchdog(void)
7549	{
7550	static unsigned int wq_stall;
7551
7552	if (wq_panic_on_stall) {
7553	wq_stall++;
7554	BUG_ON(wq_stall >= wq_panic_on_stall);
7555	}
7556	}
7557
7558	static void wq_watchdog_reset_touched(void)
7559	{
7560	int cpu;
7561
7562	wq_watchdog_touched = jiffies;
7563	for_each_possible_cpu(cpu)
7564	per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
7565	}
7566
7567	static void wq_watchdog_timer_fn(struct timer_list *unused)
7568	{
7569	unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
7570	bool lockup_detected = false;
7571	bool cpu_pool_stall = false;
7572	unsigned long now = jiffies;
7573	struct worker_pool *pool;
7574	int pi;
7575
7576	if (!thresh)
7577	return;
7578
7579	for_each_pool(pool, pi) {
7580	unsigned long pool_ts, touched, ts;
7581
7582	pool->cpu_stall = false;
7583	if (list_empty(&pool->worklist))
7584	continue;
7585
7586	/*
7587	* If a virtual machine is stopped by the host it can look to
7588	* the watchdog like a stall.
7589	*/
7590	kvm_check_and_clear_guest_paused();
7591
7592	/ get the latest of pool and touched timestamps /
7593	if (pool->cpu >= `0`)
7594	touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu));
7595	else
7596	touched = READ_ONCE(wq_watchdog_touched);
7597	pool_ts = READ_ONCE(pool->watchdog_ts);
7598
7599	if (time_after(pool_ts, touched))
7600	ts = pool_ts;
7601	else
7602	ts = touched;
7603
7604	/ did we stall? /
7605	if (time_after(now, ts + thresh)) {
7606	lockup_detected = true;
7607	if (pool->cpu >= `0` && !(pool->flags & POOL_BH)) {
7608	pool->cpu_stall = true;
7609	cpu_pool_stall = true;
7610	}
7611	pr_emerg("BUG: workqueue lockup - pool");
7612	pr_cont_pool_info(pool);
7613	pr_cont(" stuck for %us!\n",
7614	jiffies_to_msecs(now - pool_ts) / `1000`);
7615	}
7616
7617
7618	}
7619
7620	if (lockup_detected)
7621	show_all_workqueues();
7622
7623	if (cpu_pool_stall)
7624	show_cpu_pools_hogs();
7625
7626	if (lockup_detected)
7627	panic_on_wq_watchdog();
7628
7629	wq_watchdog_reset_touched();
7630	mod_timer(&wq_watchdog_timer, jiffies + thresh);
7631	}
7632
7633	notrace void wq_watchdog_touch(int cpu)
7634	{
7635	unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
7636	unsigned long touch_ts = READ_ONCE(wq_watchdog_touched);
7637	unsigned long now = jiffies;
7638
7639	if (cpu >= `0`)
7640	per_cpu(wq_watchdog_touched_cpu, cpu) = now;
7641	else
7642	WARN_ONCE(`1`, "%s should be called with valid CPU", __func__);
7643
7644	/ Don't unnecessarily store to global cacheline /
7645	if (time_after(now, touch_ts + thresh / `4`))
7646	WRITE_ONCE(wq_watchdog_touched, jiffies);
7647	}
7648
7649	static void wq_watchdog_set_thresh(unsigned long thresh)
7650	{
7651	wq_watchdog_thresh = `0`;
7652	timer_delete_sync(&wq_watchdog_timer);
7653
7654	if (thresh) {
7655	wq_watchdog_thresh = thresh;
7656	wq_watchdog_reset_touched();
7657	mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ);
7658	}
7659	}
7660
7661	static int wq_watchdog_param_set_thresh(const char *val,
7662	const struct kernel_param *kp)
7663	{
7664	unsigned long thresh;
7665	int ret;
7666
7667	ret = kstrtoul(val, `0`, &thresh);
7668	if (ret)
7669	return ret;
7670
7671	if (system_percpu_wq)
7672	wq_watchdog_set_thresh(thresh);
7673	else
7674	wq_watchdog_thresh = thresh;
7675
7676	return `0`;
7677	}
7678
7679	static const struct kernel_param_ops wq_watchdog_thresh_ops = {
7680	.set = wq_watchdog_param_set_thresh,
7681	.get = param_get_ulong,
7682	};
7683
7684	module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh,
7685	`0644`);
7686
7687	static void wq_watchdog_init(void)
7688	{
7689	timer_setup(&wq_watchdog_timer, wq_watchdog_timer_fn, TIMER_DEFERRABLE);
7690	wq_watchdog_set_thresh(wq_watchdog_thresh);
7691	}
7692
7693	#else /* CONFIG_WQ_WATCHDOG */
7694
7695	static inline void wq_watchdog_init(void) { }
7696
7697	#endif /* CONFIG_WQ_WATCHDOG */
7698
7699	static void bh_pool_kick_normal(struct irq_work *irq_work)
7700	{
7701	raise_softirq_irqoff(nr: TASKLET_SOFTIRQ);
7702	}
7703
7704	static void bh_pool_kick_highpri(struct irq_work *irq_work)
7705	{
7706	raise_softirq_irqoff(nr: HI_SOFTIRQ);
7707	}
7708
7709	static void __init restrict_unbound_cpumask(const char name, const* struct cpumask *mask)
7710	{
7711	if (!cpumask_intersects(src1p: wq_unbound_cpumask, src2p: mask)) {
7712	pr_warn("workqueue: Restricting unbound_cpumask (%pb) with %s (%pb) leaves no CPU, ignoring\n",
7713	cpumask_pr_args(wq_unbound_cpumask), name, cpumask_pr_args(mask));
7714	return;
7715	}
7716
7717	cpumask_and(dstp: wq_unbound_cpumask, src1p: wq_unbound_cpumask, src2p: mask);
7718	}
7719
7720	static void __init init_cpu_worker_pool(struct worker_pool pool, int* cpu, int nice)
7721	{
7722	BUG_ON(init_worker_pool(pool));
7723	pool->cpu = cpu;
7724	cpumask_copy(dstp: pool->attrs->cpumask, cpumask_of(cpu));
7725	cpumask_copy(dstp: pool->attrs->__pod_cpumask, cpumask_of(cpu));
7726	pool->attrs->nice = nice;
7727	pool->attrs->affn_strict = true;
7728	pool->node = cpu_to_node(cpu);
7729
7730	/ alloc pool ID /
7731	mutex_lock(lock: &wq_pool_mutex);
7732	BUG_ON(worker_pool_assign_id(pool));
7733	mutex_unlock(lock: &wq_pool_mutex);
7734	}
7735
7736	/**
7737	* workqueue_init_early - early init for workqueue subsystem
7738	*
7739	* This is the first step of three-staged workqueue subsystem initialization and
7740	* invoked as soon as the bare basics - memory allocation, cpumasks and idr are
7741	* up. It sets up all the data structures and system workqueues and allows early
7742	* boot code to create workqueues and queue/cancel work items. Actual work item
7743	* execution starts only after kthreads can be created and scheduled right
7744	* before early initcalls.
7745	*/
7746	void __init workqueue_init_early(void)
7747	{
7748	struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM];
7749	int std_nice[NR_STD_WORKER_POOLS] = { `0`, HIGHPRI_NICE_LEVEL };
7750	void (irq_work_fns[`2`])(struct* irq_work *) = { bh_pool_kick_normal,
7751	bh_pool_kick_highpri };
7752	int i, cpu;
7753
7754	BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
7755
7756	BUG_ON(!alloc_cpumask_var(&wq_online_cpumask, GFP_KERNEL));
7757	BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
7758	BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL));
7759	BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL));
7760
7761	cpumask_copy(dstp: wq_online_cpumask, cpu_online_mask);
7762	cpumask_copy(dstp: wq_unbound_cpumask, cpu_possible_mask);
7763	restrict_unbound_cpumask(name: "HK_TYPE_WQ", mask: housekeeping_cpumask(type: HK_TYPE_WQ));
7764	restrict_unbound_cpumask(name: "HK_TYPE_DOMAIN", mask: housekeeping_cpumask(type: HK_TYPE_DOMAIN));
7765	if (!cpumask_empty(srcp: &wq_cmdline_cpumask))
7766	restrict_unbound_cpumask(name: "workqueue.unbound_cpus", mask: &wq_cmdline_cpumask);
7767
7768	cpumask_copy(dstp: wq_requested_unbound_cpumask, srcp: wq_unbound_cpumask);
7769	cpumask_andnot(dstp: wq_isolated_cpumask, cpu_possible_mask,
7770	src2p: housekeeping_cpumask(type: HK_TYPE_DOMAIN));
7771	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
7772
7773	unbound_wq_update_pwq_attrs_buf = alloc_workqueue_attrs();
7774	BUG_ON(!unbound_wq_update_pwq_attrs_buf);
7775
7776	/*
7777	* If nohz_full is enabled, set power efficient workqueue as unbound.
7778	* This allows workqueue items to be moved to HK CPUs.
7779	*/
7780	if (housekeeping_enabled(type: HK_TYPE_TICK))
7781	wq_power_efficient = true;
7782
7783	/ initialize WQ_AFFN_SYSTEM pods /
7784	pt->pod_cpus = kcalloc(`1`, sizeof(pt->pod_cpus[`0`]), GFP_KERNEL);
7785	pt->pod_node = kcalloc(`1`, sizeof(pt->pod_node[`0`]), GFP_KERNEL);
7786	pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[`0`]), GFP_KERNEL);
7787	BUG_ON(!pt->pod_cpus \|\| !pt->pod_node \|\| !pt->cpu_pod);
7788
7789	BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[`0`], GFP_KERNEL, NUMA_NO_NODE));
7790
7791	pt->nr_pods = `1`;
7792	cpumask_copy(dstp: pt->pod_cpus[`0`], cpu_possible_mask);
7793	pt->pod_node[`0`] = NUMA_NO_NODE;
7794	pt->cpu_pod[`0`] = `0`;
7795
7796	/ initialize BH and CPU pools /
7797	for_each_possible_cpu(cpu) {
7798	struct worker_pool *pool;
7799
7800	i = `0`;
7801	for_each_bh_worker_pool(pool, cpu) {
7802	init_cpu_worker_pool(pool, cpu, nice: std_nice[i]);
7803	pool->flags \|= POOL_BH;
7804	init_irq_work(work: bh_pool_irq_work(pool), func: irq_work_fns[i]);
7805	i++;
7806	}
7807
7808	i = `0`;
7809	for_each_cpu_worker_pool(pool, cpu)
7810	init_cpu_worker_pool(pool, cpu, nice: std_nice[i++]);
7811	}
7812
7813	/ create default unbound and ordered wq attrs /
7814	for (i = `0`; i < NR_STD_WORKER_POOLS; i++) {
7815	struct workqueue_attrs *attrs;
7816
7817	BUG_ON(!(attrs = alloc_workqueue_attrs()));
7818	attrs->nice = std_nice[i];
7819	unbound_std_wq_attrs[i] = attrs;
7820
7821	/*
7822	* An ordered wq should have only one pwq as ordering is
7823	* guaranteed by max_active which is enforced by pwqs.
7824	*/
7825	BUG_ON(!(attrs = alloc_workqueue_attrs()));
7826	attrs->nice = std_nice[i];
7827	attrs->ordered = true;
7828	ordered_wq_attrs[i] = attrs;
7829	}
7830
7831	system_wq = alloc_workqueue("events", WQ_PERCPU, `0`);
7832	system_percpu_wq = alloc_workqueue("events", WQ_PERCPU, `0`);
7833	system_highpri_wq = alloc_workqueue("events_highpri",
7834	WQ_HIGHPRI \| WQ_PERCPU, `0`);
7835	system_long_wq = alloc_workqueue("events_long", WQ_PERCPU, `0`);
7836	system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
7837	system_dfl_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
7838	system_freezable_wq = alloc_workqueue("events_freezable",
7839	WQ_FREEZABLE \| WQ_PERCPU, `0`);
7840	system_power_efficient_wq = alloc_workqueue("events_power_efficient",
7841	WQ_POWER_EFFICIENT \| WQ_PERCPU, `0`);
7842	system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient",
7843	WQ_FREEZABLE \| WQ_POWER_EFFICIENT \| WQ_PERCPU, `0`);
7844	system_bh_wq = alloc_workqueue("events_bh", WQ_BH \| WQ_PERCPU, `0`);
7845	system_bh_highpri_wq = alloc_workqueue("events_bh_highpri",
7846	WQ_BH \| WQ_HIGHPRI \| WQ_PERCPU, `0`);
7847	BUG_ON(!system_wq \|\| !system_percpu_wq\|\| !system_highpri_wq \|\| !system_long_wq \|\|
7848	!system_unbound_wq \|\| !system_freezable_wq \|\| !system_dfl_wq \|\|
7849	!system_power_efficient_wq \|\|
7850	!system_freezable_power_efficient_wq \|\|
7851	!system_bh_wq \|\| !system_bh_highpri_wq);
7852	}
7853
7854	static void __init wq_cpu_intensive_thresh_init(void)
7855	{
7856	unsigned long thresh;
7857	unsigned long bogo;
7858
7859	pwq_release_worker = kthread_run_worker(`0`, "pool_workqueue_release");
7860	BUG_ON(IS_ERR(pwq_release_worker));
7861
7862	/ if the user set it to a specific value, keep it /
7863	if (wq_cpu_intensive_thresh_us != ULONG_MAX)
7864	return;
7865
7866	/*
7867	* The default of 10ms is derived from the fact that most modern (as of
7868	* 2023) processors can do a lot in 10ms and that it's just below what
7869	* most consider human-perceivable. However, the kernel also runs on a
7870	* lot slower CPUs including microcontrollers where the threshold is way
7871	* too low.
7872	*
7873	* Let's scale up the threshold upto 1 second if BogoMips is below 4000.
7874	* This is by no means accurate but it doesn't have to be. The mechanism
7875	* is still useful even when the threshold is fully scaled up. Also, as
7876	* the reports would usually be applicable to everyone, some machines
7877	* operating on longer thresholds won't significantly diminish their
7878	* usefulness.
7879	*/
7880	thresh = `10` * USEC_PER_MSEC;
7881
7882	/ see init/calibrate.c for lpj -> BogoMIPS calculation /
7883	bogo = max_t(unsigned long, loops_per_jiffy / `500000` * HZ, `1`);
7884	if (bogo < `4000`)
7885	thresh = min_t(unsigned long, thresh * `4000` / bogo, USEC_PER_SEC);
7886
7887	pr_debug("wq_cpu_intensive_thresh: lpj=%lu BogoMIPS=%lu thresh_us=%lu\n",
7888	loops_per_jiffy, bogo, thresh);
7889
7890	wq_cpu_intensive_thresh_us = thresh;
7891	}
7892
7893	/**
7894	* workqueue_init - bring workqueue subsystem fully online
7895	*
7896	* This is the second step of three-staged workqueue subsystem initialization
7897	* and invoked as soon as kthreads can be created and scheduled. Workqueues have
7898	* been created and work items queued on them, but there are no kworkers
7899	* executing the work items yet. Populate the worker pools with the initial
7900	* workers and enable future kworker creations.
7901	*/
7902	void __init workqueue_init(void)
7903	{
7904	struct workqueue_struct *wq;
7905	struct worker_pool *pool;
7906	int cpu, bkt;
7907
7908	wq_cpu_intensive_thresh_init();
7909
7910	mutex_lock(lock: &wq_pool_mutex);
7911
7912	/*
7913	* Per-cpu pools created earlier could be missing node hint. Fix them
7914	* up. Also, create a rescuer for workqueues that requested it.
7915	*/
7916	for_each_possible_cpu(cpu) {
7917	for_each_bh_worker_pool(pool, cpu)
7918	pool->node = cpu_to_node(cpu);
7919	for_each_cpu_worker_pool(pool, cpu)
7920	pool->node = cpu_to_node(cpu);
7921	}
7922
7923	list_for_each_entry(wq, &workqueues, list) {
7924	WARN(init_rescuer(wq),
7925	"workqueue: failed to create early rescuer for %s",
7926	wq->name);
7927	}
7928
7929	mutex_unlock(lock: &wq_pool_mutex);
7930
7931	/*
7932	* Create the initial workers. A BH pool has one pseudo worker that
7933	* represents the shared BH execution context and thus doesn't get
7934	* affected by hotplug events. Create the BH pseudo workers for all
7935	* possible CPUs here.
7936	*/
7937	for_each_possible_cpu(cpu)
7938	for_each_bh_worker_pool(pool, cpu)
7939	BUG_ON(!create_worker(pool));
7940
7941	for_each_online_cpu(cpu) {
7942	for_each_cpu_worker_pool(pool, cpu) {
7943	pool->flags &= ~POOL_DISASSOCIATED;
7944	BUG_ON(!create_worker(pool));
7945	}
7946	}
7947
7948	hash_for_each(unbound_pool_hash, bkt, pool, hash_node)
7949	BUG_ON(!create_worker(pool));
7950
7951	wq_online = true;
7952	wq_watchdog_init();
7953	}
7954
7955	/*
7956	* Initialize @pt by first initializing @pt->cpu_pod[] with pod IDs according to
7957	* @cpu_shares_pod(). Each subset of CPUs that share a pod is assigned a unique
7958	* and consecutive pod ID. The rest of @pt is initialized accordingly.
7959	*/
7960	static void __init init_pod_type(struct wq_pod_type *pt,
7961	bool (cpus_share_pod)(int, int*))
7962	{
7963	int cur, pre, cpu, pod;
7964
7965	pt->nr_pods = `0`;
7966
7967	/ init @pt->cpu_pod[] according to @cpus_share_pod() /
7968	pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[`0`]), GFP_KERNEL);
7969	BUG_ON(!pt->cpu_pod);
7970
7971	for_each_possible_cpu(cur) {
7972	for_each_possible_cpu(pre) {
7973	if (pre >= cur) {
7974	pt->cpu_pod[cur] = pt->nr_pods++;
7975	break;
7976	}
7977	if (cpus_share_pod(cur, pre)) {
7978	pt->cpu_pod[cur] = pt->cpu_pod[pre];
7979	break;
7980	}
7981	}
7982	}
7983
7984	/ init the rest to match @pt->cpu_pod[] /
7985	pt->pod_cpus = kcalloc(pt->nr_pods, sizeof(pt->pod_cpus[`0`]), GFP_KERNEL);
7986	pt->pod_node = kcalloc(pt->nr_pods, sizeof(pt->pod_node[`0`]), GFP_KERNEL);
7987	BUG_ON(!pt->pod_cpus \|\| !pt->pod_node);
7988
7989	for (pod = `0`; pod < pt->nr_pods; pod++)
7990	BUG_ON(!zalloc_cpumask_var(&pt->pod_cpus[pod], GFP_KERNEL));
7991
7992	for_each_possible_cpu(cpu) {
7993	cpumask_set_cpu(cpu, dstp: pt->pod_cpus[pt->cpu_pod[cpu]]);
7994	pt->pod_node[pt->cpu_pod[cpu]] = cpu_to_node(cpu);
7995	}
7996	}
7997
7998	static bool __init cpus_dont_share(int cpu0, int cpu1)
7999	{
8000	return false;
8001	}
8002
8003	static bool __init cpus_share_smt(int cpu0, int cpu1)
8004	{
8005	#ifdef CONFIG_SCHED_SMT
8006	return cpumask_test_cpu(cpu: cpu0, cpumask: cpu_smt_mask(cpu: cpu1));
8007	#else
8008	return false;
8009	#endif
8010	}
8011
8012	static bool __init cpus_share_numa(int cpu0, int cpu1)
8013	{
8014	return cpu_to_node(cpu: cpu0) == cpu_to_node(cpu: cpu1);
8015	}
8016
8017	/**
8018	* workqueue_init_topology - initialize CPU pods for unbound workqueues
8019	*
8020	* This is the third step of three-staged workqueue subsystem initialization and
8021	* invoked after SMP and topology information are fully initialized. It
8022	* initializes the unbound CPU pods accordingly.
8023	*/
8024	void __init workqueue_init_topology(void)
8025	{
8026	struct workqueue_struct *wq;
8027	int cpu;
8028
8029	init_pod_type(pt: &wq_pod_types[WQ_AFFN_CPU], cpus_share_pod: cpus_dont_share);
8030	init_pod_type(pt: &wq_pod_types[WQ_AFFN_SMT], cpus_share_pod: cpus_share_smt);
8031	init_pod_type(pt: &wq_pod_types[WQ_AFFN_CACHE], cpus_share_pod: cpus_share_cache);
8032	init_pod_type(pt: &wq_pod_types[WQ_AFFN_NUMA], cpus_share_pod: cpus_share_numa);
8033
8034	wq_topo_initialized = true;
8035
8036	mutex_lock(lock: &wq_pool_mutex);
8037
8038	/*
8039	* Workqueues allocated earlier would have all CPUs sharing the default
8040	* worker pool. Explicitly call unbound_wq_update_pwq() on all workqueue
8041	* and CPU combinations to apply per-pod sharing.
8042	*/
8043	list_for_each_entry(wq, &workqueues, list) {
8044	for_each_online_cpu(cpu)
8045	unbound_wq_update_pwq(wq, cpu);
8046	if (wq->flags & WQ_UNBOUND) {
8047	mutex_lock(lock: &wq->mutex);
8048	wq_update_node_max_active(wq, off_cpu: -`1`);
8049	mutex_unlock(lock: &wq->mutex);
8050	}
8051	}
8052
8053	mutex_unlock(lock: &wq_pool_mutex);
8054	}
8055
8056	void __warn_flushing_systemwide_wq(void)
8057	{
8058	pr_warn("WARNING: Flushing system-wide workqueues will be prohibited in near future.\n");
8059	dump_stack();
8060	}
8061	EXPORT_SYMBOL(__warn_flushing_systemwide_wq);
8062
8063	static int __init workqueue_unbound_cpus_setup(char *str)
8064	{
8065	if (cpulist_parse(buf: str, dstp: &wq_cmdline_cpumask) < `0`) {
8066	cpumask_clear(dstp: &wq_cmdline_cpumask);
8067	pr_warn("workqueue.unbound_cpus: incorrect CPU range, using default\n");
8068	}
8069
8070	return `1`;
8071	}
8072	__setup("workqueue.unbound_cpus=", workqueue_unbound_cpus_setup);
8073

Browse the source code of Linux/kernel/workqueue.c