syscalls.c source code [Linux/kernel/sched/syscalls.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* kernel/sched/syscalls.c
4	*
5	* Core kernel scheduler syscalls related code
6	*
7	* Copyright (C) 1991-2002 Linus Torvalds
8	* Copyright (C) 1998-2024 Ingo Molnar, Red Hat
9	*/
10	#include <linux/sched.h>
11	#include <linux/cpuset.h>
12	#include <linux/sched/debug.h>
13
14	#include <uapi/linux/sched/types.h>
15
16	#include "sched.h"
17	#include "autogroup.h"
18
19	static inline int __normal_prio(int policy, int rt_prio, int nice)
20	{
21	int prio;
22
23	if (dl_policy(policy))
24	prio = MAX_DL_PRIO - `1`;
25	else if (rt_policy(policy))
26	prio = MAX_RT_PRIO - `1` - rt_prio;
27	else
28	prio = NICE_TO_PRIO(nice);
29
30	return prio;
31	}
32
33	/*
34	* Calculate the expected normal priority: i.e. priority
35	* without taking RT-inheritance into account. Might be
36	* boosted by interactivity modifiers. Changes upon fork,
37	* setprio syscalls, and whenever the interactivity
38	* estimator recalculates.
39	*/
40	static inline int normal_prio(struct task_struct *p)
41	{
42	return __normal_prio(policy: p->policy, rt_prio: p->rt_priority, PRIO_TO_NICE(p->static_prio));
43	}
44
45	/*
46	* Calculate the current priority, i.e. the priority
47	* taken into account by the scheduler. This value might
48	* be boosted by RT tasks, or might be boosted by
49	* interactivity modifiers. Will be RT if the task got
50	* RT-boosted. If not then it returns p->normal_prio.
51	*/
52	static int effective_prio(struct task_struct *p)
53	{
54	p->normal_prio = normal_prio(p);
55	/*
56	* If we are RT tasks or we were boosted to RT priority,
57	* keep the priority unchanged. Otherwise, update priority
58	* to the normal priority:
59	*/
60	if (!rt_or_dl_prio(prio: p->prio))
61	return p->normal_prio;
62	return p->prio;
63	}
64
65	void set_user_nice(struct task_struct p, long* nice)
66	{
67	bool queued, running;
68	struct rq *rq;
69	int old_prio;
70
71	if (task_nice(p) == nice \|\| nice < MIN_NICE \|\| nice > MAX_NICE)
72	return;
73	/*
74	* We have to be careful, if called from sys_setpriority(),
75	* the task might be in the middle of scheduling on another CPU.
76	*/
77	CLASS(task_rq_lock, rq_guard)(l: p);
78	rq = rq_guard.rq;
79
80	update_rq_clock(rq);
81
82	/*
83	* The RT priorities are set via sched_setscheduler(), but we still
84	* allow the 'normal' nice value to be set - but as expected
85	* it won't have any effect on scheduling until the task is
86	* SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
87	*/
88	if (task_has_dl_policy(p) \|\| task_has_rt_policy(p)) {
89	p->static_prio = NICE_TO_PRIO(nice);
90	return;
91	}
92
93	queued = task_on_rq_queued(p);
94	running = task_current_donor(rq, p);
95	if (queued)
96	dequeue_task(rq, p, DEQUEUE_SAVE \| DEQUEUE_NOCLOCK);
97	if (running)
98	put_prev_task(rq, prev: p);
99
100	p->static_prio = NICE_TO_PRIO(nice);
101	set_load_weight(p, update_load: true);
102	old_prio = p->prio;
103	p->prio = effective_prio(p);
104
105	if (queued)
106	enqueue_task(rq, p, ENQUEUE_RESTORE \| ENQUEUE_NOCLOCK);
107	if (running)
108	set_next_task(rq, next: p);
109
110	/*
111	* If the task increased its priority or is running and
112	* lowered its priority, then reschedule its CPU:
113	*/
114	p->sched_class->prio_changed(rq, p, old_prio);
115	}
116	EXPORT_SYMBOL(set_user_nice);
117
118	/*
119	* is_nice_reduction - check if nice value is an actual reduction
120	*
121	* Similar to can_nice() but does not perform a capability check.
122	*
123	* @p: task
124	* @nice: nice value
125	*/
126	static bool is_nice_reduction(const struct task_struct p, const* int nice)
127	{
128	/ Convert nice value [19,-20] to rlimit style value [1,40]: /
129	int nice_rlim = nice_to_rlimit(nice);
130
131	return (nice_rlim <= task_rlimit(task: p, RLIMIT_NICE));
132	}
133
134	/*
135	* can_nice - check if a task can reduce its nice value
136	* @p: task
137	* @nice: nice value
138	*/
139	int can_nice(const struct task_struct p, const* int nice)
140	{
141	return is_nice_reduction(p, nice) \|\| capable(CAP_SYS_NICE);
142	}
143
144	#ifdef __ARCH_WANT_SYS_NICE
145
146	/*
147	* sys_nice - change the priority of the current process.
148	* @increment: priority increment
149	*
150	* sys_setpriority is a more generic, but much slower function that
151	* does similar things.
152	*/
153	SYSCALL_DEFINE1(nice, int, increment)
154	{
155	long nice, retval;
156
157	/*
158	* Setpriority might change our priority at the same moment.
159	* We don't have to worry. Conceptually one call occurs first
160	* and we have a single winner.
161	*/
162	increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
163	nice = task_nice(current) + increment;
164
165	nice = clamp_val(nice, MIN_NICE, MAX_NICE);
166	if (increment < `0` && !can_nice(current, nice))
167	return -EPERM;
168
169	retval = security_task_setnice(current, nice);
170	if (retval)
171	return retval;
172
173	set_user_nice(current, nice);
174	return `0`;
175	}
176
177	#endif /* __ARCH_WANT_SYS_NICE */
178
179	/**
180	* task_prio - return the priority value of a given task.
181	* @p: the task in question.
182	*
183	* Return: The priority value as seen by users in /proc.
184	*
185	* sched policy return value kernel prio user prio/nice
186	*
187	* normal, batch, idle [0 ... 39] [100 ... 139] 0/[-20 ... 19]
188	* fifo, rr [-2 ... -100] [98 ... 0] [1 ... 99]
189	* deadline -101 -1 0
190	*/
191	int task_prio(const struct task_struct *p)
192	{
193	return p->prio - MAX_RT_PRIO;
194	}
195
196	/**
197	* idle_cpu - is a given CPU idle currently?
198	* @cpu: the processor in question.
199	*
200	* Return: 1 if the CPU is currently idle. 0 otherwise.
201	*/
202	int idle_cpu(int cpu)
203	{
204	struct rq *rq = cpu_rq(cpu);
205
206	if (rq->curr != rq->idle)
207	return `0`;
208
209	if (rq->nr_running)
210	return `0`;
211
212	if (rq->ttwu_pending)
213	return `0`;
214
215	return `1`;
216	}
217
218	/**
219	* available_idle_cpu - is a given CPU idle for enqueuing work.
220	* @cpu: the CPU in question.
221	*
222	* Return: 1 if the CPU is currently idle. 0 otherwise.
223	*/
224	int available_idle_cpu(int cpu)
225	{
226	if (!idle_cpu(cpu))
227	return `0`;
228
229	if (vcpu_is_preempted(cpu))
230	return `0`;
231
232	return `1`;
233	}
234
235	/**
236	* idle_task - return the idle task for a given CPU.
237	* @cpu: the processor in question.
238	*
239	* Return: The idle task for the CPU @cpu.
240	*/
241	struct task_struct idle_task(int* cpu)
242	{
243	return cpu_rq(cpu)->idle;
244	}
245
246	#ifdef CONFIG_SCHED_CORE
247	int sched_core_idle_cpu(int cpu)
248	{
249	struct rq *rq = cpu_rq(cpu);
250
251	if (sched_core_enabled(rq) && rq->curr == rq->idle)
252	return `1`;
253
254	return idle_cpu(cpu);
255	}
256	#endif /* CONFIG_SCHED_CORE */
257
258	/**
259	* find_process_by_pid - find a process with a matching PID value.
260	* @pid: the pid in question.
261	*
262	* The task of @pid, if found. %NULL otherwise.
263	*/
264	static struct task_struct *find_process_by_pid(pid_t pid)
265	{
266	return pid ? find_task_by_vpid(nr: pid) : current;
267	}
268
269	static struct task_struct *find_get_task(pid_t pid)
270	{
271	struct task_struct *p;
272	guard(rcu)();
273
274	p = find_process_by_pid(pid);
275	if (likely(p))
276	get_task_struct(t: p);
277
278	return p;
279	}
280
281	DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T),
282	find_get_task(pid), pid_t pid)
283
284	/*
285	* sched_setparam() passes in -1 for its policy, to let the functions
286	* it calls know not to change it.
287	*/
288	#define SETPARAM_POLICY -1
289
290	static void __setscheduler_params(struct task_struct *p,
291	const struct sched_attr *attr)
292	{
293	int policy = attr->sched_policy;
294
295	if (policy == SETPARAM_POLICY)
296	policy = p->policy;
297
298	p->policy = policy;
299
300	if (dl_policy(policy))
301	__setparam_dl(p, attr);
302	else if (fair_policy(policy))
303	__setparam_fair(p, attr);
304
305	/ rt-policy tasks do not have a timerslack /
306	if (rt_or_dl_task_policy(tsk: p)) {
307	p->timer_slack_ns = `0`;
308	} else if (p->timer_slack_ns == `0`) {
309	/ when switching back to non-rt policy, restore timerslack /
310	p->timer_slack_ns = p->default_timer_slack_ns;
311	}
312
313	/*
314	* __sched_setscheduler() ensures attr->sched_priority == 0 when
315	* !rt_policy. Always setting this ensures that things like
316	* getparam()/getattr() don't report silly values for !rt tasks.
317	*/
318	p->rt_priority = attr->sched_priority;
319	p->normal_prio = normal_prio(p);
320	set_load_weight(p, update_load: true);
321	}
322
323	/*
324	* Check the target process has a UID that matches the current process's:
325	*/
326	static bool check_same_owner(struct task_struct *p)
327	{
328	const struct cred cred = current_cred(), pcred;
329	guard(rcu)();
330
331	pcred = __task_cred(p);
332	return (uid_eq(left: cred->euid, right: pcred->euid) \|\|
333	uid_eq(left: cred->euid, right: pcred->uid));
334	}
335
336	#ifdef CONFIG_UCLAMP_TASK
337
338	static int uclamp_validate(struct task_struct *p,
339	const struct sched_attr *attr)
340	{
341	int util_min = p->uclamp_req[UCLAMP_MIN].value;
342	int util_max = p->uclamp_req[UCLAMP_MAX].value;
343
344	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
345	util_min = attr->sched_util_min;
346
347	if (util_min + `1` > SCHED_CAPACITY_SCALE + `1`)
348	return -EINVAL;
349	}
350
351	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
352	util_max = attr->sched_util_max;
353
354	if (util_max + `1` > SCHED_CAPACITY_SCALE + `1`)
355	return -EINVAL;
356	}
357
358	if (util_min != -`1` && util_max != -`1` && util_min > util_max)
359	return -EINVAL;
360
361	/*
362	* We have valid uclamp attributes; make sure uclamp is enabled.
363	*
364	* We need to do that here, because enabling static branches is a
365	* blocking operation which obviously cannot be done while holding
366	* scheduler locks.
367	*/
368	sched_uclamp_enable();
369
370	return `0`;
371	}
372
373	static bool uclamp_reset(const struct sched_attr *attr,
374	enum uclamp_id clamp_id,
375	struct uclamp_se *uc_se)
376	{
377	/ Reset on sched class change for a non user-defined clamp value. /
378	if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
379	!uc_se->user_defined)
380	return true;
381
382	/ Reset on sched_util_{min,max} == -1. /
383	if (clamp_id == UCLAMP_MIN &&
384	attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
385	attr->sched_util_min == -`1`) {
386	return true;
387	}
388
389	if (clamp_id == UCLAMP_MAX &&
390	attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
391	attr->sched_util_max == -`1`) {
392	return true;
393	}
394
395	return false;
396	}
397
398	static void __setscheduler_uclamp(struct task_struct *p,
399	const struct sched_attr *attr)
400	{
401	enum uclamp_id clamp_id;
402
403	for_each_clamp_id(clamp_id) {
404	struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
405	unsigned int value;
406
407	if (!uclamp_reset(attr, clamp_id, uc_se))
408	continue;
409
410	/*
411	* RT by default have a 100% boost value that could be modified
412	* at runtime.
413	*/
414	if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
415	value = sysctl_sched_uclamp_util_min_rt_default;
416	else
417	value = uclamp_none(clamp_id);
418
419	uclamp_se_set(uc_se, value, false);
420
421	}
422
423	if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
424	return;
425
426	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
427	attr->sched_util_min != -`1`) {
428	uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
429	attr->sched_util_min, true);
430	}
431
432	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
433	attr->sched_util_max != -`1`) {
434	uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
435	attr->sched_util_max, true);
436	}
437	}
438
439	#else /* !CONFIG_UCLAMP_TASK: */
440
441	static inline int uclamp_validate(struct task_struct *p,
442	const struct sched_attr *attr)
443	{
444	return -EOPNOTSUPP;
445	}
446	static void __setscheduler_uclamp(struct task_struct *p,
447	const struct sched_attr *attr) { }
448	#endif /* !CONFIG_UCLAMP_TASK */
449
450	/*
451	* Allow unprivileged RT tasks to decrease priority.
452	* Only issue a capable test if needed and only once to avoid an audit
453	* event on permitted non-privileged operations:
454	*/
455	static int user_check_sched_setscheduler(struct task_struct *p,
456	const struct sched_attr *attr,
457	int policy, int reset_on_fork)
458	{
459	if (fair_policy(policy)) {
460	if (attr->sched_nice < task_nice(p) &&
461	!is_nice_reduction(p, nice: attr->sched_nice))
462	goto req_priv;
463	}
464
465	if (rt_policy(policy)) {
466	unsigned long rlim_rtprio = task_rlimit(task: p, RLIMIT_RTPRIO);
467
468	/ Can't set/change the rt policy: /
469	if (policy != p->policy && !rlim_rtprio)
470	goto req_priv;
471
472	/ Can't increase priority: /
473	if (attr->sched_priority > p->rt_priority &&
474	attr->sched_priority > rlim_rtprio)
475	goto req_priv;
476	}
477
478	/*
479	* Can't set/change SCHED_DEADLINE policy at all for now
480	* (safest behavior); in the future we would like to allow
481	* unprivileged DL tasks to increase their relative deadline
482	* or reduce their runtime (both ways reducing utilization)
483	*/
484	if (dl_policy(policy))
485	goto req_priv;
486
487	/*
488	* Treat SCHED_IDLE as nice 20. Only allow a switch to
489	* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
490	*/
491	if (task_has_idle_policy(p) && !idle_policy(policy)) {
492	if (!is_nice_reduction(p, nice: task_nice(p)))
493	goto req_priv;
494	}
495
496	/ Can't change other user's priorities: /
497	if (!check_same_owner(p))
498	goto req_priv;
499
500	/ Normal users shall not reset the sched_reset_on_fork flag: /
501	if (p->sched_reset_on_fork && !reset_on_fork)
502	goto req_priv;
503
504	return `0`;
505
506	req_priv:
507	if (!capable(CAP_SYS_NICE))
508	return -EPERM;
509
510	return `0`;
511	}
512
513	int __sched_setscheduler(struct task_struct *p,
514	const struct sched_attr *attr,
515	bool user, bool pi)
516	{
517	int oldpolicy = -`1`, policy = attr->sched_policy;
518	int retval, oldprio, newprio, queued, running;
519	const struct sched_class prev_class, next_class;
520	struct balance_callback *head;
521	struct rq_flags rf;
522	int reset_on_fork;
523	int queue_flags = DEQUEUE_SAVE \| DEQUEUE_MOVE \| DEQUEUE_NOCLOCK;
524	struct rq *rq;
525	bool cpuset_locked = false;
526
527	/ The pi code expects interrupts enabled /
528	BUG_ON(pi && in_interrupt());
529	recheck:
530	/ Double check policy once rq lock held: /
531	if (policy < `0`) {
532	reset_on_fork = p->sched_reset_on_fork;
533	policy = oldpolicy = p->policy;
534	} else {
535	reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
536
537	if (!valid_policy(policy))
538	return -EINVAL;
539	}
540
541	if (attr->sched_flags & ~(SCHED_FLAG_ALL \| SCHED_FLAG_SUGOV))
542	return -EINVAL;
543
544	/*
545	* Valid priorities for SCHED_FIFO and SCHED_RR are
546	* 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL,
547	* SCHED_BATCH and SCHED_IDLE is 0.
548	*/
549	if (attr->sched_priority > MAX_RT_PRIO-`1`)
550	return -EINVAL;
551	if ((dl_policy(policy) && !__checkparam_dl(attr)) \|\|
552	(rt_policy(policy) != (attr->sched_priority != `0`)))
553	return -EINVAL;
554
555	if (user) {
556	retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork);
557	if (retval)
558	return retval;
559
560	if (attr->sched_flags & SCHED_FLAG_SUGOV)
561	return -EINVAL;
562
563	retval = security_task_setscheduler(p);
564	if (retval)
565	return retval;
566	}
567
568	/ Update task specific "requested" clamps /
569	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
570	retval = uclamp_validate(p, attr);
571	if (retval)
572	return retval;
573	}
574
575	/*
576	* SCHED_DEADLINE bandwidth accounting relies on stable cpusets
577	* information.
578	*/
579	if (dl_policy(policy) \|\| dl_policy(policy: p->policy)) {
580	cpuset_locked = true;
581	cpuset_lock();
582	}
583
584	/*
585	* Make sure no PI-waiters arrive (or leave) while we are
586	* changing the priority of the task:
587	*
588	* To be able to change p->policy safely, the appropriate
589	* runqueue lock must be held.
590	*/
591	rq = task_rq_lock(p, rf: &rf);
592	update_rq_clock(rq);
593
594	/*
595	* Changing the policy of the stop threads its a very bad idea:
596	*/
597	if (p == rq->stop) {
598	retval = -EINVAL;
599	goto unlock;
600	}
601
602	retval = scx_check_setscheduler(p, policy);
603	if (retval)
604	goto unlock;
605
606	/*
607	* If not changing anything there's no need to proceed further,
608	* but store a possible modification of reset_on_fork.
609	*/
610	if (unlikely(policy == p->policy)) {
611	if (fair_policy(policy) &&
612	(attr->sched_nice != task_nice(p) \|\|
613	(attr->sched_runtime != p->se.slice)))
614	goto change;
615	if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
616	goto change;
617	if (dl_policy(policy) && dl_param_changed(p, attr))
618	goto change;
619	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
620	goto change;
621
622	p->sched_reset_on_fork = reset_on_fork;
623	retval = `0`;
624	goto unlock;
625	}
626	change:
627
628	if (user) {
629	#ifdef CONFIG_RT_GROUP_SCHED
630	/*
631	* Do not allow real-time tasks into groups that have no runtime
632	* assigned.
633	*/
634	if (rt_group_sched_enabled() &&
635	rt_bandwidth_enabled() && rt_policy(policy) &&
636	task_group(p)->rt_bandwidth.rt_runtime == `0` &&
637	!task_group_is_autogroup(task_group(p))) {
638	retval = -EPERM;
639	goto unlock;
640	}
641	#endif /* CONFIG_RT_GROUP_SCHED */
642	if (dl_bandwidth_enabled() && dl_policy(policy) &&
643	!(attr->sched_flags & SCHED_FLAG_SUGOV)) {
644	cpumask_t *span = rq->rd->span;
645
646	/*
647	* Don't allow tasks with an affinity mask smaller than
648	* the entire root_domain to become SCHED_DEADLINE. We
649	* will also fail if there's no bandwidth available.
650	*/
651	if (!cpumask_subset(src1p: span, src2p: p->cpus_ptr) \|\|
652	rq->rd->dl_bw.bw == `0`) {
653	retval = -EPERM;
654	goto unlock;
655	}
656	}
657	}
658
659	/ Re-check policy now with rq lock held: /
660	if (unlikely(oldpolicy != -`1` && oldpolicy != p->policy)) {
661	policy = oldpolicy = -`1`;
662	task_rq_unlock(rq, p, rf: &rf);
663	if (cpuset_locked)
664	cpuset_unlock();
665	goto recheck;
666	}
667
668	/*
669	* If setscheduling to SCHED_DEADLINE (or changing the parameters
670	* of a SCHED_DEADLINE task) we need to check if enough bandwidth
671	* is available.
672	*/
673	if ((dl_policy(policy) \|\| dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
674	retval = -EBUSY;
675	goto unlock;
676	}
677
678	p->sched_reset_on_fork = reset_on_fork;
679	oldprio = p->prio;
680
681	newprio = __normal_prio(policy, rt_prio: attr->sched_priority, nice: attr->sched_nice);
682	if (pi) {
683	/*
684	* Take priority boosted tasks into account. If the new
685	* effective priority is unchanged, we just store the new
686	* normal parameters and do not touch the scheduler class and
687	* the runqueue. This will be done when the task deboost
688	* itself.
689	*/
690	newprio = rt_effective_prio(p, prio: newprio);
691	if (newprio == oldprio)
692	queue_flags &= ~DEQUEUE_MOVE;
693	}
694
695	prev_class = p->sched_class;
696	next_class = __setscheduler_class(policy, prio: newprio);
697
698	if (prev_class != next_class && p->se.sched_delayed)
699	dequeue_task(rq, p, DEQUEUE_SLEEP \| DEQUEUE_DELAYED \| DEQUEUE_NOCLOCK);
700
701	queued = task_on_rq_queued(p);
702	running = task_current_donor(rq, p);
703	if (queued)
704	dequeue_task(rq, p, flags: queue_flags);
705	if (running)
706	put_prev_task(rq, prev: p);
707
708	if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
709	__setscheduler_params(p, attr);
710	p->sched_class = next_class;
711	p->prio = newprio;
712	}
713	__setscheduler_uclamp(p, attr);
714	check_class_changing(rq, p, prev_class);
715
716	if (queued) {
717	/*
718	* We enqueue to tail when the priority of a task is
719	* increased (user space view).
720	*/
721	if (oldprio < p->prio)
722	queue_flags \|= ENQUEUE_HEAD;
723
724	enqueue_task(rq, p, flags: queue_flags);
725	}
726	if (running)
727	set_next_task(rq, next: p);
728
729	check_class_changed(rq, p, prev_class, oldprio);
730
731	/ Avoid rq from going away on us: /
732	preempt_disable();
733	head = splice_balance_callbacks(rq);
734	task_rq_unlock(rq, p, rf: &rf);
735
736	if (pi) {
737	if (cpuset_locked)
738	cpuset_unlock();
739	rt_mutex_adjust_pi(p);
740	}
741
742	/ Run balance callbacks after we've adjusted the PI chain: /
743	balance_callbacks(rq, head);
744	preempt_enable();
745
746	return `0`;
747
748	unlock:
749	task_rq_unlock(rq, p, rf: &rf);
750	if (cpuset_locked)
751	cpuset_unlock();
752	return retval;
753	}
754
755	static int _sched_setscheduler(struct task_struct p, int* policy,
756	const struct sched_param *param, bool check)
757	{
758	struct sched_attr attr = {
759	.sched_policy = policy,
760	.sched_priority = param->sched_priority,
761	.sched_nice = PRIO_TO_NICE(p->static_prio),
762	};
763
764	if (p->se.custom_slice)
765	attr.sched_runtime = p->se.slice;
766
767	/ Fixup the legacy SCHED_RESET_ON_FORK hack. /
768	if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
769	attr.sched_flags \|= SCHED_FLAG_RESET_ON_FORK;
770	policy &= ~SCHED_RESET_ON_FORK;
771	attr.sched_policy = policy;
772	}
773
774	return __sched_setscheduler(p, attr: &attr, user: check, pi: true);
775	}
776	/**
777	* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
778	* @p: the task in question.
779	* @policy: new policy.
780	* @param: structure containing the new RT priority.
781	*
782	* Use sched_set_fifo(), read its comment.
783	*
784	* Return: 0 on success. An error code otherwise.
785	*
786	* NOTE that the task may be already dead.
787	*/
788	int sched_setscheduler(struct task_struct p, int* policy,
789	const struct sched_param *param)
790	{
791	return _sched_setscheduler(p, policy, param, check: true);
792	}
793
794	int sched_setattr(struct task_struct p, const* struct sched_attr *attr)
795	{
796	return __sched_setscheduler(p, attr, user: true, pi: true);
797	}
798
799	int sched_setattr_nocheck(struct task_struct p, const* struct sched_attr *attr)
800	{
801	return __sched_setscheduler(p, attr, user: false, pi: true);
802	}
803	EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
804
805	/**
806	* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernel-space.
807	* @p: the task in question.
808	* @policy: new policy.
809	* @param: structure containing the new RT priority.
810	*
811	* Just like sched_setscheduler, only don't bother checking if the
812	* current context has permission. For example, this is needed in
813	* stop_machine(): we create temporary high priority worker threads,
814	* but our caller might not have that capability.
815	*
816	* Return: 0 on success. An error code otherwise.
817	*/
818	int sched_setscheduler_nocheck(struct task_struct p, int* policy,
819	const struct sched_param *param)
820	{
821	return _sched_setscheduler(p, policy, param, check: false);
822	}
823
824	/*
825	* SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
826	* incapable of resource management, which is the one thing an OS really should
827	* be doing.
828	*
829	* This is of course the reason it is limited to privileged users only.
830	*
831	* Worse still; it is fundamentally impossible to compose static priority
832	* workloads. You cannot take two correctly working static prio workloads
833	* and smash them together and still expect them to work.
834	*
835	* For this reason 'all' FIFO tasks the kernel creates are basically at:
836	*
837	* MAX_RT_PRIO / 2
838	*
839	* The administrator _MUST_ configure the system, the kernel simply doesn't
840	* know enough information to make a sensible choice.
841	*/
842	void sched_set_fifo(struct task_struct *p)
843	{
844	struct sched_param sp = { .sched_priority = MAX_RT_PRIO / `2` };
845	WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != `0`);
846	}
847	EXPORT_SYMBOL_GPL(sched_set_fifo);
848
849	/*
850	* For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
851	*/
852	void sched_set_fifo_low(struct task_struct *p)
853	{
854	struct sched_param sp = { .sched_priority = `1` };
855	WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != `0`);
856	}
857	EXPORT_SYMBOL_GPL(sched_set_fifo_low);
858
859	void sched_set_normal(struct task_struct p, int* nice)
860	{
861	struct sched_attr attr = {
862	.sched_policy = SCHED_NORMAL,
863	.sched_nice = nice,
864	};
865	WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != `0`);
866	}
867	EXPORT_SYMBOL_GPL(sched_set_normal);
868
869	static int
870	do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
871	{
872	struct sched_param lparam;
873
874	if (unlikely(!param \|\| pid < `0`))
875	return -EINVAL;
876	if (copy_from_user(to: &lparam, from: param, n: sizeof(struct sched_param)))
877	return -EFAULT;
878
879	CLASS(find_get_task, p)(pid);
880	if (!p)
881	return -ESRCH;
882
883	return sched_setscheduler(p, policy, param: &lparam);
884	}
885
886	/*
887	* Mimics kernel/events/core.c perf_copy_attr().
888	*/
889	static int sched_copy_attr(struct sched_attr __user uattr, struct* sched_attr *attr)
890	{
891	u32 size;
892	int ret;
893
894	/ Zero the full structure, so that a short copy will be nice: /
895	memset(s: attr, c: `0`, n: sizeof(*attr));
896
897	ret = get_user(size, &uattr->size);
898	if (ret)
899	return ret;
900
901	/ ABI compatibility quirk: /
902	if (!size)
903	size = SCHED_ATTR_SIZE_VER0;
904	if (size < SCHED_ATTR_SIZE_VER0 \|\| size > PAGE_SIZE)
905	goto err_size;
906
907	ret = copy_struct_from_user(dst: attr, ksize: sizeof(*attr), src: uattr, usize: size);
908	if (ret) {
909	if (ret == -E2BIG)
910	goto err_size;
911	return ret;
912	}
913
914	if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
915	size < SCHED_ATTR_SIZE_VER1)
916	return -EINVAL;
917
918	/*
919	* XXX: Do we want to be lenient like existing syscalls; or do we want
920	* to be strict and return an error on out-of-bounds values?
921	*/
922	attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
923
924	return `0`;
925
926	err_size:
927	put_user(sizeof(*attr), &uattr->size);
928	return -E2BIG;
929	}
930
931	static void get_params(struct task_struct p, struct* sched_attr *attr)
932	{
933	if (task_has_dl_policy(p)) {
934	__getparam_dl(p, attr);
935	} else if (task_has_rt_policy(p)) {
936	attr->sched_priority = p->rt_priority;
937	} else {
938	attr->sched_nice = task_nice(p);
939	attr->sched_runtime = p->se.slice;
940	}
941	}
942
943	/**
944	* sys_sched_setscheduler - set/change the scheduler policy and RT priority
945	* @pid: the pid in question.
946	* @policy: new policy.
947	* @param: structure containing the new RT priority.
948	*
949	* Return: 0 on success. An error code otherwise.
950	*/
951	SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
952	{
953	if (policy < `0`)
954	return -EINVAL;
955
956	return do_sched_setscheduler(pid, policy, param);
957	}
958
959	/**
960	* sys_sched_setparam - set/change the RT priority of a thread
961	* @pid: the pid in question.
962	* @param: structure containing the new RT priority.
963	*
964	* Return: 0 on success. An error code otherwise.
965	*/
966	SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
967	{
968	return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
969	}
970
971	/**
972	* sys_sched_setattr - same as above, but with extended sched_attr
973	* @pid: the pid in question.
974	* @uattr: structure containing the extended parameters.
975	* @flags: for future extension.
976	*/
977	SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
978	unsigned int, flags)
979	{
980	struct sched_attr attr;
981	int retval;
982
983	if (unlikely(!uattr \|\| pid < `0` \|\| flags))
984	return -EINVAL;
985
986	retval = sched_copy_attr(uattr, attr: &attr);
987	if (retval)
988	return retval;
989
990	if ((int)attr.sched_policy < `0`)
991	return -EINVAL;
992	if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
993	attr.sched_policy = SETPARAM_POLICY;
994
995	CLASS(find_get_task, p)(pid);
996	if (!p)
997	return -ESRCH;
998
999	if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
1000	get_params(p, attr: &attr);
1001
1002	return sched_setattr(p, attr: &attr);
1003	}
1004
1005	/**
1006	* sys_sched_getscheduler - get the policy (scheduling class) of a thread
1007	* @pid: the pid in question.
1008	*
1009	* Return: On success, the policy of the thread. Otherwise, a negative error
1010	* code.
1011	*/
1012	SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
1013	{
1014	struct task_struct *p;
1015	int retval;
1016
1017	if (pid < `0`)
1018	return -EINVAL;
1019
1020	guard(rcu)();
1021	p = find_process_by_pid(pid);
1022	if (!p)
1023	return -ESRCH;
1024
1025	retval = security_task_getscheduler(p);
1026	if (!retval) {
1027	retval = p->policy;
1028	if (p->sched_reset_on_fork)
1029	retval \|= SCHED_RESET_ON_FORK;
1030	}
1031	return retval;
1032	}
1033
1034	/**
1035	* sys_sched_getparam - get the RT priority of a thread
1036	* @pid: the pid in question.
1037	* @param: structure containing the RT priority.
1038	*
1039	* Return: On success, 0 and the RT priority is in @param. Otherwise, an error
1040	* code.
1041	*/
1042	SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
1043	{
1044	struct sched_param lp = { .sched_priority = `0` };
1045	struct task_struct *p;
1046	int retval;
1047
1048	if (unlikely(!param \|\| pid < `0`))
1049	return -EINVAL;
1050
1051	scoped_guard (rcu) {
1052	p = find_process_by_pid(pid);
1053	if (!p)
1054	return -ESRCH;
1055
1056	retval = security_task_getscheduler(p);
1057	if (retval)
1058	return retval;
1059
1060	if (task_has_rt_policy(p))
1061	lp.sched_priority = p->rt_priority;
1062	}
1063
1064	/*
1065	* This one might sleep, we cannot do it with a spinlock held ...
1066	*/
1067	return copy_to_user(to: param, from: &lp, n: sizeof(*param)) ? -EFAULT : `0`;
1068	}
1069
1070	/**
1071	* sys_sched_getattr - similar to sched_getparam, but with sched_attr
1072	* @pid: the pid in question.
1073	* @uattr: structure containing the extended parameters.
1074	* @usize: sizeof(attr) for fwd/bwd comp.
1075	* @flags: for future extension.
1076	*/
1077	SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
1078	unsigned int, usize, unsigned int, flags)
1079	{
1080	struct sched_attr kattr = { };
1081	struct task_struct *p;
1082	int retval;
1083
1084	if (unlikely(!uattr \|\| pid < `0` \|\| usize > PAGE_SIZE \|\|
1085	usize < SCHED_ATTR_SIZE_VER0 \|\| flags))
1086	return -EINVAL;
1087
1088	scoped_guard (rcu) {
1089	p = find_process_by_pid(pid);
1090	if (!p)
1091	return -ESRCH;
1092
1093	retval = security_task_getscheduler(p);
1094	if (retval)
1095	return retval;
1096
1097	kattr.sched_policy = p->policy;
1098	if (p->sched_reset_on_fork)
1099	kattr.sched_flags \|= SCHED_FLAG_RESET_ON_FORK;
1100	get_params(p, attr: &kattr);
1101	kattr.sched_flags &= SCHED_FLAG_ALL;
1102
1103	#ifdef CONFIG_UCLAMP_TASK
1104	/*
1105	* This could race with another potential updater, but this is fine
1106	* because it'll correctly read the old or the new value. We don't need
1107	* to guarantee who wins the race as long as it doesn't return garbage.
1108	*/
1109	kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
1110	kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
1111	#endif
1112	}
1113
1114	kattr.size = min(usize, sizeof(kattr));
1115	return copy_struct_to_user(dst: uattr, usize, src: &kattr, ksize: sizeof(kattr), NULL);
1116	}
1117
1118	int dl_task_check_affinity(struct task_struct p, const* struct cpumask *mask)
1119	{
1120	/*
1121	* If the task isn't a deadline task or admission control is
1122	* disabled then we don't care about affinity changes.
1123	*/
1124	if (!task_has_dl_policy(p) \|\| !dl_bandwidth_enabled())
1125	return `0`;
1126
1127	/*
1128	* The special/sugov task isn't part of regular bandwidth/admission
1129	* control so let userspace change affinities.
1130	*/
1131	if (dl_entity_is_special(dl_se: &p->dl))
1132	return `0`;
1133
1134	/*
1135	* Since bandwidth control happens on root_domain basis,
1136	* if admission test is enabled, we only admit -deadline
1137	* tasks allowed to run on all the CPUs in the task's
1138	* root_domain.
1139	*/
1140	guard(rcu)();
1141	if (!cpumask_subset(task_rq(p)->rd->span, src2p: mask))
1142	return -EBUSY;
1143
1144	return `0`;
1145	}
1146
1147	int __sched_setaffinity(struct task_struct p, struct* affinity_context *ctx)
1148	{
1149	int retval;
1150	cpumask_var_t cpus_allowed, new_mask;
1151
1152	if (!alloc_cpumask_var(mask: &cpus_allowed, GFP_KERNEL))
1153	return -ENOMEM;
1154
1155	if (!alloc_cpumask_var(mask: &new_mask, GFP_KERNEL)) {
1156	retval = -ENOMEM;
1157	goto out_free_cpus_allowed;
1158	}
1159
1160	cpuset_cpus_allowed(p, mask: cpus_allowed);
1161	cpumask_and(dstp: new_mask, src1p: ctx->new_mask, src2p: cpus_allowed);
1162
1163	ctx->new_mask = new_mask;
1164	ctx->flags \|= SCA_CHECK;
1165
1166	retval = dl_task_check_affinity(p, mask: new_mask);
1167	if (retval)
1168	goto out_free_new_mask;
1169
1170	retval = __set_cpus_allowed_ptr(p, ctx);
1171	if (retval)
1172	goto out_free_new_mask;
1173
1174	cpuset_cpus_allowed(p, mask: cpus_allowed);
1175	if (!cpumask_subset(src1p: new_mask, src2p: cpus_allowed)) {
1176	/*
1177	* We must have raced with a concurrent cpuset update.
1178	* Just reset the cpumask to the cpuset's cpus_allowed.
1179	*/
1180	cpumask_copy(dstp: new_mask, srcp: cpus_allowed);
1181
1182	/*
1183	* If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr()
1184	* will restore the previous user_cpus_ptr value.
1185	*
1186	* In the unlikely event a previous user_cpus_ptr exists,
1187	* we need to further restrict the mask to what is allowed
1188	* by that old user_cpus_ptr.
1189	*/
1190	if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {
1191	bool empty = !cpumask_and(dstp: new_mask, src1p: new_mask,
1192	src2p: ctx->user_mask);
1193
1194	if (empty)
1195	cpumask_copy(dstp: new_mask, srcp: cpus_allowed);
1196	}
1197	__set_cpus_allowed_ptr(p, ctx);
1198	retval = -EINVAL;
1199	}
1200
1201	out_free_new_mask:
1202	free_cpumask_var(mask: new_mask);
1203	out_free_cpus_allowed:
1204	free_cpumask_var(mask: cpus_allowed);
1205	return retval;
1206	}
1207
1208	long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
1209	{
1210	struct affinity_context ac;
1211	struct cpumask *user_mask;
1212	int retval;
1213
1214	CLASS(find_get_task, p)(pid);
1215	if (!p)
1216	return -ESRCH;
1217
1218	if (p->flags & PF_NO_SETAFFINITY)
1219	return -EINVAL;
1220
1221	if (!check_same_owner(p)) {
1222	guard(rcu)();
1223	if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
1224	return -EPERM;
1225	}
1226
1227	retval = security_task_setscheduler(p);
1228	if (retval)
1229	return retval;
1230
1231	/*
1232	* With non-SMP configs, user_cpus_ptr/user_mask isn't used and
1233	* alloc_user_cpus_ptr() returns NULL.
1234	*/
1235	user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
1236	if (user_mask) {
1237	cpumask_copy(dstp: user_mask, srcp: in_mask);
1238	} else {
1239	return -ENOMEM;
1240	}
1241
1242	ac = (struct affinity_context){
1243	.new_mask = in_mask,
1244	.user_mask = user_mask,
1245	.flags = SCA_USER,
1246	};
1247
1248	retval = __sched_setaffinity(p, ctx: &ac);
1249	kfree(objp: ac.user_mask);
1250
1251	return retval;
1252	}
1253
1254	static int get_user_cpu_mask(unsigned long __user user_mask_ptr, unsigned* len,
1255	struct cpumask *new_mask)
1256	{
1257	if (len < cpumask_size())
1258	cpumask_clear(dstp: new_mask);
1259	else if (len > cpumask_size())
1260	len = cpumask_size();
1261
1262	return copy_from_user(to: new_mask, from: user_mask_ptr, n: len) ? -EFAULT : `0`;
1263	}
1264
1265	/**
1266	* sys_sched_setaffinity - set the CPU affinity of a process
1267	* @pid: pid of the process
1268	* @len: length in bytes of the bitmask pointed to by user_mask_ptr
1269	* @user_mask_ptr: user-space pointer to the new CPU mask
1270	*
1271	* Return: 0 on success. An error code otherwise.
1272	*/
1273	SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
1274	unsigned long __user *, user_mask_ptr)
1275	{
1276	cpumask_var_t new_mask;
1277	int retval;
1278
1279	if (!alloc_cpumask_var(mask: &new_mask, GFP_KERNEL))
1280	return -ENOMEM;
1281
1282	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
1283	if (retval == `0`)
1284	retval = sched_setaffinity(pid, in_mask: new_mask);
1285	free_cpumask_var(mask: new_mask);
1286	return retval;
1287	}
1288
1289	long sched_getaffinity(pid_t pid, struct cpumask *mask)
1290	{
1291	struct task_struct *p;
1292	int retval;
1293
1294	guard(rcu)();
1295	p = find_process_by_pid(pid);
1296	if (!p)
1297	return -ESRCH;
1298
1299	retval = security_task_getscheduler(p);
1300	if (retval)
1301	return retval;
1302
1303	guard(raw_spinlock_irqsave)(l: &p->pi_lock);
1304	cpumask_and(dstp: mask, src1p: &p->cpus_mask, cpu_active_mask);
1305
1306	return `0`;
1307	}
1308
1309	/**
1310	* sys_sched_getaffinity - get the CPU affinity of a process
1311	* @pid: pid of the process
1312	* @len: length in bytes of the bitmask pointed to by user_mask_ptr
1313	* @user_mask_ptr: user-space pointer to hold the current CPU mask
1314	*
1315	* Return: size of CPU mask copied to user_mask_ptr on success. An
1316	* error code otherwise.
1317	*/
1318	SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
1319	unsigned long __user *, user_mask_ptr)
1320	{
1321	int ret;
1322	cpumask_var_t mask;
1323
1324	if ((len * BITS_PER_BYTE) < nr_cpu_ids)
1325	return -EINVAL;
1326	if (len & (sizeof(unsigned long)-`1`))
1327	return -EINVAL;
1328
1329	if (!zalloc_cpumask_var(mask: &mask, GFP_KERNEL))
1330	return -ENOMEM;
1331
1332	ret = sched_getaffinity(pid, mask);
1333	if (ret == `0`) {
1334	unsigned int retlen = min(len, cpumask_size());
1335
1336	if (copy_to_user(to: user_mask_ptr, cpumask_bits(mask), n: retlen))
1337	ret = -EFAULT;
1338	else
1339	ret = retlen;
1340	}
1341	free_cpumask_var(mask);
1342
1343	return ret;
1344	}
1345
1346	static void do_sched_yield(void)
1347	{
1348	struct rq_flags rf;
1349	struct rq *rq;
1350
1351	rq = this_rq_lock_irq(rf: &rf);
1352
1353	schedstat_inc(rq->yld_count);
1354	current->sched_class->yield_task(rq);
1355
1356	preempt_disable();
1357	rq_unlock_irq(rq, rf: &rf);
1358	sched_preempt_enable_no_resched();
1359
1360	schedule();
1361	}
1362
1363	/**
1364	* sys_sched_yield - yield the current processor to other threads.
1365	*
1366	* This function yields the current CPU to other tasks. If there are no
1367	* other threads running on this CPU then this function will return.
1368	*
1369	* Return: 0.
1370	*/
1371	SYSCALL_DEFINE0(sched_yield)
1372	{
1373	do_sched_yield();
1374	return `0`;
1375	}
1376
1377	/**
1378	* yield - yield the current processor to other threads.
1379	*
1380	* Do not ever use this function, there's a 99% chance you're doing it wrong.
1381	*
1382	* The scheduler is at all times free to pick the calling task as the most
1383	* eligible task to run, if removing the yield() call from your code breaks
1384	* it, it's already broken.
1385	*
1386	* Typical broken usage is:
1387	*
1388	* while (!event)
1389	* yield();
1390	*
1391	* where one assumes that yield() will let 'the other' process run that will
1392	* make event true. If the current task is a SCHED_FIFO task that will never
1393	* happen. Never use yield() as a progress guarantee!!
1394	*
1395	* If you want to use yield() to wait for something, use wait_event().
1396	* If you want to use yield() to be 'nice' for others, use cond_resched().
1397	* If you still want to use yield(), do not!
1398	*/
1399	void __sched yield(void)
1400	{
1401	set_current_state(TASK_RUNNING);
1402	do_sched_yield();
1403	}
1404	EXPORT_SYMBOL(yield);
1405
1406	/**
1407	* yield_to - yield the current processor to another thread in
1408	* your thread group, or accelerate that thread toward the
1409	* processor it's on.
1410	* @p: target task
1411	* @preempt: whether task preemption is allowed or not
1412	*
1413	* It's the caller's job to ensure that the target task struct
1414	* can't go away on us before we can do any checks.
1415	*
1416	* Return:
1417	* true (>0) if we indeed boosted the target task.
1418	* false (0) if we failed to boost the target.
1419	* -ESRCH if there's no task to yield to.
1420	*/
1421	int __sched yield_to(struct task_struct *p, bool preempt)
1422	{
1423	struct task_struct *curr = current;
1424	struct rq rq, p_rq;
1425	int yielded = `0`;
1426
1427	scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
1428	rq = this_rq();
1429
1430	again:
1431	p_rq = task_rq(p);
1432	/*
1433	* If we're the only runnable task on the rq and target rq also
1434	* has only one task, there's absolutely no point in yielding.
1435	*/
1436	if (rq->nr_running == `1` && p_rq->nr_running == `1`)
1437	return -ESRCH;
1438
1439	guard(double_rq_lock)(lock: rq, lock2: p_rq);
1440	if (task_rq(p) != p_rq)
1441	goto again;
1442
1443	if (!curr->sched_class->yield_to_task)
1444	return `0`;
1445
1446	if (curr->sched_class != p->sched_class)
1447	return `0`;
1448
1449	if (task_on_cpu(rq: p_rq, p) \|\| !task_is_running(p))
1450	return `0`;
1451
1452	yielded = curr->sched_class->yield_to_task(rq, p);
1453	if (yielded) {
1454	schedstat_inc(rq->yld_count);
1455	/*
1456	* Make p's CPU reschedule; pick_next_entity
1457	* takes care of fairness.
1458	*/
1459	if (preempt && rq != p_rq)
1460	resched_curr(rq: p_rq);
1461	}
1462	}
1463
1464	if (yielded)
1465	schedule();
1466
1467	return yielded;
1468	}
1469	EXPORT_SYMBOL_GPL(yield_to);
1470
1471	/**
1472	* sys_sched_get_priority_max - return maximum RT priority.
1473	* @policy: scheduling class.
1474	*
1475	* Return: On success, this syscall returns the maximum
1476	* rt_priority that can be used by a given scheduling class.
1477	* On failure, a negative error code is returned.
1478	*/
1479	SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
1480	{
1481	int ret = -EINVAL;
1482
1483	switch (policy) {
1484	case SCHED_FIFO:
1485	case SCHED_RR:
1486	ret = MAX_RT_PRIO-`1`;
1487	break;
1488	case SCHED_DEADLINE:
1489	case SCHED_NORMAL:
1490	case SCHED_BATCH:
1491	case SCHED_IDLE:
1492	case SCHED_EXT:
1493	ret = `0`;
1494	break;
1495	}
1496	return ret;
1497	}
1498
1499	/**
1500	* sys_sched_get_priority_min - return minimum RT priority.
1501	* @policy: scheduling class.
1502	*
1503	* Return: On success, this syscall returns the minimum
1504	* rt_priority that can be used by a given scheduling class.
1505	* On failure, a negative error code is returned.
1506	*/
1507	SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
1508	{
1509	int ret = -EINVAL;
1510
1511	switch (policy) {
1512	case SCHED_FIFO:
1513	case SCHED_RR:
1514	ret = `1`;
1515	break;
1516	case SCHED_DEADLINE:
1517	case SCHED_NORMAL:
1518	case SCHED_BATCH:
1519	case SCHED_IDLE:
1520	case SCHED_EXT:
1521	ret = `0`;
1522	}
1523	return ret;
1524	}
1525
1526	static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
1527	{
1528	unsigned int time_slice = `0`;
1529	int retval;
1530
1531	if (pid < `0`)
1532	return -EINVAL;
1533
1534	scoped_guard (rcu) {
1535	struct task_struct *p = find_process_by_pid(pid);
1536	if (!p)
1537	return -ESRCH;
1538
1539	retval = security_task_getscheduler(p);
1540	if (retval)
1541	return retval;
1542
1543	scoped_guard (task_rq_lock, p) {
1544	struct rq *rq = scope.rq;
1545	if (p->sched_class->get_rr_interval)
1546	time_slice = p->sched_class->get_rr_interval(rq, p);
1547	}
1548	}
1549
1550	jiffies_to_timespec64(jiffies: time_slice, value: t);
1551	return `0`;
1552	}
1553
1554	/**
1555	* sys_sched_rr_get_interval - return the default time-slice of a process.
1556	* @pid: pid of the process.
1557	* @interval: userspace pointer to the time-slice value.
1558	*
1559	* this syscall writes the default time-slice value of a given process
1560	* into the user-space timespec buffer. A value of '0' means infinity.
1561	*
1562	* Return: On success, 0 and the time-slice is in @interval. Otherwise,
1563	* an error code.
1564	*/
1565	SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
1566	struct __kernel_timespec __user *, interval)
1567	{
1568	struct timespec64 t;
1569	int retval = sched_rr_get_interval(pid, t: &t);
1570
1571	if (retval == `0`)
1572	retval = put_timespec64(ts: &t, uts: interval);
1573
1574	return retval;
1575	}
1576
1577	#ifdef CONFIG_COMPAT_32BIT_TIME
1578	SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
1579	struct old_timespec32 __user *, interval)
1580	{
1581	struct timespec64 t;
1582	int retval = sched_rr_get_interval(pid, t: &t);
1583
1584	if (retval == `0`)
1585	retval = put_old_timespec32(&t, interval);
1586	return retval;
1587	}
1588	#endif
1589

Browse the source code of Linux/kernel/sched/syscalls.c