| 1 | /* SPDX-License-Identifier: GPL-2.0 */ | 
|---|
| 2 | /* | 
|---|
| 3 | * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst | 
|---|
| 4 | * | 
|---|
| 5 | * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. | 
|---|
| 6 | * Copyright (c) 2022 Tejun Heo <tj@kernel.org> | 
|---|
| 7 | * Copyright (c) 2022 David Vernet <dvernet@meta.com> | 
|---|
| 8 | */ | 
|---|
| 9 | #ifndef _LINUX_SCHED_EXT_H | 
|---|
| 10 | #define _LINUX_SCHED_EXT_H | 
|---|
| 11 |  | 
|---|
| 12 | #ifdef CONFIG_SCHED_CLASS_EXT | 
|---|
| 13 |  | 
|---|
| 14 | #include <linux/llist.h> | 
|---|
| 15 | #include <linux/rhashtable-types.h> | 
|---|
| 16 |  | 
|---|
| 17 | enum scx_public_consts { | 
|---|
| 18 | SCX_OPS_NAME_LEN	= 128, | 
|---|
| 19 |  | 
|---|
| 20 | SCX_SLICE_DFL		= 20 * 1000000,	/* 20ms */ | 
|---|
| 21 | SCX_SLICE_INF		= U64_MAX,	/* infinite, implies nohz */ | 
|---|
| 22 | }; | 
|---|
| 23 |  | 
|---|
| 24 | /* | 
|---|
| 25 | * DSQ (dispatch queue) IDs are 64bit of the format: | 
|---|
| 26 | * | 
|---|
| 27 | *   Bits: [63] [62 ..  0] | 
|---|
| 28 | *         [ B] [   ID   ] | 
|---|
| 29 | * | 
|---|
| 30 | *    B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs | 
|---|
| 31 | *   ID: 63 bit ID | 
|---|
| 32 | * | 
|---|
| 33 | * Built-in IDs: | 
|---|
| 34 | * | 
|---|
| 35 | *   Bits: [63] [62] [61..32] [31 ..  0] | 
|---|
| 36 | *         [ 1] [ L] [   R  ] [    V   ] | 
|---|
| 37 | * | 
|---|
| 38 | *    1: 1 for built-in DSQs. | 
|---|
| 39 | *    L: 1 for LOCAL_ON DSQ IDs, 0 for others | 
|---|
| 40 | *    V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value. | 
|---|
| 41 | */ | 
|---|
| 42 | enum scx_dsq_id_flags { | 
|---|
| 43 | SCX_DSQ_FLAG_BUILTIN	= 1LLU << 63, | 
|---|
| 44 | SCX_DSQ_FLAG_LOCAL_ON	= 1LLU << 62, | 
|---|
| 45 |  | 
|---|
| 46 | SCX_DSQ_INVALID		= SCX_DSQ_FLAG_BUILTIN | 0, | 
|---|
| 47 | SCX_DSQ_GLOBAL		= SCX_DSQ_FLAG_BUILTIN | 1, | 
|---|
| 48 | SCX_DSQ_LOCAL		= SCX_DSQ_FLAG_BUILTIN | 2, | 
|---|
| 49 | SCX_DSQ_LOCAL_ON	= SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON, | 
|---|
| 50 | SCX_DSQ_LOCAL_CPU_MASK	= 0xffffffffLLU, | 
|---|
| 51 | }; | 
|---|
| 52 |  | 
|---|
| 53 | /* | 
|---|
| 54 | * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered | 
|---|
| 55 | * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to | 
|---|
| 56 | * buffer between the scheduler core and the BPF scheduler. See the | 
|---|
| 57 | * documentation for more details. | 
|---|
| 58 | */ | 
|---|
| 59 | struct scx_dispatch_q { | 
|---|
| 60 | raw_spinlock_t		lock; | 
|---|
| 61 | struct list_head	list;	/* tasks in dispatch order */ | 
|---|
| 62 | struct rb_root		priq;	/* used to order by p->scx.dsq_vtime */ | 
|---|
| 63 | u32			nr; | 
|---|
| 64 | u32			seq;	/* used by BPF iter */ | 
|---|
| 65 | u64			id; | 
|---|
| 66 | struct rhash_head	hash_node; | 
|---|
| 67 | struct llist_node	free_node; | 
|---|
| 68 | struct rcu_head		rcu; | 
|---|
| 69 | }; | 
|---|
| 70 |  | 
|---|
| 71 | /* scx_entity.flags */ | 
|---|
| 72 | enum scx_ent_flags { | 
|---|
| 73 | SCX_TASK_QUEUED		= 1 << 0, /* on ext runqueue */ | 
|---|
| 74 | SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */ | 
|---|
| 75 | SCX_TASK_DEQD_FOR_SLEEP	= 1 << 3, /* last dequeue was for SLEEP */ | 
|---|
| 76 |  | 
|---|
| 77 | SCX_TASK_STATE_SHIFT	= 8,	  /* bit 8 and 9 are used to carry scx_task_state */ | 
|---|
| 78 | SCX_TASK_STATE_BITS	= 2, | 
|---|
| 79 | SCX_TASK_STATE_MASK	= ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, | 
|---|
| 80 |  | 
|---|
| 81 | SCX_TASK_CURSOR		= 1 << 31, /* iteration cursor, not a task */ | 
|---|
| 82 | }; | 
|---|
| 83 |  | 
|---|
| 84 | /* scx_entity.flags & SCX_TASK_STATE_MASK */ | 
|---|
| 85 | enum scx_task_state { | 
|---|
| 86 | SCX_TASK_NONE,		/* ops.init_task() not called yet */ | 
|---|
| 87 | SCX_TASK_INIT,		/* ops.init_task() succeeded, but task can be cancelled */ | 
|---|
| 88 | SCX_TASK_READY,		/* fully initialized, but not in sched_ext */ | 
|---|
| 89 | SCX_TASK_ENABLED,	/* fully initialized and in sched_ext */ | 
|---|
| 90 |  | 
|---|
| 91 | SCX_TASK_NR_STATES, | 
|---|
| 92 | }; | 
|---|
| 93 |  | 
|---|
| 94 | /* scx_entity.dsq_flags */ | 
|---|
| 95 | enum scx_ent_dsq_flags { | 
|---|
| 96 | SCX_TASK_DSQ_ON_PRIQ	= 1 << 0, /* task is queued on the priority queue of a dsq */ | 
|---|
| 97 | }; | 
|---|
| 98 |  | 
|---|
| 99 | /* | 
|---|
| 100 | * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from | 
|---|
| 101 | * everywhere and the following bits track which kfunc sets are currently | 
|---|
| 102 | * allowed for %current. This simple per-task tracking works because SCX ops | 
|---|
| 103 | * nest in a limited way. BPF will likely implement a way to allow and disallow | 
|---|
| 104 | * kfuncs depending on the calling context which will replace this manual | 
|---|
| 105 | * mechanism. See scx_kf_allow(). | 
|---|
| 106 | */ | 
|---|
| 107 | enum scx_kf_mask { | 
|---|
| 108 | SCX_KF_UNLOCKED		= 0,	  /* sleepable and not rq locked */ | 
|---|
| 109 | /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */ | 
|---|
| 110 | SCX_KF_CPU_RELEASE	= 1 << 0, /* ops.cpu_release() */ | 
|---|
| 111 | /* | 
|---|
| 112 | * ops.dispatch() may release rq lock temporarily and thus ENQUEUE and | 
|---|
| 113 | * SELECT_CPU may be nested inside. ops.dequeue (in REST) may also be | 
|---|
| 114 | * nested inside DISPATCH. | 
|---|
| 115 | */ | 
|---|
| 116 | SCX_KF_DISPATCH		= 1 << 1, /* ops.dispatch() */ | 
|---|
| 117 | SCX_KF_ENQUEUE		= 1 << 2, /* ops.enqueue() and ops.select_cpu() */ | 
|---|
| 118 | SCX_KF_SELECT_CPU	= 1 << 3, /* ops.select_cpu() */ | 
|---|
| 119 | SCX_KF_REST		= 1 << 4, /* other rq-locked operations */ | 
|---|
| 120 |  | 
|---|
| 121 | __SCX_KF_RQ_LOCKED	= SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH | | 
|---|
| 122 | SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, | 
|---|
| 123 | __SCX_KF_TERMINAL	= SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, | 
|---|
| 124 | }; | 
|---|
| 125 |  | 
|---|
| 126 | enum scx_dsq_lnode_flags { | 
|---|
| 127 | SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0, | 
|---|
| 128 |  | 
|---|
| 129 | /* high 16 bits can be for iter cursor flags */ | 
|---|
| 130 | __SCX_DSQ_LNODE_PRIV_SHIFT = 16, | 
|---|
| 131 | }; | 
|---|
| 132 |  | 
|---|
| 133 | struct scx_dsq_list_node { | 
|---|
| 134 | struct list_head	node; | 
|---|
| 135 | u32			flags; | 
|---|
| 136 | u32			priv;		/* can be used by iter cursor */ | 
|---|
| 137 | }; | 
|---|
| 138 |  | 
|---|
| 139 | /* | 
|---|
| 140 | * The following is embedded in task_struct and contains all fields necessary | 
|---|
| 141 | * for a task to be scheduled by SCX. | 
|---|
| 142 | */ | 
|---|
| 143 | struct sched_ext_entity { | 
|---|
| 144 | struct scx_dispatch_q	*dsq; | 
|---|
| 145 | struct scx_dsq_list_node dsq_list;	/* dispatch order */ | 
|---|
| 146 | struct rb_node		dsq_priq;	/* p->scx.dsq_vtime order */ | 
|---|
| 147 | u32			dsq_seq; | 
|---|
| 148 | u32			dsq_flags;	/* protected by DSQ lock */ | 
|---|
| 149 | u32			flags;		/* protected by rq lock */ | 
|---|
| 150 | u32			weight; | 
|---|
| 151 | s32			sticky_cpu; | 
|---|
| 152 | s32			holding_cpu; | 
|---|
| 153 | s32			selected_cpu; | 
|---|
| 154 | u32			kf_mask;	/* see scx_kf_mask above */ | 
|---|
| 155 | struct task_struct	*kf_tasks[2];	/* see SCX_CALL_OP_TASK() */ | 
|---|
| 156 | atomic_long_t		ops_state; | 
|---|
| 157 |  | 
|---|
| 158 | struct list_head	runnable_node;	/* rq->scx.runnable_list */ | 
|---|
| 159 | unsigned long		runnable_at; | 
|---|
| 160 |  | 
|---|
| 161 | #ifdef CONFIG_SCHED_CORE | 
|---|
| 162 | u64			core_sched_at;	/* see scx_prio_less() */ | 
|---|
| 163 | #endif | 
|---|
| 164 | u64			ddsp_dsq_id; | 
|---|
| 165 | u64			ddsp_enq_flags; | 
|---|
| 166 |  | 
|---|
| 167 | /* BPF scheduler modifiable fields */ | 
|---|
| 168 |  | 
|---|
| 169 | /* | 
|---|
| 170 | * Runtime budget in nsecs. This is usually set through | 
|---|
| 171 | * scx_bpf_dsq_insert() but can also be modified directly by the BPF | 
|---|
| 172 | * scheduler. Automatically decreased by SCX as the task executes. On | 
|---|
| 173 | * depletion, a scheduling event is triggered. | 
|---|
| 174 | * | 
|---|
| 175 | * This value is cleared to zero if the task is preempted by | 
|---|
| 176 | * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the | 
|---|
| 177 | * task ran. Use p->se.sum_exec_runtime instead. | 
|---|
| 178 | */ | 
|---|
| 179 | u64			slice; | 
|---|
| 180 |  | 
|---|
| 181 | /* | 
|---|
| 182 | * Used to order tasks when dispatching to the vtime-ordered priority | 
|---|
| 183 | * queue of a dsq. This is usually set through | 
|---|
| 184 | * scx_bpf_dsq_insert_vtime() but can also be modified directly by the | 
|---|
| 185 | * BPF scheduler. Modifying it while a task is queued on a dsq may | 
|---|
| 186 | * mangle the ordering and is not recommended. | 
|---|
| 187 | */ | 
|---|
| 188 | u64			dsq_vtime; | 
|---|
| 189 |  | 
|---|
| 190 | /* | 
|---|
| 191 | * If set, reject future sched_setscheduler(2) calls updating the policy | 
|---|
| 192 | * to %SCHED_EXT with -%EACCES. | 
|---|
| 193 | * | 
|---|
| 194 | * Can be set from ops.init_task() while the BPF scheduler is being | 
|---|
| 195 | * loaded (!scx_init_task_args->fork). If set and the task's policy is | 
|---|
| 196 | * already %SCHED_EXT, the task's policy is rejected and forcefully | 
|---|
| 197 | * reverted to %SCHED_NORMAL. The number of such events are reported | 
|---|
| 198 | * through /sys/kernel/debug/sched_ext::nr_rejected. Setting this flag | 
|---|
| 199 | * during fork is not allowed. | 
|---|
| 200 | */ | 
|---|
| 201 | bool			disallow;	/* reject switching into SCX */ | 
|---|
| 202 |  | 
|---|
| 203 | /* cold fields */ | 
|---|
| 204 | #ifdef CONFIG_EXT_GROUP_SCHED | 
|---|
| 205 | struct cgroup		*cgrp_moving_from; | 
|---|
| 206 | #endif | 
|---|
| 207 | struct list_head	tasks_node; | 
|---|
| 208 | }; | 
|---|
| 209 |  | 
|---|
| 210 | void sched_ext_free(struct task_struct *p); | 
|---|
| 211 | void print_scx_info(const char *log_lvl, struct task_struct *p); | 
|---|
| 212 | void scx_softlockup(u32 dur_s); | 
|---|
| 213 | bool scx_rcu_cpu_stall(void); | 
|---|
| 214 |  | 
|---|
| 215 | #else	/* !CONFIG_SCHED_CLASS_EXT */ | 
|---|
| 216 |  | 
|---|
| 217 | static inline void sched_ext_free(struct task_struct *p) {} | 
|---|
| 218 | static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} | 
|---|
| 219 | static inline void scx_softlockup(u32 dur_s) {} | 
|---|
| 220 | static inline bool scx_rcu_cpu_stall(void) { return false; } | 
|---|
| 221 |  | 
|---|
| 222 | #endif	/* CONFIG_SCHED_CLASS_EXT */ | 
|---|
| 223 |  | 
|---|
| 224 | struct scx_task_group { | 
|---|
| 225 | #ifdef CONFIG_EXT_GROUP_SCHED | 
|---|
| 226 | u32			flags;		/* SCX_TG_* */ | 
|---|
| 227 | u32			weight; | 
|---|
| 228 | u64			bw_period_us; | 
|---|
| 229 | u64			bw_quota_us; | 
|---|
| 230 | u64			bw_burst_us; | 
|---|
| 231 | #endif | 
|---|
| 232 | }; | 
|---|
| 233 |  | 
|---|
| 234 | #endif	/* _LINUX_SCHED_EXT_H */ | 
|---|
| 235 |  | 
|---|