| 1 | // SPDX-License-Identifier: GPL-2.0-only | 
|---|
| 2 | #include "cgroup-internal.h" | 
|---|
| 3 |  | 
|---|
| 4 | #include <linux/sched/cputime.h> | 
|---|
| 5 |  | 
|---|
| 6 | #include <linux/bpf.h> | 
|---|
| 7 | #include <linux/btf.h> | 
|---|
| 8 | #include <linux/btf_ids.h> | 
|---|
| 9 |  | 
|---|
| 10 | #include <trace/events/cgroup.h> | 
|---|
| 11 |  | 
|---|
| 12 | static DEFINE_SPINLOCK(rstat_base_lock); | 
|---|
| 13 | static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list); | 
|---|
| 14 |  | 
|---|
| 15 | static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); | 
|---|
| 16 |  | 
|---|
| 17 | /* | 
|---|
| 18 | * Determines whether a given css can participate in rstat. | 
|---|
| 19 | * css's that are cgroup::self use rstat for base stats. | 
|---|
| 20 | * Other css's associated with a subsystem use rstat only when | 
|---|
| 21 | * they define the ss->css_rstat_flush callback. | 
|---|
| 22 | */ | 
|---|
| 23 | static inline bool css_uses_rstat(struct cgroup_subsys_state *css) | 
|---|
| 24 | { | 
|---|
| 25 | return css_is_self(css) || css->ss->css_rstat_flush != NULL; | 
|---|
| 26 | } | 
|---|
| 27 |  | 
|---|
| 28 | static struct css_rstat_cpu *css_rstat_cpu( | 
|---|
| 29 | struct cgroup_subsys_state *css, int cpu) | 
|---|
| 30 | { | 
|---|
| 31 | return per_cpu_ptr(css->rstat_cpu, cpu); | 
|---|
| 32 | } | 
|---|
| 33 |  | 
|---|
| 34 | static struct cgroup_rstat_base_cpu *cgroup_rstat_base_cpu( | 
|---|
| 35 | struct cgroup *cgrp, int cpu) | 
|---|
| 36 | { | 
|---|
| 37 | return per_cpu_ptr(cgrp->rstat_base_cpu, cpu); | 
|---|
| 38 | } | 
|---|
| 39 |  | 
|---|
| 40 | static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss) | 
|---|
| 41 | { | 
|---|
| 42 | if (ss) | 
|---|
| 43 | return &ss->rstat_ss_lock; | 
|---|
| 44 |  | 
|---|
| 45 | return &rstat_base_lock; | 
|---|
| 46 | } | 
|---|
| 47 |  | 
|---|
| 48 | static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu) | 
|---|
| 49 | { | 
|---|
| 50 | if (ss) | 
|---|
| 51 | return per_cpu_ptr(ss->lhead, cpu); | 
|---|
| 52 | return per_cpu_ptr(&rstat_backlog_list, cpu); | 
|---|
| 53 | } | 
|---|
| 54 |  | 
|---|
| 55 | /** | 
|---|
| 56 | * css_rstat_updated - keep track of updated rstat_cpu | 
|---|
| 57 | * @css: target cgroup subsystem state | 
|---|
| 58 | * @cpu: cpu on which rstat_cpu was updated | 
|---|
| 59 | * | 
|---|
| 60 | * Atomically inserts the css in the ss's llist for the given cpu. This is | 
|---|
| 61 | * reentrant safe i.e. safe against softirq, hardirq and nmi. The ss's llist | 
|---|
| 62 | * will be processed at the flush time to create the update tree. | 
|---|
| 63 | * | 
|---|
| 64 | * NOTE: if the user needs the guarantee that the updater either add itself in | 
|---|
| 65 | * the lockless list or the concurrent flusher flushes its updated stats, a | 
|---|
| 66 | * memory barrier is needed before the call to css_rstat_updated() i.e. a | 
|---|
| 67 | * barrier after updating the per-cpu stats and before calling | 
|---|
| 68 | * css_rstat_updated(). | 
|---|
| 69 | */ | 
|---|
| 70 | __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) | 
|---|
| 71 | { | 
|---|
| 72 | struct llist_head *lhead; | 
|---|
| 73 | struct css_rstat_cpu *rstatc; | 
|---|
| 74 | struct css_rstat_cpu __percpu *rstatc_pcpu; | 
|---|
| 75 | struct llist_node *self; | 
|---|
| 76 |  | 
|---|
| 77 | /* | 
|---|
| 78 | * Since bpf programs can call this function, prevent access to | 
|---|
| 79 | * uninitialized rstat pointers. | 
|---|
| 80 | */ | 
|---|
| 81 | if (!css_uses_rstat(css)) | 
|---|
| 82 | return; | 
|---|
| 83 |  | 
|---|
| 84 | lockdep_assert_preemption_disabled(); | 
|---|
| 85 |  | 
|---|
| 86 | /* | 
|---|
| 87 | * For archs withnot nmi safe cmpxchg or percpu ops support, ignore | 
|---|
| 88 | * the requests from nmi context. | 
|---|
| 89 | */ | 
|---|
| 90 | if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) || | 
|---|
| 91 | !IS_ENABLED(CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS)) && in_nmi()) | 
|---|
| 92 | return; | 
|---|
| 93 |  | 
|---|
| 94 | rstatc = css_rstat_cpu(css, cpu); | 
|---|
| 95 | /* | 
|---|
| 96 | * If already on list return. This check is racy and smp_mb() is needed | 
|---|
| 97 | * to pair it with the smp_mb() in css_process_update_tree() if the | 
|---|
| 98 | * guarantee that the updated stats are visible to concurrent flusher is | 
|---|
| 99 | * needed. | 
|---|
| 100 | */ | 
|---|
| 101 | if (llist_on_list(node: &rstatc->lnode)) | 
|---|
| 102 | return; | 
|---|
| 103 |  | 
|---|
| 104 | /* | 
|---|
| 105 | * This function can be renentered by irqs and nmis for the same cgroup | 
|---|
| 106 | * and may try to insert the same per-cpu lnode into the llist. Note | 
|---|
| 107 | * that llist_add() does not protect against such scenarios. | 
|---|
| 108 | * | 
|---|
| 109 | * To protect against such stacked contexts of irqs/nmis, we use the | 
|---|
| 110 | * fact that lnode points to itself when not on a list and then use | 
|---|
| 111 | * this_cpu_cmpxchg() to atomically set to NULL to select the winner | 
|---|
| 112 | * which will call llist_add(). The losers can assume the insertion is | 
|---|
| 113 | * successful and the winner will eventually add the per-cpu lnode to | 
|---|
| 114 | * the llist. | 
|---|
| 115 | */ | 
|---|
| 116 | self = &rstatc->lnode; | 
|---|
| 117 | rstatc_pcpu = css->rstat_cpu; | 
|---|
| 118 | if (this_cpu_cmpxchg(rstatc_pcpu->lnode.next, self, NULL) != self) | 
|---|
| 119 | return; | 
|---|
| 120 |  | 
|---|
| 121 | lhead = ss_lhead_cpu(ss: css->ss, cpu); | 
|---|
| 122 | llist_add(new: &rstatc->lnode, head: lhead); | 
|---|
| 123 | } | 
|---|
| 124 |  | 
|---|
| 125 | static void __css_process_update_tree(struct cgroup_subsys_state *css, int cpu) | 
|---|
| 126 | { | 
|---|
| 127 | /* put @css and all ancestors on the corresponding updated lists */ | 
|---|
| 128 | while (true) { | 
|---|
| 129 | struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); | 
|---|
| 130 | struct cgroup_subsys_state *parent = css->parent; | 
|---|
| 131 | struct css_rstat_cpu *prstatc; | 
|---|
| 132 |  | 
|---|
| 133 | /* | 
|---|
| 134 | * Both additions and removals are bottom-up.  If a cgroup | 
|---|
| 135 | * is already in the tree, all ancestors are. | 
|---|
| 136 | */ | 
|---|
| 137 | if (rstatc->updated_next) | 
|---|
| 138 | break; | 
|---|
| 139 |  | 
|---|
| 140 | /* Root has no parent to link it to, but mark it busy */ | 
|---|
| 141 | if (!parent) { | 
|---|
| 142 | rstatc->updated_next = css; | 
|---|
| 143 | break; | 
|---|
| 144 | } | 
|---|
| 145 |  | 
|---|
| 146 | prstatc = css_rstat_cpu(css: parent, cpu); | 
|---|
| 147 | rstatc->updated_next = prstatc->updated_children; | 
|---|
| 148 | prstatc->updated_children = css; | 
|---|
| 149 |  | 
|---|
| 150 | css = parent; | 
|---|
| 151 | } | 
|---|
| 152 | } | 
|---|
| 153 |  | 
|---|
| 154 | static void css_process_update_tree(struct cgroup_subsys *ss, int cpu) | 
|---|
| 155 | { | 
|---|
| 156 | struct llist_head *lhead = ss_lhead_cpu(ss, cpu); | 
|---|
| 157 | struct llist_node *lnode; | 
|---|
| 158 |  | 
|---|
| 159 | while ((lnode = llist_del_first_init(head: lhead))) { | 
|---|
| 160 | struct css_rstat_cpu *rstatc; | 
|---|
| 161 |  | 
|---|
| 162 | /* | 
|---|
| 163 | * smp_mb() is needed here (more specifically in between | 
|---|
| 164 | * init_llist_node() and per-cpu stats flushing) if the | 
|---|
| 165 | * guarantee is required by a rstat user where etiher the | 
|---|
| 166 | * updater should add itself on the lockless list or the | 
|---|
| 167 | * flusher flush the stats updated by the updater who have | 
|---|
| 168 | * observed that they are already on the list. The | 
|---|
| 169 | * corresponding barrier pair for this one should be before | 
|---|
| 170 | * css_rstat_updated() by the user. | 
|---|
| 171 | * | 
|---|
| 172 | * For now, there aren't any such user, so not adding the | 
|---|
| 173 | * barrier here but if such a use-case arise, please add | 
|---|
| 174 | * smp_mb() here. | 
|---|
| 175 | */ | 
|---|
| 176 |  | 
|---|
| 177 | rstatc = container_of(lnode, struct css_rstat_cpu, lnode); | 
|---|
| 178 | __css_process_update_tree(css: rstatc->owner, cpu); | 
|---|
| 179 | } | 
|---|
| 180 | } | 
|---|
| 181 |  | 
|---|
| 182 | /** | 
|---|
| 183 | * css_rstat_push_children - push children css's into the given list | 
|---|
| 184 | * @head: current head of the list (= subtree root) | 
|---|
| 185 | * @child: first child of the root | 
|---|
| 186 | * @cpu: target cpu | 
|---|
| 187 | * Return: A new singly linked list of css's to be flushed | 
|---|
| 188 | * | 
|---|
| 189 | * Iteratively traverse down the css_rstat_cpu updated tree level by | 
|---|
| 190 | * level and push all the parents first before their next level children | 
|---|
| 191 | * into a singly linked list via the rstat_flush_next pointer built from the | 
|---|
| 192 | * tail backward like "pushing" css's into a stack. The root is pushed by | 
|---|
| 193 | * the caller. | 
|---|
| 194 | */ | 
|---|
| 195 | static struct cgroup_subsys_state *css_rstat_push_children( | 
|---|
| 196 | struct cgroup_subsys_state *head, | 
|---|
| 197 | struct cgroup_subsys_state *child, int cpu) | 
|---|
| 198 | { | 
|---|
| 199 | struct cgroup_subsys_state *cnext = child;	/* Next head of child css level */ | 
|---|
| 200 | struct cgroup_subsys_state *ghead = NULL;	/* Head of grandchild css level */ | 
|---|
| 201 | struct cgroup_subsys_state *parent, *grandchild; | 
|---|
| 202 | struct css_rstat_cpu *crstatc; | 
|---|
| 203 |  | 
|---|
| 204 | child->rstat_flush_next = NULL; | 
|---|
| 205 |  | 
|---|
| 206 | /* | 
|---|
| 207 | * The subsystem rstat lock must be held for the whole duration from | 
|---|
| 208 | * here as the rstat_flush_next list is being constructed to when | 
|---|
| 209 | * it is consumed later in css_rstat_flush(). | 
|---|
| 210 | */ | 
|---|
| 211 | lockdep_assert_held(ss_rstat_lock(head->ss)); | 
|---|
| 212 |  | 
|---|
| 213 | /* | 
|---|
| 214 | * Notation: -> updated_next pointer | 
|---|
| 215 | *	     => rstat_flush_next pointer | 
|---|
| 216 | * | 
|---|
| 217 | * Assuming the following sample updated_children lists: | 
|---|
| 218 | *  P: C1 -> C2 -> P | 
|---|
| 219 | *  C1: G11 -> G12 -> C1 | 
|---|
| 220 | *  C2: G21 -> G22 -> C2 | 
|---|
| 221 | * | 
|---|
| 222 | * After 1st iteration: | 
|---|
| 223 | *  head => C2 => C1 => NULL | 
|---|
| 224 | *  ghead => G21 => G11 => NULL | 
|---|
| 225 | * | 
|---|
| 226 | * After 2nd iteration: | 
|---|
| 227 | *  head => G12 => G11 => G22 => G21 => C2 => C1 => NULL | 
|---|
| 228 | */ | 
|---|
| 229 | next_level: | 
|---|
| 230 | while (cnext) { | 
|---|
| 231 | child = cnext; | 
|---|
| 232 | cnext = child->rstat_flush_next; | 
|---|
| 233 | parent = child->parent; | 
|---|
| 234 |  | 
|---|
| 235 | /* updated_next is parent cgroup terminated if !NULL */ | 
|---|
| 236 | while (child != parent) { | 
|---|
| 237 | child->rstat_flush_next = head; | 
|---|
| 238 | head = child; | 
|---|
| 239 | crstatc = css_rstat_cpu(css: child, cpu); | 
|---|
| 240 | grandchild = crstatc->updated_children; | 
|---|
| 241 | if (grandchild != child) { | 
|---|
| 242 | /* Push the grand child to the next level */ | 
|---|
| 243 | crstatc->updated_children = child; | 
|---|
| 244 | grandchild->rstat_flush_next = ghead; | 
|---|
| 245 | ghead = grandchild; | 
|---|
| 246 | } | 
|---|
| 247 | child = crstatc->updated_next; | 
|---|
| 248 | crstatc->updated_next = NULL; | 
|---|
| 249 | } | 
|---|
| 250 | } | 
|---|
| 251 |  | 
|---|
| 252 | if (ghead) { | 
|---|
| 253 | cnext = ghead; | 
|---|
| 254 | ghead = NULL; | 
|---|
| 255 | goto next_level; | 
|---|
| 256 | } | 
|---|
| 257 | return head; | 
|---|
| 258 | } | 
|---|
| 259 |  | 
|---|
| 260 | /** | 
|---|
| 261 | * css_rstat_updated_list - build a list of updated css's to be flushed | 
|---|
| 262 | * @root: root of the css subtree to traverse | 
|---|
| 263 | * @cpu: target cpu | 
|---|
| 264 | * Return: A singly linked list of css's to be flushed | 
|---|
| 265 | * | 
|---|
| 266 | * Walks the updated rstat_cpu tree on @cpu from @root.  During traversal, | 
|---|
| 267 | * each returned css is unlinked from the updated tree. | 
|---|
| 268 | * | 
|---|
| 269 | * The only ordering guarantee is that, for a parent and a child pair | 
|---|
| 270 | * covered by a given traversal, the child is before its parent in | 
|---|
| 271 | * the list. | 
|---|
| 272 | * | 
|---|
| 273 | * Note that updated_children is self terminated and points to a list of | 
|---|
| 274 | * child css's if not empty. Whereas updated_next is like a sibling link | 
|---|
| 275 | * within the children list and terminated by the parent css. An exception | 
|---|
| 276 | * here is the css root whose updated_next can be self terminated. | 
|---|
| 277 | */ | 
|---|
| 278 | static struct cgroup_subsys_state *css_rstat_updated_list( | 
|---|
| 279 | struct cgroup_subsys_state *root, int cpu) | 
|---|
| 280 | { | 
|---|
| 281 | struct css_rstat_cpu *rstatc = css_rstat_cpu(css: root, cpu); | 
|---|
| 282 | struct cgroup_subsys_state *head = NULL, *parent, *child; | 
|---|
| 283 |  | 
|---|
| 284 | css_process_update_tree(ss: root->ss, cpu); | 
|---|
| 285 |  | 
|---|
| 286 | /* Return NULL if this subtree is not on-list */ | 
|---|
| 287 | if (!rstatc->updated_next) | 
|---|
| 288 | return NULL; | 
|---|
| 289 |  | 
|---|
| 290 | /* | 
|---|
| 291 | * Unlink @root from its parent. As the updated_children list is | 
|---|
| 292 | * singly linked, we have to walk it to find the removal point. | 
|---|
| 293 | */ | 
|---|
| 294 | parent = root->parent; | 
|---|
| 295 | if (parent) { | 
|---|
| 296 | struct css_rstat_cpu *prstatc; | 
|---|
| 297 | struct cgroup_subsys_state **nextp; | 
|---|
| 298 |  | 
|---|
| 299 | prstatc = css_rstat_cpu(css: parent, cpu); | 
|---|
| 300 | nextp = &prstatc->updated_children; | 
|---|
| 301 | while (*nextp != root) { | 
|---|
| 302 | struct css_rstat_cpu *nrstatc; | 
|---|
| 303 |  | 
|---|
| 304 | nrstatc = css_rstat_cpu(css: *nextp, cpu); | 
|---|
| 305 | WARN_ON_ONCE(*nextp == parent); | 
|---|
| 306 | nextp = &nrstatc->updated_next; | 
|---|
| 307 | } | 
|---|
| 308 | *nextp = rstatc->updated_next; | 
|---|
| 309 | } | 
|---|
| 310 |  | 
|---|
| 311 | rstatc->updated_next = NULL; | 
|---|
| 312 |  | 
|---|
| 313 | /* Push @root to the list first before pushing the children */ | 
|---|
| 314 | head = root; | 
|---|
| 315 | root->rstat_flush_next = NULL; | 
|---|
| 316 | child = rstatc->updated_children; | 
|---|
| 317 | rstatc->updated_children = root; | 
|---|
| 318 | if (child != root) | 
|---|
| 319 | head = css_rstat_push_children(head, child, cpu); | 
|---|
| 320 |  | 
|---|
| 321 | return head; | 
|---|
| 322 | } | 
|---|
| 323 |  | 
|---|
| 324 | /* | 
|---|
| 325 | * A hook for bpf stat collectors to attach to and flush their stats. | 
|---|
| 326 | * Together with providing bpf kfuncs for css_rstat_updated() and | 
|---|
| 327 | * css_rstat_flush(), this enables a complete workflow where bpf progs that | 
|---|
| 328 | * collect cgroup stats can integrate with rstat for efficient flushing. | 
|---|
| 329 | * | 
|---|
| 330 | * A static noinline declaration here could cause the compiler to optimize away | 
|---|
| 331 | * the function. A global noinline declaration will keep the definition, but may | 
|---|
| 332 | * optimize away the callsite. Therefore, __weak is needed to ensure that the | 
|---|
| 333 | * call is still emitted, by telling the compiler that we don't know what the | 
|---|
| 334 | * function might eventually be. | 
|---|
| 335 | */ | 
|---|
| 336 |  | 
|---|
| 337 | __bpf_hook_start(); | 
|---|
| 338 |  | 
|---|
| 339 | __weak noinline void bpf_rstat_flush(struct cgroup *cgrp, | 
|---|
| 340 | struct cgroup *parent, int cpu) | 
|---|
| 341 | { | 
|---|
| 342 | } | 
|---|
| 343 |  | 
|---|
| 344 | __bpf_hook_end(); | 
|---|
| 345 |  | 
|---|
| 346 | /* | 
|---|
| 347 | * Helper functions for locking. | 
|---|
| 348 | * | 
|---|
| 349 | * This makes it easier to diagnose locking issues and contention in | 
|---|
| 350 | * production environments.  The parameter @cpu_in_loop indicate lock | 
|---|
| 351 | * was released and re-taken when collection data from the CPUs. The | 
|---|
| 352 | * value -1 is used when obtaining the main lock else this is the CPU | 
|---|
| 353 | * number processed last. | 
|---|
| 354 | */ | 
|---|
| 355 | static inline void __css_rstat_lock(struct cgroup_subsys_state *css, | 
|---|
| 356 | int cpu_in_loop) | 
|---|
| 357 | __acquires(ss_rstat_lock(css->ss)) | 
|---|
| 358 | { | 
|---|
| 359 | struct cgroup *cgrp = css->cgroup; | 
|---|
| 360 | spinlock_t *lock; | 
|---|
| 361 | bool contended; | 
|---|
| 362 |  | 
|---|
| 363 | lock = ss_rstat_lock(ss: css->ss); | 
|---|
| 364 | contended = !spin_trylock_irq(lock); | 
|---|
| 365 | if (contended) { | 
|---|
| 366 | trace_cgroup_rstat_lock_contended(cgrp, cpu: cpu_in_loop, contended); | 
|---|
| 367 | spin_lock_irq(lock); | 
|---|
| 368 | } | 
|---|
| 369 | trace_cgroup_rstat_locked(cgrp, cpu: cpu_in_loop, contended); | 
|---|
| 370 | } | 
|---|
| 371 |  | 
|---|
| 372 | static inline void __css_rstat_unlock(struct cgroup_subsys_state *css, | 
|---|
| 373 | int cpu_in_loop) | 
|---|
| 374 | __releases(ss_rstat_lock(css->ss)) | 
|---|
| 375 | { | 
|---|
| 376 | struct cgroup *cgrp = css->cgroup; | 
|---|
| 377 | spinlock_t *lock; | 
|---|
| 378 |  | 
|---|
| 379 | lock = ss_rstat_lock(ss: css->ss); | 
|---|
| 380 | trace_cgroup_rstat_unlock(cgrp, cpu: cpu_in_loop, contended: false); | 
|---|
| 381 | spin_unlock_irq(lock); | 
|---|
| 382 | } | 
|---|
| 383 |  | 
|---|
| 384 | /** | 
|---|
| 385 | * css_rstat_flush - flush stats in @css's rstat subtree | 
|---|
| 386 | * @css: target cgroup subsystem state | 
|---|
| 387 | * | 
|---|
| 388 | * Collect all per-cpu stats in @css's subtree into the global counters | 
|---|
| 389 | * and propagate them upwards. After this function returns, all rstat | 
|---|
| 390 | * nodes in the subtree have up-to-date ->stat. | 
|---|
| 391 | * | 
|---|
| 392 | * This also gets all rstat nodes in the subtree including @css off the | 
|---|
| 393 | * ->updated_children lists. | 
|---|
| 394 | * | 
|---|
| 395 | * This function may block. | 
|---|
| 396 | */ | 
|---|
| 397 | __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css) | 
|---|
| 398 | { | 
|---|
| 399 | int cpu; | 
|---|
| 400 | bool is_self = css_is_self(css); | 
|---|
| 401 |  | 
|---|
| 402 | /* | 
|---|
| 403 | * Since bpf programs can call this function, prevent access to | 
|---|
| 404 | * uninitialized rstat pointers. | 
|---|
| 405 | */ | 
|---|
| 406 | if (!css_uses_rstat(css)) | 
|---|
| 407 | return; | 
|---|
| 408 |  | 
|---|
| 409 | might_sleep(); | 
|---|
| 410 | for_each_possible_cpu(cpu) { | 
|---|
| 411 | struct cgroup_subsys_state *pos; | 
|---|
| 412 |  | 
|---|
| 413 | /* Reacquire for each CPU to avoid disabling IRQs too long */ | 
|---|
| 414 | __css_rstat_lock(css, cpu_in_loop: cpu); | 
|---|
| 415 | pos = css_rstat_updated_list(root: css, cpu); | 
|---|
| 416 | for (; pos; pos = pos->rstat_flush_next) { | 
|---|
| 417 | if (is_self) { | 
|---|
| 418 | cgroup_base_stat_flush(cgrp: pos->cgroup, cpu); | 
|---|
| 419 | bpf_rstat_flush(cgrp: pos->cgroup, | 
|---|
| 420 | parent: cgroup_parent(cgrp: pos->cgroup), cpu); | 
|---|
| 421 | } else | 
|---|
| 422 | pos->ss->css_rstat_flush(pos, cpu); | 
|---|
| 423 | } | 
|---|
| 424 | __css_rstat_unlock(css, cpu_in_loop: cpu); | 
|---|
| 425 | if (!cond_resched()) | 
|---|
| 426 | cpu_relax(); | 
|---|
| 427 | } | 
|---|
| 428 | } | 
|---|
| 429 |  | 
|---|
| 430 | int css_rstat_init(struct cgroup_subsys_state *css) | 
|---|
| 431 | { | 
|---|
| 432 | struct cgroup *cgrp = css->cgroup; | 
|---|
| 433 | int cpu; | 
|---|
| 434 | bool is_self = css_is_self(css); | 
|---|
| 435 |  | 
|---|
| 436 | if (is_self) { | 
|---|
| 437 | /* the root cgrp has rstat_base_cpu preallocated */ | 
|---|
| 438 | if (!cgrp->rstat_base_cpu) { | 
|---|
| 439 | cgrp->rstat_base_cpu = alloc_percpu(struct cgroup_rstat_base_cpu); | 
|---|
| 440 | if (!cgrp->rstat_base_cpu) | 
|---|
| 441 | return -ENOMEM; | 
|---|
| 442 | } | 
|---|
| 443 | } else if (css->ss->css_rstat_flush == NULL) | 
|---|
| 444 | return 0; | 
|---|
| 445 |  | 
|---|
| 446 | /* the root cgrp's self css has rstat_cpu preallocated */ | 
|---|
| 447 | if (!css->rstat_cpu) { | 
|---|
| 448 | css->rstat_cpu = alloc_percpu(struct css_rstat_cpu); | 
|---|
| 449 | if (!css->rstat_cpu) { | 
|---|
| 450 | if (is_self) | 
|---|
| 451 | free_percpu(pdata: cgrp->rstat_base_cpu); | 
|---|
| 452 |  | 
|---|
| 453 | return -ENOMEM; | 
|---|
| 454 | } | 
|---|
| 455 | } | 
|---|
| 456 |  | 
|---|
| 457 | /* ->updated_children list is self terminated */ | 
|---|
| 458 | for_each_possible_cpu(cpu) { | 
|---|
| 459 | struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); | 
|---|
| 460 |  | 
|---|
| 461 | rstatc->owner = rstatc->updated_children = css; | 
|---|
| 462 | init_llist_node(node: &rstatc->lnode); | 
|---|
| 463 |  | 
|---|
| 464 | if (is_self) { | 
|---|
| 465 | struct cgroup_rstat_base_cpu *rstatbc; | 
|---|
| 466 |  | 
|---|
| 467 | rstatbc = cgroup_rstat_base_cpu(cgrp, cpu); | 
|---|
| 468 | u64_stats_init(syncp: &rstatbc->bsync); | 
|---|
| 469 | } | 
|---|
| 470 | } | 
|---|
| 471 |  | 
|---|
| 472 | return 0; | 
|---|
| 473 | } | 
|---|
| 474 |  | 
|---|
| 475 | void css_rstat_exit(struct cgroup_subsys_state *css) | 
|---|
| 476 | { | 
|---|
| 477 | int cpu; | 
|---|
| 478 |  | 
|---|
| 479 | if (!css_uses_rstat(css)) | 
|---|
| 480 | return; | 
|---|
| 481 |  | 
|---|
| 482 | if (!css->rstat_cpu) | 
|---|
| 483 | return; | 
|---|
| 484 |  | 
|---|
| 485 | css_rstat_flush(css); | 
|---|
| 486 |  | 
|---|
| 487 | /* sanity check */ | 
|---|
| 488 | for_each_possible_cpu(cpu) { | 
|---|
| 489 | struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); | 
|---|
| 490 |  | 
|---|
| 491 | if (WARN_ON_ONCE(rstatc->updated_children != css) || | 
|---|
| 492 | WARN_ON_ONCE(rstatc->updated_next)) | 
|---|
| 493 | return; | 
|---|
| 494 | } | 
|---|
| 495 |  | 
|---|
| 496 | if (css_is_self(css)) { | 
|---|
| 497 | struct cgroup *cgrp = css->cgroup; | 
|---|
| 498 |  | 
|---|
| 499 | free_percpu(pdata: cgrp->rstat_base_cpu); | 
|---|
| 500 | cgrp->rstat_base_cpu = NULL; | 
|---|
| 501 | } | 
|---|
| 502 |  | 
|---|
| 503 | free_percpu(pdata: css->rstat_cpu); | 
|---|
| 504 | css->rstat_cpu = NULL; | 
|---|
| 505 | } | 
|---|
| 506 |  | 
|---|
| 507 | /** | 
|---|
| 508 | * ss_rstat_init - subsystem-specific rstat initialization | 
|---|
| 509 | * @ss: target subsystem | 
|---|
| 510 | * | 
|---|
| 511 | * If @ss is NULL, the static locks associated with the base stats | 
|---|
| 512 | * are initialized. If @ss is non-NULL, the subsystem-specific locks | 
|---|
| 513 | * are initialized. | 
|---|
| 514 | */ | 
|---|
| 515 | int __init ss_rstat_init(struct cgroup_subsys *ss) | 
|---|
| 516 | { | 
|---|
| 517 | int cpu; | 
|---|
| 518 |  | 
|---|
| 519 | if (ss) { | 
|---|
| 520 | ss->lhead = alloc_percpu(struct llist_head); | 
|---|
| 521 | if (!ss->lhead) | 
|---|
| 522 | return -ENOMEM; | 
|---|
| 523 | } | 
|---|
| 524 |  | 
|---|
| 525 | spin_lock_init(ss_rstat_lock(ss)); | 
|---|
| 526 | for_each_possible_cpu(cpu) | 
|---|
| 527 | init_llist_head(list: ss_lhead_cpu(ss, cpu)); | 
|---|
| 528 |  | 
|---|
| 529 | return 0; | 
|---|
| 530 | } | 
|---|
| 531 |  | 
|---|
| 532 | /* | 
|---|
| 533 | * Functions for cgroup basic resource statistics implemented on top of | 
|---|
| 534 | * rstat. | 
|---|
| 535 | */ | 
|---|
| 536 | static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat, | 
|---|
| 537 | struct cgroup_base_stat *src_bstat) | 
|---|
| 538 | { | 
|---|
| 539 | dst_bstat->cputime.utime += src_bstat->cputime.utime; | 
|---|
| 540 | dst_bstat->cputime.stime += src_bstat->cputime.stime; | 
|---|
| 541 | dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; | 
|---|
| 542 | #ifdef CONFIG_SCHED_CORE | 
|---|
| 543 | dst_bstat->forceidle_sum += src_bstat->forceidle_sum; | 
|---|
| 544 | #endif | 
|---|
| 545 | dst_bstat->ntime += src_bstat->ntime; | 
|---|
| 546 | } | 
|---|
| 547 |  | 
|---|
| 548 | static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, | 
|---|
| 549 | struct cgroup_base_stat *src_bstat) | 
|---|
| 550 | { | 
|---|
| 551 | dst_bstat->cputime.utime -= src_bstat->cputime.utime; | 
|---|
| 552 | dst_bstat->cputime.stime -= src_bstat->cputime.stime; | 
|---|
| 553 | dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime; | 
|---|
| 554 | #ifdef CONFIG_SCHED_CORE | 
|---|
| 555 | dst_bstat->forceidle_sum -= src_bstat->forceidle_sum; | 
|---|
| 556 | #endif | 
|---|
| 557 | dst_bstat->ntime -= src_bstat->ntime; | 
|---|
| 558 | } | 
|---|
| 559 |  | 
|---|
| 560 | static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) | 
|---|
| 561 | { | 
|---|
| 562 | struct cgroup_rstat_base_cpu *rstatbc = cgroup_rstat_base_cpu(cgrp, cpu); | 
|---|
| 563 | struct cgroup *parent = cgroup_parent(cgrp); | 
|---|
| 564 | struct cgroup_rstat_base_cpu *prstatbc; | 
|---|
| 565 | struct cgroup_base_stat delta; | 
|---|
| 566 | unsigned seq; | 
|---|
| 567 |  | 
|---|
| 568 | /* Root-level stats are sourced from system-wide CPU stats */ | 
|---|
| 569 | if (!parent) | 
|---|
| 570 | return; | 
|---|
| 571 |  | 
|---|
| 572 | /* fetch the current per-cpu values */ | 
|---|
| 573 | do { | 
|---|
| 574 | seq = __u64_stats_fetch_begin(syncp: &rstatbc->bsync); | 
|---|
| 575 | delta = rstatbc->bstat; | 
|---|
| 576 | } while (__u64_stats_fetch_retry(syncp: &rstatbc->bsync, start: seq)); | 
|---|
| 577 |  | 
|---|
| 578 | /* propagate per-cpu delta to cgroup and per-cpu global statistics */ | 
|---|
| 579 | cgroup_base_stat_sub(dst_bstat: &delta, src_bstat: &rstatbc->last_bstat); | 
|---|
| 580 | cgroup_base_stat_add(dst_bstat: &cgrp->bstat, src_bstat: &delta); | 
|---|
| 581 | cgroup_base_stat_add(dst_bstat: &rstatbc->last_bstat, src_bstat: &delta); | 
|---|
| 582 | cgroup_base_stat_add(dst_bstat: &rstatbc->subtree_bstat, src_bstat: &delta); | 
|---|
| 583 |  | 
|---|
| 584 | /* propagate cgroup and per-cpu global delta to parent (unless that's root) */ | 
|---|
| 585 | if (cgroup_parent(cgrp: parent)) { | 
|---|
| 586 | delta = cgrp->bstat; | 
|---|
| 587 | cgroup_base_stat_sub(dst_bstat: &delta, src_bstat: &cgrp->last_bstat); | 
|---|
| 588 | cgroup_base_stat_add(dst_bstat: &parent->bstat, src_bstat: &delta); | 
|---|
| 589 | cgroup_base_stat_add(dst_bstat: &cgrp->last_bstat, src_bstat: &delta); | 
|---|
| 590 |  | 
|---|
| 591 | delta = rstatbc->subtree_bstat; | 
|---|
| 592 | prstatbc = cgroup_rstat_base_cpu(cgrp: parent, cpu); | 
|---|
| 593 | cgroup_base_stat_sub(dst_bstat: &delta, src_bstat: &rstatbc->last_subtree_bstat); | 
|---|
| 594 | cgroup_base_stat_add(dst_bstat: &prstatbc->subtree_bstat, src_bstat: &delta); | 
|---|
| 595 | cgroup_base_stat_add(dst_bstat: &rstatbc->last_subtree_bstat, src_bstat: &delta); | 
|---|
| 596 | } | 
|---|
| 597 | } | 
|---|
| 598 |  | 
|---|
| 599 | static struct cgroup_rstat_base_cpu * | 
|---|
| 600 | cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags) | 
|---|
| 601 | { | 
|---|
| 602 | struct cgroup_rstat_base_cpu *rstatbc; | 
|---|
| 603 |  | 
|---|
| 604 | rstatbc = get_cpu_ptr(cgrp->rstat_base_cpu); | 
|---|
| 605 | *flags = u64_stats_update_begin_irqsave(syncp: &rstatbc->bsync); | 
|---|
| 606 | return rstatbc; | 
|---|
| 607 | } | 
|---|
| 608 |  | 
|---|
| 609 | static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, | 
|---|
| 610 | struct cgroup_rstat_base_cpu *rstatbc, | 
|---|
| 611 | unsigned long flags) | 
|---|
| 612 | { | 
|---|
| 613 | u64_stats_update_end_irqrestore(syncp: &rstatbc->bsync, flags); | 
|---|
| 614 | css_rstat_updated(css: &cgrp->self, smp_processor_id()); | 
|---|
| 615 | put_cpu_ptr(rstatbc); | 
|---|
| 616 | } | 
|---|
| 617 |  | 
|---|
| 618 | void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) | 
|---|
| 619 | { | 
|---|
| 620 | struct cgroup_rstat_base_cpu *rstatbc; | 
|---|
| 621 | unsigned long flags; | 
|---|
| 622 |  | 
|---|
| 623 | rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, flags: &flags); | 
|---|
| 624 | rstatbc->bstat.cputime.sum_exec_runtime += delta_exec; | 
|---|
| 625 | cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags); | 
|---|
| 626 | } | 
|---|
| 627 |  | 
|---|
| 628 | void __cgroup_account_cputime_field(struct cgroup *cgrp, | 
|---|
| 629 | enum cpu_usage_stat index, u64 delta_exec) | 
|---|
| 630 | { | 
|---|
| 631 | struct cgroup_rstat_base_cpu *rstatbc; | 
|---|
| 632 | unsigned long flags; | 
|---|
| 633 |  | 
|---|
| 634 | rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, flags: &flags); | 
|---|
| 635 |  | 
|---|
| 636 | switch (index) { | 
|---|
| 637 | case CPUTIME_NICE: | 
|---|
| 638 | rstatbc->bstat.ntime += delta_exec; | 
|---|
| 639 | fallthrough; | 
|---|
| 640 | case CPUTIME_USER: | 
|---|
| 641 | rstatbc->bstat.cputime.utime += delta_exec; | 
|---|
| 642 | break; | 
|---|
| 643 | case CPUTIME_SYSTEM: | 
|---|
| 644 | case CPUTIME_IRQ: | 
|---|
| 645 | case CPUTIME_SOFTIRQ: | 
|---|
| 646 | rstatbc->bstat.cputime.stime += delta_exec; | 
|---|
| 647 | break; | 
|---|
| 648 | #ifdef CONFIG_SCHED_CORE | 
|---|
| 649 | case CPUTIME_FORCEIDLE: | 
|---|
| 650 | rstatbc->bstat.forceidle_sum += delta_exec; | 
|---|
| 651 | break; | 
|---|
| 652 | #endif | 
|---|
| 653 | default: | 
|---|
| 654 | break; | 
|---|
| 655 | } | 
|---|
| 656 |  | 
|---|
| 657 | cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags); | 
|---|
| 658 | } | 
|---|
| 659 |  | 
|---|
| 660 | /* | 
|---|
| 661 | * compute the cputime for the root cgroup by getting the per cpu data | 
|---|
| 662 | * at a global level, then categorizing the fields in a manner consistent | 
|---|
| 663 | * with how it is done by __cgroup_account_cputime_field for each bit of | 
|---|
| 664 | * cpu time attributed to a cgroup. | 
|---|
| 665 | */ | 
|---|
| 666 | static void root_cgroup_cputime(struct cgroup_base_stat *bstat) | 
|---|
| 667 | { | 
|---|
| 668 | struct task_cputime *cputime = &bstat->cputime; | 
|---|
| 669 | int i; | 
|---|
| 670 |  | 
|---|
| 671 | memset(s: bstat, c: 0, n: sizeof(*bstat)); | 
|---|
| 672 | for_each_possible_cpu(i) { | 
|---|
| 673 | struct kernel_cpustat kcpustat; | 
|---|
| 674 | u64 *cpustat = kcpustat.cpustat; | 
|---|
| 675 | u64 user = 0; | 
|---|
| 676 | u64 sys = 0; | 
|---|
| 677 |  | 
|---|
| 678 | kcpustat_cpu_fetch(dst: &kcpustat, cpu: i); | 
|---|
| 679 |  | 
|---|
| 680 | user += cpustat[CPUTIME_USER]; | 
|---|
| 681 | user += cpustat[CPUTIME_NICE]; | 
|---|
| 682 | cputime->utime += user; | 
|---|
| 683 |  | 
|---|
| 684 | sys += cpustat[CPUTIME_SYSTEM]; | 
|---|
| 685 | sys += cpustat[CPUTIME_IRQ]; | 
|---|
| 686 | sys += cpustat[CPUTIME_SOFTIRQ]; | 
|---|
| 687 | cputime->stime += sys; | 
|---|
| 688 |  | 
|---|
| 689 | cputime->sum_exec_runtime += user; | 
|---|
| 690 | cputime->sum_exec_runtime += sys; | 
|---|
| 691 |  | 
|---|
| 692 | #ifdef CONFIG_SCHED_CORE | 
|---|
| 693 | bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE]; | 
|---|
| 694 | #endif | 
|---|
| 695 | bstat->ntime += cpustat[CPUTIME_NICE]; | 
|---|
| 696 | } | 
|---|
| 697 | } | 
|---|
| 698 |  | 
|---|
| 699 |  | 
|---|
| 700 | static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat *bstat) | 
|---|
| 701 | { | 
|---|
| 702 | #ifdef CONFIG_SCHED_CORE | 
|---|
| 703 | u64 forceidle_time = bstat->forceidle_sum; | 
|---|
| 704 |  | 
|---|
| 705 | do_div(forceidle_time, NSEC_PER_USEC); | 
|---|
| 706 | seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time); | 
|---|
| 707 | #endif | 
|---|
| 708 | } | 
|---|
| 709 |  | 
|---|
| 710 | void cgroup_base_stat_cputime_show(struct seq_file *seq) | 
|---|
| 711 | { | 
|---|
| 712 | struct cgroup *cgrp = seq_css(seq)->cgroup; | 
|---|
| 713 | struct cgroup_base_stat bstat; | 
|---|
| 714 |  | 
|---|
| 715 | if (cgroup_parent(cgrp)) { | 
|---|
| 716 | css_rstat_flush(css: &cgrp->self); | 
|---|
| 717 | __css_rstat_lock(css: &cgrp->self, cpu_in_loop: -1); | 
|---|
| 718 | bstat = cgrp->bstat; | 
|---|
| 719 | cputime_adjust(curr: &cgrp->bstat.cputime, prev: &cgrp->prev_cputime, | 
|---|
| 720 | ut: &bstat.cputime.utime, st: &bstat.cputime.stime); | 
|---|
| 721 | __css_rstat_unlock(css: &cgrp->self, cpu_in_loop: -1); | 
|---|
| 722 | } else { | 
|---|
| 723 | root_cgroup_cputime(bstat: &bstat); | 
|---|
| 724 | } | 
|---|
| 725 |  | 
|---|
| 726 | do_div(bstat.cputime.sum_exec_runtime, NSEC_PER_USEC); | 
|---|
| 727 | do_div(bstat.cputime.utime, NSEC_PER_USEC); | 
|---|
| 728 | do_div(bstat.cputime.stime, NSEC_PER_USEC); | 
|---|
| 729 | do_div(bstat.ntime, NSEC_PER_USEC); | 
|---|
| 730 |  | 
|---|
| 731 | seq_printf(m: seq, fmt: "usage_usec %llu\n" | 
|---|
| 732 | "user_usec %llu\n" | 
|---|
| 733 | "system_usec %llu\n" | 
|---|
| 734 | "nice_usec %llu\n", | 
|---|
| 735 | bstat.cputime.sum_exec_runtime, | 
|---|
| 736 | bstat.cputime.utime, | 
|---|
| 737 | bstat.cputime.stime, | 
|---|
| 738 | bstat.ntime); | 
|---|
| 739 |  | 
|---|
| 740 | cgroup_force_idle_show(seq, bstat: &bstat); | 
|---|
| 741 | } | 
|---|
| 742 |  | 
|---|
| 743 | /* Add bpf kfuncs for css_rstat_updated() and css_rstat_flush() */ | 
|---|
| 744 | BTF_KFUNCS_START(bpf_rstat_kfunc_ids) | 
|---|
| 745 | BTF_ID_FLAGS(func, css_rstat_updated) | 
|---|
| 746 | BTF_ID_FLAGS(func, css_rstat_flush, KF_SLEEPABLE) | 
|---|
| 747 | BTF_KFUNCS_END(bpf_rstat_kfunc_ids) | 
|---|
| 748 |  | 
|---|
| 749 | static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = { | 
|---|
| 750 | .owner          = THIS_MODULE, | 
|---|
| 751 | .set            = &bpf_rstat_kfunc_ids, | 
|---|
| 752 | }; | 
|---|
| 753 |  | 
|---|
| 754 | static int __init bpf_rstat_kfunc_init(void) | 
|---|
| 755 | { | 
|---|
| 756 | return register_btf_kfunc_id_set(prog_type: BPF_PROG_TYPE_TRACING, | 
|---|
| 757 | s: &bpf_rstat_kfunc_set); | 
|---|
| 758 | } | 
|---|
| 759 | late_initcall(bpf_rstat_kfunc_init); | 
|---|
| 760 |  | 
|---|