1// SPDX-License-Identifier: GPL-2.0-or-later
2/* delayacct.c - per-task delay accounting
3 *
4 * Copyright (C) Shailabh Nagar, IBM Corp. 2006
5 */
6
7#include <linux/sched.h>
8#include <linux/sched/task.h>
9#include <linux/sched/cputime.h>
10#include <linux/sched/clock.h>
11#include <linux/slab.h>
12#include <linux/taskstats.h>
13#include <linux/sysctl.h>
14#include <linux/delayacct.h>
15#include <linux/module.h>
16
17#define UPDATE_DELAY(type) \
18do { \
19 d->type##_delay_max = tsk->delays->type##_delay_max; \
20 d->type##_delay_min = tsk->delays->type##_delay_min; \
21 tmp = d->type##_delay_total + tsk->delays->type##_delay; \
22 d->type##_delay_total = (tmp < d->type##_delay_total) ? 0 : tmp; \
23 d->type##_count += tsk->delays->type##_count; \
24} while (0)
25
26DEFINE_STATIC_KEY_FALSE(delayacct_key);
27int delayacct_on __read_mostly; /* Delay accounting turned on/off */
28struct kmem_cache *delayacct_cache;
29
30static void set_delayacct(bool enabled)
31{
32 if (enabled) {
33 static_branch_enable(&delayacct_key);
34 delayacct_on = 1;
35 } else {
36 delayacct_on = 0;
37 static_branch_disable(&delayacct_key);
38 }
39}
40
41static int __init delayacct_setup_enable(char *str)
42{
43 delayacct_on = 1;
44 return 1;
45}
46__setup("delayacct", delayacct_setup_enable);
47
48void delayacct_init(void)
49{
50 delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT);
51 delayacct_tsk_init(tsk: &init_task);
52 set_delayacct(delayacct_on);
53}
54
55#ifdef CONFIG_PROC_SYSCTL
56static int sysctl_delayacct(const struct ctl_table *table, int write, void *buffer,
57 size_t *lenp, loff_t *ppos)
58{
59 int state = delayacct_on;
60 struct ctl_table t;
61 int err;
62
63 if (write && !capable(CAP_SYS_ADMIN))
64 return -EPERM;
65
66 t = *table;
67 t.data = &state;
68 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
69 if (err < 0)
70 return err;
71 if (write)
72 set_delayacct(state);
73 return err;
74}
75
76static const struct ctl_table kern_delayacct_table[] = {
77 {
78 .procname = "task_delayacct",
79 .data = NULL,
80 .maxlen = sizeof(unsigned int),
81 .mode = 0644,
82 .proc_handler = sysctl_delayacct,
83 .extra1 = SYSCTL_ZERO,
84 .extra2 = SYSCTL_ONE,
85 },
86};
87
88static __init int kernel_delayacct_sysctls_init(void)
89{
90 register_sysctl_init("kernel", kern_delayacct_table);
91 return 0;
92}
93late_initcall(kernel_delayacct_sysctls_init);
94#endif
95
96void __delayacct_tsk_init(struct task_struct *tsk)
97{
98 tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL);
99 if (tsk->delays)
100 raw_spin_lock_init(&tsk->delays->lock);
101}
102
103/*
104 * Finish delay accounting for a statistic using its timestamps (@start),
105 * accumulator (@total) and @count
106 */
107static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count, u64 *max, u64 *min)
108{
109 s64 ns = local_clock() - *start;
110 unsigned long flags;
111
112 if (ns > 0) {
113 raw_spin_lock_irqsave(lock, flags);
114 *total += ns;
115 (*count)++;
116 if (ns > *max)
117 *max = ns;
118 if (*min == 0 || ns < *min)
119 *min = ns;
120 raw_spin_unlock_irqrestore(lock, flags);
121 }
122}
123
124void __delayacct_blkio_start(void)
125{
126 current->delays->blkio_start = local_clock();
127}
128
129/*
130 * We cannot rely on the `current` macro, as we haven't yet switched back to
131 * the process being woken.
132 */
133void __delayacct_blkio_end(struct task_struct *p)
134{
135 delayacct_end(lock: &p->delays->lock,
136 start: &p->delays->blkio_start,
137 total: &p->delays->blkio_delay,
138 count: &p->delays->blkio_count,
139 max: &p->delays->blkio_delay_max,
140 min: &p->delays->blkio_delay_min);
141}
142
143int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
144{
145 u64 utime, stime, stimescaled, utimescaled;
146 unsigned long long t2, t3;
147 unsigned long flags, t1;
148 s64 tmp;
149
150 task_cputime(t: tsk, utime: &utime, stime: &stime);
151 tmp = (s64)d->cpu_run_real_total;
152 tmp += utime + stime;
153 d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
154
155 task_cputime_scaled(t: tsk, utimescaled: &utimescaled, stimescaled: &stimescaled);
156 tmp = (s64)d->cpu_scaled_run_real_total;
157 tmp += utimescaled + stimescaled;
158 d->cpu_scaled_run_real_total =
159 (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
160
161 /*
162 * No locking available for sched_info (and too expensive to add one)
163 * Mitigate by taking snapshot of values
164 */
165 t1 = tsk->sched_info.pcount;
166 t2 = tsk->sched_info.run_delay;
167 t3 = tsk->se.sum_exec_runtime;
168
169 d->cpu_count += t1;
170
171 d->cpu_delay_max = tsk->sched_info.max_run_delay;
172 d->cpu_delay_min = tsk->sched_info.min_run_delay;
173 tmp = (s64)d->cpu_delay_total + t2;
174 d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
175 tmp = (s64)d->cpu_run_virtual_total + t3;
176
177 d->cpu_run_virtual_total =
178 (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp;
179
180 if (!tsk->delays)
181 return 0;
182
183 /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
184 raw_spin_lock_irqsave(&tsk->delays->lock, flags);
185 UPDATE_DELAY(blkio);
186 UPDATE_DELAY(swapin);
187 UPDATE_DELAY(freepages);
188 UPDATE_DELAY(thrashing);
189 UPDATE_DELAY(compact);
190 UPDATE_DELAY(wpcopy);
191 UPDATE_DELAY(irq);
192 raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
193
194 return 0;
195}
196
197__u64 __delayacct_blkio_ticks(struct task_struct *tsk)
198{
199 __u64 ret;
200 unsigned long flags;
201
202 raw_spin_lock_irqsave(&tsk->delays->lock, flags);
203 ret = nsec_to_clock_t(x: tsk->delays->blkio_delay);
204 raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
205 return ret;
206}
207
208void __delayacct_freepages_start(void)
209{
210 current->delays->freepages_start = local_clock();
211}
212
213void __delayacct_freepages_end(void)
214{
215 delayacct_end(lock: &current->delays->lock,
216 start: &current->delays->freepages_start,
217 total: &current->delays->freepages_delay,
218 count: &current->delays->freepages_count,
219 max: &current->delays->freepages_delay_max,
220 min: &current->delays->freepages_delay_min);
221}
222
223void __delayacct_thrashing_start(bool *in_thrashing)
224{
225 *in_thrashing = !!current->in_thrashing;
226 if (*in_thrashing)
227 return;
228
229 current->in_thrashing = 1;
230 current->delays->thrashing_start = local_clock();
231}
232
233void __delayacct_thrashing_end(bool *in_thrashing)
234{
235 if (*in_thrashing)
236 return;
237
238 current->in_thrashing = 0;
239 delayacct_end(lock: &current->delays->lock,
240 start: &current->delays->thrashing_start,
241 total: &current->delays->thrashing_delay,
242 count: &current->delays->thrashing_count,
243 max: &current->delays->thrashing_delay_max,
244 min: &current->delays->thrashing_delay_min);
245}
246
247void __delayacct_swapin_start(void)
248{
249 current->delays->swapin_start = local_clock();
250}
251
252void __delayacct_swapin_end(void)
253{
254 delayacct_end(lock: &current->delays->lock,
255 start: &current->delays->swapin_start,
256 total: &current->delays->swapin_delay,
257 count: &current->delays->swapin_count,
258 max: &current->delays->swapin_delay_max,
259 min: &current->delays->swapin_delay_min);
260}
261
262void __delayacct_compact_start(void)
263{
264 current->delays->compact_start = local_clock();
265}
266
267void __delayacct_compact_end(void)
268{
269 delayacct_end(lock: &current->delays->lock,
270 start: &current->delays->compact_start,
271 total: &current->delays->compact_delay,
272 count: &current->delays->compact_count,
273 max: &current->delays->compact_delay_max,
274 min: &current->delays->compact_delay_min);
275}
276
277void __delayacct_wpcopy_start(void)
278{
279 current->delays->wpcopy_start = local_clock();
280}
281
282void __delayacct_wpcopy_end(void)
283{
284 delayacct_end(lock: &current->delays->lock,
285 start: &current->delays->wpcopy_start,
286 total: &current->delays->wpcopy_delay,
287 count: &current->delays->wpcopy_count,
288 max: &current->delays->wpcopy_delay_max,
289 min: &current->delays->wpcopy_delay_min);
290}
291
292void __delayacct_irq(struct task_struct *task, u32 delta)
293{
294 unsigned long flags;
295
296 raw_spin_lock_irqsave(&task->delays->lock, flags);
297 task->delays->irq_delay += delta;
298 task->delays->irq_count++;
299 if (delta > task->delays->irq_delay_max)
300 task->delays->irq_delay_max = delta;
301 if (delta && (!task->delays->irq_delay_min || delta < task->delays->irq_delay_min))
302 task->delays->irq_delay_min = delta;
303 raw_spin_unlock_irqrestore(&task->delays->lock, flags);
304}
305
306