clock.c source code [Linux/kernel/sched/clock.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* sched_clock() for unstable CPU clocks
4	*
5	* Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra
6	*
7	* Updates and enhancements:
8	* Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com>
9	*
10	* Based on code by:
11	* Ingo Molnar <mingo@redhat.com>
12	* Guillaume Chazarain <guichaz@gmail.com>
13	*
14	*
15	* What this file implements:
16	*
17	* cpu_clock(i) provides a fast (execution time) high resolution
18	* clock with bounded drift between CPUs. The value of cpu_clock(i)
19	* is monotonic for constant i. The timestamp returned is in nanoseconds.
20	*
21	* ######################### BIG FAT WARNING ##########################
22	* # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
23	* # go backwards !! #
24	* ####################################################################
25	*
26	* There is no strict promise about the base, although it tends to start
27	* at 0 on boot (but people really shouldn't rely on that).
28	*
29	* cpu_clock(i) -- can be used from any context, including NMI.
30	* local_clock() -- is cpu_clock() on the current CPU.
31	*
32	* sched_clock_cpu(i)
33	*
34	* How it is implemented:
35	*
36	* The implementation either uses sched_clock() when
37	* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
38	* sched_clock() is assumed to provide these properties (mostly it means
39	* the architecture provides a globally synchronized highres time source).
40	*
41	* Otherwise it tries to create a semi stable clock from a mixture of other
42	* clocks, including:
43	*
44	* - GTOD (clock monotonic)
45	* - sched_clock()
46	* - explicit idle events
47	*
48	* We use GTOD as base and use sched_clock() deltas to improve resolution. The
49	* deltas are filtered to provide monotonicity and keeping it within an
50	* expected window.
51	*
52	* Furthermore, explicit sleep and wakeup hooks allow us to account for time
53	* that is otherwise invisible (TSC gets stopped).
54	*
55	*/
56
57	#include <linux/sched/clock.h>
58	#include "sched.h"
59
60	/*
61	* Scheduler clock - returns current time in nanosec units.
62	* This is default implementation.
63	* Architectures and sub-architectures can override this.
64	*/
65	notrace unsigned long long __weak sched_clock(void)
66	{
67	return (unsigned long long)(jiffies - INITIAL_JIFFIES)
68	* (NSEC_PER_SEC / HZ);
69	}
70	EXPORT_SYMBOL_GPL(sched_clock);
71
72	static DEFINE_STATIC_KEY_FALSE(sched_clock_running);
73
74	#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
75	/*
76	* We must start with !__sched_clock_stable because the unstable -> stable
77	* transition is accurate, while the stable -> unstable transition is not.
78	*
79	* Similarly we start with __sched_clock_stable_early, thereby assuming we
80	* will become stable, such that there's only a single 1 -> 0 transition.
81	*/
82	static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable);
83	static int __sched_clock_stable_early = `1`;
84
85	/*
86	* We want: ktime_get_ns() + __gtod_offset == sched_clock() + __sched_clock_offset
87	*/
88	__read_mostly u64 __sched_clock_offset;
89	static __read_mostly u64 __gtod_offset;
90
91	struct sched_clock_data {
92	u64 tick_raw;
93	u64 tick_gtod;
94	u64 clock;
95	};
96
97	static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
98
99	static __always_inline struct sched_clock_data this_scd(void*)
100	{
101	return this_cpu_ptr(&sched_clock_data);
102	}
103
104	notrace static inline struct sched_clock_data cpu_sdc(int* cpu)
105	{
106	return &per_cpu(sched_clock_data, cpu);
107	}
108
109	notrace int sched_clock_stable(void)
110	{
111	return static_branch_likely(&__sched_clock_stable);
112	}
113
114	notrace static void __scd_stamp(struct sched_clock_data *scd)
115	{
116	scd->tick_gtod = ktime_get_ns();
117	scd->tick_raw = sched_clock();
118	}
119
120	notrace static void __set_sched_clock_stable(void)
121	{
122	struct sched_clock_data *scd;
123
124	/*
125	* Since we're still unstable and the tick is already running, we have
126	* to disable IRQs in order to get a consistent scd->tick* reading.
127	*/
128	local_irq_disable();
129	scd = this_scd();
130	/*
131	* Attempt to make the (initial) unstable->stable transition continuous.
132	*/
133	__sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw);
134	local_irq_enable();
135
136	printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n",
137	scd->tick_gtod, __gtod_offset,
138	scd->tick_raw, __sched_clock_offset);
139
140	static_branch_enable(&__sched_clock_stable);
141	tick_dep_clear(bit: TICK_DEP_BIT_CLOCK_UNSTABLE);
142	}
143
144	/*
145	* If we ever get here, we're screwed, because we found out -- typically after
146	* the fact -- that TSC wasn't good. This means all our clocksources (including
147	* ktime) could have reported wrong values.
148	*
149	* What we do here is an attempt to fix up and continue sort of where we left
150	* off in a coherent manner.
151	*
152	* The only way to fully avoid random clock jumps is to boot with:
153	* "tsc=unstable".
154	*/
155	notrace static void __sched_clock_work(struct work_struct *work)
156	{
157	struct sched_clock_data *scd;
158	int cpu;
159
160	/ take a current timestamp and set 'now' /
161	preempt_disable();
162	scd = this_scd();
163	__scd_stamp(scd);
164	scd->clock = scd->tick_gtod + __gtod_offset;
165	preempt_enable();
166
167	/ clone to all CPUs /
168	for_each_possible_cpu(cpu)
169	per_cpu(sched_clock_data, cpu) = *scd;
170
171	printk(KERN_WARNING "TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.\n");
172	printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n",
173	scd->tick_gtod, __gtod_offset,
174	scd->tick_raw, __sched_clock_offset);
175
176	static_branch_disable(&__sched_clock_stable);
177	}
178
179	static DECLARE_WORK(sched_clock_work, __sched_clock_work);
180
181	notrace static void __clear_sched_clock_stable(void)
182	{
183	if (!sched_clock_stable())
184	return;
185
186	tick_dep_set(bit: TICK_DEP_BIT_CLOCK_UNSTABLE);
187	schedule_work(work: &sched_clock_work);
188	}
189
190	notrace void clear_sched_clock_stable(void)
191	{
192	__sched_clock_stable_early = `0`;
193
194	smp_mb(); / matches sched_clock_init_late() /
195
196	if (static_key_count(key: &sched_clock_running.key) == `2`)
197	__clear_sched_clock_stable();
198	}
199
200	notrace static void __sched_clock_gtod_offset(void)
201	{
202	struct sched_clock_data *scd = this_scd();
203
204	__scd_stamp(scd);
205	__gtod_offset = (scd->tick_raw + __sched_clock_offset) - scd->tick_gtod;
206	}
207
208	void __init sched_clock_init(void)
209	{
210	/*
211	* Set __gtod_offset such that once we mark sched_clock_running,
212	* sched_clock_tick() continues where sched_clock() left off.
213	*
214	* Even if TSC is buggered, we're still UP at this point so it
215	* can't really be out of sync.
216	*/
217	local_irq_disable();
218	__sched_clock_gtod_offset();
219	local_irq_enable();
220
221	static_branch_inc(&sched_clock_running);
222	}
223	/*
224	* We run this as late_initcall() such that it runs after all built-in drivers,
225	* notably: acpi_processor and intel_idle, which can mark the TSC as unstable.
226	*/
227	static int __init sched_clock_init_late(void)
228	{
229	static_branch_inc(&sched_clock_running);
230	/*
231	* Ensure that it is impossible to not do a static_key update.
232	*
233	* Either {set,clear}_sched_clock_stable() must see sched_clock_running
234	* and do the update, or we must see their __sched_clock_stable_early
235	* and do the update, or both.
236	*/
237	smp_mb(); / matches {set,clear}_sched_clock_stable() /
238
239	if (__sched_clock_stable_early)
240	__set_sched_clock_stable();
241
242	return `0`;
243	}
244	late_initcall(sched_clock_init_late);
245
246	/*
247	* min, max except they take wrapping into account
248	*/
249
250	static __always_inline u64 wrap_min(u64 x, u64 y)
251	{
252	return (s64)(x - y) < `0` ? x : y;
253	}
254
255	static __always_inline u64 wrap_max(u64 x, u64 y)
256	{
257	return (s64)(x - y) > `0` ? x : y;
258	}
259
260	/*
261	* update the percpu scd from the raw @now value
262	*
263	* - filter out backward motion
264	* - use the GTOD tick value to create a window to filter crazy TSC values
265	*/
266	static __always_inline u64 sched_clock_local(struct sched_clock_data *scd)
267	{
268	u64 now, clock, old_clock, min_clock, max_clock, gtod;
269	s64 delta;
270
271	again:
272	now = sched_clock_noinstr();
273	delta = now - scd->tick_raw;
274	if (unlikely(delta < `0`))
275	delta = `0`;
276
277	old_clock = scd->clock;
278
279	/*
280	* scd->clock = clamp(scd->tick_gtod + delta,
281	* max(scd->tick_gtod, scd->clock),
282	* scd->tick_gtod + TICK_NSEC);
283	*/
284
285	gtod = scd->tick_gtod + __gtod_offset;
286	clock = gtod + delta;
287	min_clock = wrap_max(x: gtod, y: old_clock);
288	max_clock = wrap_max(x: old_clock, y: gtod + TICK_NSEC);
289
290	clock = wrap_max(x: clock, y: min_clock);
291	clock = wrap_min(x: clock, y: max_clock);
292
293	if (!raw_try_cmpxchg64(&scd->clock, &old_clock, clock))
294	goto again;
295
296	return clock;
297	}
298
299	noinstr u64 local_clock_noinstr(void)
300	{
301	u64 clock;
302
303	if (static_branch_likely(&__sched_clock_stable))
304	return sched_clock_noinstr() + __sched_clock_offset;
305
306	if (!static_branch_likely(&sched_clock_running))
307	return sched_clock_noinstr();
308
309	clock = sched_clock_local(scd: this_scd());
310
311	return clock;
312	}
313
314	u64 local_clock(void)
315	{
316	u64 now;
317	preempt_disable_notrace();
318	now = local_clock_noinstr();
319	preempt_enable_notrace();
320	return now;
321	}
322	EXPORT_SYMBOL_GPL(local_clock);
323
324	static notrace u64 sched_clock_remote(struct sched_clock_data *scd)
325	{
326	struct sched_clock_data *my_scd = this_scd();
327	u64 this_clock, remote_clock;
328	u64 *ptr, old_val, val;
329
330	#if BITS_PER_LONG != 64
331	again:
332	/*
333	* Careful here: The local and the remote clock values need to
334	* be read out atomic as we need to compare the values and
335	* then update either the local or the remote side. So the
336	* cmpxchg64 below only protects one readout.
337	*
338	* We must reread via sched_clock_local() in the retry case on
339	* 32-bit kernels as an NMI could use sched_clock_local() via the
340	* tracer and hit between the readout of
341	* the low 32-bit and the high 32-bit portion.
342	*/
343	this_clock = sched_clock_local(my_scd);
344	/*
345	* We must enforce atomic readout on 32-bit, otherwise the
346	* update on the remote CPU can hit in between the readout of
347	* the low 32-bit and the high 32-bit portion.
348	*/
349	remote_clock = cmpxchg64(&scd->clock, `0`, `0`);
350	#else
351	/*
352	* On 64-bit kernels the read of [my]scd->clock is atomic versus the
353	* update, so we can avoid the above 32-bit dance.
354	*/
355	sched_clock_local(scd: my_scd);
356	again:
357	this_clock = my_scd->clock;
358	remote_clock = scd->clock;
359	#endif
360
361	/*
362	* Use the opportunity that we have both locks
363	* taken to couple the two clocks: we take the
364	* larger time as the latest time for both
365	* runqueues. (this creates monotonic movement)
366	*/
367	if (likely((s64)(remote_clock - this_clock) < `0`)) {
368	ptr = &scd->clock;
369	old_val = remote_clock;
370	val = this_clock;
371	} else {
372	/*
373	* Should be rare, but possible:
374	*/
375	ptr = &my_scd->clock;
376	old_val = this_clock;
377	val = remote_clock;
378	}
379
380	if (!try_cmpxchg64(ptr, &old_val, val))
381	goto again;
382
383	return val;
384	}
385
386	/*
387	* Similar to cpu_clock(), but requires local IRQs to be disabled.
388	*
389	* See cpu_clock().
390	*/
391	notrace u64 sched_clock_cpu(int cpu)
392	{
393	struct sched_clock_data *scd;
394	u64 clock;
395
396	if (sched_clock_stable())
397	return sched_clock() + __sched_clock_offset;
398
399	if (!static_branch_likely(&sched_clock_running))
400	return sched_clock();
401
402	preempt_disable_notrace();
403	scd = cpu_sdc(cpu);
404
405	if (cpu != smp_processor_id())
406	clock = sched_clock_remote(scd);
407	else
408	clock = sched_clock_local(scd);
409	preempt_enable_notrace();
410
411	return clock;
412	}
413	EXPORT_SYMBOL_GPL(sched_clock_cpu);
414
415	notrace void sched_clock_tick(void)
416	{
417	struct sched_clock_data *scd;
418
419	if (sched_clock_stable())
420	return;
421
422	if (!static_branch_likely(&sched_clock_running))
423	return;
424
425	lockdep_assert_irqs_disabled();
426
427	scd = this_scd();
428	__scd_stamp(scd);
429	sched_clock_local(scd);
430	}
431
432	notrace void sched_clock_tick_stable(void)
433	{
434	if (!sched_clock_stable())
435	return;
436
437	/*
438	* Called under watchdog_lock.
439	*
440	* The watchdog just found this TSC to (still) be stable, so now is a
441	* good moment to update our __gtod_offset. Because once we find the
442	* TSC to be unstable, any computation will be computing crap.
443	*/
444	local_irq_disable();
445	__sched_clock_gtod_offset();
446	local_irq_enable();
447	}
448
449	/*
450	* We are going deep-idle (IRQs are disabled):
451	*/
452	notrace void sched_clock_idle_sleep_event(void)
453	{
454	sched_clock_cpu(smp_processor_id());
455	}
456	EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
457
458	/*
459	* We just idled; resync with ktime.
460	*/
461	notrace void sched_clock_idle_wakeup_event(void)
462	{
463	unsigned long flags;
464
465	if (sched_clock_stable())
466	return;
467
468	if (unlikely(timekeeping_suspended))
469	return;
470
471	local_irq_save(flags);
472	sched_clock_tick();
473	local_irq_restore(flags);
474	}
475	EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
476
477	#else /* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK: */
478
479	void __init sched_clock_init(void)
480	{
481	static_branch_inc(&sched_clock_running);
482	local_irq_disable();
483	generic_sched_clock_init();
484	local_irq_enable();
485	}
486
487	notrace u64 sched_clock_cpu(int cpu)
488	{
489	if (!static_branch_likely(&sched_clock_running))
490	return `0`;
491
492	return sched_clock();
493	}
494
495	#endif /* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
496
497	/*
498	* Running clock - returns the time that has elapsed while a guest has been
499	* running.
500	* On a guest this value should be local_clock minus the time the guest was
501	* suspended by the hypervisor (for any reason).
502	* On bare metal this function should return the same as local_clock.
503	* Architectures and sub-architectures can override this.
504	*/
505	notrace u64 __weak running_clock(void)
506	{
507	return local_clock();
508	}
509

Browse the source code of Linux/kernel/sched/clock.c