1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * x86 APERF/MPERF KHz calculation for
4 * /sys/.../cpufreq/scaling_cur_freq
5 *
6 * Copyright (C) 2017 Intel Corp.
7 * Author: Len Brown <len.brown@intel.com>
8 */
9#include <linux/cpufreq.h>
10#include <linux/delay.h>
11#include <linux/ktime.h>
12#include <linux/math64.h>
13#include <linux/percpu.h>
14#include <linux/rcupdate.h>
15#include <linux/sched/isolation.h>
16#include <linux/sched/topology.h>
17#include <linux/smp.h>
18#include <linux/syscore_ops.h>
19
20#include <asm/cpu.h>
21#include <asm/cpu_device_id.h>
22#include <asm/intel-family.h>
23#include <asm/msr.h>
24
25#include "cpu.h"
26
27struct aperfmperf {
28 seqcount_t seq;
29 unsigned long last_update;
30 u64 acnt;
31 u64 mcnt;
32 u64 aperf;
33 u64 mperf;
34};
35
36static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = {
37 .seq = SEQCNT_ZERO(cpu_samples.seq)
38};
39
40static void init_counter_refs(void)
41{
42 u64 aperf, mperf;
43
44 rdmsrq(MSR_IA32_APERF, aperf);
45 rdmsrq(MSR_IA32_MPERF, mperf);
46
47 this_cpu_write(cpu_samples.aperf, aperf);
48 this_cpu_write(cpu_samples.mperf, mperf);
49}
50
51#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
52/*
53 * APERF/MPERF frequency ratio computation.
54 *
55 * The scheduler wants to do frequency invariant accounting and needs a <1
56 * ratio to account for the 'current' frequency, corresponding to
57 * freq_curr / freq_max.
58 *
59 * Since the frequency freq_curr on x86 is controlled by micro-controller and
60 * our P-state setting is little more than a request/hint, we need to observe
61 * the effective frequency 'BusyMHz', i.e. the average frequency over a time
62 * interval after discarding idle time. This is given by:
63 *
64 * BusyMHz = delta_APERF / delta_MPERF * freq_base
65 *
66 * where freq_base is the max non-turbo P-state.
67 *
68 * The freq_max term has to be set to a somewhat arbitrary value, because we
69 * can't know which turbo states will be available at a given point in time:
70 * it all depends on the thermal headroom of the entire package. We set it to
71 * the turbo level with 4 cores active.
72 *
73 * Benchmarks show that's a good compromise between the 1C turbo ratio
74 * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
75 * which would ignore the entire turbo range (a conspicuous part, making
76 * freq_curr/freq_max always maxed out).
77 *
78 * An exception to the heuristic above is the Atom uarch, where we choose the
79 * highest turbo level for freq_max since Atom's are generally oriented towards
80 * power efficiency.
81 *
82 * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
83 * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
84 */
85
86DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
87
88static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
89static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
90
91void arch_set_max_freq_ratio(bool turbo_disabled)
92{
93 arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
94 arch_turbo_freq_ratio;
95}
96EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
97
98static bool __init turbo_disabled(void)
99{
100 u64 misc_en;
101 int err;
102
103 err = rdmsrq_safe(MSR_IA32_MISC_ENABLE, p: &misc_en);
104 if (err)
105 return false;
106
107 return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
108}
109
110static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
111{
112 int err;
113
114 err = rdmsrq_safe(MSR_ATOM_CORE_RATIOS, p: base_freq);
115 if (err)
116 return false;
117
118 err = rdmsrq_safe(MSR_ATOM_CORE_TURBO_RATIOS, p: turbo_freq);
119 if (err)
120 return false;
121
122 *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */
123 *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */
124
125 return true;
126}
127
128#define X86_MATCH(vfm) \
129 X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_APERFMPERF, NULL)
130
131static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = {
132 X86_MATCH(INTEL_XEON_PHI_KNL),
133 X86_MATCH(INTEL_XEON_PHI_KNM),
134 {}
135};
136
137static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = {
138 X86_MATCH(INTEL_SKYLAKE_X),
139 {}
140};
141
142static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = {
143 X86_MATCH(INTEL_ATOM_GOLDMONT),
144 X86_MATCH(INTEL_ATOM_GOLDMONT_D),
145 X86_MATCH(INTEL_ATOM_GOLDMONT_PLUS),
146 {}
147};
148
149static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
150 int num_delta_fratio)
151{
152 int fratio, delta_fratio, found;
153 int err, i;
154 u64 msr;
155
156 err = rdmsrq_safe(MSR_PLATFORM_INFO, p: base_freq);
157 if (err)
158 return false;
159
160 *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
161
162 err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, p: &msr);
163 if (err)
164 return false;
165
166 fratio = (msr >> 8) & 0xFF;
167 i = 16;
168 found = 0;
169 do {
170 if (found >= num_delta_fratio) {
171 *turbo_freq = fratio;
172 return true;
173 }
174
175 delta_fratio = (msr >> (i + 5)) & 0x7;
176
177 if (delta_fratio) {
178 found += 1;
179 fratio -= delta_fratio;
180 }
181
182 i += 8;
183 } while (i < 64);
184
185 return true;
186}
187
188static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
189{
190 u64 ratios, counts;
191 u32 group_size;
192 int err, i;
193
194 err = rdmsrq_safe(MSR_PLATFORM_INFO, p: base_freq);
195 if (err)
196 return false;
197
198 *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
199
200 err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, p: &ratios);
201 if (err)
202 return false;
203
204 err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT1, p: &counts);
205 if (err)
206 return false;
207
208 for (i = 0; i < 64; i += 8) {
209 group_size = (counts >> i) & 0xFF;
210 if (group_size >= size) {
211 *turbo_freq = (ratios >> i) & 0xFF;
212 return true;
213 }
214 }
215
216 return false;
217}
218
219static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
220{
221 u64 msr;
222 int err;
223
224 err = rdmsrq_safe(MSR_PLATFORM_INFO, p: base_freq);
225 if (err)
226 return false;
227
228 err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, p: &msr);
229 if (err)
230 return false;
231
232 *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
233 *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */
234
235 /* The CPU may have less than 4 cores */
236 if (!*turbo_freq)
237 *turbo_freq = msr & 0xFF; /* 1C turbo */
238
239 return true;
240}
241
242static bool __init intel_set_max_freq_ratio(void)
243{
244 u64 base_freq, turbo_freq;
245 u64 turbo_ratio;
246
247 if (slv_set_max_freq_ratio(base_freq: &base_freq, turbo_freq: &turbo_freq))
248 goto out;
249
250 if (x86_match_cpu(match: has_glm_turbo_ratio_limits) &&
251 skx_set_max_freq_ratio(base_freq: &base_freq, turbo_freq: &turbo_freq, size: 1))
252 goto out;
253
254 if (x86_match_cpu(match: has_knl_turbo_ratio_limits) &&
255 knl_set_max_freq_ratio(base_freq: &base_freq, turbo_freq: &turbo_freq, num_delta_fratio: 1))
256 goto out;
257
258 if (x86_match_cpu(match: has_skx_turbo_ratio_limits) &&
259 skx_set_max_freq_ratio(base_freq: &base_freq, turbo_freq: &turbo_freq, size: 4))
260 goto out;
261
262 if (core_set_max_freq_ratio(base_freq: &base_freq, turbo_freq: &turbo_freq))
263 goto out;
264
265 return false;
266
267out:
268 /*
269 * Some hypervisors advertise X86_FEATURE_APERFMPERF
270 * but then fill all MSR's with zeroes.
271 * Some CPUs have turbo boost but don't declare any turbo ratio
272 * in MSR_TURBO_RATIO_LIMIT.
273 */
274 if (!base_freq || !turbo_freq) {
275 pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
276 return false;
277 }
278
279 turbo_ratio = div_u64(dividend: turbo_freq * SCHED_CAPACITY_SCALE, divisor: base_freq);
280 if (!turbo_ratio) {
281 pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
282 return false;
283 }
284
285 arch_turbo_freq_ratio = turbo_ratio;
286 arch_set_max_freq_ratio(turbo_disabled());
287
288 return true;
289}
290
291#ifdef CONFIG_PM_SLEEP
292static struct syscore_ops freq_invariance_syscore_ops = {
293 .resume = init_counter_refs,
294};
295
296static void register_freq_invariance_syscore_ops(void)
297{
298 register_syscore_ops(ops: &freq_invariance_syscore_ops);
299}
300#else
301static inline void register_freq_invariance_syscore_ops(void) {}
302#endif
303
304static void freq_invariance_enable(void)
305{
306 if (static_branch_unlikely(&arch_scale_freq_key)) {
307 WARN_ON_ONCE(1);
308 return;
309 }
310 static_branch_enable_cpuslocked(&arch_scale_freq_key);
311 register_freq_invariance_syscore_ops();
312 pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
313}
314
315void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled)
316{
317 arch_turbo_freq_ratio = ratio;
318 arch_set_max_freq_ratio(turbo_disabled);
319 freq_invariance_enable();
320}
321
322static void __init bp_init_freq_invariance(void)
323{
324 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
325 return;
326
327 if (intel_set_max_freq_ratio()) {
328 guard(cpus_read_lock)();
329 freq_invariance_enable();
330 }
331}
332
333static void disable_freq_invariance_workfn(struct work_struct *work)
334{
335 int cpu;
336
337 static_branch_disable(&arch_scale_freq_key);
338
339 /*
340 * Set arch_freq_scale to a default value on all cpus
341 * This negates the effect of scaling
342 */
343 for_each_possible_cpu(cpu)
344 per_cpu(arch_freq_scale, cpu) = SCHED_CAPACITY_SCALE;
345}
346
347static DECLARE_WORK(disable_freq_invariance_work,
348 disable_freq_invariance_workfn);
349
350DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
351EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale);
352
353static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key);
354
355struct arch_hybrid_cpu_scale {
356 unsigned long capacity;
357 unsigned long freq_ratio;
358};
359
360static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale;
361
362/**
363 * arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling
364 *
365 * Allocate memory for per-CPU data used by hybrid CPU capacity scaling,
366 * initialize it and set the static key controlling its code paths.
367 *
368 * Must be called before arch_set_cpu_capacity().
369 */
370bool arch_enable_hybrid_capacity_scale(void)
371{
372 int cpu;
373
374 if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) {
375 WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled");
376 return true;
377 }
378
379 arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale);
380 if (!arch_cpu_scale)
381 return false;
382
383 for_each_possible_cpu(cpu) {
384 per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE;
385 per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio;
386 }
387
388 static_branch_enable(&arch_hybrid_cap_scale_key);
389
390 pr_info("Hybrid CPU capacity scaling enabled\n");
391
392 return true;
393}
394
395/**
396 * arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU
397 * @cpu: Target CPU.
398 * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap.
399 * @max_cap: System-wide maximum CPU capacity.
400 * @cap_freq: Frequency of @cpu corresponding to @cap.
401 * @base_freq: Frequency of @cpu at which MPERF counts.
402 *
403 * The units in which @cap and @max_cap are expressed do not matter, so long
404 * as they are consistent, because the former is effectively divided by the
405 * latter. Analogously for @cap_freq and @base_freq.
406 *
407 * After calling this function for all CPUs, call arch_rebuild_sched_domains()
408 * to let the scheduler know that capacity-aware scheduling can be used going
409 * forward.
410 */
411void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap,
412 unsigned long cap_freq, unsigned long base_freq)
413{
414 if (static_branch_likely(&arch_hybrid_cap_scale_key)) {
415 WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity,
416 div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap));
417 WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio,
418 div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq));
419 } else {
420 WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled");
421 }
422}
423
424unsigned long arch_scale_cpu_capacity(int cpu)
425{
426 if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
427 return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity);
428
429 return SCHED_CAPACITY_SCALE;
430}
431EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity);
432
433static void scale_freq_tick(u64 acnt, u64 mcnt)
434{
435 u64 freq_scale, freq_ratio;
436
437 if (!arch_scale_freq_invariant())
438 return;
439
440 if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
441 goto error;
442
443 if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
444 freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio);
445 else
446 freq_ratio = arch_max_freq_ratio;
447
448 if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt)
449 goto error;
450
451 freq_scale = div64_u64(dividend: acnt, divisor: mcnt);
452 if (!freq_scale)
453 goto error;
454
455 if (freq_scale > SCHED_CAPACITY_SCALE)
456 freq_scale = SCHED_CAPACITY_SCALE;
457
458 this_cpu_write(arch_freq_scale, freq_scale);
459 return;
460
461error:
462 pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
463 schedule_work(work: &disable_freq_invariance_work);
464}
465#else
466static inline void bp_init_freq_invariance(void) { }
467static inline void scale_freq_tick(u64 acnt, u64 mcnt) { }
468#endif /* CONFIG_X86_64 && CONFIG_SMP */
469
470void arch_scale_freq_tick(void)
471{
472 struct aperfmperf *s = this_cpu_ptr(&cpu_samples);
473 u64 acnt, mcnt, aperf, mperf;
474
475 if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
476 return;
477
478 rdmsrq(MSR_IA32_APERF, aperf);
479 rdmsrq(MSR_IA32_MPERF, mperf);
480 acnt = aperf - s->aperf;
481 mcnt = mperf - s->mperf;
482
483 s->aperf = aperf;
484 s->mperf = mperf;
485
486 raw_write_seqcount_begin(&s->seq);
487 s->last_update = jiffies;
488 s->acnt = acnt;
489 s->mcnt = mcnt;
490 raw_write_seqcount_end(&s->seq);
491
492 scale_freq_tick(acnt, mcnt);
493}
494
495/*
496 * Discard samples older than the define maximum sample age of 20ms. There
497 * is no point in sending IPIs in such a case. If the scheduler tick was
498 * not running then the CPU is either idle or isolated.
499 */
500#define MAX_SAMPLE_AGE ((unsigned long)HZ / 50)
501
502int arch_freq_get_on_cpu(int cpu)
503{
504 struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
505 unsigned int seq, freq;
506 unsigned long last;
507 u64 acnt, mcnt;
508
509 if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
510 goto fallback;
511
512 do {
513 seq = raw_read_seqcount_begin(&s->seq);
514 last = s->last_update;
515 acnt = s->acnt;
516 mcnt = s->mcnt;
517 } while (read_seqcount_retry(&s->seq, seq));
518
519 /*
520 * Bail on invalid count and when the last update was too long ago,
521 * which covers idle and NOHZ full CPUs.
522 */
523 if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE)
524 goto fallback;
525
526 return div64_u64(dividend: (cpu_khz * acnt), divisor: mcnt);
527
528fallback:
529 freq = cpufreq_quick_get(cpu);
530 return freq ? freq : cpu_khz;
531}
532
533static int __init bp_init_aperfmperf(void)
534{
535 if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
536 return 0;
537
538 init_counter_refs();
539 bp_init_freq_invariance();
540 return 0;
541}
542early_initcall(bp_init_aperfmperf);
543
544void ap_init_aperfmperf(void)
545{
546 if (cpu_feature_enabled(X86_FEATURE_APERFMPERF))
547 init_counter_refs();
548}
549