1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * BTS PMU driver for perf
4 * Copyright (c) 2013-2014, Intel Corporation.
5 */
6
7#undef DEBUG
8
9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10
11#include <linux/bitops.h>
12#include <linux/types.h>
13#include <linux/slab.h>
14#include <linux/debugfs.h>
15#include <linux/device.h>
16#include <linux/coredump.h>
17
18#include <linux/sizes.h>
19#include <asm/perf_event.h>
20#include <asm/msr.h>
21
22#include "../perf_event.h"
23
24struct bts_ctx {
25 struct perf_output_handle handle;
26 struct debug_store ds_back;
27 int state;
28};
29
30/* BTS context states: */
31enum {
32 /* no ongoing AUX transactions */
33 BTS_STATE_STOPPED = 0,
34 /* AUX transaction is on, BTS tracing is disabled */
35 BTS_STATE_INACTIVE,
36 /* AUX transaction is on, BTS tracing is running */
37 BTS_STATE_ACTIVE,
38};
39
40static struct bts_ctx __percpu *bts_ctx;
41
42#define BTS_RECORD_SIZE 24
43#define BTS_SAFETY_MARGIN 4080
44
45struct bts_phys {
46 struct page *page;
47 unsigned long size;
48 unsigned long offset;
49 unsigned long displacement;
50};
51
52struct bts_buffer {
53 size_t real_size; /* multiple of BTS_RECORD_SIZE */
54 unsigned int nr_pages;
55 unsigned int nr_bufs;
56 unsigned int cur_buf;
57 bool snapshot;
58 local_t data_size;
59 local_t head;
60 unsigned long end;
61 void **data_pages;
62 struct bts_phys buf[] __counted_by(nr_bufs);
63};
64
65static struct pmu bts_pmu;
66
67static int buf_nr_pages(struct page *page)
68{
69 if (!PagePrivate(page))
70 return 1;
71
72 return 1 << page_private(page);
73}
74
75static size_t buf_size(struct page *page)
76{
77 return buf_nr_pages(page) * PAGE_SIZE;
78}
79
80static void *
81bts_buffer_setup_aux(struct perf_event *event, void **pages,
82 int nr_pages, bool overwrite)
83{
84 struct bts_buffer *bb;
85 struct page *page;
86 int cpu = event->cpu;
87 int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
88 unsigned long offset;
89 size_t size = nr_pages << PAGE_SHIFT;
90 int pg, nr_buf, pad;
91
92 /* count all the high order buffers */
93 for (pg = 0, nr_buf = 0; pg < nr_pages;) {
94 page = virt_to_page(pages[pg]);
95 pg += buf_nr_pages(page);
96 nr_buf++;
97 }
98
99 /*
100 * to avoid interrupts in overwrite mode, only allow one physical
101 */
102 if (overwrite && nr_buf > 1)
103 return NULL;
104
105 bb = kzalloc_node(struct_size(bb, buf, nr_buf), GFP_KERNEL, node);
106 if (!bb)
107 return NULL;
108
109 bb->nr_pages = nr_pages;
110 bb->nr_bufs = nr_buf;
111 bb->snapshot = overwrite;
112 bb->data_pages = pages;
113 bb->real_size = size - size % BTS_RECORD_SIZE;
114
115 for (pg = 0, nr_buf = 0, offset = 0, pad = 0; nr_buf < bb->nr_bufs; nr_buf++) {
116 unsigned int __nr_pages;
117
118 page = virt_to_page(pages[pg]);
119 __nr_pages = buf_nr_pages(page);
120 bb->buf[nr_buf].page = page;
121 bb->buf[nr_buf].offset = offset;
122 bb->buf[nr_buf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
123 bb->buf[nr_buf].size = buf_size(page) - bb->buf[nr_buf].displacement;
124 pad = bb->buf[nr_buf].size % BTS_RECORD_SIZE;
125 bb->buf[nr_buf].size -= pad;
126
127 pg += __nr_pages;
128 offset += __nr_pages << PAGE_SHIFT;
129 }
130
131 return bb;
132}
133
134static void bts_buffer_free_aux(void *data)
135{
136 kfree(objp: data);
137}
138
139static unsigned long bts_buffer_offset(struct bts_buffer *bb, unsigned int idx)
140{
141 return bb->buf[idx].offset + bb->buf[idx].displacement;
142}
143
144static void
145bts_config_buffer(struct bts_buffer *bb)
146{
147 int cpu = raw_smp_processor_id();
148 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
149 struct bts_phys *phys = &bb->buf[bb->cur_buf];
150 unsigned long index, thresh = 0, end = phys->size;
151 struct page *page = phys->page;
152
153 index = local_read(&bb->head);
154
155 if (!bb->snapshot) {
156 if (bb->end < phys->offset + buf_size(page))
157 end = bb->end - phys->offset - phys->displacement;
158
159 index -= phys->offset + phys->displacement;
160
161 if (end - index > BTS_SAFETY_MARGIN)
162 thresh = end - BTS_SAFETY_MARGIN;
163 else if (end - index > BTS_RECORD_SIZE)
164 thresh = end - BTS_RECORD_SIZE;
165 else
166 thresh = end;
167 }
168
169 ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement;
170 ds->bts_index = ds->bts_buffer_base + index;
171 ds->bts_absolute_maximum = ds->bts_buffer_base + end;
172 ds->bts_interrupt_threshold = !bb->snapshot
173 ? ds->bts_buffer_base + thresh
174 : ds->bts_absolute_maximum + BTS_RECORD_SIZE;
175}
176
177static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
178{
179 unsigned long index = head - phys->offset;
180
181 memset(page_address(phys->page) + index, c: 0, n: phys->size - index);
182}
183
184static void bts_update(struct bts_ctx *bts)
185{
186 int cpu = raw_smp_processor_id();
187 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
188 struct bts_buffer *bb = perf_get_aux(handle: &bts->handle);
189 unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head;
190
191 if (!bb)
192 return;
193
194 head = index + bts_buffer_offset(bb, idx: bb->cur_buf);
195 old = local_xchg(l: &bb->head, n: head);
196
197 if (!bb->snapshot) {
198 if (old == head)
199 return;
200
201 if (ds->bts_index >= ds->bts_absolute_maximum)
202 perf_aux_output_flag(handle: &bts->handle,
203 PERF_AUX_FLAG_TRUNCATED);
204
205 /*
206 * old and head are always in the same physical buffer, so we
207 * can subtract them to get the data size.
208 */
209 local_add(i: head - old, l: &bb->data_size);
210 } else {
211 local_set(&bb->data_size, head);
212 }
213
214 /*
215 * Since BTS is coherent, just add compiler barrier to ensure
216 * BTS updating is ordered against bts::handle::event.
217 */
218 barrier();
219}
220
221static int
222bts_buffer_reset(struct bts_buffer *bb, struct perf_output_handle *handle);
223
224/*
225 * Ordering PMU callbacks wrt themselves and the PMI is done by means
226 * of bts::state, which:
227 * - is set when bts::handle::event is valid, that is, between
228 * perf_aux_output_begin() and perf_aux_output_end();
229 * - is zero otherwise;
230 * - is ordered against bts::handle::event with a compiler barrier.
231 */
232
233static void __bts_event_start(struct perf_event *event)
234{
235 struct bts_ctx *bts = this_cpu_ptr(bts_ctx);
236 struct bts_buffer *bb = perf_get_aux(handle: &bts->handle);
237 u64 config = 0;
238
239 if (!bb->snapshot)
240 config |= ARCH_PERFMON_EVENTSEL_INT;
241 if (!event->attr.exclude_kernel)
242 config |= ARCH_PERFMON_EVENTSEL_OS;
243 if (!event->attr.exclude_user)
244 config |= ARCH_PERFMON_EVENTSEL_USR;
245
246 bts_config_buffer(bb);
247
248 /*
249 * local barrier to make sure that ds configuration made it
250 * before we enable BTS and bts::state goes ACTIVE
251 */
252 wmb();
253
254 /* INACTIVE/STOPPED -> ACTIVE */
255 WRITE_ONCE(bts->state, BTS_STATE_ACTIVE);
256
257 intel_pmu_enable_bts(config);
258
259}
260
261static void bts_event_start(struct perf_event *event, int flags)
262{
263 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
264 struct bts_ctx *bts = this_cpu_ptr(bts_ctx);
265 struct bts_buffer *bb;
266
267 bb = perf_aux_output_begin(handle: &bts->handle, event);
268 if (!bb)
269 goto fail_stop;
270
271 if (bts_buffer_reset(bb, handle: &bts->handle))
272 goto fail_end_stop;
273
274 bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
275 bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
276 bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
277
278 perf_event_itrace_started(event);
279 event->hw.state = 0;
280
281 __bts_event_start(event);
282
283 return;
284
285fail_end_stop:
286 perf_aux_output_end(handle: &bts->handle, size: 0);
287
288fail_stop:
289 event->hw.state = PERF_HES_STOPPED;
290}
291
292static void __bts_event_stop(struct perf_event *event, int state)
293{
294 struct bts_ctx *bts = this_cpu_ptr(bts_ctx);
295
296 /* ACTIVE -> INACTIVE(PMI)/STOPPED(->stop()) */
297 WRITE_ONCE(bts->state, state);
298
299 /*
300 * No extra synchronization is mandated by the documentation to have
301 * BTS data stores globally visible.
302 */
303 intel_pmu_disable_bts();
304}
305
306static void bts_event_stop(struct perf_event *event, int flags)
307{
308 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
309 struct bts_ctx *bts = this_cpu_ptr(bts_ctx);
310 struct bts_buffer *bb = NULL;
311 int state = READ_ONCE(bts->state);
312
313 if (state == BTS_STATE_ACTIVE)
314 __bts_event_stop(event, state: BTS_STATE_STOPPED);
315
316 if (state != BTS_STATE_STOPPED)
317 bb = perf_get_aux(handle: &bts->handle);
318
319 event->hw.state |= PERF_HES_STOPPED;
320
321 if (flags & PERF_EF_UPDATE) {
322 bts_update(bts);
323
324 if (bb) {
325 if (bb->snapshot)
326 bts->handle.head =
327 local_xchg(l: &bb->data_size,
328 n: bb->nr_pages << PAGE_SHIFT);
329 perf_aux_output_end(handle: &bts->handle,
330 size: local_xchg(l: &bb->data_size, n: 0));
331 }
332
333 cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
334 cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
335 cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
336 cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
337 }
338}
339
340void intel_bts_enable_local(void)
341{
342 struct bts_ctx *bts;
343 int state;
344
345 if (!bts_ctx)
346 return;
347
348 bts = this_cpu_ptr(bts_ctx);
349 state = READ_ONCE(bts->state);
350 /*
351 * Here we transition from INACTIVE to ACTIVE;
352 * if we instead are STOPPED from the interrupt handler,
353 * stay that way. Can't be ACTIVE here though.
354 */
355 if (WARN_ON_ONCE(state == BTS_STATE_ACTIVE))
356 return;
357
358 if (state == BTS_STATE_STOPPED)
359 return;
360
361 if (bts->handle.event)
362 __bts_event_start(event: bts->handle.event);
363}
364
365void intel_bts_disable_local(void)
366{
367 struct bts_ctx *bts;
368
369 if (!bts_ctx)
370 return;
371
372 bts = this_cpu_ptr(bts_ctx);
373
374 /*
375 * Here we transition from ACTIVE to INACTIVE;
376 * do nothing for STOPPED or INACTIVE.
377 */
378 if (READ_ONCE(bts->state) != BTS_STATE_ACTIVE)
379 return;
380
381 if (bts->handle.event)
382 __bts_event_stop(event: bts->handle.event, state: BTS_STATE_INACTIVE);
383}
384
385static int
386bts_buffer_reset(struct bts_buffer *bb, struct perf_output_handle *handle)
387{
388 unsigned long head, space, next_space, pad, gap, skip, wakeup;
389 unsigned int next_buf;
390 struct bts_phys *phys, *next_phys;
391 int ret;
392
393 if (bb->snapshot)
394 return 0;
395
396 head = handle->head & ((bb->nr_pages << PAGE_SHIFT) - 1);
397
398 phys = &bb->buf[bb->cur_buf];
399 space = phys->offset + phys->displacement + phys->size - head;
400 pad = space;
401 if (space > handle->size) {
402 space = handle->size;
403 space -= space % BTS_RECORD_SIZE;
404 }
405 if (space <= BTS_SAFETY_MARGIN) {
406 /* See if next phys buffer has more space */
407 next_buf = bb->cur_buf + 1;
408 if (next_buf >= bb->nr_bufs)
409 next_buf = 0;
410 next_phys = &bb->buf[next_buf];
411 gap = buf_size(page: phys->page) - phys->displacement - phys->size +
412 next_phys->displacement;
413 skip = pad + gap;
414 if (handle->size >= skip) {
415 next_space = next_phys->size;
416 if (next_space + skip > handle->size) {
417 next_space = handle->size - skip;
418 next_space -= next_space % BTS_RECORD_SIZE;
419 }
420 if (next_space > space || !space) {
421 if (pad)
422 bts_buffer_pad_out(phys, head);
423 ret = perf_aux_output_skip(handle, size: skip);
424 if (ret)
425 return ret;
426 /* Advance to next phys buffer */
427 phys = next_phys;
428 space = next_space;
429 head = phys->offset + phys->displacement;
430 /*
431 * After this, cur_buf and head won't match ds
432 * anymore, so we must not be racing with
433 * bts_update().
434 */
435 bb->cur_buf = next_buf;
436 local_set(&bb->head, head);
437 }
438 }
439 }
440
441 /* Don't go far beyond wakeup watermark */
442 wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup -
443 handle->head;
444 if (space > wakeup) {
445 space = wakeup;
446 space -= space % BTS_RECORD_SIZE;
447 }
448
449 bb->end = head + space;
450
451 /*
452 * If we have no space, the lost notification would have been sent when
453 * we hit absolute_maximum - see bts_update()
454 */
455 if (!space)
456 return -ENOSPC;
457
458 return 0;
459}
460
461int intel_bts_interrupt(void)
462{
463 struct debug_store *ds = this_cpu_ptr(&cpu_hw_events)->ds;
464 struct bts_ctx *bts;
465 struct perf_event *event;
466 struct bts_buffer *bb;
467 s64 old_head;
468 int err = -ENOSPC, handled = 0;
469
470 if (!bts_ctx)
471 return 0;
472
473 bts = this_cpu_ptr(bts_ctx);
474 event = bts->handle.event;
475 /*
476 * The only surefire way of knowing if this NMI is ours is by checking
477 * the write ptr against the PMI threshold.
478 */
479 if (ds && (ds->bts_index >= ds->bts_interrupt_threshold))
480 handled = 1;
481
482 /*
483 * this is wrapped in intel_bts_enable_local/intel_bts_disable_local,
484 * so we can only be INACTIVE or STOPPED
485 */
486 if (READ_ONCE(bts->state) == BTS_STATE_STOPPED)
487 return handled;
488
489 bb = perf_get_aux(handle: &bts->handle);
490 if (!bb)
491 return handled;
492
493 /*
494 * Skip snapshot counters: they don't use the interrupt, but
495 * there's no other way of telling, because the pointer will
496 * keep moving
497 */
498 if (bb->snapshot)
499 return 0;
500
501 old_head = local_read(&bb->head);
502 bts_update(bts);
503
504 /* no new data */
505 if (old_head == local_read(&bb->head))
506 return handled;
507
508 perf_aux_output_end(handle: &bts->handle, size: local_xchg(l: &bb->data_size, n: 0));
509
510 bb = perf_aux_output_begin(handle: &bts->handle, event);
511 if (bb)
512 err = bts_buffer_reset(bb, handle: &bts->handle);
513
514 if (err) {
515 WRITE_ONCE(bts->state, BTS_STATE_STOPPED);
516
517 if (bb) {
518 /*
519 * BTS_STATE_STOPPED should be visible before
520 * cleared handle::event
521 */
522 barrier();
523 perf_aux_output_end(handle: &bts->handle, size: 0);
524 }
525 }
526
527 return 1;
528}
529
530static void bts_event_del(struct perf_event *event, int mode)
531{
532 bts_event_stop(event, PERF_EF_UPDATE);
533}
534
535static int bts_event_add(struct perf_event *event, int mode)
536{
537 struct bts_ctx *bts = this_cpu_ptr(bts_ctx);
538 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
539 struct hw_perf_event *hwc = &event->hw;
540
541 event->hw.state = PERF_HES_STOPPED;
542
543 if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
544 return -EBUSY;
545
546 if (bts->handle.event)
547 return -EBUSY;
548
549 if (mode & PERF_EF_START) {
550 bts_event_start(event, flags: 0);
551 if (hwc->state & PERF_HES_STOPPED)
552 return -EINVAL;
553 }
554
555 return 0;
556}
557
558static void bts_event_destroy(struct perf_event *event)
559{
560 x86_release_hardware();
561 x86_del_exclusive(what: x86_lbr_exclusive_bts);
562}
563
564static int bts_event_init(struct perf_event *event)
565{
566 int ret;
567
568 if (event->attr.type != bts_pmu.type)
569 return -ENOENT;
570
571 /*
572 * BTS leaks kernel addresses even when CPL0 tracing is
573 * disabled, so disallow intel_bts driver for unprivileged
574 * users on paranoid systems since it provides trace data
575 * to the user in a zero-copy fashion.
576 */
577 if (event->attr.exclude_kernel) {
578 ret = perf_allow_kernel();
579 if (ret)
580 return ret;
581 }
582
583 if (x86_add_exclusive(what: x86_lbr_exclusive_bts))
584 return -EBUSY;
585
586 ret = x86_reserve_hardware();
587 if (ret) {
588 x86_del_exclusive(what: x86_lbr_exclusive_bts);
589 return ret;
590 }
591
592 event->destroy = bts_event_destroy;
593
594 return 0;
595}
596
597static void bts_event_read(struct perf_event *event)
598{
599}
600
601static __init int bts_init(void)
602{
603 if (!boot_cpu_has(X86_FEATURE_DTES64))
604 return -ENODEV;
605
606 x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS);
607 if (!x86_pmu.bts)
608 return -ENODEV;
609
610 if (boot_cpu_has(X86_FEATURE_PTI)) {
611 /*
612 * BTS hardware writes through a virtual memory map we must
613 * either use the kernel physical map, or the user mapping of
614 * the AUX buffer.
615 *
616 * However, since this driver supports per-CPU and per-task inherit
617 * we cannot use the user mapping since it will not be available
618 * if we're not running the owning process.
619 *
620 * With PTI we can't use the kernel map either, because its not
621 * there when we run userspace.
622 *
623 * For now, disable this driver when using PTI.
624 */
625 return -ENODEV;
626 }
627
628 bts_ctx = alloc_percpu(struct bts_ctx);
629 if (!bts_ctx)
630 return -ENOMEM;
631
632 bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE |
633 PERF_PMU_CAP_EXCLUSIVE;
634 bts_pmu.task_ctx_nr = perf_sw_context;
635 bts_pmu.event_init = bts_event_init;
636 bts_pmu.add = bts_event_add;
637 bts_pmu.del = bts_event_del;
638 bts_pmu.start = bts_event_start;
639 bts_pmu.stop = bts_event_stop;
640 bts_pmu.read = bts_event_read;
641 bts_pmu.setup_aux = bts_buffer_setup_aux;
642 bts_pmu.free_aux = bts_buffer_free_aux;
643
644 return perf_pmu_register(pmu: &bts_pmu, name: "intel_bts", type: -1);
645}
646early_initcall(bts_init);
647