| 1 | // SPDX-License-Identifier: GPL-2.0 | 
|---|
| 2 | /* | 
|---|
| 3 | * Performance events callchain code, extracted from core.c: | 
|---|
| 4 | * | 
|---|
| 5 | *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> | 
|---|
| 6 | *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar | 
|---|
| 7 | *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra | 
|---|
| 8 | *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> | 
|---|
| 9 | */ | 
|---|
| 10 |  | 
|---|
| 11 | #include <linux/perf_event.h> | 
|---|
| 12 | #include <linux/slab.h> | 
|---|
| 13 | #include <linux/sched/task_stack.h> | 
|---|
| 14 | #include <linux/uprobes.h> | 
|---|
| 15 |  | 
|---|
| 16 | #include "internal.h" | 
|---|
| 17 |  | 
|---|
| 18 | struct callchain_cpus_entries { | 
|---|
| 19 | struct rcu_head			rcu_head; | 
|---|
| 20 | struct perf_callchain_entry	*cpu_entries[]; | 
|---|
| 21 | }; | 
|---|
| 22 |  | 
|---|
| 23 | int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH; | 
|---|
| 24 | int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK; | 
|---|
| 25 | static const int six_hundred_forty_kb = 640 * 1024; | 
|---|
| 26 |  | 
|---|
| 27 | static inline size_t perf_callchain_entry__sizeof(void) | 
|---|
| 28 | { | 
|---|
| 29 | return (sizeof(struct perf_callchain_entry) + | 
|---|
| 30 | sizeof(__u64) * (sysctl_perf_event_max_stack + | 
|---|
| 31 | sysctl_perf_event_max_contexts_per_stack)); | 
|---|
| 32 | } | 
|---|
| 33 |  | 
|---|
| 34 | static DEFINE_PER_CPU(u8, callchain_recursion[PERF_NR_CONTEXTS]); | 
|---|
| 35 | static atomic_t nr_callchain_events; | 
|---|
| 36 | static DEFINE_MUTEX(callchain_mutex); | 
|---|
| 37 | static struct callchain_cpus_entries *callchain_cpus_entries; | 
|---|
| 38 |  | 
|---|
| 39 |  | 
|---|
| 40 | __weak void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, | 
|---|
| 41 | struct pt_regs *regs) | 
|---|
| 42 | { | 
|---|
| 43 | } | 
|---|
| 44 |  | 
|---|
| 45 | __weak void perf_callchain_user(struct perf_callchain_entry_ctx *entry, | 
|---|
| 46 | struct pt_regs *regs) | 
|---|
| 47 | { | 
|---|
| 48 | } | 
|---|
| 49 |  | 
|---|
| 50 | static void release_callchain_buffers_rcu(struct rcu_head *head) | 
|---|
| 51 | { | 
|---|
| 52 | struct callchain_cpus_entries *entries; | 
|---|
| 53 | int cpu; | 
|---|
| 54 |  | 
|---|
| 55 | entries = container_of(head, struct callchain_cpus_entries, rcu_head); | 
|---|
| 56 |  | 
|---|
| 57 | for_each_possible_cpu(cpu) | 
|---|
| 58 | kfree(objp: entries->cpu_entries[cpu]); | 
|---|
| 59 |  | 
|---|
| 60 | kfree(objp: entries); | 
|---|
| 61 | } | 
|---|
| 62 |  | 
|---|
| 63 | static void release_callchain_buffers(void) | 
|---|
| 64 | { | 
|---|
| 65 | struct callchain_cpus_entries *entries; | 
|---|
| 66 |  | 
|---|
| 67 | entries = callchain_cpus_entries; | 
|---|
| 68 | RCU_INIT_POINTER(callchain_cpus_entries, NULL); | 
|---|
| 69 | call_rcu(head: &entries->rcu_head, func: release_callchain_buffers_rcu); | 
|---|
| 70 | } | 
|---|
| 71 |  | 
|---|
| 72 | static int alloc_callchain_buffers(void) | 
|---|
| 73 | { | 
|---|
| 74 | int cpu; | 
|---|
| 75 | int size; | 
|---|
| 76 | struct callchain_cpus_entries *entries; | 
|---|
| 77 |  | 
|---|
| 78 | /* | 
|---|
| 79 | * We can't use the percpu allocation API for data that can be | 
|---|
| 80 | * accessed from NMI. Use a temporary manual per cpu allocation | 
|---|
| 81 | * until that gets sorted out. | 
|---|
| 82 | */ | 
|---|
| 83 | size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); | 
|---|
| 84 |  | 
|---|
| 85 | entries = kzalloc(size, GFP_KERNEL); | 
|---|
| 86 | if (!entries) | 
|---|
| 87 | return -ENOMEM; | 
|---|
| 88 |  | 
|---|
| 89 | size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS; | 
|---|
| 90 |  | 
|---|
| 91 | for_each_possible_cpu(cpu) { | 
|---|
| 92 | entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, | 
|---|
| 93 | cpu_to_node(cpu)); | 
|---|
| 94 | if (!entries->cpu_entries[cpu]) | 
|---|
| 95 | goto fail; | 
|---|
| 96 | } | 
|---|
| 97 |  | 
|---|
| 98 | rcu_assign_pointer(callchain_cpus_entries, entries); | 
|---|
| 99 |  | 
|---|
| 100 | return 0; | 
|---|
| 101 |  | 
|---|
| 102 | fail: | 
|---|
| 103 | for_each_possible_cpu(cpu) | 
|---|
| 104 | kfree(objp: entries->cpu_entries[cpu]); | 
|---|
| 105 | kfree(objp: entries); | 
|---|
| 106 |  | 
|---|
| 107 | return -ENOMEM; | 
|---|
| 108 | } | 
|---|
| 109 |  | 
|---|
| 110 | int get_callchain_buffers(int event_max_stack) | 
|---|
| 111 | { | 
|---|
| 112 | int err = 0; | 
|---|
| 113 | int count; | 
|---|
| 114 |  | 
|---|
| 115 | mutex_lock(lock: &callchain_mutex); | 
|---|
| 116 |  | 
|---|
| 117 | count = atomic_inc_return(v: &nr_callchain_events); | 
|---|
| 118 | if (WARN_ON_ONCE(count < 1)) { | 
|---|
| 119 | err = -EINVAL; | 
|---|
| 120 | goto exit; | 
|---|
| 121 | } | 
|---|
| 122 |  | 
|---|
| 123 | /* | 
|---|
| 124 | * If requesting per event more than the global cap, | 
|---|
| 125 | * return a different error to help userspace figure | 
|---|
| 126 | * this out. | 
|---|
| 127 | * | 
|---|
| 128 | * And also do it here so that we have &callchain_mutex held. | 
|---|
| 129 | */ | 
|---|
| 130 | if (event_max_stack > sysctl_perf_event_max_stack) { | 
|---|
| 131 | err = -EOVERFLOW; | 
|---|
| 132 | goto exit; | 
|---|
| 133 | } | 
|---|
| 134 |  | 
|---|
| 135 | if (count == 1) | 
|---|
| 136 | err = alloc_callchain_buffers(); | 
|---|
| 137 | exit: | 
|---|
| 138 | if (err) | 
|---|
| 139 | atomic_dec(v: &nr_callchain_events); | 
|---|
| 140 |  | 
|---|
| 141 | mutex_unlock(lock: &callchain_mutex); | 
|---|
| 142 |  | 
|---|
| 143 | return err; | 
|---|
| 144 | } | 
|---|
| 145 |  | 
|---|
| 146 | void put_callchain_buffers(void) | 
|---|
| 147 | { | 
|---|
| 148 | if (atomic_dec_and_mutex_lock(cnt: &nr_callchain_events, lock: &callchain_mutex)) { | 
|---|
| 149 | release_callchain_buffers(); | 
|---|
| 150 | mutex_unlock(lock: &callchain_mutex); | 
|---|
| 151 | } | 
|---|
| 152 | } | 
|---|
| 153 |  | 
|---|
| 154 | struct perf_callchain_entry *get_callchain_entry(int *rctx) | 
|---|
| 155 | { | 
|---|
| 156 | int cpu; | 
|---|
| 157 | struct callchain_cpus_entries *entries; | 
|---|
| 158 |  | 
|---|
| 159 | *rctx = get_recursion_context(this_cpu_ptr(callchain_recursion)); | 
|---|
| 160 | if (*rctx == -1) | 
|---|
| 161 | return NULL; | 
|---|
| 162 |  | 
|---|
| 163 | entries = rcu_dereference(callchain_cpus_entries); | 
|---|
| 164 | if (!entries) { | 
|---|
| 165 | put_recursion_context(this_cpu_ptr(callchain_recursion), rctx: *rctx); | 
|---|
| 166 | return NULL; | 
|---|
| 167 | } | 
|---|
| 168 |  | 
|---|
| 169 | cpu = smp_processor_id(); | 
|---|
| 170 |  | 
|---|
| 171 | return (((void *)entries->cpu_entries[cpu]) + | 
|---|
| 172 | (*rctx * perf_callchain_entry__sizeof())); | 
|---|
| 173 | } | 
|---|
| 174 |  | 
|---|
| 175 | void | 
|---|
| 176 | put_callchain_entry(int rctx) | 
|---|
| 177 | { | 
|---|
| 178 | put_recursion_context(this_cpu_ptr(callchain_recursion), rctx); | 
|---|
| 179 | } | 
|---|
| 180 |  | 
|---|
| 181 | static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entry, | 
|---|
| 182 | int start_entry_idx) | 
|---|
| 183 | { | 
|---|
| 184 | #ifdef CONFIG_UPROBES | 
|---|
| 185 | struct uprobe_task *utask = current->utask; | 
|---|
| 186 | struct return_instance *ri; | 
|---|
| 187 | __u64 *cur_ip, *last_ip, tramp_addr; | 
|---|
| 188 |  | 
|---|
| 189 | if (likely(!utask || !utask->return_instances)) | 
|---|
| 190 | return; | 
|---|
| 191 |  | 
|---|
| 192 | cur_ip = &entry->ip[start_entry_idx]; | 
|---|
| 193 | last_ip = &entry->ip[entry->nr - 1]; | 
|---|
| 194 | ri = utask->return_instances; | 
|---|
| 195 | tramp_addr = uprobe_get_trampoline_vaddr(); | 
|---|
| 196 |  | 
|---|
| 197 | /* | 
|---|
| 198 | * If there are pending uretprobes for the current thread, they are | 
|---|
| 199 | * recorded in a list inside utask->return_instances; each such | 
|---|
| 200 | * pending uretprobe replaces traced user function's return address on | 
|---|
| 201 | * the stack, so when stack trace is captured, instead of seeing | 
|---|
| 202 | * actual function's return address, we'll have one or many uretprobe | 
|---|
| 203 | * trampoline addresses in the stack trace, which are not helpful and | 
|---|
| 204 | * misleading to users. | 
|---|
| 205 | * So here we go over the pending list of uretprobes, and each | 
|---|
| 206 | * encountered trampoline address is replaced with actual return | 
|---|
| 207 | * address. | 
|---|
| 208 | */ | 
|---|
| 209 | while (ri && cur_ip <= last_ip) { | 
|---|
| 210 | if (*cur_ip == tramp_addr) { | 
|---|
| 211 | *cur_ip = ri->orig_ret_vaddr; | 
|---|
| 212 | ri = ri->next; | 
|---|
| 213 | } | 
|---|
| 214 | cur_ip++; | 
|---|
| 215 | } | 
|---|
| 216 | #endif | 
|---|
| 217 | } | 
|---|
| 218 |  | 
|---|
| 219 | struct perf_callchain_entry * | 
|---|
| 220 | get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, | 
|---|
| 221 | u32 max_stack, bool crosstask, bool add_mark) | 
|---|
| 222 | { | 
|---|
| 223 | struct perf_callchain_entry *entry; | 
|---|
| 224 | struct perf_callchain_entry_ctx ctx; | 
|---|
| 225 | int rctx, start_entry_idx; | 
|---|
| 226 |  | 
|---|
| 227 | /* crosstask is not supported for user stacks */ | 
|---|
| 228 | if (crosstask && user && !kernel) | 
|---|
| 229 | return NULL; | 
|---|
| 230 |  | 
|---|
| 231 | entry = get_callchain_entry(rctx: &rctx); | 
|---|
| 232 | if (!entry) | 
|---|
| 233 | return NULL; | 
|---|
| 234 |  | 
|---|
| 235 | ctx.entry		= entry; | 
|---|
| 236 | ctx.max_stack		= max_stack; | 
|---|
| 237 | ctx.nr			= entry->nr = 0; | 
|---|
| 238 | ctx.contexts		= 0; | 
|---|
| 239 | ctx.contexts_maxed	= false; | 
|---|
| 240 |  | 
|---|
| 241 | if (kernel && !user_mode(regs)) { | 
|---|
| 242 | if (add_mark) | 
|---|
| 243 | perf_callchain_store_context(ctx: &ctx, ip: PERF_CONTEXT_KERNEL); | 
|---|
| 244 | perf_callchain_kernel(entry: &ctx, regs); | 
|---|
| 245 | } | 
|---|
| 246 |  | 
|---|
| 247 | if (user && !crosstask) { | 
|---|
| 248 | if (!user_mode(regs)) { | 
|---|
| 249 | if (current->flags & (PF_KTHREAD | PF_USER_WORKER)) | 
|---|
| 250 | goto exit_put; | 
|---|
| 251 | regs = task_pt_regs(current); | 
|---|
| 252 | } | 
|---|
| 253 |  | 
|---|
| 254 | if (add_mark) | 
|---|
| 255 | perf_callchain_store_context(ctx: &ctx, ip: PERF_CONTEXT_USER); | 
|---|
| 256 |  | 
|---|
| 257 | start_entry_idx = entry->nr; | 
|---|
| 258 | perf_callchain_user(entry: &ctx, regs); | 
|---|
| 259 | fixup_uretprobe_trampoline_entries(entry, start_entry_idx); | 
|---|
| 260 | } | 
|---|
| 261 |  | 
|---|
| 262 | exit_put: | 
|---|
| 263 | put_callchain_entry(rctx); | 
|---|
| 264 |  | 
|---|
| 265 | return entry; | 
|---|
| 266 | } | 
|---|
| 267 |  | 
|---|
| 268 | static int perf_event_max_stack_handler(const struct ctl_table *table, int write, | 
|---|
| 269 | void *buffer, size_t *lenp, loff_t *ppos) | 
|---|
| 270 | { | 
|---|
| 271 | int *value = table->data; | 
|---|
| 272 | int new_value = *value, ret; | 
|---|
| 273 | struct ctl_table new_table = *table; | 
|---|
| 274 |  | 
|---|
| 275 | new_table.data = &new_value; | 
|---|
| 276 | ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos); | 
|---|
| 277 | if (ret || !write) | 
|---|
| 278 | return ret; | 
|---|
| 279 |  | 
|---|
| 280 | mutex_lock(lock: &callchain_mutex); | 
|---|
| 281 | if (atomic_read(v: &nr_callchain_events)) | 
|---|
| 282 | ret = -EBUSY; | 
|---|
| 283 | else | 
|---|
| 284 | *value = new_value; | 
|---|
| 285 |  | 
|---|
| 286 | mutex_unlock(lock: &callchain_mutex); | 
|---|
| 287 |  | 
|---|
| 288 | return ret; | 
|---|
| 289 | } | 
|---|
| 290 |  | 
|---|
| 291 | static const struct ctl_table callchain_sysctl_table[] = { | 
|---|
| 292 | { | 
|---|
| 293 | .procname	= "perf_event_max_stack", | 
|---|
| 294 | .data		= &sysctl_perf_event_max_stack, | 
|---|
| 295 | .maxlen		= sizeof(sysctl_perf_event_max_stack), | 
|---|
| 296 | .mode		= 0644, | 
|---|
| 297 | .proc_handler	= perf_event_max_stack_handler, | 
|---|
| 298 | .extra1		= SYSCTL_ZERO, | 
|---|
| 299 | .extra2		= (void *)&six_hundred_forty_kb, | 
|---|
| 300 | }, | 
|---|
| 301 | { | 
|---|
| 302 | .procname	= "perf_event_max_contexts_per_stack", | 
|---|
| 303 | .data		= &sysctl_perf_event_max_contexts_per_stack, | 
|---|
| 304 | .maxlen		= sizeof(sysctl_perf_event_max_contexts_per_stack), | 
|---|
| 305 | .mode		= 0644, | 
|---|
| 306 | .proc_handler	= perf_event_max_stack_handler, | 
|---|
| 307 | .extra1		= SYSCTL_ZERO, | 
|---|
| 308 | .extra2		= SYSCTL_ONE_THOUSAND, | 
|---|
| 309 | }, | 
|---|
| 310 | }; | 
|---|
| 311 |  | 
|---|
| 312 | static int __init init_callchain_sysctls(void) | 
|---|
| 313 | { | 
|---|
| 314 | register_sysctl_init( "kernel", callchain_sysctl_table); | 
|---|
| 315 | return 0; | 
|---|
| 316 | } | 
|---|
| 317 | core_initcall(init_callchain_sysctls); | 
|---|
| 318 |  | 
|---|
| 319 |  | 
|---|