Merge tag 'perf-core-for-mingo' of git://git.kernel.org/pub/scm/linux/kernel/git...
[~shefty/rdma-dev.git] / tools / perf / builtin-record.c
1 /*
2  * builtin-record.c
3  *
4  * Builtin record command: Record the profile of a workload
5  * (or a CPU, or a PID) into the perf.data output file - for
6  * later analysis via perf report.
7  */
8 #define _FILE_OFFSET_BITS 64
9
10 #include "builtin.h"
11
12 #include "perf.h"
13
14 #include "util/build-id.h"
15 #include "util/util.h"
16 #include "util/parse-options.h"
17 #include "util/parse-events.h"
18
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/session.h"
25 #include "util/tool.h"
26 #include "util/symbol.h"
27 #include "util/cpumap.h"
28 #include "util/thread_map.h"
29
30 #include <unistd.h>
31 #include <sched.h>
32 #include <sys/mman.h>
33
34 #ifndef HAVE_ON_EXIT
35 #ifndef ATEXIT_MAX
36 #define ATEXIT_MAX 32
37 #endif
38 static int __on_exit_count = 0;
39 typedef void (*on_exit_func_t) (int, void *);
40 static on_exit_func_t __on_exit_funcs[ATEXIT_MAX];
41 static void *__on_exit_args[ATEXIT_MAX];
42 static int __exitcode = 0;
43 static void __handle_on_exit_funcs(void);
44 static int on_exit(on_exit_func_t function, void *arg);
45 #define exit(x) (exit)(__exitcode = (x))
46
47 static int on_exit(on_exit_func_t function, void *arg)
48 {
49         if (__on_exit_count == ATEXIT_MAX)
50                 return -ENOMEM;
51         else if (__on_exit_count == 0)
52                 atexit(__handle_on_exit_funcs);
53         __on_exit_funcs[__on_exit_count] = function;
54         __on_exit_args[__on_exit_count++] = arg;
55         return 0;
56 }
57
58 static void __handle_on_exit_funcs(void)
59 {
60         int i;
61         for (i = 0; i < __on_exit_count; i++)
62                 __on_exit_funcs[i] (__exitcode, __on_exit_args[i]);
63 }
64 #endif
65
66 enum write_mode_t {
67         WRITE_FORCE,
68         WRITE_APPEND
69 };
70
71 struct perf_record {
72         struct perf_tool        tool;
73         struct perf_record_opts opts;
74         u64                     bytes_written;
75         const char              *output_name;
76         struct perf_evlist      *evlist;
77         struct perf_session     *session;
78         const char              *progname;
79         int                     output;
80         unsigned int            page_size;
81         int                     realtime_prio;
82         enum write_mode_t       write_mode;
83         bool                    no_buildid;
84         bool                    no_buildid_cache;
85         bool                    force;
86         bool                    file_new;
87         bool                    append_file;
88         long                    samples;
89         off_t                   post_processing_offset;
90 };
91
92 static void advance_output(struct perf_record *rec, size_t size)
93 {
94         rec->bytes_written += size;
95 }
96
97 static int write_output(struct perf_record *rec, void *buf, size_t size)
98 {
99         while (size) {
100                 int ret = write(rec->output, buf, size);
101
102                 if (ret < 0) {
103                         pr_err("failed to write\n");
104                         return -1;
105                 }
106
107                 size -= ret;
108                 buf += ret;
109
110                 rec->bytes_written += ret;
111         }
112
113         return 0;
114 }
115
116 static int process_synthesized_event(struct perf_tool *tool,
117                                      union perf_event *event,
118                                      struct perf_sample *sample __maybe_unused,
119                                      struct machine *machine __maybe_unused)
120 {
121         struct perf_record *rec = container_of(tool, struct perf_record, tool);
122         if (write_output(rec, event, event->header.size) < 0)
123                 return -1;
124
125         return 0;
126 }
127
128 static int perf_record__mmap_read(struct perf_record *rec,
129                                    struct perf_mmap *md)
130 {
131         unsigned int head = perf_mmap__read_head(md);
132         unsigned int old = md->prev;
133         unsigned char *data = md->base + rec->page_size;
134         unsigned long size;
135         void *buf;
136         int rc = 0;
137
138         if (old == head)
139                 return 0;
140
141         rec->samples++;
142
143         size = head - old;
144
145         if ((old & md->mask) + size != (head & md->mask)) {
146                 buf = &data[old & md->mask];
147                 size = md->mask + 1 - (old & md->mask);
148                 old += size;
149
150                 if (write_output(rec, buf, size) < 0) {
151                         rc = -1;
152                         goto out;
153                 }
154         }
155
156         buf = &data[old & md->mask];
157         size = head - old;
158         old += size;
159
160         if (write_output(rec, buf, size) < 0) {
161                 rc = -1;
162                 goto out;
163         }
164
165         md->prev = old;
166         perf_mmap__write_tail(md, old);
167
168 out:
169         return rc;
170 }
171
172 static volatile int done = 0;
173 static volatile int signr = -1;
174 static volatile int child_finished = 0;
175
176 static void sig_handler(int sig)
177 {
178         if (sig == SIGCHLD)
179                 child_finished = 1;
180
181         done = 1;
182         signr = sig;
183 }
184
185 static void perf_record__sig_exit(int exit_status __maybe_unused, void *arg)
186 {
187         struct perf_record *rec = arg;
188         int status;
189
190         if (rec->evlist->workload.pid > 0) {
191                 if (!child_finished)
192                         kill(rec->evlist->workload.pid, SIGTERM);
193
194                 wait(&status);
195                 if (WIFSIGNALED(status))
196                         psignal(WTERMSIG(status), rec->progname);
197         }
198
199         if (signr == -1 || signr == SIGUSR1)
200                 return;
201
202         signal(signr, SIG_DFL);
203         kill(getpid(), signr);
204 }
205
206 static bool perf_evlist__equal(struct perf_evlist *evlist,
207                                struct perf_evlist *other)
208 {
209         struct perf_evsel *pos, *pair;
210
211         if (evlist->nr_entries != other->nr_entries)
212                 return false;
213
214         pair = perf_evlist__first(other);
215
216         list_for_each_entry(pos, &evlist->entries, node) {
217                 if (memcmp(&pos->attr, &pair->attr, sizeof(pos->attr) != 0))
218                         return false;
219                 pair = perf_evsel__next(pair);
220         }
221
222         return true;
223 }
224
225 static int perf_record__open(struct perf_record *rec)
226 {
227         struct perf_evsel *pos;
228         struct perf_evlist *evlist = rec->evlist;
229         struct perf_session *session = rec->session;
230         struct perf_record_opts *opts = &rec->opts;
231         int rc = 0;
232
233         perf_evlist__config(evlist, opts);
234
235         list_for_each_entry(pos, &evlist->entries, node) {
236                 struct perf_event_attr *attr = &pos->attr;
237                 /*
238                  * Check if parse_single_tracepoint_event has already asked for
239                  * PERF_SAMPLE_TIME.
240                  *
241                  * XXX this is kludgy but short term fix for problems introduced by
242                  * eac23d1c that broke 'perf script' by having different sample_types
243                  * when using multiple tracepoint events when we use a perf binary
244                  * that tries to use sample_id_all on an older kernel.
245                  *
246                  * We need to move counter creation to perf_session, support
247                  * different sample_types, etc.
248                  */
249                 bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
250
251 fallback_missing_features:
252                 if (opts->exclude_guest_missing)
253                         attr->exclude_guest = attr->exclude_host = 0;
254 retry_sample_id:
255                 attr->sample_id_all = opts->sample_id_all_missing ? 0 : 1;
256 try_again:
257                 if (perf_evsel__open(pos, evlist->cpus, evlist->threads) < 0) {
258                         int err = errno;
259
260                         if (err == EPERM || err == EACCES) {
261                                 ui__error_paranoid();
262                                 rc = -err;
263                                 goto out;
264                         } else if (err ==  ENODEV && opts->target.cpu_list) {
265                                 pr_err("No such device - did you specify"
266                                        " an out-of-range profile CPU?\n");
267                                 rc = -err;
268                                 goto out;
269                         } else if (err == EINVAL) {
270                                 if (!opts->exclude_guest_missing &&
271                                     (attr->exclude_guest || attr->exclude_host)) {
272                                         pr_debug("Old kernel, cannot exclude "
273                                                  "guest or host samples.\n");
274                                         opts->exclude_guest_missing = true;
275                                         goto fallback_missing_features;
276                                 } else if (!opts->sample_id_all_missing) {
277                                         /*
278                                          * Old kernel, no attr->sample_id_type_all field
279                                          */
280                                         opts->sample_id_all_missing = true;
281                                         if (!opts->sample_time && !opts->raw_samples && !time_needed)
282                                                 perf_evsel__reset_sample_bit(pos, TIME);
283
284                                         goto retry_sample_id;
285                                 }
286                         }
287
288                         /*
289                          * If it's cycles then fall back to hrtimer
290                          * based cpu-clock-tick sw counter, which
291                          * is always available even if no PMU support.
292                          *
293                          * PPC returns ENXIO until 2.6.37 (behavior changed
294                          * with commit b0a873e).
295                          */
296                         if ((err == ENOENT || err == ENXIO)
297                                         && attr->type == PERF_TYPE_HARDWARE
298                                         && attr->config == PERF_COUNT_HW_CPU_CYCLES) {
299
300                                 if (verbose)
301                                         ui__warning("The cycles event is not supported, "
302                                                     "trying to fall back to cpu-clock-ticks\n");
303                                 attr->type = PERF_TYPE_SOFTWARE;
304                                 attr->config = PERF_COUNT_SW_CPU_CLOCK;
305                                 if (pos->name) {
306                                         free(pos->name);
307                                         pos->name = NULL;
308                                 }
309                                 goto try_again;
310                         }
311
312                         if (err == ENOENT) {
313                                 ui__error("The %s event is not supported.\n",
314                                           perf_evsel__name(pos));
315                                 rc = -err;
316                                 goto out;
317                         } else if ((err == EOPNOTSUPP) && (attr->precise_ip)) {
318                                 ui__error("\'precise\' request may not be supported. "
319                                           "Try removing 'p' modifier\n");
320                                 rc = -err;
321                                 goto out;
322                         }
323
324                         printf("\n");
325                         error("sys_perf_event_open() syscall returned with %d "
326                               "(%s) for event %s. /bin/dmesg may provide "
327                               "additional information.\n",
328                               err, strerror(err), perf_evsel__name(pos));
329
330 #if defined(__i386__) || defined(__x86_64__)
331                         if (attr->type == PERF_TYPE_HARDWARE &&
332                             err == EOPNOTSUPP) {
333                                 pr_err("No hardware sampling interrupt available."
334                                        " No APIC? If so then you can boot the kernel"
335                                        " with the \"lapic\" boot parameter to"
336                                        " force-enable it.\n");
337                                 rc = -err;
338                                 goto out;
339                         }
340 #endif
341
342                         pr_err("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
343                         rc = -err;
344                         goto out;
345                 }
346         }
347
348         if (perf_evlist__apply_filters(evlist)) {
349                 error("failed to set filter with %d (%s)\n", errno,
350                         strerror(errno));
351                 rc = -1;
352                 goto out;
353         }
354
355         if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
356                 if (errno == EPERM) {
357                         pr_err("Permission error mapping pages.\n"
358                                "Consider increasing "
359                                "/proc/sys/kernel/perf_event_mlock_kb,\n"
360                                "or try again with a smaller value of -m/--mmap_pages.\n"
361                                "(current value: %d)\n", opts->mmap_pages);
362                         rc = -errno;
363                 } else if (!is_power_of_2(opts->mmap_pages) &&
364                            (opts->mmap_pages != UINT_MAX)) {
365                         pr_err("--mmap_pages/-m value must be a power of two.");
366                         rc = -EINVAL;
367                 } else {
368                         pr_err("failed to mmap with %d (%s)\n", errno, strerror(errno));
369                         rc = -errno;
370                 }
371                 goto out;
372         }
373
374         if (rec->file_new)
375                 session->evlist = evlist;
376         else {
377                 if (!perf_evlist__equal(session->evlist, evlist)) {
378                         fprintf(stderr, "incompatible append\n");
379                         rc = -1;
380                         goto out;
381                 }
382         }
383
384         perf_session__set_id_hdr_size(session);
385 out:
386         return rc;
387 }
388
389 static int process_buildids(struct perf_record *rec)
390 {
391         u64 size = lseek(rec->output, 0, SEEK_CUR);
392
393         if (size == 0)
394                 return 0;
395
396         rec->session->fd = rec->output;
397         return __perf_session__process_events(rec->session, rec->post_processing_offset,
398                                               size - rec->post_processing_offset,
399                                               size, &build_id__mark_dso_hit_ops);
400 }
401
402 static void perf_record__exit(int status, void *arg)
403 {
404         struct perf_record *rec = arg;
405
406         if (status != 0)
407                 return;
408
409         if (!rec->opts.pipe_output) {
410                 rec->session->header.data_size += rec->bytes_written;
411
412                 if (!rec->no_buildid)
413                         process_buildids(rec);
414                 perf_session__write_header(rec->session, rec->evlist,
415                                            rec->output, true);
416                 perf_session__delete(rec->session);
417                 perf_evlist__delete(rec->evlist);
418                 symbol__exit();
419         }
420 }
421
422 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
423 {
424         int err;
425         struct perf_tool *tool = data;
426
427         if (machine__is_host(machine))
428                 return;
429
430         /*
431          *As for guest kernel when processing subcommand record&report,
432          *we arrange module mmap prior to guest kernel mmap and trigger
433          *a preload dso because default guest module symbols are loaded
434          *from guest kallsyms instead of /lib/modules/XXX/XXX. This
435          *method is used to avoid symbol missing when the first addr is
436          *in module instead of in guest kernel.
437          */
438         err = perf_event__synthesize_modules(tool, process_synthesized_event,
439                                              machine);
440         if (err < 0)
441                 pr_err("Couldn't record guest kernel [%d]'s reference"
442                        " relocation symbol.\n", machine->pid);
443
444         /*
445          * We use _stext for guest kernel because guest kernel's /proc/kallsyms
446          * have no _text sometimes.
447          */
448         err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
449                                                  machine, "_text");
450         if (err < 0)
451                 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
452                                                          machine, "_stext");
453         if (err < 0)
454                 pr_err("Couldn't record guest kernel [%d]'s reference"
455                        " relocation symbol.\n", machine->pid);
456 }
457
458 static struct perf_event_header finished_round_event = {
459         .size = sizeof(struct perf_event_header),
460         .type = PERF_RECORD_FINISHED_ROUND,
461 };
462
463 static int perf_record__mmap_read_all(struct perf_record *rec)
464 {
465         int i;
466         int rc = 0;
467
468         for (i = 0; i < rec->evlist->nr_mmaps; i++) {
469                 if (rec->evlist->mmap[i].base) {
470                         if (perf_record__mmap_read(rec, &rec->evlist->mmap[i]) != 0) {
471                                 rc = -1;
472                                 goto out;
473                         }
474                 }
475         }
476
477         if (perf_header__has_feat(&rec->session->header, HEADER_TRACING_DATA))
478                 rc = write_output(rec, &finished_round_event,
479                                   sizeof(finished_round_event));
480
481 out:
482         return rc;
483 }
484
485 static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
486 {
487         struct stat st;
488         int flags;
489         int err, output, feat;
490         unsigned long waking = 0;
491         const bool forks = argc > 0;
492         struct machine *machine;
493         struct perf_tool *tool = &rec->tool;
494         struct perf_record_opts *opts = &rec->opts;
495         struct perf_evlist *evsel_list = rec->evlist;
496         const char *output_name = rec->output_name;
497         struct perf_session *session;
498         bool disabled = false;
499
500         rec->progname = argv[0];
501
502         rec->page_size = sysconf(_SC_PAGE_SIZE);
503
504         on_exit(perf_record__sig_exit, rec);
505         signal(SIGCHLD, sig_handler);
506         signal(SIGINT, sig_handler);
507         signal(SIGUSR1, sig_handler);
508
509         if (!output_name) {
510                 if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode))
511                         opts->pipe_output = true;
512                 else
513                         rec->output_name = output_name = "perf.data";
514         }
515         if (output_name) {
516                 if (!strcmp(output_name, "-"))
517                         opts->pipe_output = true;
518                 else if (!stat(output_name, &st) && st.st_size) {
519                         if (rec->write_mode == WRITE_FORCE) {
520                                 char oldname[PATH_MAX];
521                                 snprintf(oldname, sizeof(oldname), "%s.old",
522                                          output_name);
523                                 unlink(oldname);
524                                 rename(output_name, oldname);
525                         }
526                 } else if (rec->write_mode == WRITE_APPEND) {
527                         rec->write_mode = WRITE_FORCE;
528                 }
529         }
530
531         flags = O_CREAT|O_RDWR;
532         if (rec->write_mode == WRITE_APPEND)
533                 rec->file_new = 0;
534         else
535                 flags |= O_TRUNC;
536
537         if (opts->pipe_output)
538                 output = STDOUT_FILENO;
539         else
540                 output = open(output_name, flags, S_IRUSR | S_IWUSR);
541         if (output < 0) {
542                 perror("failed to create output file");
543                 return -1;
544         }
545
546         rec->output = output;
547
548         session = perf_session__new(output_name, O_WRONLY,
549                                     rec->write_mode == WRITE_FORCE, false, NULL);
550         if (session == NULL) {
551                 pr_err("Not enough memory for reading perf file header\n");
552                 return -1;
553         }
554
555         rec->session = session;
556
557         for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
558                 perf_header__set_feat(&session->header, feat);
559
560         if (rec->no_buildid)
561                 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
562
563         if (!have_tracepoints(&evsel_list->entries))
564                 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
565
566         if (!rec->opts.branch_stack)
567                 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
568
569         if (!rec->file_new) {
570                 err = perf_session__read_header(session, output);
571                 if (err < 0)
572                         goto out_delete_session;
573         }
574
575         if (forks) {
576                 err = perf_evlist__prepare_workload(evsel_list, opts, argv);
577                 if (err < 0) {
578                         pr_err("Couldn't run the workload!\n");
579                         goto out_delete_session;
580                 }
581         }
582
583         if (perf_record__open(rec) != 0) {
584                 err = -1;
585                 goto out_delete_session;
586         }
587
588         /*
589          * perf_session__delete(session) will be called at perf_record__exit()
590          */
591         on_exit(perf_record__exit, rec);
592
593         if (opts->pipe_output) {
594                 err = perf_header__write_pipe(output);
595                 if (err < 0)
596                         goto out_delete_session;
597         } else if (rec->file_new) {
598                 err = perf_session__write_header(session, evsel_list,
599                                                  output, false);
600                 if (err < 0)
601                         goto out_delete_session;
602         }
603
604         if (!rec->no_buildid
605             && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
606                 pr_err("Couldn't generate buildids. "
607                        "Use --no-buildid to profile anyway.\n");
608                 err = -1;
609                 goto out_delete_session;
610         }
611
612         rec->post_processing_offset = lseek(output, 0, SEEK_CUR);
613
614         machine = perf_session__find_host_machine(session);
615         if (!machine) {
616                 pr_err("Couldn't find native kernel information.\n");
617                 err = -1;
618                 goto out_delete_session;
619         }
620
621         if (opts->pipe_output) {
622                 err = perf_event__synthesize_attrs(tool, session,
623                                                    process_synthesized_event);
624                 if (err < 0) {
625                         pr_err("Couldn't synthesize attrs.\n");
626                         goto out_delete_session;
627                 }
628
629                 err = perf_event__synthesize_event_types(tool, process_synthesized_event,
630                                                          machine);
631                 if (err < 0) {
632                         pr_err("Couldn't synthesize event_types.\n");
633                         goto out_delete_session;
634                 }
635
636                 if (have_tracepoints(&evsel_list->entries)) {
637                         /*
638                          * FIXME err <= 0 here actually means that
639                          * there were no tracepoints so its not really
640                          * an error, just that we don't need to
641                          * synthesize anything.  We really have to
642                          * return this more properly and also
643                          * propagate errors that now are calling die()
644                          */
645                         err = perf_event__synthesize_tracing_data(tool, output, evsel_list,
646                                                                   process_synthesized_event);
647                         if (err <= 0) {
648                                 pr_err("Couldn't record tracing data.\n");
649                                 goto out_delete_session;
650                         }
651                         advance_output(rec, err);
652                 }
653         }
654
655         err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
656                                                  machine, "_text");
657         if (err < 0)
658                 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
659                                                          machine, "_stext");
660         if (err < 0)
661                 pr_err("Couldn't record kernel reference relocation symbol\n"
662                        "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
663                        "Check /proc/kallsyms permission or run as root.\n");
664
665         err = perf_event__synthesize_modules(tool, process_synthesized_event,
666                                              machine);
667         if (err < 0)
668                 pr_err("Couldn't record kernel module information.\n"
669                        "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
670                        "Check /proc/modules permission or run as root.\n");
671
672         if (perf_guest)
673                 perf_session__process_machines(session, tool,
674                                                perf_event__synthesize_guest_os);
675
676         if (!opts->target.system_wide)
677                 err = perf_event__synthesize_thread_map(tool, evsel_list->threads,
678                                                   process_synthesized_event,
679                                                   machine);
680         else
681                 err = perf_event__synthesize_threads(tool, process_synthesized_event,
682                                                machine);
683
684         if (err != 0)
685                 goto out_delete_session;
686
687         if (rec->realtime_prio) {
688                 struct sched_param param;
689
690                 param.sched_priority = rec->realtime_prio;
691                 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
692                         pr_err("Could not set realtime priority.\n");
693                         err = -1;
694                         goto out_delete_session;
695                 }
696         }
697
698         /*
699          * When perf is starting the traced process, all the events
700          * (apart from group members) have enable_on_exec=1 set,
701          * so don't spoil it by prematurely enabling them.
702          */
703         if (!perf_target__none(&opts->target))
704                 perf_evlist__enable(evsel_list);
705
706         /*
707          * Let the child rip
708          */
709         if (forks)
710                 perf_evlist__start_workload(evsel_list);
711
712         for (;;) {
713                 int hits = rec->samples;
714
715                 if (perf_record__mmap_read_all(rec) < 0) {
716                         err = -1;
717                         goto out_delete_session;
718                 }
719
720                 if (hits == rec->samples) {
721                         if (done)
722                                 break;
723                         err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1);
724                         waking++;
725                 }
726
727                 /*
728                  * When perf is starting the traced process, at the end events
729                  * die with the process and we wait for that. Thus no need to
730                  * disable events in this case.
731                  */
732                 if (done && !disabled && !perf_target__none(&opts->target)) {
733                         perf_evlist__disable(evsel_list);
734                         disabled = true;
735                 }
736         }
737
738         if (quiet || signr == SIGUSR1)
739                 return 0;
740
741         fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
742
743         /*
744          * Approximate RIP event size: 24 bytes.
745          */
746         fprintf(stderr,
747                 "[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n",
748                 (double)rec->bytes_written / 1024.0 / 1024.0,
749                 output_name,
750                 rec->bytes_written / 24);
751
752         return 0;
753
754 out_delete_session:
755         perf_session__delete(session);
756         return err;
757 }
758
759 #define BRANCH_OPT(n, m) \
760         { .name = n, .mode = (m) }
761
762 #define BRANCH_END { .name = NULL }
763
764 struct branch_mode {
765         const char *name;
766         int mode;
767 };
768
769 static const struct branch_mode branch_modes[] = {
770         BRANCH_OPT("u", PERF_SAMPLE_BRANCH_USER),
771         BRANCH_OPT("k", PERF_SAMPLE_BRANCH_KERNEL),
772         BRANCH_OPT("hv", PERF_SAMPLE_BRANCH_HV),
773         BRANCH_OPT("any", PERF_SAMPLE_BRANCH_ANY),
774         BRANCH_OPT("any_call", PERF_SAMPLE_BRANCH_ANY_CALL),
775         BRANCH_OPT("any_ret", PERF_SAMPLE_BRANCH_ANY_RETURN),
776         BRANCH_OPT("ind_call", PERF_SAMPLE_BRANCH_IND_CALL),
777         BRANCH_END
778 };
779
780 static int
781 parse_branch_stack(const struct option *opt, const char *str, int unset)
782 {
783 #define ONLY_PLM \
784         (PERF_SAMPLE_BRANCH_USER        |\
785          PERF_SAMPLE_BRANCH_KERNEL      |\
786          PERF_SAMPLE_BRANCH_HV)
787
788         uint64_t *mode = (uint64_t *)opt->value;
789         const struct branch_mode *br;
790         char *s, *os = NULL, *p;
791         int ret = -1;
792
793         if (unset)
794                 return 0;
795
796         /*
797          * cannot set it twice, -b + --branch-filter for instance
798          */
799         if (*mode)
800                 return -1;
801
802         /* str may be NULL in case no arg is passed to -b */
803         if (str) {
804                 /* because str is read-only */
805                 s = os = strdup(str);
806                 if (!s)
807                         return -1;
808
809                 for (;;) {
810                         p = strchr(s, ',');
811                         if (p)
812                                 *p = '\0';
813
814                         for (br = branch_modes; br->name; br++) {
815                                 if (!strcasecmp(s, br->name))
816                                         break;
817                         }
818                         if (!br->name) {
819                                 ui__warning("unknown branch filter %s,"
820                                             " check man page\n", s);
821                                 goto error;
822                         }
823
824                         *mode |= br->mode;
825
826                         if (!p)
827                                 break;
828
829                         s = p + 1;
830                 }
831         }
832         ret = 0;
833
834         /* default to any branch */
835         if ((*mode & ~ONLY_PLM) == 0) {
836                 *mode = PERF_SAMPLE_BRANCH_ANY;
837         }
838 error:
839         free(os);
840         return ret;
841 }
842
843 #ifdef LIBUNWIND_SUPPORT
844 static int get_stack_size(char *str, unsigned long *_size)
845 {
846         char *endptr;
847         unsigned long size;
848         unsigned long max_size = round_down(USHRT_MAX, sizeof(u64));
849
850         size = strtoul(str, &endptr, 0);
851
852         do {
853                 if (*endptr)
854                         break;
855
856                 size = round_up(size, sizeof(u64));
857                 if (!size || size > max_size)
858                         break;
859
860                 *_size = size;
861                 return 0;
862
863         } while (0);
864
865         pr_err("callchain: Incorrect stack dump size (max %ld): %s\n",
866                max_size, str);
867         return -1;
868 }
869 #endif /* LIBUNWIND_SUPPORT */
870
871 int record_parse_callchain_opt(const struct option *opt,
872                                const char *arg, int unset)
873 {
874         struct perf_record_opts *opts = opt->value;
875         char *tok, *name, *saveptr = NULL;
876         char *buf;
877         int ret = -1;
878
879         /* --no-call-graph */
880         if (unset)
881                 return 0;
882
883         /* We specified default option if none is provided. */
884         BUG_ON(!arg);
885
886         /* We need buffer that we know we can write to. */
887         buf = malloc(strlen(arg) + 1);
888         if (!buf)
889                 return -ENOMEM;
890
891         strcpy(buf, arg);
892
893         tok = strtok_r((char *)buf, ",", &saveptr);
894         name = tok ? : (char *)buf;
895
896         do {
897                 /* Framepointer style */
898                 if (!strncmp(name, "fp", sizeof("fp"))) {
899                         if (!strtok_r(NULL, ",", &saveptr)) {
900                                 opts->call_graph = CALLCHAIN_FP;
901                                 ret = 0;
902                         } else
903                                 pr_err("callchain: No more arguments "
904                                        "needed for -g fp\n");
905                         break;
906
907 #ifdef LIBUNWIND_SUPPORT
908                 /* Dwarf style */
909                 } else if (!strncmp(name, "dwarf", sizeof("dwarf"))) {
910                         const unsigned long default_stack_dump_size = 8192;
911
912                         ret = 0;
913                         opts->call_graph = CALLCHAIN_DWARF;
914                         opts->stack_dump_size = default_stack_dump_size;
915
916                         tok = strtok_r(NULL, ",", &saveptr);
917                         if (tok) {
918                                 unsigned long size = 0;
919
920                                 ret = get_stack_size(tok, &size);
921                                 opts->stack_dump_size = size;
922                         }
923
924                         if (!ret)
925                                 pr_debug("callchain: stack dump size %d\n",
926                                          opts->stack_dump_size);
927 #endif /* LIBUNWIND_SUPPORT */
928                 } else {
929                         pr_err("callchain: Unknown -g option "
930                                "value: %s\n", arg);
931                         break;
932                 }
933
934         } while (0);
935
936         free(buf);
937
938         if (!ret)
939                 pr_debug("callchain: type %d\n", opts->call_graph);
940
941         return ret;
942 }
943
944 static const char * const record_usage[] = {
945         "perf record [<options>] [<command>]",
946         "perf record [<options>] -- <command> [<options>]",
947         NULL
948 };
949
950 /*
951  * XXX Ideally would be local to cmd_record() and passed to a perf_record__new
952  * because we need to have access to it in perf_record__exit, that is called
953  * after cmd_record() exits, but since record_options need to be accessible to
954  * builtin-script, leave it here.
955  *
956  * At least we don't ouch it in all the other functions here directly.
957  *
958  * Just say no to tons of global variables, sigh.
959  */
960 static struct perf_record record = {
961         .opts = {
962                 .mmap_pages          = UINT_MAX,
963                 .user_freq           = UINT_MAX,
964                 .user_interval       = ULLONG_MAX,
965                 .freq                = 4000,
966                 .target              = {
967                         .uses_mmap   = true,
968                 },
969         },
970         .write_mode = WRITE_FORCE,
971         .file_new   = true,
972 };
973
974 #define CALLCHAIN_HELP "do call-graph (stack chain/backtrace) recording: "
975
976 #ifdef LIBUNWIND_SUPPORT
977 const char record_callchain_help[] = CALLCHAIN_HELP "[fp] dwarf";
978 #else
979 const char record_callchain_help[] = CALLCHAIN_HELP "[fp]";
980 #endif
981
982 /*
983  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
984  * with it and switch to use the library functions in perf_evlist that came
985  * from builtin-record.c, i.e. use perf_record_opts,
986  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
987  * using pipes, etc.
988  */
989 const struct option record_options[] = {
990         OPT_CALLBACK('e', "event", &record.evlist, "event",
991                      "event selector. use 'perf list' to list available events",
992                      parse_events_option),
993         OPT_CALLBACK(0, "filter", &record.evlist, "filter",
994                      "event filter", parse_filter),
995         OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
996                     "record events on existing process id"),
997         OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
998                     "record events on existing thread id"),
999         OPT_INTEGER('r', "realtime", &record.realtime_prio,
1000                     "collect data with this RT SCHED_FIFO priority"),
1001         OPT_BOOLEAN('D', "no-delay", &record.opts.no_delay,
1002                     "collect data without buffering"),
1003         OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
1004                     "collect raw sample records from all opened counters"),
1005         OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
1006                             "system-wide collection from all CPUs"),
1007         OPT_BOOLEAN('A', "append", &record.append_file,
1008                             "append to the output file to do incremental profiling"),
1009         OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
1010                     "list of cpus to monitor"),
1011         OPT_BOOLEAN('f', "force", &record.force,
1012                         "overwrite existing data file (deprecated)"),
1013         OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
1014         OPT_STRING('o', "output", &record.output_name, "file",
1015                     "output file name"),
1016         OPT_BOOLEAN('i', "no-inherit", &record.opts.no_inherit,
1017                     "child tasks do not inherit counters"),
1018         OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
1019         OPT_UINTEGER('m', "mmap-pages", &record.opts.mmap_pages,
1020                      "number of mmap data pages"),
1021         OPT_BOOLEAN(0, "group", &record.opts.group,
1022                     "put the counters into a counter group"),
1023         OPT_CALLBACK_DEFAULT('g', "call-graph", &record.opts,
1024                              "mode[,dump_size]", record_callchain_help,
1025                              &record_parse_callchain_opt, "fp"),
1026         OPT_INCR('v', "verbose", &verbose,
1027                     "be more verbose (show counter open errors, etc)"),
1028         OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
1029         OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
1030                     "per thread counts"),
1031         OPT_BOOLEAN('d', "data", &record.opts.sample_address,
1032                     "Sample addresses"),
1033         OPT_BOOLEAN('T', "timestamp", &record.opts.sample_time, "Sample timestamps"),
1034         OPT_BOOLEAN('P', "period", &record.opts.period, "Sample period"),
1035         OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
1036                     "don't sample"),
1037         OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
1038                     "do not update the buildid cache"),
1039         OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
1040                     "do not collect buildids in perf.data"),
1041         OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
1042                      "monitor event in cgroup name only",
1043                      parse_cgroups),
1044         OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
1045                    "user to profile"),
1046
1047         OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
1048                      "branch any", "sample any taken branches",
1049                      parse_branch_stack),
1050
1051         OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
1052                      "branch filter mask", "branch stack filter modes",
1053                      parse_branch_stack),
1054         OPT_END()
1055 };
1056
1057 int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
1058 {
1059         int err = -ENOMEM;
1060         struct perf_evsel *pos;
1061         struct perf_evlist *evsel_list;
1062         struct perf_record *rec = &record;
1063         char errbuf[BUFSIZ];
1064
1065         evsel_list = perf_evlist__new(NULL, NULL);
1066         if (evsel_list == NULL)
1067                 return -ENOMEM;
1068
1069         rec->evlist = evsel_list;
1070
1071         argc = parse_options(argc, argv, record_options, record_usage,
1072                             PARSE_OPT_STOP_AT_NON_OPTION);
1073         if (!argc && perf_target__none(&rec->opts.target))
1074                 usage_with_options(record_usage, record_options);
1075
1076         if (rec->force && rec->append_file) {
1077                 ui__error("Can't overwrite and append at the same time."
1078                           " You need to choose between -f and -A");
1079                 usage_with_options(record_usage, record_options);
1080         } else if (rec->append_file) {
1081                 rec->write_mode = WRITE_APPEND;
1082         } else {
1083                 rec->write_mode = WRITE_FORCE;
1084         }
1085
1086         if (nr_cgroups && !rec->opts.target.system_wide) {
1087                 ui__error("cgroup monitoring only available in"
1088                           " system-wide mode\n");
1089                 usage_with_options(record_usage, record_options);
1090         }
1091
1092         symbol__init();
1093
1094         if (symbol_conf.kptr_restrict)
1095                 pr_warning(
1096 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1097 "check /proc/sys/kernel/kptr_restrict.\n\n"
1098 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1099 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1100 "Samples in kernel modules won't be resolved at all.\n\n"
1101 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1102 "even with a suitable vmlinux or kallsyms file.\n\n");
1103
1104         if (rec->no_buildid_cache || rec->no_buildid)
1105                 disable_buildid_cache();
1106
1107         if (evsel_list->nr_entries == 0 &&
1108             perf_evlist__add_default(evsel_list) < 0) {
1109                 pr_err("Not enough memory for event selector list\n");
1110                 goto out_symbol_exit;
1111         }
1112
1113         err = perf_target__validate(&rec->opts.target);
1114         if (err) {
1115                 perf_target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1116                 ui__warning("%s", errbuf);
1117         }
1118
1119         err = perf_target__parse_uid(&rec->opts.target);
1120         if (err) {
1121                 int saved_errno = errno;
1122
1123                 perf_target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1124                 ui__error("%s", errbuf);
1125
1126                 err = -saved_errno;
1127                 goto out_free_fd;
1128         }
1129
1130         err = -ENOMEM;
1131         if (perf_evlist__create_maps(evsel_list, &rec->opts.target) < 0)
1132                 usage_with_options(record_usage, record_options);
1133
1134         list_for_each_entry(pos, &evsel_list->entries, node) {
1135                 if (perf_header__push_event(pos->attr.config, perf_evsel__name(pos)))
1136                         goto out_free_fd;
1137         }
1138
1139         if (rec->opts.user_interval != ULLONG_MAX)
1140                 rec->opts.default_interval = rec->opts.user_interval;
1141         if (rec->opts.user_freq != UINT_MAX)
1142                 rec->opts.freq = rec->opts.user_freq;
1143
1144         /*
1145          * User specified count overrides default frequency.
1146          */
1147         if (rec->opts.default_interval)
1148                 rec->opts.freq = 0;
1149         else if (rec->opts.freq) {
1150                 rec->opts.default_interval = rec->opts.freq;
1151         } else {
1152                 ui__error("frequency and count are zero, aborting\n");
1153                 err = -EINVAL;
1154                 goto out_free_fd;
1155         }
1156
1157         err = __cmd_record(&record, argc, argv);
1158 out_free_fd:
1159         perf_evlist__delete_maps(evsel_list);
1160 out_symbol_exit:
1161         symbol__exit();
1162         return err;
1163 }