Add the rt linux 4.1.3-rt3 as base
[kvmfornfv.git] / kernel / tools / perf / builtin-trace.c
1 #include <traceevent/event-parse.h>
2 #include "builtin.h"
3 #include "util/color.h"
4 #include "util/debug.h"
5 #include "util/evlist.h"
6 #include "util/machine.h"
7 #include "util/session.h"
8 #include "util/thread.h"
9 #include "util/parse-options.h"
10 #include "util/strlist.h"
11 #include "util/intlist.h"
12 #include "util/thread_map.h"
13 #include "util/stat.h"
14 #include "trace-event.h"
15 #include "util/parse-events.h"
16
17 #include <libaudit.h>
18 #include <stdlib.h>
19 #include <sys/eventfd.h>
20 #include <sys/mman.h>
21 #include <linux/futex.h>
22
23 /* For older distros: */
24 #ifndef MAP_STACK
25 # define MAP_STACK              0x20000
26 #endif
27
28 #ifndef MADV_HWPOISON
29 # define MADV_HWPOISON          100
30 #endif
31
32 #ifndef MADV_MERGEABLE
33 # define MADV_MERGEABLE         12
34 #endif
35
36 #ifndef MADV_UNMERGEABLE
37 # define MADV_UNMERGEABLE       13
38 #endif
39
40 #ifndef EFD_SEMAPHORE
41 # define EFD_SEMAPHORE          1
42 #endif
43
44 struct tp_field {
45         int offset;
46         union {
47                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
48                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
49         };
50 };
51
52 #define TP_UINT_FIELD(bits) \
53 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
54 { \
55         u##bits value; \
56         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
57         return value;  \
58 }
59
60 TP_UINT_FIELD(8);
61 TP_UINT_FIELD(16);
62 TP_UINT_FIELD(32);
63 TP_UINT_FIELD(64);
64
65 #define TP_UINT_FIELD__SWAPPED(bits) \
66 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
67 { \
68         u##bits value; \
69         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
70         return bswap_##bits(value);\
71 }
72
73 TP_UINT_FIELD__SWAPPED(16);
74 TP_UINT_FIELD__SWAPPED(32);
75 TP_UINT_FIELD__SWAPPED(64);
76
77 static int tp_field__init_uint(struct tp_field *field,
78                                struct format_field *format_field,
79                                bool needs_swap)
80 {
81         field->offset = format_field->offset;
82
83         switch (format_field->size) {
84         case 1:
85                 field->integer = tp_field__u8;
86                 break;
87         case 2:
88                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
89                 break;
90         case 4:
91                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
92                 break;
93         case 8:
94                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
95                 break;
96         default:
97                 return -1;
98         }
99
100         return 0;
101 }
102
103 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
104 {
105         return sample->raw_data + field->offset;
106 }
107
108 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
109 {
110         field->offset = format_field->offset;
111         field->pointer = tp_field__ptr;
112         return 0;
113 }
114
115 struct syscall_tp {
116         struct tp_field id;
117         union {
118                 struct tp_field args, ret;
119         };
120 };
121
122 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
123                                           struct tp_field *field,
124                                           const char *name)
125 {
126         struct format_field *format_field = perf_evsel__field(evsel, name);
127
128         if (format_field == NULL)
129                 return -1;
130
131         return tp_field__init_uint(field, format_field, evsel->needs_swap);
132 }
133
134 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
135         ({ struct syscall_tp *sc = evsel->priv;\
136            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
137
138 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
139                                          struct tp_field *field,
140                                          const char *name)
141 {
142         struct format_field *format_field = perf_evsel__field(evsel, name);
143
144         if (format_field == NULL)
145                 return -1;
146
147         return tp_field__init_ptr(field, format_field);
148 }
149
150 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
151         ({ struct syscall_tp *sc = evsel->priv;\
152            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
153
154 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
155 {
156         zfree(&evsel->priv);
157         perf_evsel__delete(evsel);
158 }
159
160 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
161 {
162         evsel->priv = malloc(sizeof(struct syscall_tp));
163         if (evsel->priv != NULL) {
164                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
165                         goto out_delete;
166
167                 evsel->handler = handler;
168                 return 0;
169         }
170
171         return -ENOMEM;
172
173 out_delete:
174         zfree(&evsel->priv);
175         return -ENOENT;
176 }
177
178 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
179 {
180         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
181
182         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
183         if (evsel == NULL)
184                 evsel = perf_evsel__newtp("syscalls", direction);
185
186         if (evsel) {
187                 if (perf_evsel__init_syscall_tp(evsel, handler))
188                         goto out_delete;
189         }
190
191         return evsel;
192
193 out_delete:
194         perf_evsel__delete_priv(evsel);
195         return NULL;
196 }
197
198 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
199         ({ struct syscall_tp *fields = evsel->priv; \
200            fields->name.integer(&fields->name, sample); })
201
202 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
203         ({ struct syscall_tp *fields = evsel->priv; \
204            fields->name.pointer(&fields->name, sample); })
205
206 static int perf_evlist__add_syscall_newtp(struct perf_evlist *evlist,
207                                           void *sys_enter_handler,
208                                           void *sys_exit_handler)
209 {
210         int ret = -1;
211         struct perf_evsel *sys_enter, *sys_exit;
212
213         sys_enter = perf_evsel__syscall_newtp("sys_enter", sys_enter_handler);
214         if (sys_enter == NULL)
215                 goto out;
216
217         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
218                 goto out_delete_sys_enter;
219
220         sys_exit = perf_evsel__syscall_newtp("sys_exit", sys_exit_handler);
221         if (sys_exit == NULL)
222                 goto out_delete_sys_enter;
223
224         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
225                 goto out_delete_sys_exit;
226
227         perf_evlist__add(evlist, sys_enter);
228         perf_evlist__add(evlist, sys_exit);
229
230         ret = 0;
231 out:
232         return ret;
233
234 out_delete_sys_exit:
235         perf_evsel__delete_priv(sys_exit);
236 out_delete_sys_enter:
237         perf_evsel__delete_priv(sys_enter);
238         goto out;
239 }
240
241
242 struct syscall_arg {
243         unsigned long val;
244         struct thread *thread;
245         struct trace  *trace;
246         void          *parm;
247         u8            idx;
248         u8            mask;
249 };
250
251 struct strarray {
252         int         offset;
253         int         nr_entries;
254         const char **entries;
255 };
256
257 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
258         .nr_entries = ARRAY_SIZE(array), \
259         .entries = array, \
260 }
261
262 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
263         .offset     = off, \
264         .nr_entries = ARRAY_SIZE(array), \
265         .entries = array, \
266 }
267
268 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
269                                                 const char *intfmt,
270                                                 struct syscall_arg *arg)
271 {
272         struct strarray *sa = arg->parm;
273         int idx = arg->val - sa->offset;
274
275         if (idx < 0 || idx >= sa->nr_entries)
276                 return scnprintf(bf, size, intfmt, arg->val);
277
278         return scnprintf(bf, size, "%s", sa->entries[idx]);
279 }
280
281 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
282                                               struct syscall_arg *arg)
283 {
284         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
285 }
286
287 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
288
289 #if defined(__i386__) || defined(__x86_64__)
290 /*
291  * FIXME: Make this available to all arches as soon as the ioctl beautifier
292  *        gets rewritten to support all arches.
293  */
294 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
295                                                  struct syscall_arg *arg)
296 {
297         return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
298 }
299
300 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
301 #endif /* defined(__i386__) || defined(__x86_64__) */
302
303 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
304                                         struct syscall_arg *arg);
305
306 #define SCA_FD syscall_arg__scnprintf_fd
307
308 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
309                                            struct syscall_arg *arg)
310 {
311         int fd = arg->val;
312
313         if (fd == AT_FDCWD)
314                 return scnprintf(bf, size, "CWD");
315
316         return syscall_arg__scnprintf_fd(bf, size, arg);
317 }
318
319 #define SCA_FDAT syscall_arg__scnprintf_fd_at
320
321 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
322                                               struct syscall_arg *arg);
323
324 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
325
326 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
327                                          struct syscall_arg *arg)
328 {
329         return scnprintf(bf, size, "%#lx", arg->val);
330 }
331
332 #define SCA_HEX syscall_arg__scnprintf_hex
333
334 static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
335                                                struct syscall_arg *arg)
336 {
337         int printed = 0, prot = arg->val;
338
339         if (prot == PROT_NONE)
340                 return scnprintf(bf, size, "NONE");
341 #define P_MMAP_PROT(n) \
342         if (prot & PROT_##n) { \
343                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
344                 prot &= ~PROT_##n; \
345         }
346
347         P_MMAP_PROT(EXEC);
348         P_MMAP_PROT(READ);
349         P_MMAP_PROT(WRITE);
350 #ifdef PROT_SEM
351         P_MMAP_PROT(SEM);
352 #endif
353         P_MMAP_PROT(GROWSDOWN);
354         P_MMAP_PROT(GROWSUP);
355 #undef P_MMAP_PROT
356
357         if (prot)
358                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);
359
360         return printed;
361 }
362
363 #define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
364
365 static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
366                                                 struct syscall_arg *arg)
367 {
368         int printed = 0, flags = arg->val;
369
370 #define P_MMAP_FLAG(n) \
371         if (flags & MAP_##n) { \
372                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
373                 flags &= ~MAP_##n; \
374         }
375
376         P_MMAP_FLAG(SHARED);
377         P_MMAP_FLAG(PRIVATE);
378 #ifdef MAP_32BIT
379         P_MMAP_FLAG(32BIT);
380 #endif
381         P_MMAP_FLAG(ANONYMOUS);
382         P_MMAP_FLAG(DENYWRITE);
383         P_MMAP_FLAG(EXECUTABLE);
384         P_MMAP_FLAG(FILE);
385         P_MMAP_FLAG(FIXED);
386         P_MMAP_FLAG(GROWSDOWN);
387 #ifdef MAP_HUGETLB
388         P_MMAP_FLAG(HUGETLB);
389 #endif
390         P_MMAP_FLAG(LOCKED);
391         P_MMAP_FLAG(NONBLOCK);
392         P_MMAP_FLAG(NORESERVE);
393         P_MMAP_FLAG(POPULATE);
394         P_MMAP_FLAG(STACK);
395 #ifdef MAP_UNINITIALIZED
396         P_MMAP_FLAG(UNINITIALIZED);
397 #endif
398 #undef P_MMAP_FLAG
399
400         if (flags)
401                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
402
403         return printed;
404 }
405
406 #define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
407
408 static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size,
409                                                   struct syscall_arg *arg)
410 {
411         int printed = 0, flags = arg->val;
412
413 #define P_MREMAP_FLAG(n) \
414         if (flags & MREMAP_##n) { \
415                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
416                 flags &= ~MREMAP_##n; \
417         }
418
419         P_MREMAP_FLAG(MAYMOVE);
420 #ifdef MREMAP_FIXED
421         P_MREMAP_FLAG(FIXED);
422 #endif
423 #undef P_MREMAP_FLAG
424
425         if (flags)
426                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
427
428         return printed;
429 }
430
431 #define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags
432
433 static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
434                                                       struct syscall_arg *arg)
435 {
436         int behavior = arg->val;
437
438         switch (behavior) {
439 #define P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
440         P_MADV_BHV(NORMAL);
441         P_MADV_BHV(RANDOM);
442         P_MADV_BHV(SEQUENTIAL);
443         P_MADV_BHV(WILLNEED);
444         P_MADV_BHV(DONTNEED);
445         P_MADV_BHV(REMOVE);
446         P_MADV_BHV(DONTFORK);
447         P_MADV_BHV(DOFORK);
448         P_MADV_BHV(HWPOISON);
449 #ifdef MADV_SOFT_OFFLINE
450         P_MADV_BHV(SOFT_OFFLINE);
451 #endif
452         P_MADV_BHV(MERGEABLE);
453         P_MADV_BHV(UNMERGEABLE);
454 #ifdef MADV_HUGEPAGE
455         P_MADV_BHV(HUGEPAGE);
456 #endif
457 #ifdef MADV_NOHUGEPAGE
458         P_MADV_BHV(NOHUGEPAGE);
459 #endif
460 #ifdef MADV_DONTDUMP
461         P_MADV_BHV(DONTDUMP);
462 #endif
463 #ifdef MADV_DODUMP
464         P_MADV_BHV(DODUMP);
465 #endif
466 #undef P_MADV_PHV
467         default: break;
468         }
469
470         return scnprintf(bf, size, "%#x", behavior);
471 }
472
473 #define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
474
475 static size_t syscall_arg__scnprintf_flock(char *bf, size_t size,
476                                            struct syscall_arg *arg)
477 {
478         int printed = 0, op = arg->val;
479
480         if (op == 0)
481                 return scnprintf(bf, size, "NONE");
482 #define P_CMD(cmd) \
483         if ((op & LOCK_##cmd) == LOCK_##cmd) { \
484                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \
485                 op &= ~LOCK_##cmd; \
486         }
487
488         P_CMD(SH);
489         P_CMD(EX);
490         P_CMD(NB);
491         P_CMD(UN);
492         P_CMD(MAND);
493         P_CMD(RW);
494         P_CMD(READ);
495         P_CMD(WRITE);
496 #undef P_OP
497
498         if (op)
499                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op);
500
501         return printed;
502 }
503
504 #define SCA_FLOCK syscall_arg__scnprintf_flock
505
506 static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
507 {
508         enum syscall_futex_args {
509                 SCF_UADDR   = (1 << 0),
510                 SCF_OP      = (1 << 1),
511                 SCF_VAL     = (1 << 2),
512                 SCF_TIMEOUT = (1 << 3),
513                 SCF_UADDR2  = (1 << 4),
514                 SCF_VAL3    = (1 << 5),
515         };
516         int op = arg->val;
517         int cmd = op & FUTEX_CMD_MASK;
518         size_t printed = 0;
519
520         switch (cmd) {
521 #define P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
522         P_FUTEX_OP(WAIT);           arg->mask |= SCF_VAL3|SCF_UADDR2;             break;
523         P_FUTEX_OP(WAKE);           arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
524         P_FUTEX_OP(FD);             arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
525         P_FUTEX_OP(REQUEUE);        arg->mask |= SCF_VAL3|SCF_TIMEOUT;            break;
526         P_FUTEX_OP(CMP_REQUEUE);    arg->mask |= SCF_TIMEOUT;                     break;
527         P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT;                     break;
528         P_FUTEX_OP(WAKE_OP);                                                      break;
529         P_FUTEX_OP(LOCK_PI);        arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
530         P_FUTEX_OP(UNLOCK_PI);      arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
531         P_FUTEX_OP(TRYLOCK_PI);     arg->mask |= SCF_VAL3|SCF_UADDR2;             break;
532         P_FUTEX_OP(WAIT_BITSET);    arg->mask |= SCF_UADDR2;                      break;
533         P_FUTEX_OP(WAKE_BITSET);    arg->mask |= SCF_UADDR2;                      break;
534         P_FUTEX_OP(WAIT_REQUEUE_PI);                                              break;
535         default: printed = scnprintf(bf, size, "%#x", cmd);                       break;
536         }
537
538         if (op & FUTEX_PRIVATE_FLAG)
539                 printed += scnprintf(bf + printed, size - printed, "|PRIV");
540
541         if (op & FUTEX_CLOCK_REALTIME)
542                 printed += scnprintf(bf + printed, size - printed, "|CLKRT");
543
544         return printed;
545 }
546
547 #define SCA_FUTEX_OP  syscall_arg__scnprintf_futex_op
548
549 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
550 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
551
552 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
553 static DEFINE_STRARRAY(itimers);
554
555 static const char *whences[] = { "SET", "CUR", "END",
556 #ifdef SEEK_DATA
557 "DATA",
558 #endif
559 #ifdef SEEK_HOLE
560 "HOLE",
561 #endif
562 };
563 static DEFINE_STRARRAY(whences);
564
565 static const char *fcntl_cmds[] = {
566         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
567         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
568         "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
569         "F_GETOWNER_UIDS",
570 };
571 static DEFINE_STRARRAY(fcntl_cmds);
572
573 static const char *rlimit_resources[] = {
574         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
575         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
576         "RTTIME",
577 };
578 static DEFINE_STRARRAY(rlimit_resources);
579
580 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
581 static DEFINE_STRARRAY(sighow);
582
583 static const char *clockid[] = {
584         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
585         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE",
586 };
587 static DEFINE_STRARRAY(clockid);
588
589 static const char *socket_families[] = {
590         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
591         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
592         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
593         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
594         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
595         "ALG", "NFC", "VSOCK",
596 };
597 static DEFINE_STRARRAY(socket_families);
598
599 #ifndef SOCK_TYPE_MASK
600 #define SOCK_TYPE_MASK 0xf
601 #endif
602
603 static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size,
604                                                       struct syscall_arg *arg)
605 {
606         size_t printed;
607         int type = arg->val,
608             flags = type & ~SOCK_TYPE_MASK;
609
610         type &= SOCK_TYPE_MASK;
611         /*
612          * Can't use a strarray, MIPS may override for ABI reasons.
613          */
614         switch (type) {
615 #define P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break;
616         P_SK_TYPE(STREAM);
617         P_SK_TYPE(DGRAM);
618         P_SK_TYPE(RAW);
619         P_SK_TYPE(RDM);
620         P_SK_TYPE(SEQPACKET);
621         P_SK_TYPE(DCCP);
622         P_SK_TYPE(PACKET);
623 #undef P_SK_TYPE
624         default:
625                 printed = scnprintf(bf, size, "%#x", type);
626         }
627
628 #define P_SK_FLAG(n) \
629         if (flags & SOCK_##n) { \
630                 printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
631                 flags &= ~SOCK_##n; \
632         }
633
634         P_SK_FLAG(CLOEXEC);
635         P_SK_FLAG(NONBLOCK);
636 #undef P_SK_FLAG
637
638         if (flags)
639                 printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
640
641         return printed;
642 }
643
644 #define SCA_SK_TYPE syscall_arg__scnprintf_socket_type
645
646 #ifndef MSG_PROBE
647 #define MSG_PROBE            0x10
648 #endif
649 #ifndef MSG_WAITFORONE
650 #define MSG_WAITFORONE  0x10000
651 #endif
652 #ifndef MSG_SENDPAGE_NOTLAST
653 #define MSG_SENDPAGE_NOTLAST 0x20000
654 #endif
655 #ifndef MSG_FASTOPEN
656 #define MSG_FASTOPEN         0x20000000
657 #endif
658
659 static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
660                                                struct syscall_arg *arg)
661 {
662         int printed = 0, flags = arg->val;
663
664         if (flags == 0)
665                 return scnprintf(bf, size, "NONE");
666 #define P_MSG_FLAG(n) \
667         if (flags & MSG_##n) { \
668                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
669                 flags &= ~MSG_##n; \
670         }
671
672         P_MSG_FLAG(OOB);
673         P_MSG_FLAG(PEEK);
674         P_MSG_FLAG(DONTROUTE);
675         P_MSG_FLAG(TRYHARD);
676         P_MSG_FLAG(CTRUNC);
677         P_MSG_FLAG(PROBE);
678         P_MSG_FLAG(TRUNC);
679         P_MSG_FLAG(DONTWAIT);
680         P_MSG_FLAG(EOR);
681         P_MSG_FLAG(WAITALL);
682         P_MSG_FLAG(FIN);
683         P_MSG_FLAG(SYN);
684         P_MSG_FLAG(CONFIRM);
685         P_MSG_FLAG(RST);
686         P_MSG_FLAG(ERRQUEUE);
687         P_MSG_FLAG(NOSIGNAL);
688         P_MSG_FLAG(MORE);
689         P_MSG_FLAG(WAITFORONE);
690         P_MSG_FLAG(SENDPAGE_NOTLAST);
691         P_MSG_FLAG(FASTOPEN);
692         P_MSG_FLAG(CMSG_CLOEXEC);
693 #undef P_MSG_FLAG
694
695         if (flags)
696                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
697
698         return printed;
699 }
700
701 #define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags
702
703 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
704                                                  struct syscall_arg *arg)
705 {
706         size_t printed = 0;
707         int mode = arg->val;
708
709         if (mode == F_OK) /* 0 */
710                 return scnprintf(bf, size, "F");
711 #define P_MODE(n) \
712         if (mode & n##_OK) { \
713                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
714                 mode &= ~n##_OK; \
715         }
716
717         P_MODE(R);
718         P_MODE(W);
719         P_MODE(X);
720 #undef P_MODE
721
722         if (mode)
723                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
724
725         return printed;
726 }
727
728 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
729
730 static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
731                                                struct syscall_arg *arg)
732 {
733         int printed = 0, flags = arg->val;
734
735         if (!(flags & O_CREAT))
736                 arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
737
738         if (flags == 0)
739                 return scnprintf(bf, size, "RDONLY");
740 #define P_FLAG(n) \
741         if (flags & O_##n) { \
742                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
743                 flags &= ~O_##n; \
744         }
745
746         P_FLAG(APPEND);
747         P_FLAG(ASYNC);
748         P_FLAG(CLOEXEC);
749         P_FLAG(CREAT);
750         P_FLAG(DIRECT);
751         P_FLAG(DIRECTORY);
752         P_FLAG(EXCL);
753         P_FLAG(LARGEFILE);
754         P_FLAG(NOATIME);
755         P_FLAG(NOCTTY);
756 #ifdef O_NONBLOCK
757         P_FLAG(NONBLOCK);
758 #elif O_NDELAY
759         P_FLAG(NDELAY);
760 #endif
761 #ifdef O_PATH
762         P_FLAG(PATH);
763 #endif
764         P_FLAG(RDWR);
765 #ifdef O_DSYNC
766         if ((flags & O_SYNC) == O_SYNC)
767                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
768         else {
769                 P_FLAG(DSYNC);
770         }
771 #else
772         P_FLAG(SYNC);
773 #endif
774         P_FLAG(TRUNC);
775         P_FLAG(WRONLY);
776 #undef P_FLAG
777
778         if (flags)
779                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
780
781         return printed;
782 }
783
784 #define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
785
786 static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size,
787                                                    struct syscall_arg *arg)
788 {
789         int printed = 0, flags = arg->val;
790
791         if (flags == 0)
792                 return scnprintf(bf, size, "NONE");
793 #define P_FLAG(n) \
794         if (flags & EFD_##n) { \
795                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
796                 flags &= ~EFD_##n; \
797         }
798
799         P_FLAG(SEMAPHORE);
800         P_FLAG(CLOEXEC);
801         P_FLAG(NONBLOCK);
802 #undef P_FLAG
803
804         if (flags)
805                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
806
807         return printed;
808 }
809
810 #define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags
811
812 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
813                                                 struct syscall_arg *arg)
814 {
815         int printed = 0, flags = arg->val;
816
817 #define P_FLAG(n) \
818         if (flags & O_##n) { \
819                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
820                 flags &= ~O_##n; \
821         }
822
823         P_FLAG(CLOEXEC);
824         P_FLAG(NONBLOCK);
825 #undef P_FLAG
826
827         if (flags)
828                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
829
830         return printed;
831 }
832
833 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
834
835 static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
836 {
837         int sig = arg->val;
838
839         switch (sig) {
840 #define P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n)
841         P_SIGNUM(HUP);
842         P_SIGNUM(INT);
843         P_SIGNUM(QUIT);
844         P_SIGNUM(ILL);
845         P_SIGNUM(TRAP);
846         P_SIGNUM(ABRT);
847         P_SIGNUM(BUS);
848         P_SIGNUM(FPE);
849         P_SIGNUM(KILL);
850         P_SIGNUM(USR1);
851         P_SIGNUM(SEGV);
852         P_SIGNUM(USR2);
853         P_SIGNUM(PIPE);
854         P_SIGNUM(ALRM);
855         P_SIGNUM(TERM);
856         P_SIGNUM(CHLD);
857         P_SIGNUM(CONT);
858         P_SIGNUM(STOP);
859         P_SIGNUM(TSTP);
860         P_SIGNUM(TTIN);
861         P_SIGNUM(TTOU);
862         P_SIGNUM(URG);
863         P_SIGNUM(XCPU);
864         P_SIGNUM(XFSZ);
865         P_SIGNUM(VTALRM);
866         P_SIGNUM(PROF);
867         P_SIGNUM(WINCH);
868         P_SIGNUM(IO);
869         P_SIGNUM(PWR);
870         P_SIGNUM(SYS);
871 #ifdef SIGEMT
872         P_SIGNUM(EMT);
873 #endif
874 #ifdef SIGSTKFLT
875         P_SIGNUM(STKFLT);
876 #endif
877 #ifdef SIGSWI
878         P_SIGNUM(SWI);
879 #endif
880         default: break;
881         }
882
883         return scnprintf(bf, size, "%#x", sig);
884 }
885
886 #define SCA_SIGNUM syscall_arg__scnprintf_signum
887
888 #if defined(__i386__) || defined(__x86_64__)
889 /*
890  * FIXME: Make this available to all arches.
891  */
892 #define TCGETS          0x5401
893
894 static const char *tioctls[] = {
895         "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
896         "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
897         "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
898         "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
899         "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
900         "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
901         "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
902         "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
903         "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
904         "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
905         "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
906         [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
907         "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
908         "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
909         "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
910 };
911
912 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
913 #endif /* defined(__i386__) || defined(__x86_64__) */
914
915 #define STRARRAY(arg, name, array) \
916           .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
917           .arg_parm      = { [arg] = &strarray__##array, }
918
919 static struct syscall_fmt {
920         const char *name;
921         const char *alias;
922         size_t     (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
923         void       *arg_parm[6];
924         bool       errmsg;
925         bool       timeout;
926         bool       hexret;
927 } syscall_fmts[] = {
928         { .name     = "access",     .errmsg = true,
929           .arg_scnprintf = { [1] = SCA_ACCMODE, /* mode */ }, },
930         { .name     = "arch_prctl", .errmsg = true, .alias = "prctl", },
931         { .name     = "brk",        .hexret = true,
932           .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
933         { .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
934         { .name     = "close",      .errmsg = true,
935           .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
936         { .name     = "connect",    .errmsg = true, },
937         { .name     = "dup",        .errmsg = true,
938           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
939         { .name     = "dup2",       .errmsg = true,
940           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
941         { .name     = "dup3",       .errmsg = true,
942           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
943         { .name     = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
944         { .name     = "eventfd2",   .errmsg = true,
945           .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
946         { .name     = "faccessat",  .errmsg = true,
947           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
948         { .name     = "fadvise64",  .errmsg = true,
949           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
950         { .name     = "fallocate",  .errmsg = true,
951           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
952         { .name     = "fchdir",     .errmsg = true,
953           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
954         { .name     = "fchmod",     .errmsg = true,
955           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
956         { .name     = "fchmodat",   .errmsg = true,
957           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
958         { .name     = "fchown",     .errmsg = true,
959           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
960         { .name     = "fchownat",   .errmsg = true,
961           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
962         { .name     = "fcntl",      .errmsg = true,
963           .arg_scnprintf = { [0] = SCA_FD, /* fd */
964                              [1] = SCA_STRARRAY, /* cmd */ },
965           .arg_parm      = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
966         { .name     = "fdatasync",  .errmsg = true,
967           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
968         { .name     = "flock",      .errmsg = true,
969           .arg_scnprintf = { [0] = SCA_FD, /* fd */
970                              [1] = SCA_FLOCK, /* cmd */ }, },
971         { .name     = "fsetxattr",  .errmsg = true,
972           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
973         { .name     = "fstat",      .errmsg = true, .alias = "newfstat",
974           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
975         { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat",
976           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
977         { .name     = "fstatfs",    .errmsg = true,
978           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
979         { .name     = "fsync",    .errmsg = true,
980           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
981         { .name     = "ftruncate", .errmsg = true,
982           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
983         { .name     = "futex",      .errmsg = true,
984           .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
985         { .name     = "futimesat", .errmsg = true,
986           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
987         { .name     = "getdents",   .errmsg = true,
988           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
989         { .name     = "getdents64", .errmsg = true,
990           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
991         { .name     = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
992         { .name     = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
993         { .name     = "ioctl",      .errmsg = true,
994           .arg_scnprintf = { [0] = SCA_FD, /* fd */
995 #if defined(__i386__) || defined(__x86_64__)
996 /*
997  * FIXME: Make this available to all arches.
998  */
999                              [1] = SCA_STRHEXARRAY, /* cmd */
1000                              [2] = SCA_HEX, /* arg */ },
1001           .arg_parm      = { [1] = &strarray__tioctls, /* cmd */ }, },
1002 #else
1003                              [2] = SCA_HEX, /* arg */ }, },
1004 #endif
1005         { .name     = "kill",       .errmsg = true,
1006           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1007         { .name     = "linkat",     .errmsg = true,
1008           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1009         { .name     = "lseek",      .errmsg = true,
1010           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1011                              [2] = SCA_STRARRAY, /* whence */ },
1012           .arg_parm      = { [2] = &strarray__whences, /* whence */ }, },
1013         { .name     = "lstat",      .errmsg = true, .alias = "newlstat", },
1014         { .name     = "madvise",    .errmsg = true,
1015           .arg_scnprintf = { [0] = SCA_HEX,      /* start */
1016                              [2] = SCA_MADV_BHV, /* behavior */ }, },
1017         { .name     = "mkdirat",    .errmsg = true,
1018           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1019         { .name     = "mknodat",    .errmsg = true,
1020           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1021         { .name     = "mlock",      .errmsg = true,
1022           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1023         { .name     = "mlockall",   .errmsg = true,
1024           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1025         { .name     = "mmap",       .hexret = true,
1026           .arg_scnprintf = { [0] = SCA_HEX,       /* addr */
1027                              [2] = SCA_MMAP_PROT, /* prot */
1028                              [3] = SCA_MMAP_FLAGS, /* flags */
1029                              [4] = SCA_FD,        /* fd */ }, },
1030         { .name     = "mprotect",   .errmsg = true,
1031           .arg_scnprintf = { [0] = SCA_HEX, /* start */
1032                              [2] = SCA_MMAP_PROT, /* prot */ }, },
1033         { .name     = "mremap",     .hexret = true,
1034           .arg_scnprintf = { [0] = SCA_HEX, /* addr */
1035                              [3] = SCA_MREMAP_FLAGS, /* flags */
1036                              [4] = SCA_HEX, /* new_addr */ }, },
1037         { .name     = "munlock",    .errmsg = true,
1038           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1039         { .name     = "munmap",     .errmsg = true,
1040           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1041         { .name     = "name_to_handle_at", .errmsg = true,
1042           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1043         { .name     = "newfstatat", .errmsg = true,
1044           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1045         { .name     = "open",       .errmsg = true,
1046           .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
1047         { .name     = "open_by_handle_at", .errmsg = true,
1048           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1049                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1050         { .name     = "openat",     .errmsg = true,
1051           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1052                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1053         { .name     = "pipe2",      .errmsg = true,
1054           .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
1055         { .name     = "poll",       .errmsg = true, .timeout = true, },
1056         { .name     = "ppoll",      .errmsg = true, .timeout = true, },
1057         { .name     = "pread",      .errmsg = true, .alias = "pread64",
1058           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1059         { .name     = "preadv",     .errmsg = true, .alias = "pread",
1060           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1061         { .name     = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
1062         { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64",
1063           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1064         { .name     = "pwritev",    .errmsg = true,
1065           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1066         { .name     = "read",       .errmsg = true,
1067           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1068         { .name     = "readlinkat", .errmsg = true,
1069           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1070         { .name     = "readv",      .errmsg = true,
1071           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1072         { .name     = "recvfrom",   .errmsg = true,
1073           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1074         { .name     = "recvmmsg",   .errmsg = true,
1075           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1076         { .name     = "recvmsg",    .errmsg = true,
1077           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
1078         { .name     = "renameat",   .errmsg = true,
1079           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1080         { .name     = "rt_sigaction", .errmsg = true,
1081           .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
1082         { .name     = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
1083         { .name     = "rt_sigqueueinfo", .errmsg = true,
1084           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1085         { .name     = "rt_tgsigqueueinfo", .errmsg = true,
1086           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1087         { .name     = "select",     .errmsg = true, .timeout = true, },
1088         { .name     = "sendmmsg",    .errmsg = true,
1089           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1090         { .name     = "sendmsg",    .errmsg = true,
1091           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
1092         { .name     = "sendto",     .errmsg = true,
1093           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1094         { .name     = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1095         { .name     = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1096         { .name     = "shutdown",   .errmsg = true,
1097           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1098         { .name     = "socket",     .errmsg = true,
1099           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1100                              [1] = SCA_SK_TYPE, /* type */ },
1101           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
1102         { .name     = "socketpair", .errmsg = true,
1103           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1104                              [1] = SCA_SK_TYPE, /* type */ },
1105           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
1106         { .name     = "stat",       .errmsg = true, .alias = "newstat", },
1107         { .name     = "symlinkat",  .errmsg = true,
1108           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1109         { .name     = "tgkill",     .errmsg = true,
1110           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1111         { .name     = "tkill",      .errmsg = true,
1112           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1113         { .name     = "uname",      .errmsg = true, .alias = "newuname", },
1114         { .name     = "unlinkat",   .errmsg = true,
1115           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1116         { .name     = "utimensat",  .errmsg = true,
1117           .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
1118         { .name     = "write",      .errmsg = true,
1119           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1120         { .name     = "writev",     .errmsg = true,
1121           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1122 };
1123
1124 static int syscall_fmt__cmp(const void *name, const void *fmtp)
1125 {
1126         const struct syscall_fmt *fmt = fmtp;
1127         return strcmp(name, fmt->name);
1128 }
1129
1130 static struct syscall_fmt *syscall_fmt__find(const char *name)
1131 {
1132         const int nmemb = ARRAY_SIZE(syscall_fmts);
1133         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
1134 }
1135
1136 struct syscall {
1137         struct event_format *tp_format;
1138         int                 nr_args;
1139         struct format_field *args;
1140         const char          *name;
1141         bool                filtered;
1142         bool                is_exit;
1143         struct syscall_fmt  *fmt;
1144         size_t              (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1145         void                **arg_parm;
1146 };
1147
1148 static size_t fprintf_duration(unsigned long t, FILE *fp)
1149 {
1150         double duration = (double)t / NSEC_PER_MSEC;
1151         size_t printed = fprintf(fp, "(");
1152
1153         if (duration >= 1.0)
1154                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1155         else if (duration >= 0.01)
1156                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1157         else
1158                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1159         return printed + fprintf(fp, "): ");
1160 }
1161
1162 struct thread_trace {
1163         u64               entry_time;
1164         u64               exit_time;
1165         bool              entry_pending;
1166         unsigned long     nr_events;
1167         unsigned long     pfmaj, pfmin;
1168         char              *entry_str;
1169         double            runtime_ms;
1170         struct {
1171                 int       max;
1172                 char      **table;
1173         } paths;
1174
1175         struct intlist *syscall_stats;
1176 };
1177
1178 static struct thread_trace *thread_trace__new(void)
1179 {
1180         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
1181
1182         if (ttrace)
1183                 ttrace->paths.max = -1;
1184
1185         ttrace->syscall_stats = intlist__new(NULL);
1186
1187         return ttrace;
1188 }
1189
1190 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1191 {
1192         struct thread_trace *ttrace;
1193
1194         if (thread == NULL)
1195                 goto fail;
1196
1197         if (thread__priv(thread) == NULL)
1198                 thread__set_priv(thread, thread_trace__new());
1199
1200         if (thread__priv(thread) == NULL)
1201                 goto fail;
1202
1203         ttrace = thread__priv(thread);
1204         ++ttrace->nr_events;
1205
1206         return ttrace;
1207 fail:
1208         color_fprintf(fp, PERF_COLOR_RED,
1209                       "WARNING: not enough memory, dropping samples!\n");
1210         return NULL;
1211 }
1212
1213 #define TRACE_PFMAJ             (1 << 0)
1214 #define TRACE_PFMIN             (1 << 1)
1215
1216 struct trace {
1217         struct perf_tool        tool;
1218         struct {
1219                 int             machine;
1220                 int             open_id;
1221         }                       audit;
1222         struct {
1223                 int             max;
1224                 struct syscall  *table;
1225         } syscalls;
1226         struct record_opts      opts;
1227         struct perf_evlist      *evlist;
1228         struct machine          *host;
1229         struct thread           *current;
1230         u64                     base_time;
1231         FILE                    *output;
1232         unsigned long           nr_events;
1233         struct strlist          *ev_qualifier;
1234         const char              *last_vfs_getname;
1235         struct intlist          *tid_list;
1236         struct intlist          *pid_list;
1237         struct {
1238                 size_t          nr;
1239                 pid_t           *entries;
1240         }                       filter_pids;
1241         double                  duration_filter;
1242         double                  runtime_ms;
1243         struct {
1244                 u64             vfs_getname,
1245                                 proc_getname;
1246         } stats;
1247         bool                    not_ev_qualifier;
1248         bool                    live;
1249         bool                    full_time;
1250         bool                    sched;
1251         bool                    multiple_threads;
1252         bool                    summary;
1253         bool                    summary_only;
1254         bool                    show_comm;
1255         bool                    show_tool_stats;
1256         bool                    trace_syscalls;
1257         bool                    force;
1258         int                     trace_pgfaults;
1259 };
1260
1261 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1262 {
1263         struct thread_trace *ttrace = thread__priv(thread);
1264
1265         if (fd > ttrace->paths.max) {
1266                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
1267
1268                 if (npath == NULL)
1269                         return -1;
1270
1271                 if (ttrace->paths.max != -1) {
1272                         memset(npath + ttrace->paths.max + 1, 0,
1273                                (fd - ttrace->paths.max) * sizeof(char *));
1274                 } else {
1275                         memset(npath, 0, (fd + 1) * sizeof(char *));
1276                 }
1277
1278                 ttrace->paths.table = npath;
1279                 ttrace->paths.max   = fd;
1280         }
1281
1282         ttrace->paths.table[fd] = strdup(pathname);
1283
1284         return ttrace->paths.table[fd] != NULL ? 0 : -1;
1285 }
1286
1287 static int thread__read_fd_path(struct thread *thread, int fd)
1288 {
1289         char linkname[PATH_MAX], pathname[PATH_MAX];
1290         struct stat st;
1291         int ret;
1292
1293         if (thread->pid_ == thread->tid) {
1294                 scnprintf(linkname, sizeof(linkname),
1295                           "/proc/%d/fd/%d", thread->pid_, fd);
1296         } else {
1297                 scnprintf(linkname, sizeof(linkname),
1298                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1299         }
1300
1301         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1302                 return -1;
1303
1304         ret = readlink(linkname, pathname, sizeof(pathname));
1305
1306         if (ret < 0 || ret > st.st_size)
1307                 return -1;
1308
1309         pathname[ret] = '\0';
1310         return trace__set_fd_pathname(thread, fd, pathname);
1311 }
1312
1313 static const char *thread__fd_path(struct thread *thread, int fd,
1314                                    struct trace *trace)
1315 {
1316         struct thread_trace *ttrace = thread__priv(thread);
1317
1318         if (ttrace == NULL)
1319                 return NULL;
1320
1321         if (fd < 0)
1322                 return NULL;
1323
1324         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1325                 if (!trace->live)
1326                         return NULL;
1327                 ++trace->stats.proc_getname;
1328                 if (thread__read_fd_path(thread, fd))
1329                         return NULL;
1330         }
1331
1332         return ttrace->paths.table[fd];
1333 }
1334
1335 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
1336                                         struct syscall_arg *arg)
1337 {
1338         int fd = arg->val;
1339         size_t printed = scnprintf(bf, size, "%d", fd);
1340         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1341
1342         if (path)
1343                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1344
1345         return printed;
1346 }
1347
1348 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1349                                               struct syscall_arg *arg)
1350 {
1351         int fd = arg->val;
1352         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1353         struct thread_trace *ttrace = thread__priv(arg->thread);
1354
1355         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1356                 zfree(&ttrace->paths.table[fd]);
1357
1358         return printed;
1359 }
1360
1361 static bool trace__filter_duration(struct trace *trace, double t)
1362 {
1363         return t < (trace->duration_filter * NSEC_PER_MSEC);
1364 }
1365
1366 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1367 {
1368         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1369
1370         return fprintf(fp, "%10.3f ", ts);
1371 }
1372
1373 static bool done = false;
1374 static bool interrupted = false;
1375
1376 static void sig_handler(int sig)
1377 {
1378         done = true;
1379         interrupted = sig == SIGINT;
1380 }
1381
1382 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1383                                         u64 duration, u64 tstamp, FILE *fp)
1384 {
1385         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1386         printed += fprintf_duration(duration, fp);
1387
1388         if (trace->multiple_threads) {
1389                 if (trace->show_comm)
1390                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1391                 printed += fprintf(fp, "%d ", thread->tid);
1392         }
1393
1394         return printed;
1395 }
1396
1397 static int trace__process_event(struct trace *trace, struct machine *machine,
1398                                 union perf_event *event, struct perf_sample *sample)
1399 {
1400         int ret = 0;
1401
1402         switch (event->header.type) {
1403         case PERF_RECORD_LOST:
1404                 color_fprintf(trace->output, PERF_COLOR_RED,
1405                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1406                 ret = machine__process_lost_event(machine, event, sample);
1407         default:
1408                 ret = machine__process_event(machine, event, sample);
1409                 break;
1410         }
1411
1412         return ret;
1413 }
1414
1415 static int trace__tool_process(struct perf_tool *tool,
1416                                union perf_event *event,
1417                                struct perf_sample *sample,
1418                                struct machine *machine)
1419 {
1420         struct trace *trace = container_of(tool, struct trace, tool);
1421         return trace__process_event(trace, machine, event, sample);
1422 }
1423
1424 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1425 {
1426         int err = symbol__init(NULL);
1427
1428         if (err)
1429                 return err;
1430
1431         trace->host = machine__new_host();
1432         if (trace->host == NULL)
1433                 return -ENOMEM;
1434
1435         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1436                                             evlist->threads, trace__tool_process, false);
1437         if (err)
1438                 symbol__exit();
1439
1440         return err;
1441 }
1442
1443 static int syscall__set_arg_fmts(struct syscall *sc)
1444 {
1445         struct format_field *field;
1446         int idx = 0;
1447
1448         sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1449         if (sc->arg_scnprintf == NULL)
1450                 return -1;
1451
1452         if (sc->fmt)
1453                 sc->arg_parm = sc->fmt->arg_parm;
1454
1455         for (field = sc->args; field; field = field->next) {
1456                 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1457                         sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1458                 else if (field->flags & FIELD_IS_POINTER)
1459                         sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1460                 ++idx;
1461         }
1462
1463         return 0;
1464 }
1465
1466 static int trace__read_syscall_info(struct trace *trace, int id)
1467 {
1468         char tp_name[128];
1469         struct syscall *sc;
1470         const char *name = audit_syscall_to_name(id, trace->audit.machine);
1471
1472         if (name == NULL)
1473                 return -1;
1474
1475         if (id > trace->syscalls.max) {
1476                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1477
1478                 if (nsyscalls == NULL)
1479                         return -1;
1480
1481                 if (trace->syscalls.max != -1) {
1482                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1483                                (id - trace->syscalls.max) * sizeof(*sc));
1484                 } else {
1485                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1486                 }
1487
1488                 trace->syscalls.table = nsyscalls;
1489                 trace->syscalls.max   = id;
1490         }
1491
1492         sc = trace->syscalls.table + id;
1493         sc->name = name;
1494
1495         if (trace->ev_qualifier) {
1496                 bool in = strlist__find(trace->ev_qualifier, name) != NULL;
1497
1498                 if (!(in ^ trace->not_ev_qualifier)) {
1499                         sc->filtered = true;
1500                         /*
1501                          * No need to do read tracepoint information since this will be
1502                          * filtered out.
1503                          */
1504                         return 0;
1505                 }
1506         }
1507
1508         sc->fmt  = syscall_fmt__find(sc->name);
1509
1510         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1511         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1512
1513         if (sc->tp_format == NULL && sc->fmt && sc->fmt->alias) {
1514                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1515                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1516         }
1517
1518         if (sc->tp_format == NULL)
1519                 return -1;
1520
1521         sc->args = sc->tp_format->format.fields;
1522         sc->nr_args = sc->tp_format->format.nr_fields;
1523         /* drop nr field - not relevant here; does not exist on older kernels */
1524         if (sc->args && strcmp(sc->args->name, "nr") == 0) {
1525                 sc->args = sc->args->next;
1526                 --sc->nr_args;
1527         }
1528
1529         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1530
1531         return syscall__set_arg_fmts(sc);
1532 }
1533
1534 /*
1535  * args is to be interpreted as a series of longs but we need to handle
1536  * 8-byte unaligned accesses. args points to raw_data within the event
1537  * and raw_data is guaranteed to be 8-byte unaligned because it is
1538  * preceded by raw_size which is a u32. So we need to copy args to a temp
1539  * variable to read it. Most notably this avoids extended load instructions
1540  * on unaligned addresses
1541  */
1542
1543 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1544                                       unsigned char *args, struct trace *trace,
1545                                       struct thread *thread)
1546 {
1547         size_t printed = 0;
1548         unsigned char *p;
1549         unsigned long val;
1550
1551         if (sc->args != NULL) {
1552                 struct format_field *field;
1553                 u8 bit = 1;
1554                 struct syscall_arg arg = {
1555                         .idx    = 0,
1556                         .mask   = 0,
1557                         .trace  = trace,
1558                         .thread = thread,
1559                 };
1560
1561                 for (field = sc->args; field;
1562                      field = field->next, ++arg.idx, bit <<= 1) {
1563                         if (arg.mask & bit)
1564                                 continue;
1565
1566                         /* special care for unaligned accesses */
1567                         p = args + sizeof(unsigned long) * arg.idx;
1568                         memcpy(&val, p, sizeof(val));
1569
1570                         /*
1571                          * Suppress this argument if its value is zero and
1572                          * and we don't have a string associated in an
1573                          * strarray for it.
1574                          */
1575                         if (val == 0 &&
1576                             !(sc->arg_scnprintf &&
1577                               sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1578                               sc->arg_parm[arg.idx]))
1579                                 continue;
1580
1581                         printed += scnprintf(bf + printed, size - printed,
1582                                              "%s%s: ", printed ? ", " : "", field->name);
1583                         if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1584                                 arg.val = val;
1585                                 if (sc->arg_parm)
1586                                         arg.parm = sc->arg_parm[arg.idx];
1587                                 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1588                                                                       size - printed, &arg);
1589                         } else {
1590                                 printed += scnprintf(bf + printed, size - printed,
1591                                                      "%ld", val);
1592                         }
1593                 }
1594         } else {
1595                 int i = 0;
1596
1597                 while (i < 6) {
1598                         /* special care for unaligned accesses */
1599                         p = args + sizeof(unsigned long) * i;
1600                         memcpy(&val, p, sizeof(val));
1601                         printed += scnprintf(bf + printed, size - printed,
1602                                              "%sarg%d: %ld",
1603                                              printed ? ", " : "", i, val);
1604                         ++i;
1605                 }
1606         }
1607
1608         return printed;
1609 }
1610
1611 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1612                                   union perf_event *event,
1613                                   struct perf_sample *sample);
1614
1615 static struct syscall *trace__syscall_info(struct trace *trace,
1616                                            struct perf_evsel *evsel, int id)
1617 {
1618
1619         if (id < 0) {
1620
1621                 /*
1622                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1623                  * before that, leaving at a higher verbosity level till that is
1624                  * explained. Reproduced with plain ftrace with:
1625                  *
1626                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1627                  * grep "NR -1 " /t/trace_pipe
1628                  *
1629                  * After generating some load on the machine.
1630                  */
1631                 if (verbose > 1) {
1632                         static u64 n;
1633                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1634                                 id, perf_evsel__name(evsel), ++n);
1635                 }
1636                 return NULL;
1637         }
1638
1639         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1640             trace__read_syscall_info(trace, id))
1641                 goto out_cant_read;
1642
1643         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1644                 goto out_cant_read;
1645
1646         return &trace->syscalls.table[id];
1647
1648 out_cant_read:
1649         if (verbose) {
1650                 fprintf(trace->output, "Problems reading syscall %d", id);
1651                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1652                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1653                 fputs(" information\n", trace->output);
1654         }
1655         return NULL;
1656 }
1657
1658 static void thread__update_stats(struct thread_trace *ttrace,
1659                                  int id, struct perf_sample *sample)
1660 {
1661         struct int_node *inode;
1662         struct stats *stats;
1663         u64 duration = 0;
1664
1665         inode = intlist__findnew(ttrace->syscall_stats, id);
1666         if (inode == NULL)
1667                 return;
1668
1669         stats = inode->priv;
1670         if (stats == NULL) {
1671                 stats = malloc(sizeof(struct stats));
1672                 if (stats == NULL)
1673                         return;
1674                 init_stats(stats);
1675                 inode->priv = stats;
1676         }
1677
1678         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1679                 duration = sample->time - ttrace->entry_time;
1680
1681         update_stats(stats, duration);
1682 }
1683
1684 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1685 {
1686         struct thread_trace *ttrace;
1687         u64 duration;
1688         size_t printed;
1689
1690         if (trace->current == NULL)
1691                 return 0;
1692
1693         ttrace = thread__priv(trace->current);
1694
1695         if (!ttrace->entry_pending)
1696                 return 0;
1697
1698         duration = sample->time - ttrace->entry_time;
1699
1700         printed  = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
1701         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1702         ttrace->entry_pending = false;
1703
1704         return printed;
1705 }
1706
1707 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1708                             union perf_event *event __maybe_unused,
1709                             struct perf_sample *sample)
1710 {
1711         char *msg;
1712         void *args;
1713         size_t printed = 0;
1714         struct thread *thread;
1715         int id = perf_evsel__sc_tp_uint(evsel, id, sample);
1716         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1717         struct thread_trace *ttrace;
1718
1719         if (sc == NULL)
1720                 return -1;
1721
1722         if (sc->filtered)
1723                 return 0;
1724
1725         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1726         ttrace = thread__trace(thread, trace->output);
1727         if (ttrace == NULL)
1728                 return -1;
1729
1730         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1731
1732         if (ttrace->entry_str == NULL) {
1733                 ttrace->entry_str = malloc(1024);
1734                 if (!ttrace->entry_str)
1735                         return -1;
1736         }
1737
1738         if (!trace->summary_only)
1739                 trace__printf_interrupted_entry(trace, sample);
1740
1741         ttrace->entry_time = sample->time;
1742         msg = ttrace->entry_str;
1743         printed += scnprintf(msg + printed, 1024 - printed, "%s(", sc->name);
1744
1745         printed += syscall__scnprintf_args(sc, msg + printed, 1024 - printed,
1746                                            args, trace, thread);
1747
1748         if (sc->is_exit) {
1749                 if (!trace->duration_filter && !trace->summary_only) {
1750                         trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
1751                         fprintf(trace->output, "%-70s\n", ttrace->entry_str);
1752                 }
1753         } else
1754                 ttrace->entry_pending = true;
1755
1756         if (trace->current != thread) {
1757                 thread__put(trace->current);
1758                 trace->current = thread__get(thread);
1759         }
1760
1761         return 0;
1762 }
1763
1764 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1765                            union perf_event *event __maybe_unused,
1766                            struct perf_sample *sample)
1767 {
1768         long ret;
1769         u64 duration = 0;
1770         struct thread *thread;
1771         int id = perf_evsel__sc_tp_uint(evsel, id, sample);
1772         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1773         struct thread_trace *ttrace;
1774
1775         if (sc == NULL)
1776                 return -1;
1777
1778         if (sc->filtered)
1779                 return 0;
1780
1781         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1782         ttrace = thread__trace(thread, trace->output);
1783         if (ttrace == NULL)
1784                 return -1;
1785
1786         if (trace->summary)
1787                 thread__update_stats(ttrace, id, sample);
1788
1789         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1790
1791         if (id == trace->audit.open_id && ret >= 0 && trace->last_vfs_getname) {
1792                 trace__set_fd_pathname(thread, ret, trace->last_vfs_getname);
1793                 trace->last_vfs_getname = NULL;
1794                 ++trace->stats.vfs_getname;
1795         }
1796
1797         ttrace->exit_time = sample->time;
1798
1799         if (ttrace->entry_time) {
1800                 duration = sample->time - ttrace->entry_time;
1801                 if (trace__filter_duration(trace, duration))
1802                         goto out;
1803         } else if (trace->duration_filter)
1804                 goto out;
1805
1806         if (trace->summary_only)
1807                 goto out;
1808
1809         trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
1810
1811         if (ttrace->entry_pending) {
1812                 fprintf(trace->output, "%-70s", ttrace->entry_str);
1813         } else {
1814                 fprintf(trace->output, " ... [");
1815                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1816                 fprintf(trace->output, "]: %s()", sc->name);
1817         }
1818
1819         if (sc->fmt == NULL) {
1820 signed_print:
1821                 fprintf(trace->output, ") = %ld", ret);
1822         } else if (ret < 0 && sc->fmt->errmsg) {
1823                 char bf[STRERR_BUFSIZE];
1824                 const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
1825                            *e = audit_errno_to_name(-ret);
1826
1827                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1828         } else if (ret == 0 && sc->fmt->timeout)
1829                 fprintf(trace->output, ") = 0 Timeout");
1830         else if (sc->fmt->hexret)
1831                 fprintf(trace->output, ") = %#lx", ret);
1832         else
1833                 goto signed_print;
1834
1835         fputc('\n', trace->output);
1836 out:
1837         ttrace->entry_pending = false;
1838
1839         return 0;
1840 }
1841
1842 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1843                               union perf_event *event __maybe_unused,
1844                               struct perf_sample *sample)
1845 {
1846         trace->last_vfs_getname = perf_evsel__rawptr(evsel, sample, "pathname");
1847         return 0;
1848 }
1849
1850 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1851                                      union perf_event *event __maybe_unused,
1852                                      struct perf_sample *sample)
1853 {
1854         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1855         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1856         struct thread *thread = machine__findnew_thread(trace->host,
1857                                                         sample->pid,
1858                                                         sample->tid);
1859         struct thread_trace *ttrace = thread__trace(thread, trace->output);
1860
1861         if (ttrace == NULL)
1862                 goto out_dump;
1863
1864         ttrace->runtime_ms += runtime_ms;
1865         trace->runtime_ms += runtime_ms;
1866         return 0;
1867
1868 out_dump:
1869         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1870                evsel->name,
1871                perf_evsel__strval(evsel, sample, "comm"),
1872                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1873                runtime,
1874                perf_evsel__intval(evsel, sample, "vruntime"));
1875         return 0;
1876 }
1877
1878 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1879                                 union perf_event *event __maybe_unused,
1880                                 struct perf_sample *sample)
1881 {
1882         trace__printf_interrupted_entry(trace, sample);
1883         trace__fprintf_tstamp(trace, sample->time, trace->output);
1884
1885         if (trace->trace_syscalls)
1886                 fprintf(trace->output, "(         ): ");
1887
1888         fprintf(trace->output, "%s:", evsel->name);
1889
1890         if (evsel->tp_format) {
1891                 event_format__fprintf(evsel->tp_format, sample->cpu,
1892                                       sample->raw_data, sample->raw_size,
1893                                       trace->output);
1894         }
1895
1896         fprintf(trace->output, ")\n");
1897         return 0;
1898 }
1899
1900 static void print_location(FILE *f, struct perf_sample *sample,
1901                            struct addr_location *al,
1902                            bool print_dso, bool print_sym)
1903 {
1904
1905         if ((verbose || print_dso) && al->map)
1906                 fprintf(f, "%s@", al->map->dso->long_name);
1907
1908         if ((verbose || print_sym) && al->sym)
1909                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1910                         al->addr - al->sym->start);
1911         else if (al->map)
1912                 fprintf(f, "0x%" PRIx64, al->addr);
1913         else
1914                 fprintf(f, "0x%" PRIx64, sample->addr);
1915 }
1916
1917 static int trace__pgfault(struct trace *trace,
1918                           struct perf_evsel *evsel,
1919                           union perf_event *event,
1920                           struct perf_sample *sample)
1921 {
1922         struct thread *thread;
1923         u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
1924         struct addr_location al;
1925         char map_type = 'd';
1926         struct thread_trace *ttrace;
1927
1928         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1929         ttrace = thread__trace(thread, trace->output);
1930         if (ttrace == NULL)
1931                 return -1;
1932
1933         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1934                 ttrace->pfmaj++;
1935         else
1936                 ttrace->pfmin++;
1937
1938         if (trace->summary_only)
1939                 return 0;
1940
1941         thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
1942                               sample->ip, &al);
1943
1944         trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
1945
1946         fprintf(trace->output, "%sfault [",
1947                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1948                 "maj" : "min");
1949
1950         print_location(trace->output, sample, &al, false, true);
1951
1952         fprintf(trace->output, "] => ");
1953
1954         thread__find_addr_location(thread, cpumode, MAP__VARIABLE,
1955                                    sample->addr, &al);
1956
1957         if (!al.map) {
1958                 thread__find_addr_location(thread, cpumode,
1959                                            MAP__FUNCTION, sample->addr, &al);
1960
1961                 if (al.map)
1962                         map_type = 'x';
1963                 else
1964                         map_type = '?';
1965         }
1966
1967         print_location(trace->output, sample, &al, true, false);
1968
1969         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1970
1971         return 0;
1972 }
1973
1974 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
1975 {
1976         if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
1977             (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
1978                 return false;
1979
1980         if (trace->pid_list || trace->tid_list)
1981                 return true;
1982
1983         return false;
1984 }
1985
1986 static int trace__process_sample(struct perf_tool *tool,
1987                                  union perf_event *event,
1988                                  struct perf_sample *sample,
1989                                  struct perf_evsel *evsel,
1990                                  struct machine *machine __maybe_unused)
1991 {
1992         struct trace *trace = container_of(tool, struct trace, tool);
1993         int err = 0;
1994
1995         tracepoint_handler handler = evsel->handler;
1996
1997         if (skip_sample(trace, sample))
1998                 return 0;
1999
2000         if (!trace->full_time && trace->base_time == 0)
2001                 trace->base_time = sample->time;
2002
2003         if (handler) {
2004                 ++trace->nr_events;
2005                 handler(trace, evsel, event, sample);
2006         }
2007
2008         return err;
2009 }
2010
2011 static int parse_target_str(struct trace *trace)
2012 {
2013         if (trace->opts.target.pid) {
2014                 trace->pid_list = intlist__new(trace->opts.target.pid);
2015                 if (trace->pid_list == NULL) {
2016                         pr_err("Error parsing process id string\n");
2017                         return -EINVAL;
2018                 }
2019         }
2020
2021         if (trace->opts.target.tid) {
2022                 trace->tid_list = intlist__new(trace->opts.target.tid);
2023                 if (trace->tid_list == NULL) {
2024                         pr_err("Error parsing thread id string\n");
2025                         return -EINVAL;
2026                 }
2027         }
2028
2029         return 0;
2030 }
2031
2032 static int trace__record(struct trace *trace, int argc, const char **argv)
2033 {
2034         unsigned int rec_argc, i, j;
2035         const char **rec_argv;
2036         const char * const record_args[] = {
2037                 "record",
2038                 "-R",
2039                 "-m", "1024",
2040                 "-c", "1",
2041         };
2042
2043         const char * const sc_args[] = { "-e", };
2044         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2045         const char * const majpf_args[] = { "-e", "major-faults" };
2046         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2047         const char * const minpf_args[] = { "-e", "minor-faults" };
2048         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2049
2050         /* +1 is for the event string below */
2051         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2052                 majpf_args_nr + minpf_args_nr + argc;
2053         rec_argv = calloc(rec_argc + 1, sizeof(char *));
2054
2055         if (rec_argv == NULL)
2056                 return -ENOMEM;
2057
2058         j = 0;
2059         for (i = 0; i < ARRAY_SIZE(record_args); i++)
2060                 rec_argv[j++] = record_args[i];
2061
2062         if (trace->trace_syscalls) {
2063                 for (i = 0; i < sc_args_nr; i++)
2064                         rec_argv[j++] = sc_args[i];
2065
2066                 /* event string may be different for older kernels - e.g., RHEL6 */
2067                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2068                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2069                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2070                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2071                 else {
2072                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2073                         return -1;
2074                 }
2075         }
2076
2077         if (trace->trace_pgfaults & TRACE_PFMAJ)
2078                 for (i = 0; i < majpf_args_nr; i++)
2079                         rec_argv[j++] = majpf_args[i];
2080
2081         if (trace->trace_pgfaults & TRACE_PFMIN)
2082                 for (i = 0; i < minpf_args_nr; i++)
2083                         rec_argv[j++] = minpf_args[i];
2084
2085         for (i = 0; i < (unsigned int)argc; i++)
2086                 rec_argv[j++] = argv[i];
2087
2088         return cmd_record(j, rec_argv, NULL);
2089 }
2090
2091 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2092
2093 static void perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2094 {
2095         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2096         if (evsel == NULL)
2097                 return;
2098
2099         if (perf_evsel__field(evsel, "pathname") == NULL) {
2100                 perf_evsel__delete(evsel);
2101                 return;
2102         }
2103
2104         evsel->handler = trace__vfs_getname;
2105         perf_evlist__add(evlist, evsel);
2106 }
2107
2108 static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
2109                                     u64 config)
2110 {
2111         struct perf_evsel *evsel;
2112         struct perf_event_attr attr = {
2113                 .type = PERF_TYPE_SOFTWARE,
2114                 .mmap_data = 1,
2115         };
2116
2117         attr.config = config;
2118         attr.sample_period = 1;
2119
2120         event_attr_init(&attr);
2121
2122         evsel = perf_evsel__new(&attr);
2123         if (!evsel)
2124                 return -ENOMEM;
2125
2126         evsel->handler = trace__pgfault;
2127         perf_evlist__add(evlist, evsel);
2128
2129         return 0;
2130 }
2131
2132 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2133 {
2134         const u32 type = event->header.type;
2135         struct perf_evsel *evsel;
2136
2137         if (!trace->full_time && trace->base_time == 0)
2138                 trace->base_time = sample->time;
2139
2140         if (type != PERF_RECORD_SAMPLE) {
2141                 trace__process_event(trace, trace->host, event, sample);
2142                 return;
2143         }
2144
2145         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2146         if (evsel == NULL) {
2147                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2148                 return;
2149         }
2150
2151         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2152             sample->raw_data == NULL) {
2153                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2154                        perf_evsel__name(evsel), sample->tid,
2155                        sample->cpu, sample->raw_size);
2156         } else {
2157                 tracepoint_handler handler = evsel->handler;
2158                 handler(trace, evsel, event, sample);
2159         }
2160 }
2161
2162 static int trace__run(struct trace *trace, int argc, const char **argv)
2163 {
2164         struct perf_evlist *evlist = trace->evlist;
2165         int err = -1, i;
2166         unsigned long before;
2167         const bool forks = argc > 0;
2168         bool draining = false;
2169
2170         trace->live = true;
2171
2172         if (trace->trace_syscalls &&
2173             perf_evlist__add_syscall_newtp(evlist, trace__sys_enter,
2174                                            trace__sys_exit))
2175                 goto out_error_raw_syscalls;
2176
2177         if (trace->trace_syscalls)
2178                 perf_evlist__add_vfs_getname(evlist);
2179
2180         if ((trace->trace_pgfaults & TRACE_PFMAJ) &&
2181             perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ)) {
2182                 goto out_error_mem;
2183         }
2184
2185         if ((trace->trace_pgfaults & TRACE_PFMIN) &&
2186             perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MIN))
2187                 goto out_error_mem;
2188
2189         if (trace->sched &&
2190             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2191                                    trace__sched_stat_runtime))
2192                 goto out_error_sched_stat_runtime;
2193
2194         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2195         if (err < 0) {
2196                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2197                 goto out_delete_evlist;
2198         }
2199
2200         err = trace__symbols_init(trace, evlist);
2201         if (err < 0) {
2202                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2203                 goto out_delete_evlist;
2204         }
2205
2206         perf_evlist__config(evlist, &trace->opts);
2207
2208         signal(SIGCHLD, sig_handler);
2209         signal(SIGINT, sig_handler);
2210
2211         if (forks) {
2212                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2213                                                     argv, false, NULL);
2214                 if (err < 0) {
2215                         fprintf(trace->output, "Couldn't run the workload!\n");
2216                         goto out_delete_evlist;
2217                 }
2218         }
2219
2220         err = perf_evlist__open(evlist);
2221         if (err < 0)
2222                 goto out_error_open;
2223
2224         /*
2225          * Better not use !target__has_task() here because we need to cover the
2226          * case where no threads were specified in the command line, but a
2227          * workload was, and in that case we will fill in the thread_map when
2228          * we fork the workload in perf_evlist__prepare_workload.
2229          */
2230         if (trace->filter_pids.nr > 0)
2231                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2232         else if (evlist->threads->map[0] == -1)
2233                 err = perf_evlist__set_filter_pid(evlist, getpid());
2234
2235         if (err < 0) {
2236                 printf("err=%d,%s\n", -err, strerror(-err));
2237                 exit(1);
2238         }
2239
2240         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2241         if (err < 0)
2242                 goto out_error_mmap;
2243
2244         if (!target__none(&trace->opts.target))
2245                 perf_evlist__enable(evlist);
2246
2247         if (forks)
2248                 perf_evlist__start_workload(evlist);
2249
2250         trace->multiple_threads = evlist->threads->map[0] == -1 ||
2251                                   evlist->threads->nr > 1 ||
2252                                   perf_evlist__first(evlist)->attr.inherit;
2253 again:
2254         before = trace->nr_events;
2255
2256         for (i = 0; i < evlist->nr_mmaps; i++) {
2257                 union perf_event *event;
2258
2259                 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2260                         struct perf_sample sample;
2261
2262                         ++trace->nr_events;
2263
2264                         err = perf_evlist__parse_sample(evlist, event, &sample);
2265                         if (err) {
2266                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2267                                 goto next_event;
2268                         }
2269
2270                         trace__handle_event(trace, event, &sample);
2271 next_event:
2272                         perf_evlist__mmap_consume(evlist, i);
2273
2274                         if (interrupted)
2275                                 goto out_disable;
2276
2277                         if (done && !draining) {
2278                                 perf_evlist__disable(evlist);
2279                                 draining = true;
2280                         }
2281                 }
2282         }
2283
2284         if (trace->nr_events == before) {
2285                 int timeout = done ? 100 : -1;
2286
2287                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2288                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2289                                 draining = true;
2290
2291                         goto again;
2292                 }
2293         } else {
2294                 goto again;
2295         }
2296
2297 out_disable:
2298         thread__zput(trace->current);
2299
2300         perf_evlist__disable(evlist);
2301
2302         if (!err) {
2303                 if (trace->summary)
2304                         trace__fprintf_thread_summary(trace, trace->output);
2305
2306                 if (trace->show_tool_stats) {
2307                         fprintf(trace->output, "Stats:\n "
2308                                                " vfs_getname : %" PRIu64 "\n"
2309                                                " proc_getname: %" PRIu64 "\n",
2310                                 trace->stats.vfs_getname,
2311                                 trace->stats.proc_getname);
2312                 }
2313         }
2314
2315 out_delete_evlist:
2316         perf_evlist__delete(evlist);
2317         trace->evlist = NULL;
2318         trace->live = false;
2319         return err;
2320 {
2321         char errbuf[BUFSIZ];
2322
2323 out_error_sched_stat_runtime:
2324         debugfs__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2325         goto out_error;
2326
2327 out_error_raw_syscalls:
2328         debugfs__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2329         goto out_error;
2330
2331 out_error_mmap:
2332         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2333         goto out_error;
2334
2335 out_error_open:
2336         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2337
2338 out_error:
2339         fprintf(trace->output, "%s\n", errbuf);
2340         goto out_delete_evlist;
2341 }
2342 out_error_mem:
2343         fprintf(trace->output, "Not enough memory to run!\n");
2344         goto out_delete_evlist;
2345 }
2346
2347 static int trace__replay(struct trace *trace)
2348 {
2349         const struct perf_evsel_str_handler handlers[] = {
2350                 { "probe:vfs_getname",       trace__vfs_getname, },
2351         };
2352         struct perf_data_file file = {
2353                 .path  = input_name,
2354                 .mode  = PERF_DATA_MODE_READ,
2355                 .force = trace->force,
2356         };
2357         struct perf_session *session;
2358         struct perf_evsel *evsel;
2359         int err = -1;
2360
2361         trace->tool.sample        = trace__process_sample;
2362         trace->tool.mmap          = perf_event__process_mmap;
2363         trace->tool.mmap2         = perf_event__process_mmap2;
2364         trace->tool.comm          = perf_event__process_comm;
2365         trace->tool.exit          = perf_event__process_exit;
2366         trace->tool.fork          = perf_event__process_fork;
2367         trace->tool.attr          = perf_event__process_attr;
2368         trace->tool.tracing_data = perf_event__process_tracing_data;
2369         trace->tool.build_id      = perf_event__process_build_id;
2370
2371         trace->tool.ordered_events = true;
2372         trace->tool.ordering_requires_timestamps = true;
2373
2374         /* add tid to output */
2375         trace->multiple_threads = true;
2376
2377         session = perf_session__new(&file, false, &trace->tool);
2378         if (session == NULL)
2379                 return -1;
2380
2381         if (symbol__init(&session->header.env) < 0)
2382                 goto out;
2383
2384         trace->host = &session->machines.host;
2385
2386         err = perf_session__set_tracepoints_handlers(session, handlers);
2387         if (err)
2388                 goto out;
2389
2390         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2391                                                      "raw_syscalls:sys_enter");
2392         /* older kernels have syscalls tp versus raw_syscalls */
2393         if (evsel == NULL)
2394                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2395                                                              "syscalls:sys_enter");
2396
2397         if (evsel &&
2398             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2399             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2400                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2401                 goto out;
2402         }
2403
2404         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2405                                                      "raw_syscalls:sys_exit");
2406         if (evsel == NULL)
2407                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2408                                                              "syscalls:sys_exit");
2409         if (evsel &&
2410             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2411             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2412                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2413                 goto out;
2414         }
2415
2416         evlist__for_each(session->evlist, evsel) {
2417                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2418                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2419                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2420                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2421                         evsel->handler = trace__pgfault;
2422         }
2423
2424         err = parse_target_str(trace);
2425         if (err != 0)
2426                 goto out;
2427
2428         setup_pager();
2429
2430         err = perf_session__process_events(session);
2431         if (err)
2432                 pr_err("Failed to process events, error %d", err);
2433
2434         else if (trace->summary)
2435                 trace__fprintf_thread_summary(trace, trace->output);
2436
2437 out:
2438         perf_session__delete(session);
2439
2440         return err;
2441 }
2442
2443 static size_t trace__fprintf_threads_header(FILE *fp)
2444 {
2445         size_t printed;
2446
2447         printed  = fprintf(fp, "\n Summary of events:\n\n");
2448
2449         return printed;
2450 }
2451
2452 static size_t thread__dump_stats(struct thread_trace *ttrace,
2453                                  struct trace *trace, FILE *fp)
2454 {
2455         struct stats *stats;
2456         size_t printed = 0;
2457         struct syscall *sc;
2458         struct int_node *inode = intlist__first(ttrace->syscall_stats);
2459
2460         if (inode == NULL)
2461                 return 0;
2462
2463         printed += fprintf(fp, "\n");
2464
2465         printed += fprintf(fp, "   syscall            calls      min       avg       max      stddev\n");
2466         printed += fprintf(fp, "                               (msec)    (msec)    (msec)        (%%)\n");
2467         printed += fprintf(fp, "   --------------- -------- --------- --------- ---------     ------\n");
2468
2469         /* each int_node is a syscall */
2470         while (inode) {
2471                 stats = inode->priv;
2472                 if (stats) {
2473                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2474                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2475                         double avg = avg_stats(stats);
2476                         double pct;
2477                         u64 n = (u64) stats->n;
2478
2479                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2480                         avg /= NSEC_PER_MSEC;
2481
2482                         sc = &trace->syscalls.table[inode->i];
2483                         printed += fprintf(fp, "   %-15s", sc->name);
2484                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f",
2485                                            n, min, avg);
2486                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2487                 }
2488
2489                 inode = intlist__next(inode);
2490         }
2491
2492         printed += fprintf(fp, "\n\n");
2493
2494         return printed;
2495 }
2496
2497 /* struct used to pass data to per-thread function */
2498 struct summary_data {
2499         FILE *fp;
2500         struct trace *trace;
2501         size_t printed;
2502 };
2503
2504 static int trace__fprintf_one_thread(struct thread *thread, void *priv)
2505 {
2506         struct summary_data *data = priv;
2507         FILE *fp = data->fp;
2508         size_t printed = data->printed;
2509         struct trace *trace = data->trace;
2510         struct thread_trace *ttrace = thread__priv(thread);
2511         double ratio;
2512
2513         if (ttrace == NULL)
2514                 return 0;
2515
2516         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2517
2518         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2519         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2520         printed += fprintf(fp, "%.1f%%", ratio);
2521         if (ttrace->pfmaj)
2522                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2523         if (ttrace->pfmin)
2524                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2525         printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2526         printed += thread__dump_stats(ttrace, trace, fp);
2527
2528         data->printed += printed;
2529
2530         return 0;
2531 }
2532
2533 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2534 {
2535         struct summary_data data = {
2536                 .fp = fp,
2537                 .trace = trace
2538         };
2539         data.printed = trace__fprintf_threads_header(fp);
2540
2541         machine__for_each_thread(trace->host, trace__fprintf_one_thread, &data);
2542
2543         return data.printed;
2544 }
2545
2546 static int trace__set_duration(const struct option *opt, const char *str,
2547                                int unset __maybe_unused)
2548 {
2549         struct trace *trace = opt->value;
2550
2551         trace->duration_filter = atof(str);
2552         return 0;
2553 }
2554
2555 static int trace__set_filter_pids(const struct option *opt, const char *str,
2556                                   int unset __maybe_unused)
2557 {
2558         int ret = -1;
2559         size_t i;
2560         struct trace *trace = opt->value;
2561         /*
2562          * FIXME: introduce a intarray class, plain parse csv and create a
2563          * { int nr, int entries[] } struct...
2564          */
2565         struct intlist *list = intlist__new(str);
2566
2567         if (list == NULL)
2568                 return -1;
2569
2570         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2571         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2572
2573         if (trace->filter_pids.entries == NULL)
2574                 goto out;
2575
2576         trace->filter_pids.entries[0] = getpid();
2577
2578         for (i = 1; i < trace->filter_pids.nr; ++i)
2579                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2580
2581         intlist__delete(list);
2582         ret = 0;
2583 out:
2584         return ret;
2585 }
2586
2587 static int trace__open_output(struct trace *trace, const char *filename)
2588 {
2589         struct stat st;
2590
2591         if (!stat(filename, &st) && st.st_size) {
2592                 char oldname[PATH_MAX];
2593
2594                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2595                 unlink(oldname);
2596                 rename(filename, oldname);
2597         }
2598
2599         trace->output = fopen(filename, "w");
2600
2601         return trace->output == NULL ? -errno : 0;
2602 }
2603
2604 static int parse_pagefaults(const struct option *opt, const char *str,
2605                             int unset __maybe_unused)
2606 {
2607         int *trace_pgfaults = opt->value;
2608
2609         if (strcmp(str, "all") == 0)
2610                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2611         else if (strcmp(str, "maj") == 0)
2612                 *trace_pgfaults |= TRACE_PFMAJ;
2613         else if (strcmp(str, "min") == 0)
2614                 *trace_pgfaults |= TRACE_PFMIN;
2615         else
2616                 return -1;
2617
2618         return 0;
2619 }
2620
2621 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2622 {
2623         struct perf_evsel *evsel;
2624
2625         evlist__for_each(evlist, evsel)
2626                 evsel->handler = handler;
2627 }
2628
2629 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2630 {
2631         const char *trace_usage[] = {
2632                 "perf trace [<options>] [<command>]",
2633                 "perf trace [<options>] -- <command> [<options>]",
2634                 "perf trace record [<options>] [<command>]",
2635                 "perf trace record [<options>] -- <command> [<options>]",
2636                 NULL
2637         };
2638         struct trace trace = {
2639                 .audit = {
2640                         .machine = audit_detect_machine(),
2641                         .open_id = audit_name_to_syscall("open", trace.audit.machine),
2642                 },
2643                 .syscalls = {
2644                         . max = -1,
2645                 },
2646                 .opts = {
2647                         .target = {
2648                                 .uid       = UINT_MAX,
2649                                 .uses_mmap = true,
2650                         },
2651                         .user_freq     = UINT_MAX,
2652                         .user_interval = ULLONG_MAX,
2653                         .no_buffering  = true,
2654                         .mmap_pages    = UINT_MAX,
2655                 },
2656                 .output = stdout,
2657                 .show_comm = true,
2658                 .trace_syscalls = true,
2659         };
2660         const char *output_name = NULL;
2661         const char *ev_qualifier_str = NULL;
2662         const struct option trace_options[] = {
2663         OPT_CALLBACK(0, "event", &trace.evlist, "event",
2664                      "event selector. use 'perf list' to list available events",
2665                      parse_events_option),
2666         OPT_BOOLEAN(0, "comm", &trace.show_comm,
2667                     "show the thread COMM next to its id"),
2668         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2669         OPT_STRING('e', "expr", &ev_qualifier_str, "expr",
2670                     "list of events to trace"),
2671         OPT_STRING('o', "output", &output_name, "file", "output file name"),
2672         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2673         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2674                     "trace events on existing process id"),
2675         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2676                     "trace events on existing thread id"),
2677         OPT_CALLBACK(0, "filter-pids", &trace, "float",
2678                      "show only events with duration > N.M ms", trace__set_filter_pids),
2679         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2680                     "system-wide collection from all CPUs"),
2681         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2682                     "list of cpus to monitor"),
2683         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2684                     "child tasks do not inherit counters"),
2685         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2686                      "number of mmap data pages",
2687                      perf_evlist__parse_mmap_pages),
2688         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2689                    "user to profile"),
2690         OPT_CALLBACK(0, "duration", &trace, "float",
2691                      "show only events with duration > N.M ms",
2692                      trace__set_duration),
2693         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2694         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2695         OPT_BOOLEAN('T', "time", &trace.full_time,
2696                     "Show full timestamp, not time relative to first start"),
2697         OPT_BOOLEAN('s', "summary", &trace.summary_only,
2698                     "Show only syscall summary with statistics"),
2699         OPT_BOOLEAN('S', "with-summary", &trace.summary,
2700                     "Show all syscalls and summary with statistics"),
2701         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2702                      "Trace pagefaults", parse_pagefaults, "maj"),
2703         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2704         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2705         OPT_END()
2706         };
2707         const char * const trace_subcommands[] = { "record", NULL };
2708         int err;
2709         char bf[BUFSIZ];
2710
2711         signal(SIGSEGV, sighandler_dump_stack);
2712         signal(SIGFPE, sighandler_dump_stack);
2713
2714         trace.evlist = perf_evlist__new();
2715         if (trace.evlist == NULL)
2716                 return -ENOMEM;
2717
2718         if (trace.evlist == NULL) {
2719                 pr_err("Not enough memory to run!\n");
2720                 goto out;
2721         }
2722
2723         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2724                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2725
2726         if (trace.trace_pgfaults) {
2727                 trace.opts.sample_address = true;
2728                 trace.opts.sample_time = true;
2729         }
2730
2731         if (trace.evlist->nr_entries > 0)
2732                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2733
2734         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2735                 return trace__record(&trace, argc-1, &argv[1]);
2736
2737         /* summary_only implies summary option, but don't overwrite summary if set */
2738         if (trace.summary_only)
2739                 trace.summary = trace.summary_only;
2740
2741         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2742             trace.evlist->nr_entries == 0 /* Was --events used? */) {
2743                 pr_err("Please specify something to trace.\n");
2744                 return -1;
2745         }
2746
2747         if (output_name != NULL) {
2748                 err = trace__open_output(&trace, output_name);
2749                 if (err < 0) {
2750                         perror("failed to create output file");
2751                         goto out;
2752                 }
2753         }
2754
2755         if (ev_qualifier_str != NULL) {
2756                 const char *s = ev_qualifier_str;
2757
2758                 trace.not_ev_qualifier = *s == '!';
2759                 if (trace.not_ev_qualifier)
2760                         ++s;
2761                 trace.ev_qualifier = strlist__new(true, s);
2762                 if (trace.ev_qualifier == NULL) {
2763                         fputs("Not enough memory to parse event qualifier",
2764                               trace.output);
2765                         err = -ENOMEM;
2766                         goto out_close;
2767                 }
2768         }
2769
2770         err = target__validate(&trace.opts.target);
2771         if (err) {
2772                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2773                 fprintf(trace.output, "%s", bf);
2774                 goto out_close;
2775         }
2776
2777         err = target__parse_uid(&trace.opts.target);
2778         if (err) {
2779                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2780                 fprintf(trace.output, "%s", bf);
2781                 goto out_close;
2782         }
2783
2784         if (!argc && target__none(&trace.opts.target))
2785                 trace.opts.target.system_wide = true;
2786
2787         if (input_name)
2788                 err = trace__replay(&trace);
2789         else
2790                 err = trace__run(&trace, argc, argv);
2791
2792 out_close:
2793         if (output_name != NULL)
2794                 fclose(trace.output);
2795 out:
2796         return err;
2797 }