Add the rt linux 4.1.3-rt3 as base
[kvmfornfv.git] / kernel / tools / perf / builtin-kmem.c
1 #include "builtin.h"
2 #include "perf.h"
3
4 #include "util/evlist.h"
5 #include "util/evsel.h"
6 #include "util/util.h"
7 #include "util/cache.h"
8 #include "util/symbol.h"
9 #include "util/thread.h"
10 #include "util/header.h"
11 #include "util/session.h"
12 #include "util/tool.h"
13
14 #include "util/parse-options.h"
15 #include "util/trace-event.h"
16 #include "util/data.h"
17 #include "util/cpumap.h"
18
19 #include "util/debug.h"
20
21 #include <linux/rbtree.h>
22 #include <linux/string.h>
23 #include <locale.h>
24
25 static int      kmem_slab;
26 static int      kmem_page;
27
28 static long     kmem_page_size;
29
30 struct alloc_stat;
31 typedef int (*sort_fn_t)(struct alloc_stat *, struct alloc_stat *);
32
33 static int                      alloc_flag;
34 static int                      caller_flag;
35
36 static int                      alloc_lines = -1;
37 static int                      caller_lines = -1;
38
39 static bool                     raw_ip;
40
41 struct alloc_stat {
42         u64     call_site;
43         u64     ptr;
44         u64     bytes_req;
45         u64     bytes_alloc;
46         u32     hit;
47         u32     pingpong;
48
49         short   alloc_cpu;
50
51         struct rb_node node;
52 };
53
54 static struct rb_root root_alloc_stat;
55 static struct rb_root root_alloc_sorted;
56 static struct rb_root root_caller_stat;
57 static struct rb_root root_caller_sorted;
58
59 static unsigned long total_requested, total_allocated;
60 static unsigned long nr_allocs, nr_cross_allocs;
61
62 static int insert_alloc_stat(unsigned long call_site, unsigned long ptr,
63                              int bytes_req, int bytes_alloc, int cpu)
64 {
65         struct rb_node **node = &root_alloc_stat.rb_node;
66         struct rb_node *parent = NULL;
67         struct alloc_stat *data = NULL;
68
69         while (*node) {
70                 parent = *node;
71                 data = rb_entry(*node, struct alloc_stat, node);
72
73                 if (ptr > data->ptr)
74                         node = &(*node)->rb_right;
75                 else if (ptr < data->ptr)
76                         node = &(*node)->rb_left;
77                 else
78                         break;
79         }
80
81         if (data && data->ptr == ptr) {
82                 data->hit++;
83                 data->bytes_req += bytes_req;
84                 data->bytes_alloc += bytes_alloc;
85         } else {
86                 data = malloc(sizeof(*data));
87                 if (!data) {
88                         pr_err("%s: malloc failed\n", __func__);
89                         return -1;
90                 }
91                 data->ptr = ptr;
92                 data->pingpong = 0;
93                 data->hit = 1;
94                 data->bytes_req = bytes_req;
95                 data->bytes_alloc = bytes_alloc;
96
97                 rb_link_node(&data->node, parent, node);
98                 rb_insert_color(&data->node, &root_alloc_stat);
99         }
100         data->call_site = call_site;
101         data->alloc_cpu = cpu;
102         return 0;
103 }
104
105 static int insert_caller_stat(unsigned long call_site,
106                               int bytes_req, int bytes_alloc)
107 {
108         struct rb_node **node = &root_caller_stat.rb_node;
109         struct rb_node *parent = NULL;
110         struct alloc_stat *data = NULL;
111
112         while (*node) {
113                 parent = *node;
114                 data = rb_entry(*node, struct alloc_stat, node);
115
116                 if (call_site > data->call_site)
117                         node = &(*node)->rb_right;
118                 else if (call_site < data->call_site)
119                         node = &(*node)->rb_left;
120                 else
121                         break;
122         }
123
124         if (data && data->call_site == call_site) {
125                 data->hit++;
126                 data->bytes_req += bytes_req;
127                 data->bytes_alloc += bytes_alloc;
128         } else {
129                 data = malloc(sizeof(*data));
130                 if (!data) {
131                         pr_err("%s: malloc failed\n", __func__);
132                         return -1;
133                 }
134                 data->call_site = call_site;
135                 data->pingpong = 0;
136                 data->hit = 1;
137                 data->bytes_req = bytes_req;
138                 data->bytes_alloc = bytes_alloc;
139
140                 rb_link_node(&data->node, parent, node);
141                 rb_insert_color(&data->node, &root_caller_stat);
142         }
143
144         return 0;
145 }
146
147 static int perf_evsel__process_alloc_event(struct perf_evsel *evsel,
148                                            struct perf_sample *sample)
149 {
150         unsigned long ptr = perf_evsel__intval(evsel, sample, "ptr"),
151                       call_site = perf_evsel__intval(evsel, sample, "call_site");
152         int bytes_req = perf_evsel__intval(evsel, sample, "bytes_req"),
153             bytes_alloc = perf_evsel__intval(evsel, sample, "bytes_alloc");
154
155         if (insert_alloc_stat(call_site, ptr, bytes_req, bytes_alloc, sample->cpu) ||
156             insert_caller_stat(call_site, bytes_req, bytes_alloc))
157                 return -1;
158
159         total_requested += bytes_req;
160         total_allocated += bytes_alloc;
161
162         nr_allocs++;
163         return 0;
164 }
165
166 static int perf_evsel__process_alloc_node_event(struct perf_evsel *evsel,
167                                                 struct perf_sample *sample)
168 {
169         int ret = perf_evsel__process_alloc_event(evsel, sample);
170
171         if (!ret) {
172                 int node1 = cpu__get_node(sample->cpu),
173                     node2 = perf_evsel__intval(evsel, sample, "node");
174
175                 if (node1 != node2)
176                         nr_cross_allocs++;
177         }
178
179         return ret;
180 }
181
182 static int ptr_cmp(struct alloc_stat *, struct alloc_stat *);
183 static int callsite_cmp(struct alloc_stat *, struct alloc_stat *);
184
185 static struct alloc_stat *search_alloc_stat(unsigned long ptr,
186                                             unsigned long call_site,
187                                             struct rb_root *root,
188                                             sort_fn_t sort_fn)
189 {
190         struct rb_node *node = root->rb_node;
191         struct alloc_stat key = { .ptr = ptr, .call_site = call_site };
192
193         while (node) {
194                 struct alloc_stat *data;
195                 int cmp;
196
197                 data = rb_entry(node, struct alloc_stat, node);
198
199                 cmp = sort_fn(&key, data);
200                 if (cmp < 0)
201                         node = node->rb_left;
202                 else if (cmp > 0)
203                         node = node->rb_right;
204                 else
205                         return data;
206         }
207         return NULL;
208 }
209
210 static int perf_evsel__process_free_event(struct perf_evsel *evsel,
211                                           struct perf_sample *sample)
212 {
213         unsigned long ptr = perf_evsel__intval(evsel, sample, "ptr");
214         struct alloc_stat *s_alloc, *s_caller;
215
216         s_alloc = search_alloc_stat(ptr, 0, &root_alloc_stat, ptr_cmp);
217         if (!s_alloc)
218                 return 0;
219
220         if ((short)sample->cpu != s_alloc->alloc_cpu) {
221                 s_alloc->pingpong++;
222
223                 s_caller = search_alloc_stat(0, s_alloc->call_site,
224                                              &root_caller_stat, callsite_cmp);
225                 if (!s_caller)
226                         return -1;
227                 s_caller->pingpong++;
228         }
229         s_alloc->alloc_cpu = -1;
230
231         return 0;
232 }
233
234 static u64 total_page_alloc_bytes;
235 static u64 total_page_free_bytes;
236 static u64 total_page_nomatch_bytes;
237 static u64 total_page_fail_bytes;
238 static unsigned long nr_page_allocs;
239 static unsigned long nr_page_frees;
240 static unsigned long nr_page_fails;
241 static unsigned long nr_page_nomatch;
242
243 static bool use_pfn;
244
245 #define MAX_MIGRATE_TYPES  6
246 #define MAX_PAGE_ORDER     11
247
248 static int order_stats[MAX_PAGE_ORDER][MAX_MIGRATE_TYPES];
249
250 struct page_stat {
251         struct rb_node  node;
252         u64             page;
253         int             order;
254         unsigned        gfp_flags;
255         unsigned        migrate_type;
256         u64             alloc_bytes;
257         u64             free_bytes;
258         int             nr_alloc;
259         int             nr_free;
260 };
261
262 static struct rb_root page_tree;
263 static struct rb_root page_alloc_tree;
264 static struct rb_root page_alloc_sorted;
265
266 static struct page_stat *search_page(unsigned long page, bool create)
267 {
268         struct rb_node **node = &page_tree.rb_node;
269         struct rb_node *parent = NULL;
270         struct page_stat *data;
271
272         while (*node) {
273                 s64 cmp;
274
275                 parent = *node;
276                 data = rb_entry(*node, struct page_stat, node);
277
278                 cmp = data->page - page;
279                 if (cmp < 0)
280                         node = &parent->rb_left;
281                 else if (cmp > 0)
282                         node = &parent->rb_right;
283                 else
284                         return data;
285         }
286
287         if (!create)
288                 return NULL;
289
290         data = zalloc(sizeof(*data));
291         if (data != NULL) {
292                 data->page = page;
293
294                 rb_link_node(&data->node, parent, node);
295                 rb_insert_color(&data->node, &page_tree);
296         }
297
298         return data;
299 }
300
301 static int page_stat_cmp(struct page_stat *a, struct page_stat *b)
302 {
303         if (a->page > b->page)
304                 return -1;
305         if (a->page < b->page)
306                 return 1;
307         if (a->order > b->order)
308                 return -1;
309         if (a->order < b->order)
310                 return 1;
311         if (a->migrate_type > b->migrate_type)
312                 return -1;
313         if (a->migrate_type < b->migrate_type)
314                 return 1;
315         if (a->gfp_flags > b->gfp_flags)
316                 return -1;
317         if (a->gfp_flags < b->gfp_flags)
318                 return 1;
319         return 0;
320 }
321
322 static struct page_stat *search_page_alloc_stat(struct page_stat *pstat, bool create)
323 {
324         struct rb_node **node = &page_alloc_tree.rb_node;
325         struct rb_node *parent = NULL;
326         struct page_stat *data;
327
328         while (*node) {
329                 s64 cmp;
330
331                 parent = *node;
332                 data = rb_entry(*node, struct page_stat, node);
333
334                 cmp = page_stat_cmp(data, pstat);
335                 if (cmp < 0)
336                         node = &parent->rb_left;
337                 else if (cmp > 0)
338                         node = &parent->rb_right;
339                 else
340                         return data;
341         }
342
343         if (!create)
344                 return NULL;
345
346         data = zalloc(sizeof(*data));
347         if (data != NULL) {
348                 data->page = pstat->page;
349                 data->order = pstat->order;
350                 data->gfp_flags = pstat->gfp_flags;
351                 data->migrate_type = pstat->migrate_type;
352
353                 rb_link_node(&data->node, parent, node);
354                 rb_insert_color(&data->node, &page_alloc_tree);
355         }
356
357         return data;
358 }
359
360 static bool valid_page(u64 pfn_or_page)
361 {
362         if (use_pfn && pfn_or_page == -1UL)
363                 return false;
364         if (!use_pfn && pfn_or_page == 0)
365                 return false;
366         return true;
367 }
368
369 static int perf_evsel__process_page_alloc_event(struct perf_evsel *evsel,
370                                                 struct perf_sample *sample)
371 {
372         u64 page;
373         unsigned int order = perf_evsel__intval(evsel, sample, "order");
374         unsigned int gfp_flags = perf_evsel__intval(evsel, sample, "gfp_flags");
375         unsigned int migrate_type = perf_evsel__intval(evsel, sample,
376                                                        "migratetype");
377         u64 bytes = kmem_page_size << order;
378         struct page_stat *pstat;
379         struct page_stat this = {
380                 .order = order,
381                 .gfp_flags = gfp_flags,
382                 .migrate_type = migrate_type,
383         };
384
385         if (use_pfn)
386                 page = perf_evsel__intval(evsel, sample, "pfn");
387         else
388                 page = perf_evsel__intval(evsel, sample, "page");
389
390         nr_page_allocs++;
391         total_page_alloc_bytes += bytes;
392
393         if (!valid_page(page)) {
394                 nr_page_fails++;
395                 total_page_fail_bytes += bytes;
396
397                 return 0;
398         }
399
400         /*
401          * This is to find the current page (with correct gfp flags and
402          * migrate type) at free event.
403          */
404         pstat = search_page(page, true);
405         if (pstat == NULL)
406                 return -ENOMEM;
407
408         pstat->order = order;
409         pstat->gfp_flags = gfp_flags;
410         pstat->migrate_type = migrate_type;
411
412         this.page = page;
413         pstat = search_page_alloc_stat(&this, true);
414         if (pstat == NULL)
415                 return -ENOMEM;
416
417         pstat->nr_alloc++;
418         pstat->alloc_bytes += bytes;
419
420         order_stats[order][migrate_type]++;
421
422         return 0;
423 }
424
425 static int perf_evsel__process_page_free_event(struct perf_evsel *evsel,
426                                                 struct perf_sample *sample)
427 {
428         u64 page;
429         unsigned int order = perf_evsel__intval(evsel, sample, "order");
430         u64 bytes = kmem_page_size << order;
431         struct page_stat *pstat;
432         struct page_stat this = {
433                 .order = order,
434         };
435
436         if (use_pfn)
437                 page = perf_evsel__intval(evsel, sample, "pfn");
438         else
439                 page = perf_evsel__intval(evsel, sample, "page");
440
441         nr_page_frees++;
442         total_page_free_bytes += bytes;
443
444         pstat = search_page(page, false);
445         if (pstat == NULL) {
446                 pr_debug2("missing free at page %"PRIx64" (order: %d)\n",
447                           page, order);
448
449                 nr_page_nomatch++;
450                 total_page_nomatch_bytes += bytes;
451
452                 return 0;
453         }
454
455         this.page = page;
456         this.gfp_flags = pstat->gfp_flags;
457         this.migrate_type = pstat->migrate_type;
458
459         rb_erase(&pstat->node, &page_tree);
460         free(pstat);
461
462         pstat = search_page_alloc_stat(&this, false);
463         if (pstat == NULL)
464                 return -ENOENT;
465
466         pstat->nr_free++;
467         pstat->free_bytes += bytes;
468
469         return 0;
470 }
471
472 typedef int (*tracepoint_handler)(struct perf_evsel *evsel,
473                                   struct perf_sample *sample);
474
475 static int process_sample_event(struct perf_tool *tool __maybe_unused,
476                                 union perf_event *event,
477                                 struct perf_sample *sample,
478                                 struct perf_evsel *evsel,
479                                 struct machine *machine)
480 {
481         struct thread *thread = machine__findnew_thread(machine, sample->pid,
482                                                         sample->tid);
483
484         if (thread == NULL) {
485                 pr_debug("problem processing %d event, skipping it.\n",
486                          event->header.type);
487                 return -1;
488         }
489
490         dump_printf(" ... thread: %s:%d\n", thread__comm_str(thread), thread->tid);
491
492         if (evsel->handler != NULL) {
493                 tracepoint_handler f = evsel->handler;
494                 return f(evsel, sample);
495         }
496
497         return 0;
498 }
499
500 static struct perf_tool perf_kmem = {
501         .sample          = process_sample_event,
502         .comm            = perf_event__process_comm,
503         .mmap            = perf_event__process_mmap,
504         .mmap2           = perf_event__process_mmap2,
505         .ordered_events  = true,
506 };
507
508 static double fragmentation(unsigned long n_req, unsigned long n_alloc)
509 {
510         if (n_alloc == 0)
511                 return 0.0;
512         else
513                 return 100.0 - (100.0 * n_req / n_alloc);
514 }
515
516 static void __print_slab_result(struct rb_root *root,
517                                 struct perf_session *session,
518                                 int n_lines, int is_caller)
519 {
520         struct rb_node *next;
521         struct machine *machine = &session->machines.host;
522
523         printf("%.105s\n", graph_dotted_line);
524         printf(" %-34s |",  is_caller ? "Callsite": "Alloc Ptr");
525         printf(" Total_alloc/Per | Total_req/Per   | Hit      | Ping-pong | Frag\n");
526         printf("%.105s\n", graph_dotted_line);
527
528         next = rb_first(root);
529
530         while (next && n_lines--) {
531                 struct alloc_stat *data = rb_entry(next, struct alloc_stat,
532                                                    node);
533                 struct symbol *sym = NULL;
534                 struct map *map;
535                 char buf[BUFSIZ];
536                 u64 addr;
537
538                 if (is_caller) {
539                         addr = data->call_site;
540                         if (!raw_ip)
541                                 sym = machine__find_kernel_function(machine, addr, &map, NULL);
542                 } else
543                         addr = data->ptr;
544
545                 if (sym != NULL)
546                         snprintf(buf, sizeof(buf), "%s+%" PRIx64 "", sym->name,
547                                  addr - map->unmap_ip(map, sym->start));
548                 else
549                         snprintf(buf, sizeof(buf), "%#" PRIx64 "", addr);
550                 printf(" %-34s |", buf);
551
552                 printf(" %9llu/%-5lu | %9llu/%-5lu | %8lu | %9lu | %6.3f%%\n",
553                        (unsigned long long)data->bytes_alloc,
554                        (unsigned long)data->bytes_alloc / data->hit,
555                        (unsigned long long)data->bytes_req,
556                        (unsigned long)data->bytes_req / data->hit,
557                        (unsigned long)data->hit,
558                        (unsigned long)data->pingpong,
559                        fragmentation(data->bytes_req, data->bytes_alloc));
560
561                 next = rb_next(next);
562         }
563
564         if (n_lines == -1)
565                 printf(" ...                                | ...             | ...             | ...      | ...       | ...   \n");
566
567         printf("%.105s\n", graph_dotted_line);
568 }
569
570 static const char * const migrate_type_str[] = {
571         "UNMOVABL",
572         "RECLAIM",
573         "MOVABLE",
574         "RESERVED",
575         "CMA/ISLT",
576         "UNKNOWN",
577 };
578
579 static void __print_page_result(struct rb_root *root,
580                                 struct perf_session *session __maybe_unused,
581                                 int n_lines)
582 {
583         struct rb_node *next = rb_first(root);
584         const char *format;
585
586         printf("\n%.80s\n", graph_dotted_line);
587         printf(" %-16s | Total alloc (KB) | Hits      | Order | Mig.type | GFP flags\n",
588                use_pfn ? "PFN" : "Page");
589         printf("%.80s\n", graph_dotted_line);
590
591         if (use_pfn)
592                 format = " %16llu | %'16llu | %'9d | %5d | %8s |  %08lx\n";
593         else
594                 format = " %016llx | %'16llu | %'9d | %5d | %8s |  %08lx\n";
595
596         while (next && n_lines--) {
597                 struct page_stat *data;
598
599                 data = rb_entry(next, struct page_stat, node);
600
601                 printf(format, (unsigned long long)data->page,
602                        (unsigned long long)data->alloc_bytes / 1024,
603                        data->nr_alloc, data->order,
604                        migrate_type_str[data->migrate_type],
605                        (unsigned long)data->gfp_flags);
606
607                 next = rb_next(next);
608         }
609
610         if (n_lines == -1)
611                 printf(" ...              | ...              | ...       | ...   | ...      | ...     \n");
612
613         printf("%.80s\n", graph_dotted_line);
614 }
615
616 static void print_slab_summary(void)
617 {
618         printf("\nSUMMARY (SLAB allocator)");
619         printf("\n========================\n");
620         printf("Total bytes requested: %'lu\n", total_requested);
621         printf("Total bytes allocated: %'lu\n", total_allocated);
622         printf("Total bytes wasted on internal fragmentation: %'lu\n",
623                total_allocated - total_requested);
624         printf("Internal fragmentation: %f%%\n",
625                fragmentation(total_requested, total_allocated));
626         printf("Cross CPU allocations: %'lu/%'lu\n", nr_cross_allocs, nr_allocs);
627 }
628
629 static void print_page_summary(void)
630 {
631         int o, m;
632         u64 nr_alloc_freed = nr_page_frees - nr_page_nomatch;
633         u64 total_alloc_freed_bytes = total_page_free_bytes - total_page_nomatch_bytes;
634
635         printf("\nSUMMARY (page allocator)");
636         printf("\n========================\n");
637         printf("%-30s: %'16lu   [ %'16"PRIu64" KB ]\n", "Total allocation requests",
638                nr_page_allocs, total_page_alloc_bytes / 1024);
639         printf("%-30s: %'16lu   [ %'16"PRIu64" KB ]\n", "Total free requests",
640                nr_page_frees, total_page_free_bytes / 1024);
641         printf("\n");
642
643         printf("%-30s: %'16"PRIu64"   [ %'16"PRIu64" KB ]\n", "Total alloc+freed requests",
644                nr_alloc_freed, (total_alloc_freed_bytes) / 1024);
645         printf("%-30s: %'16"PRIu64"   [ %'16"PRIu64" KB ]\n", "Total alloc-only requests",
646                nr_page_allocs - nr_alloc_freed,
647                (total_page_alloc_bytes - total_alloc_freed_bytes) / 1024);
648         printf("%-30s: %'16lu   [ %'16"PRIu64" KB ]\n", "Total free-only requests",
649                nr_page_nomatch, total_page_nomatch_bytes / 1024);
650         printf("\n");
651
652         printf("%-30s: %'16lu   [ %'16"PRIu64" KB ]\n", "Total allocation failures",
653                nr_page_fails, total_page_fail_bytes / 1024);
654         printf("\n");
655
656         printf("%5s  %12s  %12s  %12s  %12s  %12s\n", "Order",  "Unmovable",
657                "Reclaimable", "Movable", "Reserved", "CMA/Isolated");
658         printf("%.5s  %.12s  %.12s  %.12s  %.12s  %.12s\n", graph_dotted_line,
659                graph_dotted_line, graph_dotted_line, graph_dotted_line,
660                graph_dotted_line, graph_dotted_line);
661
662         for (o = 0; o < MAX_PAGE_ORDER; o++) {
663                 printf("%5d", o);
664                 for (m = 0; m < MAX_MIGRATE_TYPES - 1; m++) {
665                         if (order_stats[o][m])
666                                 printf("  %'12d", order_stats[o][m]);
667                         else
668                                 printf("  %12c", '.');
669                 }
670                 printf("\n");
671         }
672 }
673
674 static void print_slab_result(struct perf_session *session)
675 {
676         if (caller_flag)
677                 __print_slab_result(&root_caller_sorted, session, caller_lines, 1);
678         if (alloc_flag)
679                 __print_slab_result(&root_alloc_sorted, session, alloc_lines, 0);
680         print_slab_summary();
681 }
682
683 static void print_page_result(struct perf_session *session)
684 {
685         if (alloc_flag)
686                 __print_page_result(&page_alloc_sorted, session, alloc_lines);
687         print_page_summary();
688 }
689
690 static void print_result(struct perf_session *session)
691 {
692         if (kmem_slab)
693                 print_slab_result(session);
694         if (kmem_page)
695                 print_page_result(session);
696 }
697
698 struct sort_dimension {
699         const char              name[20];
700         sort_fn_t               cmp;
701         struct list_head        list;
702 };
703
704 static LIST_HEAD(caller_sort);
705 static LIST_HEAD(alloc_sort);
706
707 static void sort_slab_insert(struct rb_root *root, struct alloc_stat *data,
708                              struct list_head *sort_list)
709 {
710         struct rb_node **new = &(root->rb_node);
711         struct rb_node *parent = NULL;
712         struct sort_dimension *sort;
713
714         while (*new) {
715                 struct alloc_stat *this;
716                 int cmp = 0;
717
718                 this = rb_entry(*new, struct alloc_stat, node);
719                 parent = *new;
720
721                 list_for_each_entry(sort, sort_list, list) {
722                         cmp = sort->cmp(data, this);
723                         if (cmp)
724                                 break;
725                 }
726
727                 if (cmp > 0)
728                         new = &((*new)->rb_left);
729                 else
730                         new = &((*new)->rb_right);
731         }
732
733         rb_link_node(&data->node, parent, new);
734         rb_insert_color(&data->node, root);
735 }
736
737 static void __sort_slab_result(struct rb_root *root, struct rb_root *root_sorted,
738                                struct list_head *sort_list)
739 {
740         struct rb_node *node;
741         struct alloc_stat *data;
742
743         for (;;) {
744                 node = rb_first(root);
745                 if (!node)
746                         break;
747
748                 rb_erase(node, root);
749                 data = rb_entry(node, struct alloc_stat, node);
750                 sort_slab_insert(root_sorted, data, sort_list);
751         }
752 }
753
754 static void sort_page_insert(struct rb_root *root, struct page_stat *data)
755 {
756         struct rb_node **new = &root->rb_node;
757         struct rb_node *parent = NULL;
758
759         while (*new) {
760                 struct page_stat *this;
761                 int cmp = 0;
762
763                 this = rb_entry(*new, struct page_stat, node);
764                 parent = *new;
765
766                 /* TODO: support more sort key */
767                 cmp = data->alloc_bytes - this->alloc_bytes;
768
769                 if (cmp > 0)
770                         new = &parent->rb_left;
771                 else
772                         new = &parent->rb_right;
773         }
774
775         rb_link_node(&data->node, parent, new);
776         rb_insert_color(&data->node, root);
777 }
778
779 static void __sort_page_result(struct rb_root *root, struct rb_root *root_sorted)
780 {
781         struct rb_node *node;
782         struct page_stat *data;
783
784         for (;;) {
785                 node = rb_first(root);
786                 if (!node)
787                         break;
788
789                 rb_erase(node, root);
790                 data = rb_entry(node, struct page_stat, node);
791                 sort_page_insert(root_sorted, data);
792         }
793 }
794
795 static void sort_result(void)
796 {
797         if (kmem_slab) {
798                 __sort_slab_result(&root_alloc_stat, &root_alloc_sorted,
799                                    &alloc_sort);
800                 __sort_slab_result(&root_caller_stat, &root_caller_sorted,
801                                    &caller_sort);
802         }
803         if (kmem_page) {
804                 __sort_page_result(&page_alloc_tree, &page_alloc_sorted);
805         }
806 }
807
808 static int __cmd_kmem(struct perf_session *session)
809 {
810         int err = -EINVAL;
811         struct perf_evsel *evsel;
812         const struct perf_evsel_str_handler kmem_tracepoints[] = {
813                 /* slab allocator */
814                 { "kmem:kmalloc",               perf_evsel__process_alloc_event, },
815                 { "kmem:kmem_cache_alloc",      perf_evsel__process_alloc_event, },
816                 { "kmem:kmalloc_node",          perf_evsel__process_alloc_node_event, },
817                 { "kmem:kmem_cache_alloc_node", perf_evsel__process_alloc_node_event, },
818                 { "kmem:kfree",                 perf_evsel__process_free_event, },
819                 { "kmem:kmem_cache_free",       perf_evsel__process_free_event, },
820                 /* page allocator */
821                 { "kmem:mm_page_alloc",         perf_evsel__process_page_alloc_event, },
822                 { "kmem:mm_page_free",          perf_evsel__process_page_free_event, },
823         };
824
825         if (!perf_session__has_traces(session, "kmem record"))
826                 goto out;
827
828         if (perf_session__set_tracepoints_handlers(session, kmem_tracepoints)) {
829                 pr_err("Initializing perf session tracepoint handlers failed\n");
830                 goto out;
831         }
832
833         evlist__for_each(session->evlist, evsel) {
834                 if (!strcmp(perf_evsel__name(evsel), "kmem:mm_page_alloc") &&
835                     perf_evsel__field(evsel, "pfn")) {
836                         use_pfn = true;
837                         break;
838                 }
839         }
840
841         setup_pager();
842         err = perf_session__process_events(session);
843         if (err != 0) {
844                 pr_err("error during process events: %d\n", err);
845                 goto out;
846         }
847         sort_result();
848         print_result(session);
849 out:
850         return err;
851 }
852
853 static int ptr_cmp(struct alloc_stat *l, struct alloc_stat *r)
854 {
855         if (l->ptr < r->ptr)
856                 return -1;
857         else if (l->ptr > r->ptr)
858                 return 1;
859         return 0;
860 }
861
862 static struct sort_dimension ptr_sort_dimension = {
863         .name   = "ptr",
864         .cmp    = ptr_cmp,
865 };
866
867 static int callsite_cmp(struct alloc_stat *l, struct alloc_stat *r)
868 {
869         if (l->call_site < r->call_site)
870                 return -1;
871         else if (l->call_site > r->call_site)
872                 return 1;
873         return 0;
874 }
875
876 static struct sort_dimension callsite_sort_dimension = {
877         .name   = "callsite",
878         .cmp    = callsite_cmp,
879 };
880
881 static int hit_cmp(struct alloc_stat *l, struct alloc_stat *r)
882 {
883         if (l->hit < r->hit)
884                 return -1;
885         else if (l->hit > r->hit)
886                 return 1;
887         return 0;
888 }
889
890 static struct sort_dimension hit_sort_dimension = {
891         .name   = "hit",
892         .cmp    = hit_cmp,
893 };
894
895 static int bytes_cmp(struct alloc_stat *l, struct alloc_stat *r)
896 {
897         if (l->bytes_alloc < r->bytes_alloc)
898                 return -1;
899         else if (l->bytes_alloc > r->bytes_alloc)
900                 return 1;
901         return 0;
902 }
903
904 static struct sort_dimension bytes_sort_dimension = {
905         .name   = "bytes",
906         .cmp    = bytes_cmp,
907 };
908
909 static int frag_cmp(struct alloc_stat *l, struct alloc_stat *r)
910 {
911         double x, y;
912
913         x = fragmentation(l->bytes_req, l->bytes_alloc);
914         y = fragmentation(r->bytes_req, r->bytes_alloc);
915
916         if (x < y)
917                 return -1;
918         else if (x > y)
919                 return 1;
920         return 0;
921 }
922
923 static struct sort_dimension frag_sort_dimension = {
924         .name   = "frag",
925         .cmp    = frag_cmp,
926 };
927
928 static int pingpong_cmp(struct alloc_stat *l, struct alloc_stat *r)
929 {
930         if (l->pingpong < r->pingpong)
931                 return -1;
932         else if (l->pingpong > r->pingpong)
933                 return 1;
934         return 0;
935 }
936
937 static struct sort_dimension pingpong_sort_dimension = {
938         .name   = "pingpong",
939         .cmp    = pingpong_cmp,
940 };
941
942 static struct sort_dimension *avail_sorts[] = {
943         &ptr_sort_dimension,
944         &callsite_sort_dimension,
945         &hit_sort_dimension,
946         &bytes_sort_dimension,
947         &frag_sort_dimension,
948         &pingpong_sort_dimension,
949 };
950
951 #define NUM_AVAIL_SORTS ((int)ARRAY_SIZE(avail_sorts))
952
953 static int sort_dimension__add(const char *tok, struct list_head *list)
954 {
955         struct sort_dimension *sort;
956         int i;
957
958         for (i = 0; i < NUM_AVAIL_SORTS; i++) {
959                 if (!strcmp(avail_sorts[i]->name, tok)) {
960                         sort = memdup(avail_sorts[i], sizeof(*avail_sorts[i]));
961                         if (!sort) {
962                                 pr_err("%s: memdup failed\n", __func__);
963                                 return -1;
964                         }
965                         list_add_tail(&sort->list, list);
966                         return 0;
967                 }
968         }
969
970         return -1;
971 }
972
973 static int setup_sorting(struct list_head *sort_list, const char *arg)
974 {
975         char *tok;
976         char *str = strdup(arg);
977         char *pos = str;
978
979         if (!str) {
980                 pr_err("%s: strdup failed\n", __func__);
981                 return -1;
982         }
983
984         while (true) {
985                 tok = strsep(&pos, ",");
986                 if (!tok)
987                         break;
988                 if (sort_dimension__add(tok, sort_list) < 0) {
989                         error("Unknown --sort key: '%s'", tok);
990                         free(str);
991                         return -1;
992                 }
993         }
994
995         free(str);
996         return 0;
997 }
998
999 static int parse_sort_opt(const struct option *opt __maybe_unused,
1000                           const char *arg, int unset __maybe_unused)
1001 {
1002         if (!arg)
1003                 return -1;
1004
1005         if (caller_flag > alloc_flag)
1006                 return setup_sorting(&caller_sort, arg);
1007         else
1008                 return setup_sorting(&alloc_sort, arg);
1009
1010         return 0;
1011 }
1012
1013 static int parse_caller_opt(const struct option *opt __maybe_unused,
1014                             const char *arg __maybe_unused,
1015                             int unset __maybe_unused)
1016 {
1017         caller_flag = (alloc_flag + 1);
1018         return 0;
1019 }
1020
1021 static int parse_alloc_opt(const struct option *opt __maybe_unused,
1022                            const char *arg __maybe_unused,
1023                            int unset __maybe_unused)
1024 {
1025         alloc_flag = (caller_flag + 1);
1026         return 0;
1027 }
1028
1029 static int parse_slab_opt(const struct option *opt __maybe_unused,
1030                           const char *arg __maybe_unused,
1031                           int unset __maybe_unused)
1032 {
1033         kmem_slab = (kmem_page + 1);
1034         return 0;
1035 }
1036
1037 static int parse_page_opt(const struct option *opt __maybe_unused,
1038                           const char *arg __maybe_unused,
1039                           int unset __maybe_unused)
1040 {
1041         kmem_page = (kmem_slab + 1);
1042         return 0;
1043 }
1044
1045 static int parse_line_opt(const struct option *opt __maybe_unused,
1046                           const char *arg, int unset __maybe_unused)
1047 {
1048         int lines;
1049
1050         if (!arg)
1051                 return -1;
1052
1053         lines = strtoul(arg, NULL, 10);
1054
1055         if (caller_flag > alloc_flag)
1056                 caller_lines = lines;
1057         else
1058                 alloc_lines = lines;
1059
1060         return 0;
1061 }
1062
1063 static int __cmd_record(int argc, const char **argv)
1064 {
1065         const char * const record_args[] = {
1066         "record", "-a", "-R", "-c", "1",
1067         };
1068         const char * const slab_events[] = {
1069         "-e", "kmem:kmalloc",
1070         "-e", "kmem:kmalloc_node",
1071         "-e", "kmem:kfree",
1072         "-e", "kmem:kmem_cache_alloc",
1073         "-e", "kmem:kmem_cache_alloc_node",
1074         "-e", "kmem:kmem_cache_free",
1075         };
1076         const char * const page_events[] = {
1077         "-e", "kmem:mm_page_alloc",
1078         "-e", "kmem:mm_page_free",
1079         };
1080         unsigned int rec_argc, i, j;
1081         const char **rec_argv;
1082
1083         rec_argc = ARRAY_SIZE(record_args) + argc - 1;
1084         if (kmem_slab)
1085                 rec_argc += ARRAY_SIZE(slab_events);
1086         if (kmem_page)
1087                 rec_argc += ARRAY_SIZE(page_events);
1088
1089         rec_argv = calloc(rec_argc + 1, sizeof(char *));
1090
1091         if (rec_argv == NULL)
1092                 return -ENOMEM;
1093
1094         for (i = 0; i < ARRAY_SIZE(record_args); i++)
1095                 rec_argv[i] = strdup(record_args[i]);
1096
1097         if (kmem_slab) {
1098                 for (j = 0; j < ARRAY_SIZE(slab_events); j++, i++)
1099                         rec_argv[i] = strdup(slab_events[j]);
1100         }
1101         if (kmem_page) {
1102                 for (j = 0; j < ARRAY_SIZE(page_events); j++, i++)
1103                         rec_argv[i] = strdup(page_events[j]);
1104         }
1105
1106         for (j = 1; j < (unsigned int)argc; j++, i++)
1107                 rec_argv[i] = argv[j];
1108
1109         return cmd_record(i, rec_argv, NULL);
1110 }
1111
1112 int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
1113 {
1114         const char * const default_sort_order = "frag,hit,bytes";
1115         struct perf_data_file file = {
1116                 .mode = PERF_DATA_MODE_READ,
1117         };
1118         const struct option kmem_options[] = {
1119         OPT_STRING('i', "input", &input_name, "file", "input file name"),
1120         OPT_INCR('v', "verbose", &verbose,
1121                     "be more verbose (show symbol address, etc)"),
1122         OPT_CALLBACK_NOOPT(0, "caller", NULL, NULL,
1123                            "show per-callsite statistics", parse_caller_opt),
1124         OPT_CALLBACK_NOOPT(0, "alloc", NULL, NULL,
1125                            "show per-allocation statistics", parse_alloc_opt),
1126         OPT_CALLBACK('s', "sort", NULL, "key[,key2...]",
1127                      "sort by keys: ptr, call_site, bytes, hit, pingpong, frag",
1128                      parse_sort_opt),
1129         OPT_CALLBACK('l', "line", NULL, "num", "show n lines", parse_line_opt),
1130         OPT_BOOLEAN(0, "raw-ip", &raw_ip, "show raw ip instead of symbol"),
1131         OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"),
1132         OPT_CALLBACK_NOOPT(0, "slab", NULL, NULL, "Analyze slab allocator",
1133                            parse_slab_opt),
1134         OPT_CALLBACK_NOOPT(0, "page", NULL, NULL, "Analyze page allocator",
1135                            parse_page_opt),
1136         OPT_END()
1137         };
1138         const char *const kmem_subcommands[] = { "record", "stat", NULL };
1139         const char *kmem_usage[] = {
1140                 NULL,
1141                 NULL
1142         };
1143         struct perf_session *session;
1144         int ret = -1;
1145
1146         argc = parse_options_subcommand(argc, argv, kmem_options,
1147                                         kmem_subcommands, kmem_usage, 0);
1148
1149         if (!argc)
1150                 usage_with_options(kmem_usage, kmem_options);
1151
1152         if (kmem_slab == 0 && kmem_page == 0)
1153                 kmem_slab = 1;  /* for backward compatibility */
1154
1155         if (!strncmp(argv[0], "rec", 3)) {
1156                 symbol__init(NULL);
1157                 return __cmd_record(argc, argv);
1158         }
1159
1160         file.path = input_name;
1161
1162         session = perf_session__new(&file, false, &perf_kmem);
1163         if (session == NULL)
1164                 return -1;
1165
1166         if (kmem_page) {
1167                 struct perf_evsel *evsel = perf_evlist__first(session->evlist);
1168
1169                 if (evsel == NULL || evsel->tp_format == NULL) {
1170                         pr_err("invalid event found.. aborting\n");
1171                         return -1;
1172                 }
1173
1174                 kmem_page_size = pevent_get_page_size(evsel->tp_format->pevent);
1175         }
1176
1177         symbol__init(&session->header.env);
1178
1179         if (!strcmp(argv[0], "stat")) {
1180                 setlocale(LC_ALL, "");
1181
1182                 if (cpu__setup_cpunode_map())
1183                         goto out_delete;
1184
1185                 if (list_empty(&caller_sort))
1186                         setup_sorting(&caller_sort, default_sort_order);
1187                 if (list_empty(&alloc_sort))
1188                         setup_sorting(&alloc_sort, default_sort_order);
1189
1190                 ret = __cmd_kmem(session);
1191         } else
1192                 usage_with_options(kmem_usage, kmem_options);
1193
1194 out_delete:
1195         perf_session__delete(session);
1196
1197         return ret;
1198 }
1199