These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / net / sched / sch_api.c
1 /*
2  * net/sched/sch_api.c  Packet scheduler API.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37
38 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
39                         struct nlmsghdr *n, u32 clid,
40                         struct Qdisc *old, struct Qdisc *new);
41 static int tclass_notify(struct net *net, struct sk_buff *oskb,
42                          struct nlmsghdr *n, struct Qdisc *q,
43                          unsigned long cl, int event);
44
45 /*
46
47    Short review.
48    -------------
49
50    This file consists of two interrelated parts:
51
52    1. queueing disciplines manager frontend.
53    2. traffic classes manager frontend.
54
55    Generally, queueing discipline ("qdisc") is a black box,
56    which is able to enqueue packets and to dequeue them (when
57    device is ready to send something) in order and at times
58    determined by algorithm hidden in it.
59
60    qdisc's are divided to two categories:
61    - "queues", which have no internal structure visible from outside.
62    - "schedulers", which split all the packets to "traffic classes",
63      using "packet classifiers" (look at cls_api.c)
64
65    In turn, classes may have child qdiscs (as rule, queues)
66    attached to them etc. etc. etc.
67
68    The goal of the routines in this file is to translate
69    information supplied by user in the form of handles
70    to more intelligible for kernel form, to make some sanity
71    checks and part of work, which is common to all qdiscs
72    and to provide rtnetlink notifications.
73
74    All real intelligent work is done inside qdisc modules.
75
76
77
78    Every discipline has two major routines: enqueue and dequeue.
79
80    ---dequeue
81
82    dequeue usually returns a skb to send. It is allowed to return NULL,
83    but it does not mean that queue is empty, it just means that
84    discipline does not want to send anything this time.
85    Queue is really empty if q->q.qlen == 0.
86    For complicated disciplines with multiple queues q->q is not
87    real packet queue, but however q->q.qlen must be valid.
88
89    ---enqueue
90
91    enqueue returns 0, if packet was enqueued successfully.
92    If packet (this one or another one) was dropped, it returns
93    not zero error code.
94    NET_XMIT_DROP        - this packet dropped
95      Expected action: do not backoff, but wait until queue will clear.
96    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
97      Expected action: backoff or ignore
98    NET_XMIT_POLICED     - dropped by police.
99      Expected action: backoff or error to real-time apps.
100
101    Auxiliary routines:
102
103    ---peek
104
105    like dequeue but without removing a packet from the queue
106
107    ---reset
108
109    returns qdisc to initial state: purge all buffers, clear all
110    timers, counters (except for statistics) etc.
111
112    ---init
113
114    initializes newly created qdisc.
115
116    ---destroy
117
118    destroys resources allocated by init and during lifetime of qdisc.
119
120    ---change
121
122    changes qdisc parameters.
123  */
124
125 /* Protects list of registered TC modules. It is pure SMP lock. */
126 static DEFINE_RWLOCK(qdisc_mod_lock);
127
128
129 /************************************************
130  *      Queueing disciplines manipulation.      *
131  ************************************************/
132
133
134 /* The list of all installed queueing disciplines. */
135
136 static struct Qdisc_ops *qdisc_base;
137
138 /* Register/unregister queueing discipline */
139
140 int register_qdisc(struct Qdisc_ops *qops)
141 {
142         struct Qdisc_ops *q, **qp;
143         int rc = -EEXIST;
144
145         write_lock(&qdisc_mod_lock);
146         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
147                 if (!strcmp(qops->id, q->id))
148                         goto out;
149
150         if (qops->enqueue == NULL)
151                 qops->enqueue = noop_qdisc_ops.enqueue;
152         if (qops->peek == NULL) {
153                 if (qops->dequeue == NULL)
154                         qops->peek = noop_qdisc_ops.peek;
155                 else
156                         goto out_einval;
157         }
158         if (qops->dequeue == NULL)
159                 qops->dequeue = noop_qdisc_ops.dequeue;
160
161         if (qops->cl_ops) {
162                 const struct Qdisc_class_ops *cops = qops->cl_ops;
163
164                 if (!(cops->get && cops->put && cops->walk && cops->leaf))
165                         goto out_einval;
166
167                 if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
168                         goto out_einval;
169         }
170
171         qops->next = NULL;
172         *qp = qops;
173         rc = 0;
174 out:
175         write_unlock(&qdisc_mod_lock);
176         return rc;
177
178 out_einval:
179         rc = -EINVAL;
180         goto out;
181 }
182 EXPORT_SYMBOL(register_qdisc);
183
184 int unregister_qdisc(struct Qdisc_ops *qops)
185 {
186         struct Qdisc_ops *q, **qp;
187         int err = -ENOENT;
188
189         write_lock(&qdisc_mod_lock);
190         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
191                 if (q == qops)
192                         break;
193         if (q) {
194                 *qp = q->next;
195                 q->next = NULL;
196                 err = 0;
197         }
198         write_unlock(&qdisc_mod_lock);
199         return err;
200 }
201 EXPORT_SYMBOL(unregister_qdisc);
202
203 /* Get default qdisc if not otherwise specified */
204 void qdisc_get_default(char *name, size_t len)
205 {
206         read_lock(&qdisc_mod_lock);
207         strlcpy(name, default_qdisc_ops->id, len);
208         read_unlock(&qdisc_mod_lock);
209 }
210
211 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
212 {
213         struct Qdisc_ops *q = NULL;
214
215         for (q = qdisc_base; q; q = q->next) {
216                 if (!strcmp(name, q->id)) {
217                         if (!try_module_get(q->owner))
218                                 q = NULL;
219                         break;
220                 }
221         }
222
223         return q;
224 }
225
226 /* Set new default qdisc to use */
227 int qdisc_set_default(const char *name)
228 {
229         const struct Qdisc_ops *ops;
230
231         if (!capable(CAP_NET_ADMIN))
232                 return -EPERM;
233
234         write_lock(&qdisc_mod_lock);
235         ops = qdisc_lookup_default(name);
236         if (!ops) {
237                 /* Not found, drop lock and try to load module */
238                 write_unlock(&qdisc_mod_lock);
239                 request_module("sch_%s", name);
240                 write_lock(&qdisc_mod_lock);
241
242                 ops = qdisc_lookup_default(name);
243         }
244
245         if (ops) {
246                 /* Set new default */
247                 module_put(default_qdisc_ops->owner);
248                 default_qdisc_ops = ops;
249         }
250         write_unlock(&qdisc_mod_lock);
251
252         return ops ? 0 : -ENOENT;
253 }
254
255 /* We know handle. Find qdisc among all qdisc's attached to device
256  * (root qdisc, all its children, children of children etc.)
257  * Note: caller either uses rtnl or rcu_read_lock()
258  */
259
260 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
261 {
262         struct Qdisc *q;
263
264         if (!(root->flags & TCQ_F_BUILTIN) &&
265             root->handle == handle)
266                 return root;
267
268         list_for_each_entry_rcu(q, &root->list, list) {
269                 if (q->handle == handle)
270                         return q;
271         }
272         return NULL;
273 }
274
275 void qdisc_list_add(struct Qdisc *q)
276 {
277         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
278                 struct Qdisc *root = qdisc_dev(q)->qdisc;
279
280                 WARN_ON_ONCE(root == &noop_qdisc);
281                 ASSERT_RTNL();
282                 list_add_tail_rcu(&q->list, &root->list);
283         }
284 }
285 EXPORT_SYMBOL(qdisc_list_add);
286
287 void qdisc_list_del(struct Qdisc *q)
288 {
289         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
290                 ASSERT_RTNL();
291                 list_del_rcu(&q->list);
292         }
293 }
294 EXPORT_SYMBOL(qdisc_list_del);
295
296 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
297 {
298         struct Qdisc *q;
299
300         q = qdisc_match_from_root(dev->qdisc, handle);
301         if (q)
302                 goto out;
303
304         if (dev_ingress_queue(dev))
305                 q = qdisc_match_from_root(
306                         dev_ingress_queue(dev)->qdisc_sleeping,
307                         handle);
308 out:
309         return q;
310 }
311
312 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
313 {
314         unsigned long cl;
315         struct Qdisc *leaf;
316         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
317
318         if (cops == NULL)
319                 return NULL;
320         cl = cops->get(p, classid);
321
322         if (cl == 0)
323                 return NULL;
324         leaf = cops->leaf(p, cl);
325         cops->put(p, cl);
326         return leaf;
327 }
328
329 /* Find queueing discipline by name */
330
331 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
332 {
333         struct Qdisc_ops *q = NULL;
334
335         if (kind) {
336                 read_lock(&qdisc_mod_lock);
337                 for (q = qdisc_base; q; q = q->next) {
338                         if (nla_strcmp(kind, q->id) == 0) {
339                                 if (!try_module_get(q->owner))
340                                         q = NULL;
341                                 break;
342                         }
343                 }
344                 read_unlock(&qdisc_mod_lock);
345         }
346         return q;
347 }
348
349 /* The linklayer setting were not transferred from iproute2, in older
350  * versions, and the rate tables lookup systems have been dropped in
351  * the kernel. To keep backward compatible with older iproute2 tc
352  * utils, we detect the linklayer setting by detecting if the rate
353  * table were modified.
354  *
355  * For linklayer ATM table entries, the rate table will be aligned to
356  * 48 bytes, thus some table entries will contain the same value.  The
357  * mpu (min packet unit) is also encoded into the old rate table, thus
358  * starting from the mpu, we find low and high table entries for
359  * mapping this cell.  If these entries contain the same value, when
360  * the rate tables have been modified for linklayer ATM.
361  *
362  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
363  * and then roundup to the next cell, calc the table entry one below,
364  * and compare.
365  */
366 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
367 {
368         int low       = roundup(r->mpu, 48);
369         int high      = roundup(low+1, 48);
370         int cell_low  = low >> r->cell_log;
371         int cell_high = (high >> r->cell_log) - 1;
372
373         /* rtab is too inaccurate at rates > 100Mbit/s */
374         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
375                 pr_debug("TC linklayer: Giving up ATM detection\n");
376                 return TC_LINKLAYER_ETHERNET;
377         }
378
379         if ((cell_high > cell_low) && (cell_high < 256)
380             && (rtab[cell_low] == rtab[cell_high])) {
381                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
382                          cell_low, cell_high, rtab[cell_high]);
383                 return TC_LINKLAYER_ATM;
384         }
385         return TC_LINKLAYER_ETHERNET;
386 }
387
388 static struct qdisc_rate_table *qdisc_rtab_list;
389
390 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
391 {
392         struct qdisc_rate_table *rtab;
393
394         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
395             nla_len(tab) != TC_RTAB_SIZE)
396                 return NULL;
397
398         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
399                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
400                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
401                         rtab->refcnt++;
402                         return rtab;
403                 }
404         }
405
406         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
407         if (rtab) {
408                 rtab->rate = *r;
409                 rtab->refcnt = 1;
410                 memcpy(rtab->data, nla_data(tab), 1024);
411                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
412                         r->linklayer = __detect_linklayer(r, rtab->data);
413                 rtab->next = qdisc_rtab_list;
414                 qdisc_rtab_list = rtab;
415         }
416         return rtab;
417 }
418 EXPORT_SYMBOL(qdisc_get_rtab);
419
420 void qdisc_put_rtab(struct qdisc_rate_table *tab)
421 {
422         struct qdisc_rate_table *rtab, **rtabp;
423
424         if (!tab || --tab->refcnt)
425                 return;
426
427         for (rtabp = &qdisc_rtab_list;
428              (rtab = *rtabp) != NULL;
429              rtabp = &rtab->next) {
430                 if (rtab == tab) {
431                         *rtabp = rtab->next;
432                         kfree(rtab);
433                         return;
434                 }
435         }
436 }
437 EXPORT_SYMBOL(qdisc_put_rtab);
438
439 static LIST_HEAD(qdisc_stab_list);
440 static DEFINE_SPINLOCK(qdisc_stab_lock);
441
442 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
443         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
444         [TCA_STAB_DATA] = { .type = NLA_BINARY },
445 };
446
447 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
448 {
449         struct nlattr *tb[TCA_STAB_MAX + 1];
450         struct qdisc_size_table *stab;
451         struct tc_sizespec *s;
452         unsigned int tsize = 0;
453         u16 *tab = NULL;
454         int err;
455
456         err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
457         if (err < 0)
458                 return ERR_PTR(err);
459         if (!tb[TCA_STAB_BASE])
460                 return ERR_PTR(-EINVAL);
461
462         s = nla_data(tb[TCA_STAB_BASE]);
463
464         if (s->tsize > 0) {
465                 if (!tb[TCA_STAB_DATA])
466                         return ERR_PTR(-EINVAL);
467                 tab = nla_data(tb[TCA_STAB_DATA]);
468                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
469         }
470
471         if (tsize != s->tsize || (!tab && tsize > 0))
472                 return ERR_PTR(-EINVAL);
473
474         spin_lock(&qdisc_stab_lock);
475
476         list_for_each_entry(stab, &qdisc_stab_list, list) {
477                 if (memcmp(&stab->szopts, s, sizeof(*s)))
478                         continue;
479                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
480                         continue;
481                 stab->refcnt++;
482                 spin_unlock(&qdisc_stab_lock);
483                 return stab;
484         }
485
486         spin_unlock(&qdisc_stab_lock);
487
488         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
489         if (!stab)
490                 return ERR_PTR(-ENOMEM);
491
492         stab->refcnt = 1;
493         stab->szopts = *s;
494         if (tsize > 0)
495                 memcpy(stab->data, tab, tsize * sizeof(u16));
496
497         spin_lock(&qdisc_stab_lock);
498         list_add_tail(&stab->list, &qdisc_stab_list);
499         spin_unlock(&qdisc_stab_lock);
500
501         return stab;
502 }
503
504 static void stab_kfree_rcu(struct rcu_head *head)
505 {
506         kfree(container_of(head, struct qdisc_size_table, rcu));
507 }
508
509 void qdisc_put_stab(struct qdisc_size_table *tab)
510 {
511         if (!tab)
512                 return;
513
514         spin_lock(&qdisc_stab_lock);
515
516         if (--tab->refcnt == 0) {
517                 list_del(&tab->list);
518                 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
519         }
520
521         spin_unlock(&qdisc_stab_lock);
522 }
523 EXPORT_SYMBOL(qdisc_put_stab);
524
525 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
526 {
527         struct nlattr *nest;
528
529         nest = nla_nest_start(skb, TCA_STAB);
530         if (nest == NULL)
531                 goto nla_put_failure;
532         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
533                 goto nla_put_failure;
534         nla_nest_end(skb, nest);
535
536         return skb->len;
537
538 nla_put_failure:
539         return -1;
540 }
541
542 void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
543 {
544         int pkt_len, slot;
545
546         pkt_len = skb->len + stab->szopts.overhead;
547         if (unlikely(!stab->szopts.tsize))
548                 goto out;
549
550         slot = pkt_len + stab->szopts.cell_align;
551         if (unlikely(slot < 0))
552                 slot = 0;
553
554         slot >>= stab->szopts.cell_log;
555         if (likely(slot < stab->szopts.tsize))
556                 pkt_len = stab->data[slot];
557         else
558                 pkt_len = stab->data[stab->szopts.tsize - 1] *
559                                 (slot / stab->szopts.tsize) +
560                                 stab->data[slot % stab->szopts.tsize];
561
562         pkt_len <<= stab->szopts.size_log;
563 out:
564         if (unlikely(pkt_len < 1))
565                 pkt_len = 1;
566         qdisc_skb_cb(skb)->pkt_len = pkt_len;
567 }
568 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
569
570 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
571 {
572         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
573                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
574                         txt, qdisc->ops->id, qdisc->handle >> 16);
575                 qdisc->flags |= TCQ_F_WARN_NONWC;
576         }
577 }
578 EXPORT_SYMBOL(qdisc_warn_nonwc);
579
580 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
581 {
582         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
583                                                  timer);
584
585         rcu_read_lock();
586         qdisc_unthrottled(wd->qdisc);
587         __netif_schedule(qdisc_root(wd->qdisc));
588         rcu_read_unlock();
589
590         return HRTIMER_NORESTART;
591 }
592
593 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
594 {
595         hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
596         wd->timer.function = qdisc_watchdog;
597         wd->qdisc = qdisc;
598 }
599 EXPORT_SYMBOL(qdisc_watchdog_init);
600
601 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires, bool throttle)
602 {
603         if (test_bit(__QDISC_STATE_DEACTIVATED,
604                      &qdisc_root_sleeping(wd->qdisc)->state))
605                 return;
606
607         if (throttle)
608                 qdisc_throttled(wd->qdisc);
609
610         hrtimer_start(&wd->timer,
611                       ns_to_ktime(expires),
612                       HRTIMER_MODE_ABS_PINNED);
613 }
614 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
615
616 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
617 {
618         hrtimer_cancel(&wd->timer);
619         qdisc_unthrottled(wd->qdisc);
620 }
621 EXPORT_SYMBOL(qdisc_watchdog_cancel);
622
623 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
624 {
625         unsigned int size = n * sizeof(struct hlist_head), i;
626         struct hlist_head *h;
627
628         if (size <= PAGE_SIZE)
629                 h = kmalloc(size, GFP_KERNEL);
630         else
631                 h = (struct hlist_head *)
632                         __get_free_pages(GFP_KERNEL, get_order(size));
633
634         if (h != NULL) {
635                 for (i = 0; i < n; i++)
636                         INIT_HLIST_HEAD(&h[i]);
637         }
638         return h;
639 }
640
641 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
642 {
643         unsigned int size = n * sizeof(struct hlist_head);
644
645         if (size <= PAGE_SIZE)
646                 kfree(h);
647         else
648                 free_pages((unsigned long)h, get_order(size));
649 }
650
651 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
652 {
653         struct Qdisc_class_common *cl;
654         struct hlist_node *next;
655         struct hlist_head *nhash, *ohash;
656         unsigned int nsize, nmask, osize;
657         unsigned int i, h;
658
659         /* Rehash when load factor exceeds 0.75 */
660         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
661                 return;
662         nsize = clhash->hashsize * 2;
663         nmask = nsize - 1;
664         nhash = qdisc_class_hash_alloc(nsize);
665         if (nhash == NULL)
666                 return;
667
668         ohash = clhash->hash;
669         osize = clhash->hashsize;
670
671         sch_tree_lock(sch);
672         for (i = 0; i < osize; i++) {
673                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
674                         h = qdisc_class_hash(cl->classid, nmask);
675                         hlist_add_head(&cl->hnode, &nhash[h]);
676                 }
677         }
678         clhash->hash     = nhash;
679         clhash->hashsize = nsize;
680         clhash->hashmask = nmask;
681         sch_tree_unlock(sch);
682
683         qdisc_class_hash_free(ohash, osize);
684 }
685 EXPORT_SYMBOL(qdisc_class_hash_grow);
686
687 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
688 {
689         unsigned int size = 4;
690
691         clhash->hash = qdisc_class_hash_alloc(size);
692         if (clhash->hash == NULL)
693                 return -ENOMEM;
694         clhash->hashsize  = size;
695         clhash->hashmask  = size - 1;
696         clhash->hashelems = 0;
697         return 0;
698 }
699 EXPORT_SYMBOL(qdisc_class_hash_init);
700
701 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
702 {
703         qdisc_class_hash_free(clhash->hash, clhash->hashsize);
704 }
705 EXPORT_SYMBOL(qdisc_class_hash_destroy);
706
707 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
708                              struct Qdisc_class_common *cl)
709 {
710         unsigned int h;
711
712         INIT_HLIST_NODE(&cl->hnode);
713         h = qdisc_class_hash(cl->classid, clhash->hashmask);
714         hlist_add_head(&cl->hnode, &clhash->hash[h]);
715         clhash->hashelems++;
716 }
717 EXPORT_SYMBOL(qdisc_class_hash_insert);
718
719 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
720                              struct Qdisc_class_common *cl)
721 {
722         hlist_del(&cl->hnode);
723         clhash->hashelems--;
724 }
725 EXPORT_SYMBOL(qdisc_class_hash_remove);
726
727 /* Allocate an unique handle from space managed by kernel
728  * Possible range is [8000-FFFF]:0000 (0x8000 values)
729  */
730 static u32 qdisc_alloc_handle(struct net_device *dev)
731 {
732         int i = 0x8000;
733         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
734
735         do {
736                 autohandle += TC_H_MAKE(0x10000U, 0);
737                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
738                         autohandle = TC_H_MAKE(0x80000000U, 0);
739                 if (!qdisc_lookup(dev, autohandle))
740                         return autohandle;
741                 cond_resched();
742         } while (--i > 0);
743
744         return 0;
745 }
746
747 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
748 {
749         const struct Qdisc_class_ops *cops;
750         unsigned long cl;
751         u32 parentid;
752         int drops;
753
754         if (n == 0)
755                 return;
756         drops = max_t(int, n, 0);
757         rcu_read_lock();
758         while ((parentid = sch->parent)) {
759                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
760                         break;
761
762                 if (sch->flags & TCQ_F_NOPARENT)
763                         break;
764                 /* TODO: perform the search on a per txq basis */
765                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
766                 if (sch == NULL) {
767                         WARN_ON_ONCE(parentid != TC_H_ROOT);
768                         break;
769                 }
770                 cops = sch->ops->cl_ops;
771                 if (cops->qlen_notify) {
772                         cl = cops->get(sch, parentid);
773                         cops->qlen_notify(sch, cl);
774                         cops->put(sch, cl);
775                 }
776                 sch->q.qlen -= n;
777                 __qdisc_qstats_drop(sch, drops);
778         }
779         rcu_read_unlock();
780 }
781 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
782
783 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
784                                struct nlmsghdr *n, u32 clid,
785                                struct Qdisc *old, struct Qdisc *new)
786 {
787         if (new || old)
788                 qdisc_notify(net, skb, n, clid, old, new);
789
790         if (old)
791                 qdisc_destroy(old);
792 }
793
794 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
795  * to device "dev".
796  *
797  * When appropriate send a netlink notification using 'skb'
798  * and "n".
799  *
800  * On success, destroy old qdisc.
801  */
802
803 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
804                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
805                        struct Qdisc *new, struct Qdisc *old)
806 {
807         struct Qdisc *q = old;
808         struct net *net = dev_net(dev);
809         int err = 0;
810
811         if (parent == NULL) {
812                 unsigned int i, num_q, ingress;
813
814                 ingress = 0;
815                 num_q = dev->num_tx_queues;
816                 if ((q && q->flags & TCQ_F_INGRESS) ||
817                     (new && new->flags & TCQ_F_INGRESS)) {
818                         num_q = 1;
819                         ingress = 1;
820                         if (!dev_ingress_queue(dev))
821                                 return -ENOENT;
822                 }
823
824                 if (dev->flags & IFF_UP)
825                         dev_deactivate(dev);
826
827                 if (new && new->ops->attach)
828                         goto skip;
829
830                 for (i = 0; i < num_q; i++) {
831                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
832
833                         if (!ingress)
834                                 dev_queue = netdev_get_tx_queue(dev, i);
835
836                         old = dev_graft_qdisc(dev_queue, new);
837                         if (new && i > 0)
838                                 atomic_inc(&new->refcnt);
839
840                         if (!ingress)
841                                 qdisc_destroy(old);
842                 }
843
844 skip:
845                 if (!ingress) {
846                         notify_and_destroy(net, skb, n, classid,
847                                            dev->qdisc, new);
848                         if (new && !new->ops->attach)
849                                 atomic_inc(&new->refcnt);
850                         dev->qdisc = new ? : &noop_qdisc;
851
852                         if (new && new->ops->attach)
853                                 new->ops->attach(new);
854                 } else {
855                         notify_and_destroy(net, skb, n, classid, old, new);
856                 }
857
858                 if (dev->flags & IFF_UP)
859                         dev_activate(dev);
860         } else {
861                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
862
863                 err = -EOPNOTSUPP;
864                 if (cops && cops->graft) {
865                         unsigned long cl = cops->get(parent, classid);
866                         if (cl) {
867                                 err = cops->graft(parent, cl, new, &old);
868                                 cops->put(parent, cl);
869                         } else
870                                 err = -ENOENT;
871                 }
872                 if (!err)
873                         notify_and_destroy(net, skb, n, classid, old, new);
874         }
875         return err;
876 }
877
878 /* lockdep annotation is needed for ingress; egress gets it only for name */
879 static struct lock_class_key qdisc_tx_lock;
880 static struct lock_class_key qdisc_rx_lock;
881
882 /*
883    Allocate and initialize new qdisc.
884
885    Parameters are passed via opt.
886  */
887
888 static struct Qdisc *
889 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
890              struct Qdisc *p, u32 parent, u32 handle,
891              struct nlattr **tca, int *errp)
892 {
893         int err;
894         struct nlattr *kind = tca[TCA_KIND];
895         struct Qdisc *sch;
896         struct Qdisc_ops *ops;
897         struct qdisc_size_table *stab;
898
899         ops = qdisc_lookup_ops(kind);
900 #ifdef CONFIG_MODULES
901         if (ops == NULL && kind != NULL) {
902                 char name[IFNAMSIZ];
903                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
904                         /* We dropped the RTNL semaphore in order to
905                          * perform the module load.  So, even if we
906                          * succeeded in loading the module we have to
907                          * tell the caller to replay the request.  We
908                          * indicate this using -EAGAIN.
909                          * We replay the request because the device may
910                          * go away in the mean time.
911                          */
912                         rtnl_unlock();
913                         request_module("sch_%s", name);
914                         rtnl_lock();
915                         ops = qdisc_lookup_ops(kind);
916                         if (ops != NULL) {
917                                 /* We will try again qdisc_lookup_ops,
918                                  * so don't keep a reference.
919                                  */
920                                 module_put(ops->owner);
921                                 err = -EAGAIN;
922                                 goto err_out;
923                         }
924                 }
925         }
926 #endif
927
928         err = -ENOENT;
929         if (ops == NULL)
930                 goto err_out;
931
932         sch = qdisc_alloc(dev_queue, ops);
933         if (IS_ERR(sch)) {
934                 err = PTR_ERR(sch);
935                 goto err_out2;
936         }
937
938         sch->parent = parent;
939
940         if (handle == TC_H_INGRESS) {
941                 sch->flags |= TCQ_F_INGRESS;
942                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
943                 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
944         } else {
945                 if (handle == 0) {
946                         handle = qdisc_alloc_handle(dev);
947                         err = -ENOMEM;
948                         if (handle == 0)
949                                 goto err_out3;
950                 }
951                 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
952                 if (!netif_is_multiqueue(dev))
953                         sch->flags |= TCQ_F_ONETXQUEUE;
954         }
955
956         sch->handle = handle;
957
958         if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
959                 if (qdisc_is_percpu_stats(sch)) {
960                         sch->cpu_bstats =
961                                 netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
962                         if (!sch->cpu_bstats)
963                                 goto err_out4;
964
965                         sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
966                         if (!sch->cpu_qstats)
967                                 goto err_out4;
968                 }
969
970                 if (tca[TCA_STAB]) {
971                         stab = qdisc_get_stab(tca[TCA_STAB]);
972                         if (IS_ERR(stab)) {
973                                 err = PTR_ERR(stab);
974                                 goto err_out4;
975                         }
976                         rcu_assign_pointer(sch->stab, stab);
977                 }
978                 if (tca[TCA_RATE]) {
979                         spinlock_t *root_lock;
980
981                         err = -EOPNOTSUPP;
982                         if (sch->flags & TCQ_F_MQROOT)
983                                 goto err_out4;
984
985                         if ((sch->parent != TC_H_ROOT) &&
986                             !(sch->flags & TCQ_F_INGRESS) &&
987                             (!p || !(p->flags & TCQ_F_MQROOT)))
988                                 root_lock = qdisc_root_sleeping_lock(sch);
989                         else
990                                 root_lock = qdisc_lock(sch);
991
992                         err = gen_new_estimator(&sch->bstats,
993                                                 sch->cpu_bstats,
994                                                 &sch->rate_est,
995                                                 root_lock,
996                                                 tca[TCA_RATE]);
997                         if (err)
998                                 goto err_out4;
999                 }
1000
1001                 qdisc_list_add(sch);
1002
1003                 return sch;
1004         }
1005 err_out3:
1006         dev_put(dev);
1007         kfree((char *) sch - sch->padded);
1008 err_out2:
1009         module_put(ops->owner);
1010 err_out:
1011         *errp = err;
1012         return NULL;
1013
1014 err_out4:
1015         free_percpu(sch->cpu_bstats);
1016         free_percpu(sch->cpu_qstats);
1017         /*
1018          * Any broken qdiscs that would require a ops->reset() here?
1019          * The qdisc was never in action so it shouldn't be necessary.
1020          */
1021         qdisc_put_stab(rtnl_dereference(sch->stab));
1022         if (ops->destroy)
1023                 ops->destroy(sch);
1024         goto err_out3;
1025 }
1026
1027 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
1028 {
1029         struct qdisc_size_table *ostab, *stab = NULL;
1030         int err = 0;
1031
1032         if (tca[TCA_OPTIONS]) {
1033                 if (sch->ops->change == NULL)
1034                         return -EINVAL;
1035                 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
1036                 if (err)
1037                         return err;
1038         }
1039
1040         if (tca[TCA_STAB]) {
1041                 stab = qdisc_get_stab(tca[TCA_STAB]);
1042                 if (IS_ERR(stab))
1043                         return PTR_ERR(stab);
1044         }
1045
1046         ostab = rtnl_dereference(sch->stab);
1047         rcu_assign_pointer(sch->stab, stab);
1048         qdisc_put_stab(ostab);
1049
1050         if (tca[TCA_RATE]) {
1051                 /* NB: ignores errors from replace_estimator
1052                    because change can't be undone. */
1053                 if (sch->flags & TCQ_F_MQROOT)
1054                         goto out;
1055                 gen_replace_estimator(&sch->bstats,
1056                                       sch->cpu_bstats,
1057                                       &sch->rate_est,
1058                                       qdisc_root_sleeping_lock(sch),
1059                                       tca[TCA_RATE]);
1060         }
1061 out:
1062         return 0;
1063 }
1064
1065 struct check_loop_arg {
1066         struct qdisc_walker     w;
1067         struct Qdisc            *p;
1068         int                     depth;
1069 };
1070
1071 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
1072
1073 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1074 {
1075         struct check_loop_arg   arg;
1076
1077         if (q->ops->cl_ops == NULL)
1078                 return 0;
1079
1080         arg.w.stop = arg.w.skip = arg.w.count = 0;
1081         arg.w.fn = check_loop_fn;
1082         arg.depth = depth;
1083         arg.p = p;
1084         q->ops->cl_ops->walk(q, &arg.w);
1085         return arg.w.stop ? -ELOOP : 0;
1086 }
1087
1088 static int
1089 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1090 {
1091         struct Qdisc *leaf;
1092         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1093         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1094
1095         leaf = cops->leaf(q, cl);
1096         if (leaf) {
1097                 if (leaf == arg->p || arg->depth > 7)
1098                         return -ELOOP;
1099                 return check_loop(leaf, arg->p, arg->depth + 1);
1100         }
1101         return 0;
1102 }
1103
1104 /*
1105  * Delete/get qdisc.
1106  */
1107
1108 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1109 {
1110         struct net *net = sock_net(skb->sk);
1111         struct tcmsg *tcm = nlmsg_data(n);
1112         struct nlattr *tca[TCA_MAX + 1];
1113         struct net_device *dev;
1114         u32 clid;
1115         struct Qdisc *q = NULL;
1116         struct Qdisc *p = NULL;
1117         int err;
1118
1119         if ((n->nlmsg_type != RTM_GETQDISC) &&
1120             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1121                 return -EPERM;
1122
1123         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1124         if (err < 0)
1125                 return err;
1126
1127         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1128         if (!dev)
1129                 return -ENODEV;
1130
1131         clid = tcm->tcm_parent;
1132         if (clid) {
1133                 if (clid != TC_H_ROOT) {
1134                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1135                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1136                                 if (!p)
1137                                         return -ENOENT;
1138                                 q = qdisc_leaf(p, clid);
1139                         } else if (dev_ingress_queue(dev)) {
1140                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1141                         }
1142                 } else {
1143                         q = dev->qdisc;
1144                 }
1145                 if (!q)
1146                         return -ENOENT;
1147
1148                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1149                         return -EINVAL;
1150         } else {
1151                 q = qdisc_lookup(dev, tcm->tcm_handle);
1152                 if (!q)
1153                         return -ENOENT;
1154         }
1155
1156         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1157                 return -EINVAL;
1158
1159         if (n->nlmsg_type == RTM_DELQDISC) {
1160                 if (!clid)
1161                         return -EINVAL;
1162                 if (q->handle == 0)
1163                         return -ENOENT;
1164                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1165                 if (err != 0)
1166                         return err;
1167         } else {
1168                 qdisc_notify(net, skb, n, clid, NULL, q);
1169         }
1170         return 0;
1171 }
1172
1173 /*
1174  * Create/change qdisc.
1175  */
1176
1177 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1178 {
1179         struct net *net = sock_net(skb->sk);
1180         struct tcmsg *tcm;
1181         struct nlattr *tca[TCA_MAX + 1];
1182         struct net_device *dev;
1183         u32 clid;
1184         struct Qdisc *q, *p;
1185         int err;
1186
1187         if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1188                 return -EPERM;
1189
1190 replay:
1191         /* Reinit, just in case something touches this. */
1192         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1193         if (err < 0)
1194                 return err;
1195
1196         tcm = nlmsg_data(n);
1197         clid = tcm->tcm_parent;
1198         q = p = NULL;
1199
1200         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1201         if (!dev)
1202                 return -ENODEV;
1203
1204
1205         if (clid) {
1206                 if (clid != TC_H_ROOT) {
1207                         if (clid != TC_H_INGRESS) {
1208                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1209                                 if (!p)
1210                                         return -ENOENT;
1211                                 q = qdisc_leaf(p, clid);
1212                         } else if (dev_ingress_queue_create(dev)) {
1213                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1214                         }
1215                 } else {
1216                         q = dev->qdisc;
1217                 }
1218
1219                 /* It may be default qdisc, ignore it */
1220                 if (q && q->handle == 0)
1221                         q = NULL;
1222
1223                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1224                         if (tcm->tcm_handle) {
1225                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1226                                         return -EEXIST;
1227                                 if (TC_H_MIN(tcm->tcm_handle))
1228                                         return -EINVAL;
1229                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1230                                 if (!q)
1231                                         goto create_n_graft;
1232                                 if (n->nlmsg_flags & NLM_F_EXCL)
1233                                         return -EEXIST;
1234                                 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1235                                         return -EINVAL;
1236                                 if (q == p ||
1237                                     (p && check_loop(q, p, 0)))
1238                                         return -ELOOP;
1239                                 atomic_inc(&q->refcnt);
1240                                 goto graft;
1241                         } else {
1242                                 if (!q)
1243                                         goto create_n_graft;
1244
1245                                 /* This magic test requires explanation.
1246                                  *
1247                                  *   We know, that some child q is already
1248                                  *   attached to this parent and have choice:
1249                                  *   either to change it or to create/graft new one.
1250                                  *
1251                                  *   1. We are allowed to create/graft only
1252                                  *   if CREATE and REPLACE flags are set.
1253                                  *
1254                                  *   2. If EXCL is set, requestor wanted to say,
1255                                  *   that qdisc tcm_handle is not expected
1256                                  *   to exist, so that we choose create/graft too.
1257                                  *
1258                                  *   3. The last case is when no flags are set.
1259                                  *   Alas, it is sort of hole in API, we
1260                                  *   cannot decide what to do unambiguously.
1261                                  *   For now we select create/graft, if
1262                                  *   user gave KIND, which does not match existing.
1263                                  */
1264                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1265                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1266                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1267                                      (tca[TCA_KIND] &&
1268                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1269                                         goto create_n_graft;
1270                         }
1271                 }
1272         } else {
1273                 if (!tcm->tcm_handle)
1274                         return -EINVAL;
1275                 q = qdisc_lookup(dev, tcm->tcm_handle);
1276         }
1277
1278         /* Change qdisc parameters */
1279         if (q == NULL)
1280                 return -ENOENT;
1281         if (n->nlmsg_flags & NLM_F_EXCL)
1282                 return -EEXIST;
1283         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1284                 return -EINVAL;
1285         err = qdisc_change(q, tca);
1286         if (err == 0)
1287                 qdisc_notify(net, skb, n, clid, NULL, q);
1288         return err;
1289
1290 create_n_graft:
1291         if (!(n->nlmsg_flags & NLM_F_CREATE))
1292                 return -ENOENT;
1293         if (clid == TC_H_INGRESS) {
1294                 if (dev_ingress_queue(dev))
1295                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1296                                          tcm->tcm_parent, tcm->tcm_parent,
1297                                          tca, &err);
1298                 else
1299                         err = -ENOENT;
1300         } else {
1301                 struct netdev_queue *dev_queue;
1302
1303                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1304                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1305                 else if (p)
1306                         dev_queue = p->dev_queue;
1307                 else
1308                         dev_queue = netdev_get_tx_queue(dev, 0);
1309
1310                 q = qdisc_create(dev, dev_queue, p,
1311                                  tcm->tcm_parent, tcm->tcm_handle,
1312                                  tca, &err);
1313         }
1314         if (q == NULL) {
1315                 if (err == -EAGAIN)
1316                         goto replay;
1317                 return err;
1318         }
1319
1320 graft:
1321         err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1322         if (err) {
1323                 if (q)
1324                         qdisc_destroy(q);
1325                 return err;
1326         }
1327
1328         return 0;
1329 }
1330
1331 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1332                          u32 portid, u32 seq, u16 flags, int event)
1333 {
1334         struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
1335         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
1336         struct tcmsg *tcm;
1337         struct nlmsghdr  *nlh;
1338         unsigned char *b = skb_tail_pointer(skb);
1339         struct gnet_dump d;
1340         struct qdisc_size_table *stab;
1341         __u32 qlen;
1342
1343         cond_resched();
1344         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1345         if (!nlh)
1346                 goto out_nlmsg_trim;
1347         tcm = nlmsg_data(nlh);
1348         tcm->tcm_family = AF_UNSPEC;
1349         tcm->tcm__pad1 = 0;
1350         tcm->tcm__pad2 = 0;
1351         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1352         tcm->tcm_parent = clid;
1353         tcm->tcm_handle = q->handle;
1354         tcm->tcm_info = atomic_read(&q->refcnt);
1355         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1356                 goto nla_put_failure;
1357         if (q->ops->dump && q->ops->dump(q, skb) < 0)
1358                 goto nla_put_failure;
1359         qlen = q->q.qlen;
1360
1361         stab = rtnl_dereference(q->stab);
1362         if (stab && qdisc_dump_stab(skb, stab) < 0)
1363                 goto nla_put_failure;
1364
1365         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1366                                          qdisc_root_sleeping_lock(q), &d) < 0)
1367                 goto nla_put_failure;
1368
1369         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1370                 goto nla_put_failure;
1371
1372         if (qdisc_is_percpu_stats(q)) {
1373                 cpu_bstats = q->cpu_bstats;
1374                 cpu_qstats = q->cpu_qstats;
1375         }
1376
1377         if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats) < 0 ||
1378             gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1379             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
1380                 goto nla_put_failure;
1381
1382         if (gnet_stats_finish_copy(&d) < 0)
1383                 goto nla_put_failure;
1384
1385         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1386         return skb->len;
1387
1388 out_nlmsg_trim:
1389 nla_put_failure:
1390         nlmsg_trim(skb, b);
1391         return -1;
1392 }
1393
1394 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1395 {
1396         return (q->flags & TCQ_F_BUILTIN) ? true : false;
1397 }
1398
1399 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1400                         struct nlmsghdr *n, u32 clid,
1401                         struct Qdisc *old, struct Qdisc *new)
1402 {
1403         struct sk_buff *skb;
1404         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1405
1406         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1407         if (!skb)
1408                 return -ENOBUFS;
1409
1410         if (old && !tc_qdisc_dump_ignore(old)) {
1411                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1412                                   0, RTM_DELQDISC) < 0)
1413                         goto err_out;
1414         }
1415         if (new && !tc_qdisc_dump_ignore(new)) {
1416                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1417                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1418                         goto err_out;
1419         }
1420
1421         if (skb->len)
1422                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1423                                       n->nlmsg_flags & NLM_F_ECHO);
1424
1425 err_out:
1426         kfree_skb(skb);
1427         return -EINVAL;
1428 }
1429
1430 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1431                               struct netlink_callback *cb,
1432                               int *q_idx_p, int s_q_idx)
1433 {
1434         int ret = 0, q_idx = *q_idx_p;
1435         struct Qdisc *q;
1436
1437         if (!root)
1438                 return 0;
1439
1440         q = root;
1441         if (q_idx < s_q_idx) {
1442                 q_idx++;
1443         } else {
1444                 if (!tc_qdisc_dump_ignore(q) &&
1445                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1446                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1447                         goto done;
1448                 q_idx++;
1449         }
1450         list_for_each_entry(q, &root->list, list) {
1451                 if (q_idx < s_q_idx) {
1452                         q_idx++;
1453                         continue;
1454                 }
1455                 if (!tc_qdisc_dump_ignore(q) &&
1456                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1457                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1458                         goto done;
1459                 q_idx++;
1460         }
1461
1462 out:
1463         *q_idx_p = q_idx;
1464         return ret;
1465 done:
1466         ret = -1;
1467         goto out;
1468 }
1469
1470 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1471 {
1472         struct net *net = sock_net(skb->sk);
1473         int idx, q_idx;
1474         int s_idx, s_q_idx;
1475         struct net_device *dev;
1476
1477         s_idx = cb->args[0];
1478         s_q_idx = q_idx = cb->args[1];
1479
1480         idx = 0;
1481         ASSERT_RTNL();
1482         for_each_netdev(net, dev) {
1483                 struct netdev_queue *dev_queue;
1484
1485                 if (idx < s_idx)
1486                         goto cont;
1487                 if (idx > s_idx)
1488                         s_q_idx = 0;
1489                 q_idx = 0;
1490
1491                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1492                         goto done;
1493
1494                 dev_queue = dev_ingress_queue(dev);
1495                 if (dev_queue &&
1496                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1497                                        &q_idx, s_q_idx) < 0)
1498                         goto done;
1499
1500 cont:
1501                 idx++;
1502         }
1503
1504 done:
1505         cb->args[0] = idx;
1506         cb->args[1] = q_idx;
1507
1508         return skb->len;
1509 }
1510
1511
1512
1513 /************************************************
1514  *      Traffic classes manipulation.           *
1515  ************************************************/
1516
1517
1518
1519 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
1520 {
1521         struct net *net = sock_net(skb->sk);
1522         struct tcmsg *tcm = nlmsg_data(n);
1523         struct nlattr *tca[TCA_MAX + 1];
1524         struct net_device *dev;
1525         struct Qdisc *q = NULL;
1526         const struct Qdisc_class_ops *cops;
1527         unsigned long cl = 0;
1528         unsigned long new_cl;
1529         u32 portid;
1530         u32 clid;
1531         u32 qid;
1532         int err;
1533
1534         if ((n->nlmsg_type != RTM_GETTCLASS) &&
1535             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1536                 return -EPERM;
1537
1538         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1539         if (err < 0)
1540                 return err;
1541
1542         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1543         if (!dev)
1544                 return -ENODEV;
1545
1546         /*
1547            parent == TC_H_UNSPEC - unspecified parent.
1548            parent == TC_H_ROOT   - class is root, which has no parent.
1549            parent == X:0         - parent is root class.
1550            parent == X:Y         - parent is a node in hierarchy.
1551            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1552
1553            handle == 0:0         - generate handle from kernel pool.
1554            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1555            handle == X:Y         - clear.
1556            handle == X:0         - root class.
1557          */
1558
1559         /* Step 1. Determine qdisc handle X:0 */
1560
1561         portid = tcm->tcm_parent;
1562         clid = tcm->tcm_handle;
1563         qid = TC_H_MAJ(clid);
1564
1565         if (portid != TC_H_ROOT) {
1566                 u32 qid1 = TC_H_MAJ(portid);
1567
1568                 if (qid && qid1) {
1569                         /* If both majors are known, they must be identical. */
1570                         if (qid != qid1)
1571                                 return -EINVAL;
1572                 } else if (qid1) {
1573                         qid = qid1;
1574                 } else if (qid == 0)
1575                         qid = dev->qdisc->handle;
1576
1577                 /* Now qid is genuine qdisc handle consistent
1578                  * both with parent and child.
1579                  *
1580                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
1581                  */
1582                 if (portid)
1583                         portid = TC_H_MAKE(qid, portid);
1584         } else {
1585                 if (qid == 0)
1586                         qid = dev->qdisc->handle;
1587         }
1588
1589         /* OK. Locate qdisc */
1590         q = qdisc_lookup(dev, qid);
1591         if (!q)
1592                 return -ENOENT;
1593
1594         /* An check that it supports classes */
1595         cops = q->ops->cl_ops;
1596         if (cops == NULL)
1597                 return -EINVAL;
1598
1599         /* Now try to get class */
1600         if (clid == 0) {
1601                 if (portid == TC_H_ROOT)
1602                         clid = qid;
1603         } else
1604                 clid = TC_H_MAKE(qid, clid);
1605
1606         if (clid)
1607                 cl = cops->get(q, clid);
1608
1609         if (cl == 0) {
1610                 err = -ENOENT;
1611                 if (n->nlmsg_type != RTM_NEWTCLASS ||
1612                     !(n->nlmsg_flags & NLM_F_CREATE))
1613                         goto out;
1614         } else {
1615                 switch (n->nlmsg_type) {
1616                 case RTM_NEWTCLASS:
1617                         err = -EEXIST;
1618                         if (n->nlmsg_flags & NLM_F_EXCL)
1619                                 goto out;
1620                         break;
1621                 case RTM_DELTCLASS:
1622                         err = -EOPNOTSUPP;
1623                         if (cops->delete)
1624                                 err = cops->delete(q, cl);
1625                         if (err == 0)
1626                                 tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1627                         goto out;
1628                 case RTM_GETTCLASS:
1629                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1630                         goto out;
1631                 default:
1632                         err = -EINVAL;
1633                         goto out;
1634                 }
1635         }
1636
1637         new_cl = cl;
1638         err = -EOPNOTSUPP;
1639         if (cops->change)
1640                 err = cops->change(q, clid, portid, tca, &new_cl);
1641         if (err == 0)
1642                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1643
1644 out:
1645         if (cl)
1646                 cops->put(q, cl);
1647
1648         return err;
1649 }
1650
1651
1652 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1653                           unsigned long cl,
1654                           u32 portid, u32 seq, u16 flags, int event)
1655 {
1656         struct tcmsg *tcm;
1657         struct nlmsghdr  *nlh;
1658         unsigned char *b = skb_tail_pointer(skb);
1659         struct gnet_dump d;
1660         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1661
1662         cond_resched();
1663         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1664         if (!nlh)
1665                 goto out_nlmsg_trim;
1666         tcm = nlmsg_data(nlh);
1667         tcm->tcm_family = AF_UNSPEC;
1668         tcm->tcm__pad1 = 0;
1669         tcm->tcm__pad2 = 0;
1670         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1671         tcm->tcm_parent = q->handle;
1672         tcm->tcm_handle = q->handle;
1673         tcm->tcm_info = 0;
1674         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1675                 goto nla_put_failure;
1676         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1677                 goto nla_put_failure;
1678
1679         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1680                                          qdisc_root_sleeping_lock(q), &d) < 0)
1681                 goto nla_put_failure;
1682
1683         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1684                 goto nla_put_failure;
1685
1686         if (gnet_stats_finish_copy(&d) < 0)
1687                 goto nla_put_failure;
1688
1689         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1690         return skb->len;
1691
1692 out_nlmsg_trim:
1693 nla_put_failure:
1694         nlmsg_trim(skb, b);
1695         return -1;
1696 }
1697
1698 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1699                          struct nlmsghdr *n, struct Qdisc *q,
1700                          unsigned long cl, int event)
1701 {
1702         struct sk_buff *skb;
1703         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1704
1705         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1706         if (!skb)
1707                 return -ENOBUFS;
1708
1709         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1710                 kfree_skb(skb);
1711                 return -EINVAL;
1712         }
1713
1714         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1715                               n->nlmsg_flags & NLM_F_ECHO);
1716 }
1717
1718 struct qdisc_dump_args {
1719         struct qdisc_walker     w;
1720         struct sk_buff          *skb;
1721         struct netlink_callback *cb;
1722 };
1723
1724 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1725 {
1726         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1727
1728         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1729                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1730 }
1731
1732 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1733                                 struct tcmsg *tcm, struct netlink_callback *cb,
1734                                 int *t_p, int s_t)
1735 {
1736         struct qdisc_dump_args arg;
1737
1738         if (tc_qdisc_dump_ignore(q) ||
1739             *t_p < s_t || !q->ops->cl_ops ||
1740             (tcm->tcm_parent &&
1741              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1742                 (*t_p)++;
1743                 return 0;
1744         }
1745         if (*t_p > s_t)
1746                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1747         arg.w.fn = qdisc_class_dump;
1748         arg.skb = skb;
1749         arg.cb = cb;
1750         arg.w.stop  = 0;
1751         arg.w.skip = cb->args[1];
1752         arg.w.count = 0;
1753         q->ops->cl_ops->walk(q, &arg.w);
1754         cb->args[1] = arg.w.count;
1755         if (arg.w.stop)
1756                 return -1;
1757         (*t_p)++;
1758         return 0;
1759 }
1760
1761 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1762                                struct tcmsg *tcm, struct netlink_callback *cb,
1763                                int *t_p, int s_t)
1764 {
1765         struct Qdisc *q;
1766
1767         if (!root)
1768                 return 0;
1769
1770         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1771                 return -1;
1772
1773         list_for_each_entry(q, &root->list, list) {
1774                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1775                         return -1;
1776         }
1777
1778         return 0;
1779 }
1780
1781 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1782 {
1783         struct tcmsg *tcm = nlmsg_data(cb->nlh);
1784         struct net *net = sock_net(skb->sk);
1785         struct netdev_queue *dev_queue;
1786         struct net_device *dev;
1787         int t, s_t;
1788
1789         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
1790                 return 0;
1791         dev = dev_get_by_index(net, tcm->tcm_ifindex);
1792         if (!dev)
1793                 return 0;
1794
1795         s_t = cb->args[0];
1796         t = 0;
1797
1798         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1799                 goto done;
1800
1801         dev_queue = dev_ingress_queue(dev);
1802         if (dev_queue &&
1803             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1804                                 &t, s_t) < 0)
1805                 goto done;
1806
1807 done:
1808         cb->args[0] = t;
1809
1810         dev_put(dev);
1811         return skb->len;
1812 }
1813
1814 /* Main classifier routine: scans classifier chain attached
1815  * to this qdisc, (optionally) tests for protocol and asks
1816  * specific classifiers.
1817  */
1818 int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1819                 struct tcf_result *res, bool compat_mode)
1820 {
1821         __be16 protocol = tc_skb_protocol(skb);
1822 #ifdef CONFIG_NET_CLS_ACT
1823         const struct tcf_proto *old_tp = tp;
1824         int limit = 0;
1825
1826 reclassify:
1827 #endif
1828         for (; tp; tp = rcu_dereference_bh(tp->next)) {
1829                 int err;
1830
1831                 if (tp->protocol != protocol &&
1832                     tp->protocol != htons(ETH_P_ALL))
1833                         continue;
1834
1835                 err = tp->classify(skb, tp, res);
1836 #ifdef CONFIG_NET_CLS_ACT
1837                 if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode))
1838                         goto reset;
1839 #endif
1840                 if (err >= 0)
1841                         return err;
1842         }
1843
1844         return -1;
1845 #ifdef CONFIG_NET_CLS_ACT
1846 reset:
1847         if (unlikely(limit++ >= MAX_REC_LOOP)) {
1848                 net_notice_ratelimited("%s: reclassify loop, rule prio %u, protocol %02x\n",
1849                                        tp->q->ops->id, tp->prio & 0xffff,
1850                                        ntohs(tp->protocol));
1851                 return TC_ACT_SHOT;
1852         }
1853
1854         tp = old_tp;
1855         protocol = tc_skb_protocol(skb);
1856         goto reclassify;
1857 #endif
1858 }
1859 EXPORT_SYMBOL(tc_classify);
1860
1861 bool tcf_destroy(struct tcf_proto *tp, bool force)
1862 {
1863         if (tp->ops->destroy(tp, force)) {
1864                 module_put(tp->ops->owner);
1865                 kfree_rcu(tp, rcu);
1866                 return true;
1867         }
1868
1869         return false;
1870 }
1871
1872 void tcf_destroy_chain(struct tcf_proto __rcu **fl)
1873 {
1874         struct tcf_proto *tp;
1875
1876         while ((tp = rtnl_dereference(*fl)) != NULL) {
1877                 RCU_INIT_POINTER(*fl, tp->next);
1878                 tcf_destroy(tp, true);
1879         }
1880 }
1881 EXPORT_SYMBOL(tcf_destroy_chain);
1882
1883 #ifdef CONFIG_PROC_FS
1884 static int psched_show(struct seq_file *seq, void *v)
1885 {
1886         seq_printf(seq, "%08x %08x %08x %08x\n",
1887                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1888                    1000000,
1889                    (u32)NSEC_PER_SEC / hrtimer_resolution);
1890
1891         return 0;
1892 }
1893
1894 static int psched_open(struct inode *inode, struct file *file)
1895 {
1896         return single_open(file, psched_show, NULL);
1897 }
1898
1899 static const struct file_operations psched_fops = {
1900         .owner = THIS_MODULE,
1901         .open = psched_open,
1902         .read  = seq_read,
1903         .llseek = seq_lseek,
1904         .release = single_release,
1905 };
1906
1907 static int __net_init psched_net_init(struct net *net)
1908 {
1909         struct proc_dir_entry *e;
1910
1911         e = proc_create("psched", 0, net->proc_net, &psched_fops);
1912         if (e == NULL)
1913                 return -ENOMEM;
1914
1915         return 0;
1916 }
1917
1918 static void __net_exit psched_net_exit(struct net *net)
1919 {
1920         remove_proc_entry("psched", net->proc_net);
1921 }
1922 #else
1923 static int __net_init psched_net_init(struct net *net)
1924 {
1925         return 0;
1926 }
1927
1928 static void __net_exit psched_net_exit(struct net *net)
1929 {
1930 }
1931 #endif
1932
1933 static struct pernet_operations psched_net_ops = {
1934         .init = psched_net_init,
1935         .exit = psched_net_exit,
1936 };
1937
1938 static int __init pktsched_init(void)
1939 {
1940         int err;
1941
1942         err = register_pernet_subsys(&psched_net_ops);
1943         if (err) {
1944                 pr_err("pktsched_init: "
1945                        "cannot initialize per netns operations\n");
1946                 return err;
1947         }
1948
1949         register_qdisc(&pfifo_fast_ops);
1950         register_qdisc(&pfifo_qdisc_ops);
1951         register_qdisc(&bfifo_qdisc_ops);
1952         register_qdisc(&pfifo_head_drop_qdisc_ops);
1953         register_qdisc(&mq_qdisc_ops);
1954         register_qdisc(&noqueue_qdisc_ops);
1955
1956         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1957         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1958         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1959         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1960         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1961         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1962
1963         return 0;
1964 }
1965
1966 subsys_initcall(pktsched_init);