Add the rt linux 4.1.3-rt3 as base
[kvmfornfv.git] / kernel / net / sched / sch_api.c
1 /*
2  * net/sched/sch_api.c  Packet scheduler API.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37
38 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
39                         struct nlmsghdr *n, u32 clid,
40                         struct Qdisc *old, struct Qdisc *new);
41 static int tclass_notify(struct net *net, struct sk_buff *oskb,
42                          struct nlmsghdr *n, struct Qdisc *q,
43                          unsigned long cl, int event);
44
45 /*
46
47    Short review.
48    -------------
49
50    This file consists of two interrelated parts:
51
52    1. queueing disciplines manager frontend.
53    2. traffic classes manager frontend.
54
55    Generally, queueing discipline ("qdisc") is a black box,
56    which is able to enqueue packets and to dequeue them (when
57    device is ready to send something) in order and at times
58    determined by algorithm hidden in it.
59
60    qdisc's are divided to two categories:
61    - "queues", which have no internal structure visible from outside.
62    - "schedulers", which split all the packets to "traffic classes",
63      using "packet classifiers" (look at cls_api.c)
64
65    In turn, classes may have child qdiscs (as rule, queues)
66    attached to them etc. etc. etc.
67
68    The goal of the routines in this file is to translate
69    information supplied by user in the form of handles
70    to more intelligible for kernel form, to make some sanity
71    checks and part of work, which is common to all qdiscs
72    and to provide rtnetlink notifications.
73
74    All real intelligent work is done inside qdisc modules.
75
76
77
78    Every discipline has two major routines: enqueue and dequeue.
79
80    ---dequeue
81
82    dequeue usually returns a skb to send. It is allowed to return NULL,
83    but it does not mean that queue is empty, it just means that
84    discipline does not want to send anything this time.
85    Queue is really empty if q->q.qlen == 0.
86    For complicated disciplines with multiple queues q->q is not
87    real packet queue, but however q->q.qlen must be valid.
88
89    ---enqueue
90
91    enqueue returns 0, if packet was enqueued successfully.
92    If packet (this one or another one) was dropped, it returns
93    not zero error code.
94    NET_XMIT_DROP        - this packet dropped
95      Expected action: do not backoff, but wait until queue will clear.
96    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
97      Expected action: backoff or ignore
98    NET_XMIT_POLICED     - dropped by police.
99      Expected action: backoff or error to real-time apps.
100
101    Auxiliary routines:
102
103    ---peek
104
105    like dequeue but without removing a packet from the queue
106
107    ---reset
108
109    returns qdisc to initial state: purge all buffers, clear all
110    timers, counters (except for statistics) etc.
111
112    ---init
113
114    initializes newly created qdisc.
115
116    ---destroy
117
118    destroys resources allocated by init and during lifetime of qdisc.
119
120    ---change
121
122    changes qdisc parameters.
123  */
124
125 /* Protects list of registered TC modules. It is pure SMP lock. */
126 static DEFINE_RWLOCK(qdisc_mod_lock);
127
128
129 /************************************************
130  *      Queueing disciplines manipulation.      *
131  ************************************************/
132
133
134 /* The list of all installed queueing disciplines. */
135
136 static struct Qdisc_ops *qdisc_base;
137
138 /* Register/unregister queueing discipline */
139
140 int register_qdisc(struct Qdisc_ops *qops)
141 {
142         struct Qdisc_ops *q, **qp;
143         int rc = -EEXIST;
144
145         write_lock(&qdisc_mod_lock);
146         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
147                 if (!strcmp(qops->id, q->id))
148                         goto out;
149
150         if (qops->enqueue == NULL)
151                 qops->enqueue = noop_qdisc_ops.enqueue;
152         if (qops->peek == NULL) {
153                 if (qops->dequeue == NULL)
154                         qops->peek = noop_qdisc_ops.peek;
155                 else
156                         goto out_einval;
157         }
158         if (qops->dequeue == NULL)
159                 qops->dequeue = noop_qdisc_ops.dequeue;
160
161         if (qops->cl_ops) {
162                 const struct Qdisc_class_ops *cops = qops->cl_ops;
163
164                 if (!(cops->get && cops->put && cops->walk && cops->leaf))
165                         goto out_einval;
166
167                 if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
168                         goto out_einval;
169         }
170
171         qops->next = NULL;
172         *qp = qops;
173         rc = 0;
174 out:
175         write_unlock(&qdisc_mod_lock);
176         return rc;
177
178 out_einval:
179         rc = -EINVAL;
180         goto out;
181 }
182 EXPORT_SYMBOL(register_qdisc);
183
184 int unregister_qdisc(struct Qdisc_ops *qops)
185 {
186         struct Qdisc_ops *q, **qp;
187         int err = -ENOENT;
188
189         write_lock(&qdisc_mod_lock);
190         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
191                 if (q == qops)
192                         break;
193         if (q) {
194                 *qp = q->next;
195                 q->next = NULL;
196                 err = 0;
197         }
198         write_unlock(&qdisc_mod_lock);
199         return err;
200 }
201 EXPORT_SYMBOL(unregister_qdisc);
202
203 /* Get default qdisc if not otherwise specified */
204 void qdisc_get_default(char *name, size_t len)
205 {
206         read_lock(&qdisc_mod_lock);
207         strlcpy(name, default_qdisc_ops->id, len);
208         read_unlock(&qdisc_mod_lock);
209 }
210
211 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
212 {
213         struct Qdisc_ops *q = NULL;
214
215         for (q = qdisc_base; q; q = q->next) {
216                 if (!strcmp(name, q->id)) {
217                         if (!try_module_get(q->owner))
218                                 q = NULL;
219                         break;
220                 }
221         }
222
223         return q;
224 }
225
226 /* Set new default qdisc to use */
227 int qdisc_set_default(const char *name)
228 {
229         const struct Qdisc_ops *ops;
230
231         if (!capable(CAP_NET_ADMIN))
232                 return -EPERM;
233
234         write_lock(&qdisc_mod_lock);
235         ops = qdisc_lookup_default(name);
236         if (!ops) {
237                 /* Not found, drop lock and try to load module */
238                 write_unlock(&qdisc_mod_lock);
239                 request_module("sch_%s", name);
240                 write_lock(&qdisc_mod_lock);
241
242                 ops = qdisc_lookup_default(name);
243         }
244
245         if (ops) {
246                 /* Set new default */
247                 module_put(default_qdisc_ops->owner);
248                 default_qdisc_ops = ops;
249         }
250         write_unlock(&qdisc_mod_lock);
251
252         return ops ? 0 : -ENOENT;
253 }
254
255 /* We know handle. Find qdisc among all qdisc's attached to device
256    (root qdisc, all its children, children of children etc.)
257  */
258
259 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
260 {
261         struct Qdisc *q;
262
263         if (!(root->flags & TCQ_F_BUILTIN) &&
264             root->handle == handle)
265                 return root;
266
267         list_for_each_entry(q, &root->list, list) {
268                 if (q->handle == handle)
269                         return q;
270         }
271         return NULL;
272 }
273
274 void qdisc_list_add(struct Qdisc *q)
275 {
276         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
277                 struct Qdisc *root = qdisc_dev(q)->qdisc;
278
279                 WARN_ON_ONCE(root == &noop_qdisc);
280                 list_add_tail(&q->list, &root->list);
281         }
282 }
283 EXPORT_SYMBOL(qdisc_list_add);
284
285 void qdisc_list_del(struct Qdisc *q)
286 {
287         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
288                 list_del(&q->list);
289 }
290 EXPORT_SYMBOL(qdisc_list_del);
291
292 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
293 {
294         struct Qdisc *q;
295
296         q = qdisc_match_from_root(dev->qdisc, handle);
297         if (q)
298                 goto out;
299
300         if (dev_ingress_queue(dev))
301                 q = qdisc_match_from_root(
302                         dev_ingress_queue(dev)->qdisc_sleeping,
303                         handle);
304 out:
305         return q;
306 }
307
308 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
309 {
310         unsigned long cl;
311         struct Qdisc *leaf;
312         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
313
314         if (cops == NULL)
315                 return NULL;
316         cl = cops->get(p, classid);
317
318         if (cl == 0)
319                 return NULL;
320         leaf = cops->leaf(p, cl);
321         cops->put(p, cl);
322         return leaf;
323 }
324
325 /* Find queueing discipline by name */
326
327 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
328 {
329         struct Qdisc_ops *q = NULL;
330
331         if (kind) {
332                 read_lock(&qdisc_mod_lock);
333                 for (q = qdisc_base; q; q = q->next) {
334                         if (nla_strcmp(kind, q->id) == 0) {
335                                 if (!try_module_get(q->owner))
336                                         q = NULL;
337                                 break;
338                         }
339                 }
340                 read_unlock(&qdisc_mod_lock);
341         }
342         return q;
343 }
344
345 /* The linklayer setting were not transferred from iproute2, in older
346  * versions, and the rate tables lookup systems have been dropped in
347  * the kernel. To keep backward compatible with older iproute2 tc
348  * utils, we detect the linklayer setting by detecting if the rate
349  * table were modified.
350  *
351  * For linklayer ATM table entries, the rate table will be aligned to
352  * 48 bytes, thus some table entries will contain the same value.  The
353  * mpu (min packet unit) is also encoded into the old rate table, thus
354  * starting from the mpu, we find low and high table entries for
355  * mapping this cell.  If these entries contain the same value, when
356  * the rate tables have been modified for linklayer ATM.
357  *
358  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
359  * and then roundup to the next cell, calc the table entry one below,
360  * and compare.
361  */
362 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
363 {
364         int low       = roundup(r->mpu, 48);
365         int high      = roundup(low+1, 48);
366         int cell_low  = low >> r->cell_log;
367         int cell_high = (high >> r->cell_log) - 1;
368
369         /* rtab is too inaccurate at rates > 100Mbit/s */
370         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
371                 pr_debug("TC linklayer: Giving up ATM detection\n");
372                 return TC_LINKLAYER_ETHERNET;
373         }
374
375         if ((cell_high > cell_low) && (cell_high < 256)
376             && (rtab[cell_low] == rtab[cell_high])) {
377                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
378                          cell_low, cell_high, rtab[cell_high]);
379                 return TC_LINKLAYER_ATM;
380         }
381         return TC_LINKLAYER_ETHERNET;
382 }
383
384 static struct qdisc_rate_table *qdisc_rtab_list;
385
386 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
387 {
388         struct qdisc_rate_table *rtab;
389
390         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
391             nla_len(tab) != TC_RTAB_SIZE)
392                 return NULL;
393
394         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
395                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
396                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
397                         rtab->refcnt++;
398                         return rtab;
399                 }
400         }
401
402         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
403         if (rtab) {
404                 rtab->rate = *r;
405                 rtab->refcnt = 1;
406                 memcpy(rtab->data, nla_data(tab), 1024);
407                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
408                         r->linklayer = __detect_linklayer(r, rtab->data);
409                 rtab->next = qdisc_rtab_list;
410                 qdisc_rtab_list = rtab;
411         }
412         return rtab;
413 }
414 EXPORT_SYMBOL(qdisc_get_rtab);
415
416 void qdisc_put_rtab(struct qdisc_rate_table *tab)
417 {
418         struct qdisc_rate_table *rtab, **rtabp;
419
420         if (!tab || --tab->refcnt)
421                 return;
422
423         for (rtabp = &qdisc_rtab_list;
424              (rtab = *rtabp) != NULL;
425              rtabp = &rtab->next) {
426                 if (rtab == tab) {
427                         *rtabp = rtab->next;
428                         kfree(rtab);
429                         return;
430                 }
431         }
432 }
433 EXPORT_SYMBOL(qdisc_put_rtab);
434
435 static LIST_HEAD(qdisc_stab_list);
436 static DEFINE_SPINLOCK(qdisc_stab_lock);
437
438 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
439         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
440         [TCA_STAB_DATA] = { .type = NLA_BINARY },
441 };
442
443 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
444 {
445         struct nlattr *tb[TCA_STAB_MAX + 1];
446         struct qdisc_size_table *stab;
447         struct tc_sizespec *s;
448         unsigned int tsize = 0;
449         u16 *tab = NULL;
450         int err;
451
452         err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
453         if (err < 0)
454                 return ERR_PTR(err);
455         if (!tb[TCA_STAB_BASE])
456                 return ERR_PTR(-EINVAL);
457
458         s = nla_data(tb[TCA_STAB_BASE]);
459
460         if (s->tsize > 0) {
461                 if (!tb[TCA_STAB_DATA])
462                         return ERR_PTR(-EINVAL);
463                 tab = nla_data(tb[TCA_STAB_DATA]);
464                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
465         }
466
467         if (tsize != s->tsize || (!tab && tsize > 0))
468                 return ERR_PTR(-EINVAL);
469
470         spin_lock(&qdisc_stab_lock);
471
472         list_for_each_entry(stab, &qdisc_stab_list, list) {
473                 if (memcmp(&stab->szopts, s, sizeof(*s)))
474                         continue;
475                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
476                         continue;
477                 stab->refcnt++;
478                 spin_unlock(&qdisc_stab_lock);
479                 return stab;
480         }
481
482         spin_unlock(&qdisc_stab_lock);
483
484         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
485         if (!stab)
486                 return ERR_PTR(-ENOMEM);
487
488         stab->refcnt = 1;
489         stab->szopts = *s;
490         if (tsize > 0)
491                 memcpy(stab->data, tab, tsize * sizeof(u16));
492
493         spin_lock(&qdisc_stab_lock);
494         list_add_tail(&stab->list, &qdisc_stab_list);
495         spin_unlock(&qdisc_stab_lock);
496
497         return stab;
498 }
499
500 static void stab_kfree_rcu(struct rcu_head *head)
501 {
502         kfree(container_of(head, struct qdisc_size_table, rcu));
503 }
504
505 void qdisc_put_stab(struct qdisc_size_table *tab)
506 {
507         if (!tab)
508                 return;
509
510         spin_lock(&qdisc_stab_lock);
511
512         if (--tab->refcnt == 0) {
513                 list_del(&tab->list);
514                 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
515         }
516
517         spin_unlock(&qdisc_stab_lock);
518 }
519 EXPORT_SYMBOL(qdisc_put_stab);
520
521 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
522 {
523         struct nlattr *nest;
524
525         nest = nla_nest_start(skb, TCA_STAB);
526         if (nest == NULL)
527                 goto nla_put_failure;
528         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
529                 goto nla_put_failure;
530         nla_nest_end(skb, nest);
531
532         return skb->len;
533
534 nla_put_failure:
535         return -1;
536 }
537
538 void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
539 {
540         int pkt_len, slot;
541
542         pkt_len = skb->len + stab->szopts.overhead;
543         if (unlikely(!stab->szopts.tsize))
544                 goto out;
545
546         slot = pkt_len + stab->szopts.cell_align;
547         if (unlikely(slot < 0))
548                 slot = 0;
549
550         slot >>= stab->szopts.cell_log;
551         if (likely(slot < stab->szopts.tsize))
552                 pkt_len = stab->data[slot];
553         else
554                 pkt_len = stab->data[stab->szopts.tsize - 1] *
555                                 (slot / stab->szopts.tsize) +
556                                 stab->data[slot % stab->szopts.tsize];
557
558         pkt_len <<= stab->szopts.size_log;
559 out:
560         if (unlikely(pkt_len < 1))
561                 pkt_len = 1;
562         qdisc_skb_cb(skb)->pkt_len = pkt_len;
563 }
564 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
565
566 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
567 {
568         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
569                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
570                         txt, qdisc->ops->id, qdisc->handle >> 16);
571                 qdisc->flags |= TCQ_F_WARN_NONWC;
572         }
573 }
574 EXPORT_SYMBOL(qdisc_warn_nonwc);
575
576 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
577 {
578         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
579                                                  timer);
580
581         rcu_read_lock();
582         qdisc_unthrottled(wd->qdisc);
583         __netif_schedule(qdisc_root(wd->qdisc));
584         rcu_read_unlock();
585
586         return HRTIMER_NORESTART;
587 }
588
589 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
590 {
591         hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
592         wd->timer.function = qdisc_watchdog;
593         wd->qdisc = qdisc;
594 }
595 EXPORT_SYMBOL(qdisc_watchdog_init);
596
597 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires, bool throttle)
598 {
599         if (test_bit(__QDISC_STATE_DEACTIVATED,
600                      &qdisc_root_sleeping(wd->qdisc)->state))
601                 return;
602
603         if (throttle)
604                 qdisc_throttled(wd->qdisc);
605
606         hrtimer_start(&wd->timer,
607                       ns_to_ktime(expires),
608                       HRTIMER_MODE_ABS_PINNED);
609 }
610 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
611
612 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
613 {
614         hrtimer_cancel(&wd->timer);
615         qdisc_unthrottled(wd->qdisc);
616 }
617 EXPORT_SYMBOL(qdisc_watchdog_cancel);
618
619 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
620 {
621         unsigned int size = n * sizeof(struct hlist_head), i;
622         struct hlist_head *h;
623
624         if (size <= PAGE_SIZE)
625                 h = kmalloc(size, GFP_KERNEL);
626         else
627                 h = (struct hlist_head *)
628                         __get_free_pages(GFP_KERNEL, get_order(size));
629
630         if (h != NULL) {
631                 for (i = 0; i < n; i++)
632                         INIT_HLIST_HEAD(&h[i]);
633         }
634         return h;
635 }
636
637 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
638 {
639         unsigned int size = n * sizeof(struct hlist_head);
640
641         if (size <= PAGE_SIZE)
642                 kfree(h);
643         else
644                 free_pages((unsigned long)h, get_order(size));
645 }
646
647 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
648 {
649         struct Qdisc_class_common *cl;
650         struct hlist_node *next;
651         struct hlist_head *nhash, *ohash;
652         unsigned int nsize, nmask, osize;
653         unsigned int i, h;
654
655         /* Rehash when load factor exceeds 0.75 */
656         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
657                 return;
658         nsize = clhash->hashsize * 2;
659         nmask = nsize - 1;
660         nhash = qdisc_class_hash_alloc(nsize);
661         if (nhash == NULL)
662                 return;
663
664         ohash = clhash->hash;
665         osize = clhash->hashsize;
666
667         sch_tree_lock(sch);
668         for (i = 0; i < osize; i++) {
669                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
670                         h = qdisc_class_hash(cl->classid, nmask);
671                         hlist_add_head(&cl->hnode, &nhash[h]);
672                 }
673         }
674         clhash->hash     = nhash;
675         clhash->hashsize = nsize;
676         clhash->hashmask = nmask;
677         sch_tree_unlock(sch);
678
679         qdisc_class_hash_free(ohash, osize);
680 }
681 EXPORT_SYMBOL(qdisc_class_hash_grow);
682
683 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
684 {
685         unsigned int size = 4;
686
687         clhash->hash = qdisc_class_hash_alloc(size);
688         if (clhash->hash == NULL)
689                 return -ENOMEM;
690         clhash->hashsize  = size;
691         clhash->hashmask  = size - 1;
692         clhash->hashelems = 0;
693         return 0;
694 }
695 EXPORT_SYMBOL(qdisc_class_hash_init);
696
697 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
698 {
699         qdisc_class_hash_free(clhash->hash, clhash->hashsize);
700 }
701 EXPORT_SYMBOL(qdisc_class_hash_destroy);
702
703 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
704                              struct Qdisc_class_common *cl)
705 {
706         unsigned int h;
707
708         INIT_HLIST_NODE(&cl->hnode);
709         h = qdisc_class_hash(cl->classid, clhash->hashmask);
710         hlist_add_head(&cl->hnode, &clhash->hash[h]);
711         clhash->hashelems++;
712 }
713 EXPORT_SYMBOL(qdisc_class_hash_insert);
714
715 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
716                              struct Qdisc_class_common *cl)
717 {
718         hlist_del(&cl->hnode);
719         clhash->hashelems--;
720 }
721 EXPORT_SYMBOL(qdisc_class_hash_remove);
722
723 /* Allocate an unique handle from space managed by kernel
724  * Possible range is [8000-FFFF]:0000 (0x8000 values)
725  */
726 static u32 qdisc_alloc_handle(struct net_device *dev)
727 {
728         int i = 0x8000;
729         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
730
731         do {
732                 autohandle += TC_H_MAKE(0x10000U, 0);
733                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
734                         autohandle = TC_H_MAKE(0x80000000U, 0);
735                 if (!qdisc_lookup(dev, autohandle))
736                         return autohandle;
737                 cond_resched();
738         } while (--i > 0);
739
740         return 0;
741 }
742
743 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
744 {
745         const struct Qdisc_class_ops *cops;
746         unsigned long cl;
747         u32 parentid;
748         int drops;
749
750         if (n == 0)
751                 return;
752         drops = max_t(int, n, 0);
753         while ((parentid = sch->parent)) {
754                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
755                         return;
756
757                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
758                 if (sch == NULL) {
759                         WARN_ON(parentid != TC_H_ROOT);
760                         return;
761                 }
762                 cops = sch->ops->cl_ops;
763                 if (cops->qlen_notify) {
764                         cl = cops->get(sch, parentid);
765                         cops->qlen_notify(sch, cl);
766                         cops->put(sch, cl);
767                 }
768                 sch->q.qlen -= n;
769                 __qdisc_qstats_drop(sch, drops);
770         }
771 }
772 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
773
774 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
775                                struct nlmsghdr *n, u32 clid,
776                                struct Qdisc *old, struct Qdisc *new)
777 {
778         if (new || old)
779                 qdisc_notify(net, skb, n, clid, old, new);
780
781         if (old)
782                 qdisc_destroy(old);
783 }
784
785 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
786  * to device "dev".
787  *
788  * When appropriate send a netlink notification using 'skb'
789  * and "n".
790  *
791  * On success, destroy old qdisc.
792  */
793
794 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
795                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
796                        struct Qdisc *new, struct Qdisc *old)
797 {
798         struct Qdisc *q = old;
799         struct net *net = dev_net(dev);
800         int err = 0;
801
802         if (parent == NULL) {
803                 unsigned int i, num_q, ingress;
804
805                 ingress = 0;
806                 num_q = dev->num_tx_queues;
807                 if ((q && q->flags & TCQ_F_INGRESS) ||
808                     (new && new->flags & TCQ_F_INGRESS)) {
809                         num_q = 1;
810                         ingress = 1;
811                         if (!dev_ingress_queue(dev))
812                                 return -ENOENT;
813                 }
814
815                 if (dev->flags & IFF_UP)
816                         dev_deactivate(dev);
817
818                 if (new && new->ops->attach)
819                         goto skip;
820
821                 for (i = 0; i < num_q; i++) {
822                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
823
824                         if (!ingress)
825                                 dev_queue = netdev_get_tx_queue(dev, i);
826
827                         old = dev_graft_qdisc(dev_queue, new);
828                         if (new && i > 0)
829                                 atomic_inc(&new->refcnt);
830
831                         if (!ingress)
832                                 qdisc_destroy(old);
833                 }
834
835 skip:
836                 if (!ingress) {
837                         notify_and_destroy(net, skb, n, classid,
838                                            dev->qdisc, new);
839                         if (new && !new->ops->attach)
840                                 atomic_inc(&new->refcnt);
841                         dev->qdisc = new ? : &noop_qdisc;
842
843                         if (new && new->ops->attach)
844                                 new->ops->attach(new);
845                 } else {
846                         notify_and_destroy(net, skb, n, classid, old, new);
847                 }
848
849                 if (dev->flags & IFF_UP)
850                         dev_activate(dev);
851         } else {
852                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
853
854                 err = -EOPNOTSUPP;
855                 if (cops && cops->graft) {
856                         unsigned long cl = cops->get(parent, classid);
857                         if (cl) {
858                                 err = cops->graft(parent, cl, new, &old);
859                                 cops->put(parent, cl);
860                         } else
861                                 err = -ENOENT;
862                 }
863                 if (!err)
864                         notify_and_destroy(net, skb, n, classid, old, new);
865         }
866         return err;
867 }
868
869 /* lockdep annotation is needed for ingress; egress gets it only for name */
870 static struct lock_class_key qdisc_tx_lock;
871 static struct lock_class_key qdisc_rx_lock;
872
873 /*
874    Allocate and initialize new qdisc.
875
876    Parameters are passed via opt.
877  */
878
879 static struct Qdisc *
880 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
881              struct Qdisc *p, u32 parent, u32 handle,
882              struct nlattr **tca, int *errp)
883 {
884         int err;
885         struct nlattr *kind = tca[TCA_KIND];
886         struct Qdisc *sch;
887         struct Qdisc_ops *ops;
888         struct qdisc_size_table *stab;
889
890         ops = qdisc_lookup_ops(kind);
891 #ifdef CONFIG_MODULES
892         if (ops == NULL && kind != NULL) {
893                 char name[IFNAMSIZ];
894                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
895                         /* We dropped the RTNL semaphore in order to
896                          * perform the module load.  So, even if we
897                          * succeeded in loading the module we have to
898                          * tell the caller to replay the request.  We
899                          * indicate this using -EAGAIN.
900                          * We replay the request because the device may
901                          * go away in the mean time.
902                          */
903                         rtnl_unlock();
904                         request_module("sch_%s", name);
905                         rtnl_lock();
906                         ops = qdisc_lookup_ops(kind);
907                         if (ops != NULL) {
908                                 /* We will try again qdisc_lookup_ops,
909                                  * so don't keep a reference.
910                                  */
911                                 module_put(ops->owner);
912                                 err = -EAGAIN;
913                                 goto err_out;
914                         }
915                 }
916         }
917 #endif
918
919         err = -ENOENT;
920         if (ops == NULL)
921                 goto err_out;
922
923         sch = qdisc_alloc(dev_queue, ops);
924         if (IS_ERR(sch)) {
925                 err = PTR_ERR(sch);
926                 goto err_out2;
927         }
928
929         sch->parent = parent;
930
931         if (handle == TC_H_INGRESS) {
932                 sch->flags |= TCQ_F_INGRESS;
933                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
934                 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
935         } else {
936                 if (handle == 0) {
937                         handle = qdisc_alloc_handle(dev);
938                         err = -ENOMEM;
939                         if (handle == 0)
940                                 goto err_out3;
941                 }
942                 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
943                 if (!netif_is_multiqueue(dev))
944                         sch->flags |= TCQ_F_ONETXQUEUE;
945         }
946
947         sch->handle = handle;
948
949         if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
950                 if (qdisc_is_percpu_stats(sch)) {
951                         sch->cpu_bstats =
952                                 netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
953                         if (!sch->cpu_bstats)
954                                 goto err_out4;
955
956                         sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
957                         if (!sch->cpu_qstats)
958                                 goto err_out4;
959                 }
960
961                 if (tca[TCA_STAB]) {
962                         stab = qdisc_get_stab(tca[TCA_STAB]);
963                         if (IS_ERR(stab)) {
964                                 err = PTR_ERR(stab);
965                                 goto err_out4;
966                         }
967                         rcu_assign_pointer(sch->stab, stab);
968                 }
969                 if (tca[TCA_RATE]) {
970                         spinlock_t *root_lock;
971
972                         err = -EOPNOTSUPP;
973                         if (sch->flags & TCQ_F_MQROOT)
974                                 goto err_out4;
975
976                         if ((sch->parent != TC_H_ROOT) &&
977                             !(sch->flags & TCQ_F_INGRESS) &&
978                             (!p || !(p->flags & TCQ_F_MQROOT)))
979                                 root_lock = qdisc_root_sleeping_lock(sch);
980                         else
981                                 root_lock = qdisc_lock(sch);
982
983                         err = gen_new_estimator(&sch->bstats,
984                                                 sch->cpu_bstats,
985                                                 &sch->rate_est,
986                                                 root_lock,
987                                                 tca[TCA_RATE]);
988                         if (err)
989                                 goto err_out4;
990                 }
991
992                 qdisc_list_add(sch);
993
994                 return sch;
995         }
996 err_out3:
997         dev_put(dev);
998         kfree((char *) sch - sch->padded);
999 err_out2:
1000         module_put(ops->owner);
1001 err_out:
1002         *errp = err;
1003         return NULL;
1004
1005 err_out4:
1006         free_percpu(sch->cpu_bstats);
1007         free_percpu(sch->cpu_qstats);
1008         /*
1009          * Any broken qdiscs that would require a ops->reset() here?
1010          * The qdisc was never in action so it shouldn't be necessary.
1011          */
1012         qdisc_put_stab(rtnl_dereference(sch->stab));
1013         if (ops->destroy)
1014                 ops->destroy(sch);
1015         goto err_out3;
1016 }
1017
1018 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
1019 {
1020         struct qdisc_size_table *ostab, *stab = NULL;
1021         int err = 0;
1022
1023         if (tca[TCA_OPTIONS]) {
1024                 if (sch->ops->change == NULL)
1025                         return -EINVAL;
1026                 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
1027                 if (err)
1028                         return err;
1029         }
1030
1031         if (tca[TCA_STAB]) {
1032                 stab = qdisc_get_stab(tca[TCA_STAB]);
1033                 if (IS_ERR(stab))
1034                         return PTR_ERR(stab);
1035         }
1036
1037         ostab = rtnl_dereference(sch->stab);
1038         rcu_assign_pointer(sch->stab, stab);
1039         qdisc_put_stab(ostab);
1040
1041         if (tca[TCA_RATE]) {
1042                 /* NB: ignores errors from replace_estimator
1043                    because change can't be undone. */
1044                 if (sch->flags & TCQ_F_MQROOT)
1045                         goto out;
1046                 gen_replace_estimator(&sch->bstats,
1047                                       sch->cpu_bstats,
1048                                       &sch->rate_est,
1049                                       qdisc_root_sleeping_lock(sch),
1050                                       tca[TCA_RATE]);
1051         }
1052 out:
1053         return 0;
1054 }
1055
1056 struct check_loop_arg {
1057         struct qdisc_walker     w;
1058         struct Qdisc            *p;
1059         int                     depth;
1060 };
1061
1062 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
1063
1064 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1065 {
1066         struct check_loop_arg   arg;
1067
1068         if (q->ops->cl_ops == NULL)
1069                 return 0;
1070
1071         arg.w.stop = arg.w.skip = arg.w.count = 0;
1072         arg.w.fn = check_loop_fn;
1073         arg.depth = depth;
1074         arg.p = p;
1075         q->ops->cl_ops->walk(q, &arg.w);
1076         return arg.w.stop ? -ELOOP : 0;
1077 }
1078
1079 static int
1080 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1081 {
1082         struct Qdisc *leaf;
1083         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1084         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1085
1086         leaf = cops->leaf(q, cl);
1087         if (leaf) {
1088                 if (leaf == arg->p || arg->depth > 7)
1089                         return -ELOOP;
1090                 return check_loop(leaf, arg->p, arg->depth + 1);
1091         }
1092         return 0;
1093 }
1094
1095 /*
1096  * Delete/get qdisc.
1097  */
1098
1099 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1100 {
1101         struct net *net = sock_net(skb->sk);
1102         struct tcmsg *tcm = nlmsg_data(n);
1103         struct nlattr *tca[TCA_MAX + 1];
1104         struct net_device *dev;
1105         u32 clid;
1106         struct Qdisc *q = NULL;
1107         struct Qdisc *p = NULL;
1108         int err;
1109
1110         if ((n->nlmsg_type != RTM_GETQDISC) &&
1111             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1112                 return -EPERM;
1113
1114         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1115         if (err < 0)
1116                 return err;
1117
1118         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1119         if (!dev)
1120                 return -ENODEV;
1121
1122         clid = tcm->tcm_parent;
1123         if (clid) {
1124                 if (clid != TC_H_ROOT) {
1125                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1126                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1127                                 if (!p)
1128                                         return -ENOENT;
1129                                 q = qdisc_leaf(p, clid);
1130                         } else if (dev_ingress_queue(dev)) {
1131                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1132                         }
1133                 } else {
1134                         q = dev->qdisc;
1135                 }
1136                 if (!q)
1137                         return -ENOENT;
1138
1139                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1140                         return -EINVAL;
1141         } else {
1142                 q = qdisc_lookup(dev, tcm->tcm_handle);
1143                 if (!q)
1144                         return -ENOENT;
1145         }
1146
1147         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1148                 return -EINVAL;
1149
1150         if (n->nlmsg_type == RTM_DELQDISC) {
1151                 if (!clid)
1152                         return -EINVAL;
1153                 if (q->handle == 0)
1154                         return -ENOENT;
1155                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1156                 if (err != 0)
1157                         return err;
1158         } else {
1159                 qdisc_notify(net, skb, n, clid, NULL, q);
1160         }
1161         return 0;
1162 }
1163
1164 /*
1165  * Create/change qdisc.
1166  */
1167
1168 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1169 {
1170         struct net *net = sock_net(skb->sk);
1171         struct tcmsg *tcm;
1172         struct nlattr *tca[TCA_MAX + 1];
1173         struct net_device *dev;
1174         u32 clid;
1175         struct Qdisc *q, *p;
1176         int err;
1177
1178         if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1179                 return -EPERM;
1180
1181 replay:
1182         /* Reinit, just in case something touches this. */
1183         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1184         if (err < 0)
1185                 return err;
1186
1187         tcm = nlmsg_data(n);
1188         clid = tcm->tcm_parent;
1189         q = p = NULL;
1190
1191         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1192         if (!dev)
1193                 return -ENODEV;
1194
1195
1196         if (clid) {
1197                 if (clid != TC_H_ROOT) {
1198                         if (clid != TC_H_INGRESS) {
1199                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1200                                 if (!p)
1201                                         return -ENOENT;
1202                                 q = qdisc_leaf(p, clid);
1203                         } else if (dev_ingress_queue_create(dev)) {
1204                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1205                         }
1206                 } else {
1207                         q = dev->qdisc;
1208                 }
1209
1210                 /* It may be default qdisc, ignore it */
1211                 if (q && q->handle == 0)
1212                         q = NULL;
1213
1214                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1215                         if (tcm->tcm_handle) {
1216                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1217                                         return -EEXIST;
1218                                 if (TC_H_MIN(tcm->tcm_handle))
1219                                         return -EINVAL;
1220                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1221                                 if (!q)
1222                                         goto create_n_graft;
1223                                 if (n->nlmsg_flags & NLM_F_EXCL)
1224                                         return -EEXIST;
1225                                 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1226                                         return -EINVAL;
1227                                 if (q == p ||
1228                                     (p && check_loop(q, p, 0)))
1229                                         return -ELOOP;
1230                                 atomic_inc(&q->refcnt);
1231                                 goto graft;
1232                         } else {
1233                                 if (!q)
1234                                         goto create_n_graft;
1235
1236                                 /* This magic test requires explanation.
1237                                  *
1238                                  *   We know, that some child q is already
1239                                  *   attached to this parent and have choice:
1240                                  *   either to change it or to create/graft new one.
1241                                  *
1242                                  *   1. We are allowed to create/graft only
1243                                  *   if CREATE and REPLACE flags are set.
1244                                  *
1245                                  *   2. If EXCL is set, requestor wanted to say,
1246                                  *   that qdisc tcm_handle is not expected
1247                                  *   to exist, so that we choose create/graft too.
1248                                  *
1249                                  *   3. The last case is when no flags are set.
1250                                  *   Alas, it is sort of hole in API, we
1251                                  *   cannot decide what to do unambiguously.
1252                                  *   For now we select create/graft, if
1253                                  *   user gave KIND, which does not match existing.
1254                                  */
1255                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1256                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1257                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1258                                      (tca[TCA_KIND] &&
1259                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1260                                         goto create_n_graft;
1261                         }
1262                 }
1263         } else {
1264                 if (!tcm->tcm_handle)
1265                         return -EINVAL;
1266                 q = qdisc_lookup(dev, tcm->tcm_handle);
1267         }
1268
1269         /* Change qdisc parameters */
1270         if (q == NULL)
1271                 return -ENOENT;
1272         if (n->nlmsg_flags & NLM_F_EXCL)
1273                 return -EEXIST;
1274         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1275                 return -EINVAL;
1276         err = qdisc_change(q, tca);
1277         if (err == 0)
1278                 qdisc_notify(net, skb, n, clid, NULL, q);
1279         return err;
1280
1281 create_n_graft:
1282         if (!(n->nlmsg_flags & NLM_F_CREATE))
1283                 return -ENOENT;
1284         if (clid == TC_H_INGRESS) {
1285                 if (dev_ingress_queue(dev))
1286                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1287                                          tcm->tcm_parent, tcm->tcm_parent,
1288                                          tca, &err);
1289                 else
1290                         err = -ENOENT;
1291         } else {
1292                 struct netdev_queue *dev_queue;
1293
1294                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1295                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1296                 else if (p)
1297                         dev_queue = p->dev_queue;
1298                 else
1299                         dev_queue = netdev_get_tx_queue(dev, 0);
1300
1301                 q = qdisc_create(dev, dev_queue, p,
1302                                  tcm->tcm_parent, tcm->tcm_handle,
1303                                  tca, &err);
1304         }
1305         if (q == NULL) {
1306                 if (err == -EAGAIN)
1307                         goto replay;
1308                 return err;
1309         }
1310
1311 graft:
1312         err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1313         if (err) {
1314                 if (q)
1315                         qdisc_destroy(q);
1316                 return err;
1317         }
1318
1319         return 0;
1320 }
1321
1322 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1323                          u32 portid, u32 seq, u16 flags, int event)
1324 {
1325         struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
1326         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
1327         struct tcmsg *tcm;
1328         struct nlmsghdr  *nlh;
1329         unsigned char *b = skb_tail_pointer(skb);
1330         struct gnet_dump d;
1331         struct qdisc_size_table *stab;
1332         __u32 qlen;
1333
1334         cond_resched();
1335         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1336         if (!nlh)
1337                 goto out_nlmsg_trim;
1338         tcm = nlmsg_data(nlh);
1339         tcm->tcm_family = AF_UNSPEC;
1340         tcm->tcm__pad1 = 0;
1341         tcm->tcm__pad2 = 0;
1342         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1343         tcm->tcm_parent = clid;
1344         tcm->tcm_handle = q->handle;
1345         tcm->tcm_info = atomic_read(&q->refcnt);
1346         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1347                 goto nla_put_failure;
1348         if (q->ops->dump && q->ops->dump(q, skb) < 0)
1349                 goto nla_put_failure;
1350         qlen = q->q.qlen;
1351
1352         stab = rtnl_dereference(q->stab);
1353         if (stab && qdisc_dump_stab(skb, stab) < 0)
1354                 goto nla_put_failure;
1355
1356         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1357                                          qdisc_root_sleeping_lock(q), &d) < 0)
1358                 goto nla_put_failure;
1359
1360         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1361                 goto nla_put_failure;
1362
1363         if (qdisc_is_percpu_stats(q)) {
1364                 cpu_bstats = q->cpu_bstats;
1365                 cpu_qstats = q->cpu_qstats;
1366         }
1367
1368         if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats) < 0 ||
1369             gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1370             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
1371                 goto nla_put_failure;
1372
1373         if (gnet_stats_finish_copy(&d) < 0)
1374                 goto nla_put_failure;
1375
1376         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1377         return skb->len;
1378
1379 out_nlmsg_trim:
1380 nla_put_failure:
1381         nlmsg_trim(skb, b);
1382         return -1;
1383 }
1384
1385 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1386 {
1387         return (q->flags & TCQ_F_BUILTIN) ? true : false;
1388 }
1389
1390 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1391                         struct nlmsghdr *n, u32 clid,
1392                         struct Qdisc *old, struct Qdisc *new)
1393 {
1394         struct sk_buff *skb;
1395         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1396
1397         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1398         if (!skb)
1399                 return -ENOBUFS;
1400
1401         if (old && !tc_qdisc_dump_ignore(old)) {
1402                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1403                                   0, RTM_DELQDISC) < 0)
1404                         goto err_out;
1405         }
1406         if (new && !tc_qdisc_dump_ignore(new)) {
1407                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1408                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1409                         goto err_out;
1410         }
1411
1412         if (skb->len)
1413                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1414                                       n->nlmsg_flags & NLM_F_ECHO);
1415
1416 err_out:
1417         kfree_skb(skb);
1418         return -EINVAL;
1419 }
1420
1421 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1422                               struct netlink_callback *cb,
1423                               int *q_idx_p, int s_q_idx)
1424 {
1425         int ret = 0, q_idx = *q_idx_p;
1426         struct Qdisc *q;
1427
1428         if (!root)
1429                 return 0;
1430
1431         q = root;
1432         if (q_idx < s_q_idx) {
1433                 q_idx++;
1434         } else {
1435                 if (!tc_qdisc_dump_ignore(q) &&
1436                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1437                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1438                         goto done;
1439                 q_idx++;
1440         }
1441         list_for_each_entry(q, &root->list, list) {
1442                 if (q_idx < s_q_idx) {
1443                         q_idx++;
1444                         continue;
1445                 }
1446                 if (!tc_qdisc_dump_ignore(q) &&
1447                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1448                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1449                         goto done;
1450                 q_idx++;
1451         }
1452
1453 out:
1454         *q_idx_p = q_idx;
1455         return ret;
1456 done:
1457         ret = -1;
1458         goto out;
1459 }
1460
1461 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1462 {
1463         struct net *net = sock_net(skb->sk);
1464         int idx, q_idx;
1465         int s_idx, s_q_idx;
1466         struct net_device *dev;
1467
1468         s_idx = cb->args[0];
1469         s_q_idx = q_idx = cb->args[1];
1470
1471         idx = 0;
1472         ASSERT_RTNL();
1473         for_each_netdev(net, dev) {
1474                 struct netdev_queue *dev_queue;
1475
1476                 if (idx < s_idx)
1477                         goto cont;
1478                 if (idx > s_idx)
1479                         s_q_idx = 0;
1480                 q_idx = 0;
1481
1482                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1483                         goto done;
1484
1485                 dev_queue = dev_ingress_queue(dev);
1486                 if (dev_queue &&
1487                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1488                                        &q_idx, s_q_idx) < 0)
1489                         goto done;
1490
1491 cont:
1492                 idx++;
1493         }
1494
1495 done:
1496         cb->args[0] = idx;
1497         cb->args[1] = q_idx;
1498
1499         return skb->len;
1500 }
1501
1502
1503
1504 /************************************************
1505  *      Traffic classes manipulation.           *
1506  ************************************************/
1507
1508
1509
1510 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
1511 {
1512         struct net *net = sock_net(skb->sk);
1513         struct tcmsg *tcm = nlmsg_data(n);
1514         struct nlattr *tca[TCA_MAX + 1];
1515         struct net_device *dev;
1516         struct Qdisc *q = NULL;
1517         const struct Qdisc_class_ops *cops;
1518         unsigned long cl = 0;
1519         unsigned long new_cl;
1520         u32 portid;
1521         u32 clid;
1522         u32 qid;
1523         int err;
1524
1525         if ((n->nlmsg_type != RTM_GETTCLASS) &&
1526             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1527                 return -EPERM;
1528
1529         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1530         if (err < 0)
1531                 return err;
1532
1533         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1534         if (!dev)
1535                 return -ENODEV;
1536
1537         /*
1538            parent == TC_H_UNSPEC - unspecified parent.
1539            parent == TC_H_ROOT   - class is root, which has no parent.
1540            parent == X:0         - parent is root class.
1541            parent == X:Y         - parent is a node in hierarchy.
1542            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1543
1544            handle == 0:0         - generate handle from kernel pool.
1545            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1546            handle == X:Y         - clear.
1547            handle == X:0         - root class.
1548          */
1549
1550         /* Step 1. Determine qdisc handle X:0 */
1551
1552         portid = tcm->tcm_parent;
1553         clid = tcm->tcm_handle;
1554         qid = TC_H_MAJ(clid);
1555
1556         if (portid != TC_H_ROOT) {
1557                 u32 qid1 = TC_H_MAJ(portid);
1558
1559                 if (qid && qid1) {
1560                         /* If both majors are known, they must be identical. */
1561                         if (qid != qid1)
1562                                 return -EINVAL;
1563                 } else if (qid1) {
1564                         qid = qid1;
1565                 } else if (qid == 0)
1566                         qid = dev->qdisc->handle;
1567
1568                 /* Now qid is genuine qdisc handle consistent
1569                  * both with parent and child.
1570                  *
1571                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
1572                  */
1573                 if (portid)
1574                         portid = TC_H_MAKE(qid, portid);
1575         } else {
1576                 if (qid == 0)
1577                         qid = dev->qdisc->handle;
1578         }
1579
1580         /* OK. Locate qdisc */
1581         q = qdisc_lookup(dev, qid);
1582         if (!q)
1583                 return -ENOENT;
1584
1585         /* An check that it supports classes */
1586         cops = q->ops->cl_ops;
1587         if (cops == NULL)
1588                 return -EINVAL;
1589
1590         /* Now try to get class */
1591         if (clid == 0) {
1592                 if (portid == TC_H_ROOT)
1593                         clid = qid;
1594         } else
1595                 clid = TC_H_MAKE(qid, clid);
1596
1597         if (clid)
1598                 cl = cops->get(q, clid);
1599
1600         if (cl == 0) {
1601                 err = -ENOENT;
1602                 if (n->nlmsg_type != RTM_NEWTCLASS ||
1603                     !(n->nlmsg_flags & NLM_F_CREATE))
1604                         goto out;
1605         } else {
1606                 switch (n->nlmsg_type) {
1607                 case RTM_NEWTCLASS:
1608                         err = -EEXIST;
1609                         if (n->nlmsg_flags & NLM_F_EXCL)
1610                                 goto out;
1611                         break;
1612                 case RTM_DELTCLASS:
1613                         err = -EOPNOTSUPP;
1614                         if (cops->delete)
1615                                 err = cops->delete(q, cl);
1616                         if (err == 0)
1617                                 tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1618                         goto out;
1619                 case RTM_GETTCLASS:
1620                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1621                         goto out;
1622                 default:
1623                         err = -EINVAL;
1624                         goto out;
1625                 }
1626         }
1627
1628         new_cl = cl;
1629         err = -EOPNOTSUPP;
1630         if (cops->change)
1631                 err = cops->change(q, clid, portid, tca, &new_cl);
1632         if (err == 0)
1633                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1634
1635 out:
1636         if (cl)
1637                 cops->put(q, cl);
1638
1639         return err;
1640 }
1641
1642
1643 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1644                           unsigned long cl,
1645                           u32 portid, u32 seq, u16 flags, int event)
1646 {
1647         struct tcmsg *tcm;
1648         struct nlmsghdr  *nlh;
1649         unsigned char *b = skb_tail_pointer(skb);
1650         struct gnet_dump d;
1651         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1652
1653         cond_resched();
1654         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1655         if (!nlh)
1656                 goto out_nlmsg_trim;
1657         tcm = nlmsg_data(nlh);
1658         tcm->tcm_family = AF_UNSPEC;
1659         tcm->tcm__pad1 = 0;
1660         tcm->tcm__pad2 = 0;
1661         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1662         tcm->tcm_parent = q->handle;
1663         tcm->tcm_handle = q->handle;
1664         tcm->tcm_info = 0;
1665         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1666                 goto nla_put_failure;
1667         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1668                 goto nla_put_failure;
1669
1670         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1671                                          qdisc_root_sleeping_lock(q), &d) < 0)
1672                 goto nla_put_failure;
1673
1674         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1675                 goto nla_put_failure;
1676
1677         if (gnet_stats_finish_copy(&d) < 0)
1678                 goto nla_put_failure;
1679
1680         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1681         return skb->len;
1682
1683 out_nlmsg_trim:
1684 nla_put_failure:
1685         nlmsg_trim(skb, b);
1686         return -1;
1687 }
1688
1689 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1690                          struct nlmsghdr *n, struct Qdisc *q,
1691                          unsigned long cl, int event)
1692 {
1693         struct sk_buff *skb;
1694         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1695
1696         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1697         if (!skb)
1698                 return -ENOBUFS;
1699
1700         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1701                 kfree_skb(skb);
1702                 return -EINVAL;
1703         }
1704
1705         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1706                               n->nlmsg_flags & NLM_F_ECHO);
1707 }
1708
1709 struct qdisc_dump_args {
1710         struct qdisc_walker     w;
1711         struct sk_buff          *skb;
1712         struct netlink_callback *cb;
1713 };
1714
1715 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1716 {
1717         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1718
1719         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1720                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1721 }
1722
1723 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1724                                 struct tcmsg *tcm, struct netlink_callback *cb,
1725                                 int *t_p, int s_t)
1726 {
1727         struct qdisc_dump_args arg;
1728
1729         if (tc_qdisc_dump_ignore(q) ||
1730             *t_p < s_t || !q->ops->cl_ops ||
1731             (tcm->tcm_parent &&
1732              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1733                 (*t_p)++;
1734                 return 0;
1735         }
1736         if (*t_p > s_t)
1737                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1738         arg.w.fn = qdisc_class_dump;
1739         arg.skb = skb;
1740         arg.cb = cb;
1741         arg.w.stop  = 0;
1742         arg.w.skip = cb->args[1];
1743         arg.w.count = 0;
1744         q->ops->cl_ops->walk(q, &arg.w);
1745         cb->args[1] = arg.w.count;
1746         if (arg.w.stop)
1747                 return -1;
1748         (*t_p)++;
1749         return 0;
1750 }
1751
1752 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1753                                struct tcmsg *tcm, struct netlink_callback *cb,
1754                                int *t_p, int s_t)
1755 {
1756         struct Qdisc *q;
1757
1758         if (!root)
1759                 return 0;
1760
1761         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1762                 return -1;
1763
1764         list_for_each_entry(q, &root->list, list) {
1765                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1766                         return -1;
1767         }
1768
1769         return 0;
1770 }
1771
1772 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1773 {
1774         struct tcmsg *tcm = nlmsg_data(cb->nlh);
1775         struct net *net = sock_net(skb->sk);
1776         struct netdev_queue *dev_queue;
1777         struct net_device *dev;
1778         int t, s_t;
1779
1780         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
1781                 return 0;
1782         dev = dev_get_by_index(net, tcm->tcm_ifindex);
1783         if (!dev)
1784                 return 0;
1785
1786         s_t = cb->args[0];
1787         t = 0;
1788
1789         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1790                 goto done;
1791
1792         dev_queue = dev_ingress_queue(dev);
1793         if (dev_queue &&
1794             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1795                                 &t, s_t) < 0)
1796                 goto done;
1797
1798 done:
1799         cb->args[0] = t;
1800
1801         dev_put(dev);
1802         return skb->len;
1803 }
1804
1805 /* Main classifier routine: scans classifier chain attached
1806  * to this qdisc, (optionally) tests for protocol and asks
1807  * specific classifiers.
1808  */
1809 int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
1810                        struct tcf_result *res)
1811 {
1812         __be16 protocol = tc_skb_protocol(skb);
1813         int err;
1814
1815         for (; tp; tp = rcu_dereference_bh(tp->next)) {
1816                 if (tp->protocol != protocol &&
1817                     tp->protocol != htons(ETH_P_ALL))
1818                         continue;
1819                 err = tp->classify(skb, tp, res);
1820
1821                 if (err >= 0) {
1822 #ifdef CONFIG_NET_CLS_ACT
1823                         if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1824                                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1825 #endif
1826                         return err;
1827                 }
1828         }
1829         return -1;
1830 }
1831 EXPORT_SYMBOL(tc_classify_compat);
1832
1833 int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1834                 struct tcf_result *res)
1835 {
1836         int err = 0;
1837 #ifdef CONFIG_NET_CLS_ACT
1838         const struct tcf_proto *otp = tp;
1839 reclassify:
1840 #endif
1841
1842         err = tc_classify_compat(skb, tp, res);
1843 #ifdef CONFIG_NET_CLS_ACT
1844         if (err == TC_ACT_RECLASSIFY) {
1845                 u32 verd = G_TC_VERD(skb->tc_verd);
1846                 tp = otp;
1847
1848                 if (verd++ >= MAX_REC_LOOP) {
1849                         net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
1850                                                tp->q->ops->id,
1851                                                tp->prio & 0xffff,
1852                                                ntohs(tp->protocol));
1853                         return TC_ACT_SHOT;
1854                 }
1855                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1856                 goto reclassify;
1857         }
1858 #endif
1859         return err;
1860 }
1861 EXPORT_SYMBOL(tc_classify);
1862
1863 bool tcf_destroy(struct tcf_proto *tp, bool force)
1864 {
1865         if (tp->ops->destroy(tp, force)) {
1866                 module_put(tp->ops->owner);
1867                 kfree_rcu(tp, rcu);
1868                 return true;
1869         }
1870
1871         return false;
1872 }
1873
1874 void tcf_destroy_chain(struct tcf_proto __rcu **fl)
1875 {
1876         struct tcf_proto *tp;
1877
1878         while ((tp = rtnl_dereference(*fl)) != NULL) {
1879                 RCU_INIT_POINTER(*fl, tp->next);
1880                 tcf_destroy(tp, true);
1881         }
1882 }
1883 EXPORT_SYMBOL(tcf_destroy_chain);
1884
1885 #ifdef CONFIG_PROC_FS
1886 static int psched_show(struct seq_file *seq, void *v)
1887 {
1888         struct timespec ts;
1889
1890         hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1891         seq_printf(seq, "%08x %08x %08x %08x\n",
1892                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1893                    1000000,
1894                    (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1895
1896         return 0;
1897 }
1898
1899 static int psched_open(struct inode *inode, struct file *file)
1900 {
1901         return single_open(file, psched_show, NULL);
1902 }
1903
1904 static const struct file_operations psched_fops = {
1905         .owner = THIS_MODULE,
1906         .open = psched_open,
1907         .read  = seq_read,
1908         .llseek = seq_lseek,
1909         .release = single_release,
1910 };
1911
1912 static int __net_init psched_net_init(struct net *net)
1913 {
1914         struct proc_dir_entry *e;
1915
1916         e = proc_create("psched", 0, net->proc_net, &psched_fops);
1917         if (e == NULL)
1918                 return -ENOMEM;
1919
1920         return 0;
1921 }
1922
1923 static void __net_exit psched_net_exit(struct net *net)
1924 {
1925         remove_proc_entry("psched", net->proc_net);
1926 }
1927 #else
1928 static int __net_init psched_net_init(struct net *net)
1929 {
1930         return 0;
1931 }
1932
1933 static void __net_exit psched_net_exit(struct net *net)
1934 {
1935 }
1936 #endif
1937
1938 static struct pernet_operations psched_net_ops = {
1939         .init = psched_net_init,
1940         .exit = psched_net_exit,
1941 };
1942
1943 static int __init pktsched_init(void)
1944 {
1945         int err;
1946
1947         err = register_pernet_subsys(&psched_net_ops);
1948         if (err) {
1949                 pr_err("pktsched_init: "
1950                        "cannot initialize per netns operations\n");
1951                 return err;
1952         }
1953
1954         register_qdisc(&pfifo_fast_ops);
1955         register_qdisc(&pfifo_qdisc_ops);
1956         register_qdisc(&bfifo_qdisc_ops);
1957         register_qdisc(&pfifo_head_drop_qdisc_ops);
1958         register_qdisc(&mq_qdisc_ops);
1959
1960         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1961         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1962         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1963         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1964         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1965         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1966
1967         return 0;
1968 }
1969
1970 subsys_initcall(pktsched_init);