These changes are a raw update to a vanilla kernel 4.1.10, with the
[kvmfornfv.git] / kernel / net / core / dev.c
1 /*
2  *      NET3    Protocol independent device support routines.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  *      Derived from the non IP parts of dev.c 1.0.19
10  *              Authors:        Ross Biro
11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *      Additional Authors:
15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
17  *              David Hinds <dahinds@users.sourceforge.net>
18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *              Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *      Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *                                      to 2 if register_netdev gets called
25  *                                      before net_dev_init & also removed a
26  *                                      few lines of code in the process.
27  *              Alan Cox        :       device private ioctl copies fields back.
28  *              Alan Cox        :       Transmit queue code does relevant
29  *                                      stunts to keep the queue safe.
30  *              Alan Cox        :       Fixed double lock.
31  *              Alan Cox        :       Fixed promisc NULL pointer trap
32  *              ????????        :       Support the full private ioctl range
33  *              Alan Cox        :       Moved ioctl permission check into
34  *                                      drivers
35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
36  *              Alan Cox        :       100 backlog just doesn't cut it when
37  *                                      you start doing multicast video 8)
38  *              Alan Cox        :       Rewrote net_bh and list manager.
39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
40  *              Alan Cox        :       Took out transmit every packet pass
41  *                                      Saved a few bytes in the ioctl handler
42  *              Alan Cox        :       Network driver sets packet type before
43  *                                      calling netif_rx. Saves a function
44  *                                      call a packet.
45  *              Alan Cox        :       Hashed net_bh()
46  *              Richard Kooijman:       Timestamp fixes.
47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
48  *              Alan Cox        :       Device lock protection.
49  *              Alan Cox        :       Fixed nasty side effect of device close
50  *                                      changes.
51  *              Rudi Cilibrasi  :       Pass the right thing to
52  *                                      set_mac_address()
53  *              Dave Miller     :       32bit quantity for the device lock to
54  *                                      make it work out on a Sparc.
55  *              Bjorn Ekwall    :       Added KERNELD hack.
56  *              Alan Cox        :       Cleaned up the backlog initialise.
57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
58  *                                      1 device.
59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
60  *                                      is no device open function.
61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
63  *              Cyrus Durgin    :       Cleaned for KMOD
64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
65  *                                      A network device unload needs to purge
66  *                                      the backlog queue.
67  *      Paul Rusty Russell      :       SIOCSIFNAME
68  *              Pekka Riikonen  :       Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *                                      indefinitely on dev->refcnt
71  *              J Hadi Salim    :       - Backlog queue sampling
72  *                                      - netif_rx() feedback
73  */
74
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/stat.h>
101 #include <net/dst.h>
102 #include <net/pkt_sched.h>
103 #include <net/checksum.h>
104 #include <net/xfrm.h>
105 #include <linux/highmem.h>
106 #include <linux/init.h>
107 #include <linux/module.h>
108 #include <linux/netpoll.h>
109 #include <linux/rcupdate.h>
110 #include <linux/delay.h>
111 #include <net/iw_handler.h>
112 #include <asm/current.h>
113 #include <linux/audit.h>
114 #include <linux/dmaengine.h>
115 #include <linux/err.h>
116 #include <linux/ctype.h>
117 #include <linux/if_arp.h>
118 #include <linux/if_vlan.h>
119 #include <linux/ip.h>
120 #include <net/ip.h>
121 #include <net/mpls.h>
122 #include <linux/ipv6.h>
123 #include <linux/in.h>
124 #include <linux/jhash.h>
125 #include <linux/random.h>
126 #include <trace/events/napi.h>
127 #include <trace/events/net.h>
128 #include <trace/events/skb.h>
129 #include <linux/pci.h>
130 #include <linux/inetdevice.h>
131 #include <linux/cpu_rmap.h>
132 #include <linux/static_key.h>
133 #include <linux/hashtable.h>
134 #include <linux/vmalloc.h>
135 #include <linux/if_macvlan.h>
136 #include <linux/errqueue.h>
137 #include <linux/hrtimer.h>
138
139 #include "net-sysfs.h"
140
141 /* Instead of increasing this, you should create a hash table. */
142 #define MAX_GRO_SKBS 8
143
144 /* This should be increased if a protocol with a bigger head is added. */
145 #define GRO_MAX_HEAD (MAX_HEADER + 128)
146
147 static DEFINE_SPINLOCK(ptype_lock);
148 static DEFINE_SPINLOCK(offload_lock);
149 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
150 struct list_head ptype_all __read_mostly;       /* Taps */
151 static struct list_head offload_base __read_mostly;
152
153 static int netif_rx_internal(struct sk_buff *skb);
154 static int call_netdevice_notifiers_info(unsigned long val,
155                                          struct net_device *dev,
156                                          struct netdev_notifier_info *info);
157
158 /*
159  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
160  * semaphore.
161  *
162  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
163  *
164  * Writers must hold the rtnl semaphore while they loop through the
165  * dev_base_head list, and hold dev_base_lock for writing when they do the
166  * actual updates.  This allows pure readers to access the list even
167  * while a writer is preparing to update it.
168  *
169  * To put it another way, dev_base_lock is held for writing only to
170  * protect against pure readers; the rtnl semaphore provides the
171  * protection against other writers.
172  *
173  * See, for example usages, register_netdevice() and
174  * unregister_netdevice(), which must be called with the rtnl
175  * semaphore held.
176  */
177 DEFINE_RWLOCK(dev_base_lock);
178 EXPORT_SYMBOL(dev_base_lock);
179
180 /* protects napi_hash addition/deletion and napi_gen_id */
181 static DEFINE_SPINLOCK(napi_hash_lock);
182
183 static unsigned int napi_gen_id;
184 static DEFINE_HASHTABLE(napi_hash, 8);
185
186 static seqcount_t devnet_rename_seq;
187 static DEFINE_MUTEX(devnet_rename_mutex);
188
189 static inline void dev_base_seq_inc(struct net *net)
190 {
191         while (++net->dev_base_seq == 0);
192 }
193
194 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
195 {
196         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
197
198         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
199 }
200
201 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
202 {
203         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
204 }
205
206 static inline void rps_lock(struct softnet_data *sd)
207 {
208 #ifdef CONFIG_RPS
209         raw_spin_lock(&sd->input_pkt_queue.raw_lock);
210 #endif
211 }
212
213 static inline void rps_unlock(struct softnet_data *sd)
214 {
215 #ifdef CONFIG_RPS
216         raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
217 #endif
218 }
219
220 /* Device list insertion */
221 static void list_netdevice(struct net_device *dev)
222 {
223         struct net *net = dev_net(dev);
224
225         ASSERT_RTNL();
226
227         write_lock_bh(&dev_base_lock);
228         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
229         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
230         hlist_add_head_rcu(&dev->index_hlist,
231                            dev_index_hash(net, dev->ifindex));
232         write_unlock_bh(&dev_base_lock);
233
234         dev_base_seq_inc(net);
235 }
236
237 /* Device list removal
238  * caller must respect a RCU grace period before freeing/reusing dev
239  */
240 static void unlist_netdevice(struct net_device *dev)
241 {
242         ASSERT_RTNL();
243
244         /* Unlink dev from the device chain */
245         write_lock_bh(&dev_base_lock);
246         list_del_rcu(&dev->dev_list);
247         hlist_del_rcu(&dev->name_hlist);
248         hlist_del_rcu(&dev->index_hlist);
249         write_unlock_bh(&dev_base_lock);
250
251         dev_base_seq_inc(dev_net(dev));
252 }
253
254 /*
255  *      Our notifier list
256  */
257
258 static RAW_NOTIFIER_HEAD(netdev_chain);
259
260 /*
261  *      Device drivers call our routines to queue packets here. We empty the
262  *      queue in the local softnet handler.
263  */
264
265 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
266 EXPORT_PER_CPU_SYMBOL(softnet_data);
267
268 #ifdef CONFIG_LOCKDEP
269 /*
270  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
271  * according to dev->type
272  */
273 static const unsigned short netdev_lock_type[] =
274         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
275          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
276          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
277          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
278          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
279          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
280          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
281          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
282          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
283          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
284          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
285          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
286          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
287          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
288          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
289
290 static const char *const netdev_lock_name[] =
291         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
292          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
293          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
294          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
295          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
296          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
297          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
298          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
299          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
300          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
301          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
302          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
303          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
304          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
305          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
306
307 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
308 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
309
310 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
311 {
312         int i;
313
314         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
315                 if (netdev_lock_type[i] == dev_type)
316                         return i;
317         /* the last key is used by default */
318         return ARRAY_SIZE(netdev_lock_type) - 1;
319 }
320
321 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
322                                                  unsigned short dev_type)
323 {
324         int i;
325
326         i = netdev_lock_pos(dev_type);
327         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
328                                    netdev_lock_name[i]);
329 }
330
331 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
332 {
333         int i;
334
335         i = netdev_lock_pos(dev->type);
336         lockdep_set_class_and_name(&dev->addr_list_lock,
337                                    &netdev_addr_lock_key[i],
338                                    netdev_lock_name[i]);
339 }
340 #else
341 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
342                                                  unsigned short dev_type)
343 {
344 }
345 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
346 {
347 }
348 #endif
349
350 /*******************************************************************************
351
352                 Protocol management and registration routines
353
354 *******************************************************************************/
355
356 /*
357  *      Add a protocol ID to the list. Now that the input handler is
358  *      smarter we can dispense with all the messy stuff that used to be
359  *      here.
360  *
361  *      BEWARE!!! Protocol handlers, mangling input packets,
362  *      MUST BE last in hash buckets and checking protocol handlers
363  *      MUST start from promiscuous ptype_all chain in net_bh.
364  *      It is true now, do not change it.
365  *      Explanation follows: if protocol handler, mangling packet, will
366  *      be the first on list, it is not able to sense, that packet
367  *      is cloned and should be copied-on-write, so that it will
368  *      change it and subsequent readers will get broken packet.
369  *                                                      --ANK (980803)
370  */
371
372 static inline struct list_head *ptype_head(const struct packet_type *pt)
373 {
374         if (pt->type == htons(ETH_P_ALL))
375                 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
376         else
377                 return pt->dev ? &pt->dev->ptype_specific :
378                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
379 }
380
381 /**
382  *      dev_add_pack - add packet handler
383  *      @pt: packet type declaration
384  *
385  *      Add a protocol handler to the networking stack. The passed &packet_type
386  *      is linked into kernel lists and may not be freed until it has been
387  *      removed from the kernel lists.
388  *
389  *      This call does not sleep therefore it can not
390  *      guarantee all CPU's that are in middle of receiving packets
391  *      will see the new packet type (until the next received packet).
392  */
393
394 void dev_add_pack(struct packet_type *pt)
395 {
396         struct list_head *head = ptype_head(pt);
397
398         spin_lock(&ptype_lock);
399         list_add_rcu(&pt->list, head);
400         spin_unlock(&ptype_lock);
401 }
402 EXPORT_SYMBOL(dev_add_pack);
403
404 /**
405  *      __dev_remove_pack        - remove packet handler
406  *      @pt: packet type declaration
407  *
408  *      Remove a protocol handler that was previously added to the kernel
409  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
410  *      from the kernel lists and can be freed or reused once this function
411  *      returns.
412  *
413  *      The packet type might still be in use by receivers
414  *      and must not be freed until after all the CPU's have gone
415  *      through a quiescent state.
416  */
417 void __dev_remove_pack(struct packet_type *pt)
418 {
419         struct list_head *head = ptype_head(pt);
420         struct packet_type *pt1;
421
422         spin_lock(&ptype_lock);
423
424         list_for_each_entry(pt1, head, list) {
425                 if (pt == pt1) {
426                         list_del_rcu(&pt->list);
427                         goto out;
428                 }
429         }
430
431         pr_warn("dev_remove_pack: %p not found\n", pt);
432 out:
433         spin_unlock(&ptype_lock);
434 }
435 EXPORT_SYMBOL(__dev_remove_pack);
436
437 /**
438  *      dev_remove_pack  - remove packet handler
439  *      @pt: packet type declaration
440  *
441  *      Remove a protocol handler that was previously added to the kernel
442  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
443  *      from the kernel lists and can be freed or reused once this function
444  *      returns.
445  *
446  *      This call sleeps to guarantee that no CPU is looking at the packet
447  *      type after return.
448  */
449 void dev_remove_pack(struct packet_type *pt)
450 {
451         __dev_remove_pack(pt);
452
453         synchronize_net();
454 }
455 EXPORT_SYMBOL(dev_remove_pack);
456
457
458 /**
459  *      dev_add_offload - register offload handlers
460  *      @po: protocol offload declaration
461  *
462  *      Add protocol offload handlers to the networking stack. The passed
463  *      &proto_offload is linked into kernel lists and may not be freed until
464  *      it has been removed from the kernel lists.
465  *
466  *      This call does not sleep therefore it can not
467  *      guarantee all CPU's that are in middle of receiving packets
468  *      will see the new offload handlers (until the next received packet).
469  */
470 void dev_add_offload(struct packet_offload *po)
471 {
472         struct list_head *head = &offload_base;
473
474         spin_lock(&offload_lock);
475         list_add_rcu(&po->list, head);
476         spin_unlock(&offload_lock);
477 }
478 EXPORT_SYMBOL(dev_add_offload);
479
480 /**
481  *      __dev_remove_offload     - remove offload handler
482  *      @po: packet offload declaration
483  *
484  *      Remove a protocol offload handler that was previously added to the
485  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
486  *      is removed from the kernel lists and can be freed or reused once this
487  *      function returns.
488  *
489  *      The packet type might still be in use by receivers
490  *      and must not be freed until after all the CPU's have gone
491  *      through a quiescent state.
492  */
493 static void __dev_remove_offload(struct packet_offload *po)
494 {
495         struct list_head *head = &offload_base;
496         struct packet_offload *po1;
497
498         spin_lock(&offload_lock);
499
500         list_for_each_entry(po1, head, list) {
501                 if (po == po1) {
502                         list_del_rcu(&po->list);
503                         goto out;
504                 }
505         }
506
507         pr_warn("dev_remove_offload: %p not found\n", po);
508 out:
509         spin_unlock(&offload_lock);
510 }
511
512 /**
513  *      dev_remove_offload       - remove packet offload handler
514  *      @po: packet offload declaration
515  *
516  *      Remove a packet offload handler that was previously added to the kernel
517  *      offload handlers by dev_add_offload(). The passed &offload_type is
518  *      removed from the kernel lists and can be freed or reused once this
519  *      function returns.
520  *
521  *      This call sleeps to guarantee that no CPU is looking at the packet
522  *      type after return.
523  */
524 void dev_remove_offload(struct packet_offload *po)
525 {
526         __dev_remove_offload(po);
527
528         synchronize_net();
529 }
530 EXPORT_SYMBOL(dev_remove_offload);
531
532 /******************************************************************************
533
534                       Device Boot-time Settings Routines
535
536 *******************************************************************************/
537
538 /* Boot time configuration table */
539 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
540
541 /**
542  *      netdev_boot_setup_add   - add new setup entry
543  *      @name: name of the device
544  *      @map: configured settings for the device
545  *
546  *      Adds new setup entry to the dev_boot_setup list.  The function
547  *      returns 0 on error and 1 on success.  This is a generic routine to
548  *      all netdevices.
549  */
550 static int netdev_boot_setup_add(char *name, struct ifmap *map)
551 {
552         struct netdev_boot_setup *s;
553         int i;
554
555         s = dev_boot_setup;
556         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
557                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
558                         memset(s[i].name, 0, sizeof(s[i].name));
559                         strlcpy(s[i].name, name, IFNAMSIZ);
560                         memcpy(&s[i].map, map, sizeof(s[i].map));
561                         break;
562                 }
563         }
564
565         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
566 }
567
568 /**
569  *      netdev_boot_setup_check - check boot time settings
570  *      @dev: the netdevice
571  *
572  *      Check boot time settings for the device.
573  *      The found settings are set for the device to be used
574  *      later in the device probing.
575  *      Returns 0 if no settings found, 1 if they are.
576  */
577 int netdev_boot_setup_check(struct net_device *dev)
578 {
579         struct netdev_boot_setup *s = dev_boot_setup;
580         int i;
581
582         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
583                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
584                     !strcmp(dev->name, s[i].name)) {
585                         dev->irq        = s[i].map.irq;
586                         dev->base_addr  = s[i].map.base_addr;
587                         dev->mem_start  = s[i].map.mem_start;
588                         dev->mem_end    = s[i].map.mem_end;
589                         return 1;
590                 }
591         }
592         return 0;
593 }
594 EXPORT_SYMBOL(netdev_boot_setup_check);
595
596
597 /**
598  *      netdev_boot_base        - get address from boot time settings
599  *      @prefix: prefix for network device
600  *      @unit: id for network device
601  *
602  *      Check boot time settings for the base address of device.
603  *      The found settings are set for the device to be used
604  *      later in the device probing.
605  *      Returns 0 if no settings found.
606  */
607 unsigned long netdev_boot_base(const char *prefix, int unit)
608 {
609         const struct netdev_boot_setup *s = dev_boot_setup;
610         char name[IFNAMSIZ];
611         int i;
612
613         sprintf(name, "%s%d", prefix, unit);
614
615         /*
616          * If device already registered then return base of 1
617          * to indicate not to probe for this interface
618          */
619         if (__dev_get_by_name(&init_net, name))
620                 return 1;
621
622         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
623                 if (!strcmp(name, s[i].name))
624                         return s[i].map.base_addr;
625         return 0;
626 }
627
628 /*
629  * Saves at boot time configured settings for any netdevice.
630  */
631 int __init netdev_boot_setup(char *str)
632 {
633         int ints[5];
634         struct ifmap map;
635
636         str = get_options(str, ARRAY_SIZE(ints), ints);
637         if (!str || !*str)
638                 return 0;
639
640         /* Save settings */
641         memset(&map, 0, sizeof(map));
642         if (ints[0] > 0)
643                 map.irq = ints[1];
644         if (ints[0] > 1)
645                 map.base_addr = ints[2];
646         if (ints[0] > 2)
647                 map.mem_start = ints[3];
648         if (ints[0] > 3)
649                 map.mem_end = ints[4];
650
651         /* Add new entry to the list */
652         return netdev_boot_setup_add(str, &map);
653 }
654
655 __setup("netdev=", netdev_boot_setup);
656
657 /*******************************************************************************
658
659                             Device Interface Subroutines
660
661 *******************************************************************************/
662
663 /**
664  *      dev_get_iflink  - get 'iflink' value of a interface
665  *      @dev: targeted interface
666  *
667  *      Indicates the ifindex the interface is linked to.
668  *      Physical interfaces have the same 'ifindex' and 'iflink' values.
669  */
670
671 int dev_get_iflink(const struct net_device *dev)
672 {
673         if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
674                 return dev->netdev_ops->ndo_get_iflink(dev);
675
676         return dev->ifindex;
677 }
678 EXPORT_SYMBOL(dev_get_iflink);
679
680 /**
681  *      __dev_get_by_name       - find a device by its name
682  *      @net: the applicable net namespace
683  *      @name: name to find
684  *
685  *      Find an interface by name. Must be called under RTNL semaphore
686  *      or @dev_base_lock. If the name is found a pointer to the device
687  *      is returned. If the name is not found then %NULL is returned. The
688  *      reference counters are not incremented so the caller must be
689  *      careful with locks.
690  */
691
692 struct net_device *__dev_get_by_name(struct net *net, const char *name)
693 {
694         struct net_device *dev;
695         struct hlist_head *head = dev_name_hash(net, name);
696
697         hlist_for_each_entry(dev, head, name_hlist)
698                 if (!strncmp(dev->name, name, IFNAMSIZ))
699                         return dev;
700
701         return NULL;
702 }
703 EXPORT_SYMBOL(__dev_get_by_name);
704
705 /**
706  *      dev_get_by_name_rcu     - find a device by its name
707  *      @net: the applicable net namespace
708  *      @name: name to find
709  *
710  *      Find an interface by name.
711  *      If the name is found a pointer to the device is returned.
712  *      If the name is not found then %NULL is returned.
713  *      The reference counters are not incremented so the caller must be
714  *      careful with locks. The caller must hold RCU lock.
715  */
716
717 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
718 {
719         struct net_device *dev;
720         struct hlist_head *head = dev_name_hash(net, name);
721
722         hlist_for_each_entry_rcu(dev, head, name_hlist)
723                 if (!strncmp(dev->name, name, IFNAMSIZ))
724                         return dev;
725
726         return NULL;
727 }
728 EXPORT_SYMBOL(dev_get_by_name_rcu);
729
730 /**
731  *      dev_get_by_name         - find a device by its name
732  *      @net: the applicable net namespace
733  *      @name: name to find
734  *
735  *      Find an interface by name. This can be called from any
736  *      context and does its own locking. The returned handle has
737  *      the usage count incremented and the caller must use dev_put() to
738  *      release it when it is no longer needed. %NULL is returned if no
739  *      matching device is found.
740  */
741
742 struct net_device *dev_get_by_name(struct net *net, const char *name)
743 {
744         struct net_device *dev;
745
746         rcu_read_lock();
747         dev = dev_get_by_name_rcu(net, name);
748         if (dev)
749                 dev_hold(dev);
750         rcu_read_unlock();
751         return dev;
752 }
753 EXPORT_SYMBOL(dev_get_by_name);
754
755 /**
756  *      __dev_get_by_index - find a device by its ifindex
757  *      @net: the applicable net namespace
758  *      @ifindex: index of device
759  *
760  *      Search for an interface by index. Returns %NULL if the device
761  *      is not found or a pointer to the device. The device has not
762  *      had its reference counter increased so the caller must be careful
763  *      about locking. The caller must hold either the RTNL semaphore
764  *      or @dev_base_lock.
765  */
766
767 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
768 {
769         struct net_device *dev;
770         struct hlist_head *head = dev_index_hash(net, ifindex);
771
772         hlist_for_each_entry(dev, head, index_hlist)
773                 if (dev->ifindex == ifindex)
774                         return dev;
775
776         return NULL;
777 }
778 EXPORT_SYMBOL(__dev_get_by_index);
779
780 /**
781  *      dev_get_by_index_rcu - find a device by its ifindex
782  *      @net: the applicable net namespace
783  *      @ifindex: index of device
784  *
785  *      Search for an interface by index. Returns %NULL if the device
786  *      is not found or a pointer to the device. The device has not
787  *      had its reference counter increased so the caller must be careful
788  *      about locking. The caller must hold RCU lock.
789  */
790
791 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
792 {
793         struct net_device *dev;
794         struct hlist_head *head = dev_index_hash(net, ifindex);
795
796         hlist_for_each_entry_rcu(dev, head, index_hlist)
797                 if (dev->ifindex == ifindex)
798                         return dev;
799
800         return NULL;
801 }
802 EXPORT_SYMBOL(dev_get_by_index_rcu);
803
804
805 /**
806  *      dev_get_by_index - find a device by its ifindex
807  *      @net: the applicable net namespace
808  *      @ifindex: index of device
809  *
810  *      Search for an interface by index. Returns NULL if the device
811  *      is not found or a pointer to the device. The device returned has
812  *      had a reference added and the pointer is safe until the user calls
813  *      dev_put to indicate they have finished with it.
814  */
815
816 struct net_device *dev_get_by_index(struct net *net, int ifindex)
817 {
818         struct net_device *dev;
819
820         rcu_read_lock();
821         dev = dev_get_by_index_rcu(net, ifindex);
822         if (dev)
823                 dev_hold(dev);
824         rcu_read_unlock();
825         return dev;
826 }
827 EXPORT_SYMBOL(dev_get_by_index);
828
829 /**
830  *      netdev_get_name - get a netdevice name, knowing its ifindex.
831  *      @net: network namespace
832  *      @name: a pointer to the buffer where the name will be stored.
833  *      @ifindex: the ifindex of the interface to get the name from.
834  *
835  *      The use of raw_seqcount_begin() and cond_resched() before
836  *      retrying is required as we want to give the writers a chance
837  *      to complete when CONFIG_PREEMPT is not set.
838  */
839 int netdev_get_name(struct net *net, char *name, int ifindex)
840 {
841         struct net_device *dev;
842         unsigned int seq;
843
844 retry:
845         seq = raw_seqcount_begin(&devnet_rename_seq);
846         rcu_read_lock();
847         dev = dev_get_by_index_rcu(net, ifindex);
848         if (!dev) {
849                 rcu_read_unlock();
850                 return -ENODEV;
851         }
852
853         strcpy(name, dev->name);
854         rcu_read_unlock();
855         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
856                 mutex_lock(&devnet_rename_mutex);
857                 mutex_unlock(&devnet_rename_mutex);
858                 goto retry;
859         }
860
861         return 0;
862 }
863
864 /**
865  *      dev_getbyhwaddr_rcu - find a device by its hardware address
866  *      @net: the applicable net namespace
867  *      @type: media type of device
868  *      @ha: hardware address
869  *
870  *      Search for an interface by MAC address. Returns NULL if the device
871  *      is not found or a pointer to the device.
872  *      The caller must hold RCU or RTNL.
873  *      The returned device has not had its ref count increased
874  *      and the caller must therefore be careful about locking
875  *
876  */
877
878 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
879                                        const char *ha)
880 {
881         struct net_device *dev;
882
883         for_each_netdev_rcu(net, dev)
884                 if (dev->type == type &&
885                     !memcmp(dev->dev_addr, ha, dev->addr_len))
886                         return dev;
887
888         return NULL;
889 }
890 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
891
892 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
893 {
894         struct net_device *dev;
895
896         ASSERT_RTNL();
897         for_each_netdev(net, dev)
898                 if (dev->type == type)
899                         return dev;
900
901         return NULL;
902 }
903 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
904
905 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
906 {
907         struct net_device *dev, *ret = NULL;
908
909         rcu_read_lock();
910         for_each_netdev_rcu(net, dev)
911                 if (dev->type == type) {
912                         dev_hold(dev);
913                         ret = dev;
914                         break;
915                 }
916         rcu_read_unlock();
917         return ret;
918 }
919 EXPORT_SYMBOL(dev_getfirstbyhwtype);
920
921 /**
922  *      __dev_get_by_flags - find any device with given flags
923  *      @net: the applicable net namespace
924  *      @if_flags: IFF_* values
925  *      @mask: bitmask of bits in if_flags to check
926  *
927  *      Search for any interface with the given flags. Returns NULL if a device
928  *      is not found or a pointer to the device. Must be called inside
929  *      rtnl_lock(), and result refcount is unchanged.
930  */
931
932 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
933                                       unsigned short mask)
934 {
935         struct net_device *dev, *ret;
936
937         ASSERT_RTNL();
938
939         ret = NULL;
940         for_each_netdev(net, dev) {
941                 if (((dev->flags ^ if_flags) & mask) == 0) {
942                         ret = dev;
943                         break;
944                 }
945         }
946         return ret;
947 }
948 EXPORT_SYMBOL(__dev_get_by_flags);
949
950 /**
951  *      dev_valid_name - check if name is okay for network device
952  *      @name: name string
953  *
954  *      Network device names need to be valid file names to
955  *      to allow sysfs to work.  We also disallow any kind of
956  *      whitespace.
957  */
958 bool dev_valid_name(const char *name)
959 {
960         if (*name == '\0')
961                 return false;
962         if (strlen(name) >= IFNAMSIZ)
963                 return false;
964         if (!strcmp(name, ".") || !strcmp(name, ".."))
965                 return false;
966
967         while (*name) {
968                 if (*name == '/' || *name == ':' || isspace(*name))
969                         return false;
970                 name++;
971         }
972         return true;
973 }
974 EXPORT_SYMBOL(dev_valid_name);
975
976 /**
977  *      __dev_alloc_name - allocate a name for a device
978  *      @net: network namespace to allocate the device name in
979  *      @name: name format string
980  *      @buf:  scratch buffer and result name string
981  *
982  *      Passed a format string - eg "lt%d" it will try and find a suitable
983  *      id. It scans list of devices to build up a free map, then chooses
984  *      the first empty slot. The caller must hold the dev_base or rtnl lock
985  *      while allocating the name and adding the device in order to avoid
986  *      duplicates.
987  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
988  *      Returns the number of the unit assigned or a negative errno code.
989  */
990
991 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
992 {
993         int i = 0;
994         const char *p;
995         const int max_netdevices = 8*PAGE_SIZE;
996         unsigned long *inuse;
997         struct net_device *d;
998
999         p = strnchr(name, IFNAMSIZ-1, '%');
1000         if (p) {
1001                 /*
1002                  * Verify the string as this thing may have come from
1003                  * the user.  There must be either one "%d" and no other "%"
1004                  * characters.
1005                  */
1006                 if (p[1] != 'd' || strchr(p + 2, '%'))
1007                         return -EINVAL;
1008
1009                 /* Use one page as a bit array of possible slots */
1010                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1011                 if (!inuse)
1012                         return -ENOMEM;
1013
1014                 for_each_netdev(net, d) {
1015                         if (!sscanf(d->name, name, &i))
1016                                 continue;
1017                         if (i < 0 || i >= max_netdevices)
1018                                 continue;
1019
1020                         /*  avoid cases where sscanf is not exact inverse of printf */
1021                         snprintf(buf, IFNAMSIZ, name, i);
1022                         if (!strncmp(buf, d->name, IFNAMSIZ))
1023                                 set_bit(i, inuse);
1024                 }
1025
1026                 i = find_first_zero_bit(inuse, max_netdevices);
1027                 free_page((unsigned long) inuse);
1028         }
1029
1030         if (buf != name)
1031                 snprintf(buf, IFNAMSIZ, name, i);
1032         if (!__dev_get_by_name(net, buf))
1033                 return i;
1034
1035         /* It is possible to run out of possible slots
1036          * when the name is long and there isn't enough space left
1037          * for the digits, or if all bits are used.
1038          */
1039         return -ENFILE;
1040 }
1041
1042 /**
1043  *      dev_alloc_name - allocate a name for a device
1044  *      @dev: device
1045  *      @name: name format string
1046  *
1047  *      Passed a format string - eg "lt%d" it will try and find a suitable
1048  *      id. It scans list of devices to build up a free map, then chooses
1049  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1050  *      while allocating the name and adding the device in order to avoid
1051  *      duplicates.
1052  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1053  *      Returns the number of the unit assigned or a negative errno code.
1054  */
1055
1056 int dev_alloc_name(struct net_device *dev, const char *name)
1057 {
1058         char buf[IFNAMSIZ];
1059         struct net *net;
1060         int ret;
1061
1062         BUG_ON(!dev_net(dev));
1063         net = dev_net(dev);
1064         ret = __dev_alloc_name(net, name, buf);
1065         if (ret >= 0)
1066                 strlcpy(dev->name, buf, IFNAMSIZ);
1067         return ret;
1068 }
1069 EXPORT_SYMBOL(dev_alloc_name);
1070
1071 static int dev_alloc_name_ns(struct net *net,
1072                              struct net_device *dev,
1073                              const char *name)
1074 {
1075         char buf[IFNAMSIZ];
1076         int ret;
1077
1078         ret = __dev_alloc_name(net, name, buf);
1079         if (ret >= 0)
1080                 strlcpy(dev->name, buf, IFNAMSIZ);
1081         return ret;
1082 }
1083
1084 static int dev_get_valid_name(struct net *net,
1085                               struct net_device *dev,
1086                               const char *name)
1087 {
1088         BUG_ON(!net);
1089
1090         if (!dev_valid_name(name))
1091                 return -EINVAL;
1092
1093         if (strchr(name, '%'))
1094                 return dev_alloc_name_ns(net, dev, name);
1095         else if (__dev_get_by_name(net, name))
1096                 return -EEXIST;
1097         else if (dev->name != name)
1098                 strlcpy(dev->name, name, IFNAMSIZ);
1099
1100         return 0;
1101 }
1102
1103 /**
1104  *      dev_change_name - change name of a device
1105  *      @dev: device
1106  *      @newname: name (or format string) must be at least IFNAMSIZ
1107  *
1108  *      Change name of a device, can pass format strings "eth%d".
1109  *      for wildcarding.
1110  */
1111 int dev_change_name(struct net_device *dev, const char *newname)
1112 {
1113         unsigned char old_assign_type;
1114         char oldname[IFNAMSIZ];
1115         int err = 0;
1116         int ret;
1117         struct net *net;
1118
1119         ASSERT_RTNL();
1120         BUG_ON(!dev_net(dev));
1121
1122         net = dev_net(dev);
1123         if (dev->flags & IFF_UP)
1124                 return -EBUSY;
1125
1126         mutex_lock(&devnet_rename_mutex);
1127         __raw_write_seqcount_begin(&devnet_rename_seq);
1128
1129         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1130                 goto outunlock;
1131
1132         memcpy(oldname, dev->name, IFNAMSIZ);
1133
1134         err = dev_get_valid_name(net, dev, newname);
1135         if (err < 0)
1136                 goto outunlock;
1137
1138         if (oldname[0] && !strchr(oldname, '%'))
1139                 netdev_info(dev, "renamed from %s\n", oldname);
1140
1141         old_assign_type = dev->name_assign_type;
1142         dev->name_assign_type = NET_NAME_RENAMED;
1143
1144 rollback:
1145         ret = device_rename(&dev->dev, dev->name);
1146         if (ret) {
1147                 memcpy(dev->name, oldname, IFNAMSIZ);
1148                 dev->name_assign_type = old_assign_type;
1149                 err = ret;
1150                 goto outunlock;
1151         }
1152
1153         __raw_write_seqcount_end(&devnet_rename_seq);
1154         mutex_unlock(&devnet_rename_mutex);
1155
1156         netdev_adjacent_rename_links(dev, oldname);
1157
1158         write_lock_bh(&dev_base_lock);
1159         hlist_del_rcu(&dev->name_hlist);
1160         write_unlock_bh(&dev_base_lock);
1161
1162         synchronize_rcu();
1163
1164         write_lock_bh(&dev_base_lock);
1165         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1166         write_unlock_bh(&dev_base_lock);
1167
1168         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1169         ret = notifier_to_errno(ret);
1170
1171         if (ret) {
1172                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1173                 if (err >= 0) {
1174                         err = ret;
1175                         mutex_lock(&devnet_rename_mutex);
1176                         __raw_write_seqcount_begin(&devnet_rename_seq);
1177                         memcpy(dev->name, oldname, IFNAMSIZ);
1178                         memcpy(oldname, newname, IFNAMSIZ);
1179                         dev->name_assign_type = old_assign_type;
1180                         old_assign_type = NET_NAME_RENAMED;
1181                         goto rollback;
1182                 } else {
1183                         pr_err("%s: name change rollback failed: %d\n",
1184                                dev->name, ret);
1185                 }
1186         }
1187
1188         return err;
1189
1190 outunlock:
1191         __raw_write_seqcount_end(&devnet_rename_seq);
1192         mutex_unlock(&devnet_rename_mutex);
1193         return err;
1194 }
1195
1196 /**
1197  *      dev_set_alias - change ifalias of a device
1198  *      @dev: device
1199  *      @alias: name up to IFALIASZ
1200  *      @len: limit of bytes to copy from info
1201  *
1202  *      Set ifalias for a device,
1203  */
1204 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1205 {
1206         char *new_ifalias;
1207
1208         ASSERT_RTNL();
1209
1210         if (len >= IFALIASZ)
1211                 return -EINVAL;
1212
1213         if (!len) {
1214                 kfree(dev->ifalias);
1215                 dev->ifalias = NULL;
1216                 return 0;
1217         }
1218
1219         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1220         if (!new_ifalias)
1221                 return -ENOMEM;
1222         dev->ifalias = new_ifalias;
1223
1224         strlcpy(dev->ifalias, alias, len+1);
1225         return len;
1226 }
1227
1228
1229 /**
1230  *      netdev_features_change - device changes features
1231  *      @dev: device to cause notification
1232  *
1233  *      Called to indicate a device has changed features.
1234  */
1235 void netdev_features_change(struct net_device *dev)
1236 {
1237         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1238 }
1239 EXPORT_SYMBOL(netdev_features_change);
1240
1241 /**
1242  *      netdev_state_change - device changes state
1243  *      @dev: device to cause notification
1244  *
1245  *      Called to indicate a device has changed state. This function calls
1246  *      the notifier chains for netdev_chain and sends a NEWLINK message
1247  *      to the routing socket.
1248  */
1249 void netdev_state_change(struct net_device *dev)
1250 {
1251         if (dev->flags & IFF_UP) {
1252                 struct netdev_notifier_change_info change_info;
1253
1254                 change_info.flags_changed = 0;
1255                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1256                                               &change_info.info);
1257                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1258         }
1259 }
1260 EXPORT_SYMBOL(netdev_state_change);
1261
1262 /**
1263  *      netdev_notify_peers - notify network peers about existence of @dev
1264  *      @dev: network device
1265  *
1266  * Generate traffic such that interested network peers are aware of
1267  * @dev, such as by generating a gratuitous ARP. This may be used when
1268  * a device wants to inform the rest of the network about some sort of
1269  * reconfiguration such as a failover event or virtual machine
1270  * migration.
1271  */
1272 void netdev_notify_peers(struct net_device *dev)
1273 {
1274         rtnl_lock();
1275         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1276         rtnl_unlock();
1277 }
1278 EXPORT_SYMBOL(netdev_notify_peers);
1279
1280 static int __dev_open(struct net_device *dev)
1281 {
1282         const struct net_device_ops *ops = dev->netdev_ops;
1283         int ret;
1284
1285         ASSERT_RTNL();
1286
1287         if (!netif_device_present(dev))
1288                 return -ENODEV;
1289
1290         /* Block netpoll from trying to do any rx path servicing.
1291          * If we don't do this there is a chance ndo_poll_controller
1292          * or ndo_poll may be running while we open the device
1293          */
1294         netpoll_poll_disable(dev);
1295
1296         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1297         ret = notifier_to_errno(ret);
1298         if (ret)
1299                 return ret;
1300
1301         set_bit(__LINK_STATE_START, &dev->state);
1302
1303         if (ops->ndo_validate_addr)
1304                 ret = ops->ndo_validate_addr(dev);
1305
1306         if (!ret && ops->ndo_open)
1307                 ret = ops->ndo_open(dev);
1308
1309         netpoll_poll_enable(dev);
1310
1311         if (ret)
1312                 clear_bit(__LINK_STATE_START, &dev->state);
1313         else {
1314                 dev->flags |= IFF_UP;
1315                 dev_set_rx_mode(dev);
1316                 dev_activate(dev);
1317                 add_device_randomness(dev->dev_addr, dev->addr_len);
1318         }
1319
1320         return ret;
1321 }
1322
1323 /**
1324  *      dev_open        - prepare an interface for use.
1325  *      @dev:   device to open
1326  *
1327  *      Takes a device from down to up state. The device's private open
1328  *      function is invoked and then the multicast lists are loaded. Finally
1329  *      the device is moved into the up state and a %NETDEV_UP message is
1330  *      sent to the netdev notifier chain.
1331  *
1332  *      Calling this function on an active interface is a nop. On a failure
1333  *      a negative errno code is returned.
1334  */
1335 int dev_open(struct net_device *dev)
1336 {
1337         int ret;
1338
1339         if (dev->flags & IFF_UP)
1340                 return 0;
1341
1342         ret = __dev_open(dev);
1343         if (ret < 0)
1344                 return ret;
1345
1346         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1347         call_netdevice_notifiers(NETDEV_UP, dev);
1348
1349         return ret;
1350 }
1351 EXPORT_SYMBOL(dev_open);
1352
1353 static int __dev_close_many(struct list_head *head)
1354 {
1355         struct net_device *dev;
1356
1357         ASSERT_RTNL();
1358         might_sleep();
1359
1360         list_for_each_entry(dev, head, close_list) {
1361                 /* Temporarily disable netpoll until the interface is down */
1362                 netpoll_poll_disable(dev);
1363
1364                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1365
1366                 clear_bit(__LINK_STATE_START, &dev->state);
1367
1368                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1369                  * can be even on different cpu. So just clear netif_running().
1370                  *
1371                  * dev->stop() will invoke napi_disable() on all of it's
1372                  * napi_struct instances on this device.
1373                  */
1374                 smp_mb__after_atomic(); /* Commit netif_running(). */
1375         }
1376
1377         dev_deactivate_many(head);
1378
1379         list_for_each_entry(dev, head, close_list) {
1380                 const struct net_device_ops *ops = dev->netdev_ops;
1381
1382                 /*
1383                  *      Call the device specific close. This cannot fail.
1384                  *      Only if device is UP
1385                  *
1386                  *      We allow it to be called even after a DETACH hot-plug
1387                  *      event.
1388                  */
1389                 if (ops->ndo_stop)
1390                         ops->ndo_stop(dev);
1391
1392                 dev->flags &= ~IFF_UP;
1393                 netpoll_poll_enable(dev);
1394         }
1395
1396         return 0;
1397 }
1398
1399 static int __dev_close(struct net_device *dev)
1400 {
1401         int retval;
1402         LIST_HEAD(single);
1403
1404         list_add(&dev->close_list, &single);
1405         retval = __dev_close_many(&single);
1406         list_del(&single);
1407
1408         return retval;
1409 }
1410
1411 int dev_close_many(struct list_head *head, bool unlink)
1412 {
1413         struct net_device *dev, *tmp;
1414
1415         /* Remove the devices that don't need to be closed */
1416         list_for_each_entry_safe(dev, tmp, head, close_list)
1417                 if (!(dev->flags & IFF_UP))
1418                         list_del_init(&dev->close_list);
1419
1420         __dev_close_many(head);
1421
1422         list_for_each_entry_safe(dev, tmp, head, close_list) {
1423                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1424                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1425                 if (unlink)
1426                         list_del_init(&dev->close_list);
1427         }
1428
1429         return 0;
1430 }
1431 EXPORT_SYMBOL(dev_close_many);
1432
1433 /**
1434  *      dev_close - shutdown an interface.
1435  *      @dev: device to shutdown
1436  *
1437  *      This function moves an active device into down state. A
1438  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1439  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1440  *      chain.
1441  */
1442 int dev_close(struct net_device *dev)
1443 {
1444         if (dev->flags & IFF_UP) {
1445                 LIST_HEAD(single);
1446
1447                 list_add(&dev->close_list, &single);
1448                 dev_close_many(&single, true);
1449                 list_del(&single);
1450         }
1451         return 0;
1452 }
1453 EXPORT_SYMBOL(dev_close);
1454
1455
1456 /**
1457  *      dev_disable_lro - disable Large Receive Offload on a device
1458  *      @dev: device
1459  *
1460  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1461  *      called under RTNL.  This is needed if received packets may be
1462  *      forwarded to another interface.
1463  */
1464 void dev_disable_lro(struct net_device *dev)
1465 {
1466         struct net_device *lower_dev;
1467         struct list_head *iter;
1468
1469         dev->wanted_features &= ~NETIF_F_LRO;
1470         netdev_update_features(dev);
1471
1472         if (unlikely(dev->features & NETIF_F_LRO))
1473                 netdev_WARN(dev, "failed to disable LRO!\n");
1474
1475         netdev_for_each_lower_dev(dev, lower_dev, iter)
1476                 dev_disable_lro(lower_dev);
1477 }
1478 EXPORT_SYMBOL(dev_disable_lro);
1479
1480 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1481                                    struct net_device *dev)
1482 {
1483         struct netdev_notifier_info info;
1484
1485         netdev_notifier_info_init(&info, dev);
1486         return nb->notifier_call(nb, val, &info);
1487 }
1488
1489 static int dev_boot_phase = 1;
1490
1491 /**
1492  *      register_netdevice_notifier - register a network notifier block
1493  *      @nb: notifier
1494  *
1495  *      Register a notifier to be called when network device events occur.
1496  *      The notifier passed is linked into the kernel structures and must
1497  *      not be reused until it has been unregistered. A negative errno code
1498  *      is returned on a failure.
1499  *
1500  *      When registered all registration and up events are replayed
1501  *      to the new notifier to allow device to have a race free
1502  *      view of the network device list.
1503  */
1504
1505 int register_netdevice_notifier(struct notifier_block *nb)
1506 {
1507         struct net_device *dev;
1508         struct net_device *last;
1509         struct net *net;
1510         int err;
1511
1512         rtnl_lock();
1513         err = raw_notifier_chain_register(&netdev_chain, nb);
1514         if (err)
1515                 goto unlock;
1516         if (dev_boot_phase)
1517                 goto unlock;
1518         for_each_net(net) {
1519                 for_each_netdev(net, dev) {
1520                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1521                         err = notifier_to_errno(err);
1522                         if (err)
1523                                 goto rollback;
1524
1525                         if (!(dev->flags & IFF_UP))
1526                                 continue;
1527
1528                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1529                 }
1530         }
1531
1532 unlock:
1533         rtnl_unlock();
1534         return err;
1535
1536 rollback:
1537         last = dev;
1538         for_each_net(net) {
1539                 for_each_netdev(net, dev) {
1540                         if (dev == last)
1541                                 goto outroll;
1542
1543                         if (dev->flags & IFF_UP) {
1544                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1545                                                         dev);
1546                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1547                         }
1548                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1549                 }
1550         }
1551
1552 outroll:
1553         raw_notifier_chain_unregister(&netdev_chain, nb);
1554         goto unlock;
1555 }
1556 EXPORT_SYMBOL(register_netdevice_notifier);
1557
1558 /**
1559  *      unregister_netdevice_notifier - unregister a network notifier block
1560  *      @nb: notifier
1561  *
1562  *      Unregister a notifier previously registered by
1563  *      register_netdevice_notifier(). The notifier is unlinked into the
1564  *      kernel structures and may then be reused. A negative errno code
1565  *      is returned on a failure.
1566  *
1567  *      After unregistering unregister and down device events are synthesized
1568  *      for all devices on the device list to the removed notifier to remove
1569  *      the need for special case cleanup code.
1570  */
1571
1572 int unregister_netdevice_notifier(struct notifier_block *nb)
1573 {
1574         struct net_device *dev;
1575         struct net *net;
1576         int err;
1577
1578         rtnl_lock();
1579         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1580         if (err)
1581                 goto unlock;
1582
1583         for_each_net(net) {
1584                 for_each_netdev(net, dev) {
1585                         if (dev->flags & IFF_UP) {
1586                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1587                                                         dev);
1588                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1589                         }
1590                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1591                 }
1592         }
1593 unlock:
1594         rtnl_unlock();
1595         return err;
1596 }
1597 EXPORT_SYMBOL(unregister_netdevice_notifier);
1598
1599 /**
1600  *      call_netdevice_notifiers_info - call all network notifier blocks
1601  *      @val: value passed unmodified to notifier function
1602  *      @dev: net_device pointer passed unmodified to notifier function
1603  *      @info: notifier information data
1604  *
1605  *      Call all network notifier blocks.  Parameters and return value
1606  *      are as for raw_notifier_call_chain().
1607  */
1608
1609 static int call_netdevice_notifiers_info(unsigned long val,
1610                                          struct net_device *dev,
1611                                          struct netdev_notifier_info *info)
1612 {
1613         ASSERT_RTNL();
1614         netdev_notifier_info_init(info, dev);
1615         return raw_notifier_call_chain(&netdev_chain, val, info);
1616 }
1617
1618 /**
1619  *      call_netdevice_notifiers - call all network notifier blocks
1620  *      @val: value passed unmodified to notifier function
1621  *      @dev: net_device pointer passed unmodified to notifier function
1622  *
1623  *      Call all network notifier blocks.  Parameters and return value
1624  *      are as for raw_notifier_call_chain().
1625  */
1626
1627 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1628 {
1629         struct netdev_notifier_info info;
1630
1631         return call_netdevice_notifiers_info(val, dev, &info);
1632 }
1633 EXPORT_SYMBOL(call_netdevice_notifiers);
1634
1635 #ifdef CONFIG_NET_CLS_ACT
1636 static struct static_key ingress_needed __read_mostly;
1637
1638 void net_inc_ingress_queue(void)
1639 {
1640         static_key_slow_inc(&ingress_needed);
1641 }
1642 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1643
1644 void net_dec_ingress_queue(void)
1645 {
1646         static_key_slow_dec(&ingress_needed);
1647 }
1648 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1649 #endif
1650
1651 static struct static_key netstamp_needed __read_mostly;
1652 #ifdef HAVE_JUMP_LABEL
1653 /* We are not allowed to call static_key_slow_dec() from irq context
1654  * If net_disable_timestamp() is called from irq context, defer the
1655  * static_key_slow_dec() calls.
1656  */
1657 static atomic_t netstamp_needed_deferred;
1658 #endif
1659
1660 void net_enable_timestamp(void)
1661 {
1662 #ifdef HAVE_JUMP_LABEL
1663         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1664
1665         if (deferred) {
1666                 while (--deferred)
1667                         static_key_slow_dec(&netstamp_needed);
1668                 return;
1669         }
1670 #endif
1671         static_key_slow_inc(&netstamp_needed);
1672 }
1673 EXPORT_SYMBOL(net_enable_timestamp);
1674
1675 void net_disable_timestamp(void)
1676 {
1677 #ifdef HAVE_JUMP_LABEL
1678         if (in_interrupt()) {
1679                 atomic_inc(&netstamp_needed_deferred);
1680                 return;
1681         }
1682 #endif
1683         static_key_slow_dec(&netstamp_needed);
1684 }
1685 EXPORT_SYMBOL(net_disable_timestamp);
1686
1687 static inline void net_timestamp_set(struct sk_buff *skb)
1688 {
1689         skb->tstamp.tv64 = 0;
1690         if (static_key_false(&netstamp_needed))
1691                 __net_timestamp(skb);
1692 }
1693
1694 #define net_timestamp_check(COND, SKB)                  \
1695         if (static_key_false(&netstamp_needed)) {               \
1696                 if ((COND) && !(SKB)->tstamp.tv64)      \
1697                         __net_timestamp(SKB);           \
1698         }                                               \
1699
1700 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1701 {
1702         unsigned int len;
1703
1704         if (!(dev->flags & IFF_UP))
1705                 return false;
1706
1707         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1708         if (skb->len <= len)
1709                 return true;
1710
1711         /* if TSO is enabled, we don't care about the length as the packet
1712          * could be forwarded without being segmented before
1713          */
1714         if (skb_is_gso(skb))
1715                 return true;
1716
1717         return false;
1718 }
1719 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1720
1721 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1722 {
1723         if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1724             unlikely(!is_skb_forwardable(dev, skb))) {
1725                 atomic_long_inc(&dev->rx_dropped);
1726                 kfree_skb(skb);
1727                 return NET_RX_DROP;
1728         }
1729
1730         skb_scrub_packet(skb, true);
1731         skb->priority = 0;
1732         skb->protocol = eth_type_trans(skb, dev);
1733         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1734
1735         return 0;
1736 }
1737 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1738
1739 /**
1740  * dev_forward_skb - loopback an skb to another netif
1741  *
1742  * @dev: destination network device
1743  * @skb: buffer to forward
1744  *
1745  * return values:
1746  *      NET_RX_SUCCESS  (no congestion)
1747  *      NET_RX_DROP     (packet was dropped, but freed)
1748  *
1749  * dev_forward_skb can be used for injecting an skb from the
1750  * start_xmit function of one device into the receive queue
1751  * of another device.
1752  *
1753  * The receiving device may be in another namespace, so
1754  * we have to clear all information in the skb that could
1755  * impact namespace isolation.
1756  */
1757 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1758 {
1759         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1760 }
1761 EXPORT_SYMBOL_GPL(dev_forward_skb);
1762
1763 static inline int deliver_skb(struct sk_buff *skb,
1764                               struct packet_type *pt_prev,
1765                               struct net_device *orig_dev)
1766 {
1767         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1768                 return -ENOMEM;
1769         atomic_inc(&skb->users);
1770         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1771 }
1772
1773 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1774                                           struct packet_type **pt,
1775                                           struct net_device *orig_dev,
1776                                           __be16 type,
1777                                           struct list_head *ptype_list)
1778 {
1779         struct packet_type *ptype, *pt_prev = *pt;
1780
1781         list_for_each_entry_rcu(ptype, ptype_list, list) {
1782                 if (ptype->type != type)
1783                         continue;
1784                 if (pt_prev)
1785                         deliver_skb(skb, pt_prev, orig_dev);
1786                 pt_prev = ptype;
1787         }
1788         *pt = pt_prev;
1789 }
1790
1791 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1792 {
1793         if (!ptype->af_packet_priv || !skb->sk)
1794                 return false;
1795
1796         if (ptype->id_match)
1797                 return ptype->id_match(ptype, skb->sk);
1798         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1799                 return true;
1800
1801         return false;
1802 }
1803
1804 /*
1805  *      Support routine. Sends outgoing frames to any network
1806  *      taps currently in use.
1807  */
1808
1809 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1810 {
1811         struct packet_type *ptype;
1812         struct sk_buff *skb2 = NULL;
1813         struct packet_type *pt_prev = NULL;
1814         struct list_head *ptype_list = &ptype_all;
1815
1816         rcu_read_lock();
1817 again:
1818         list_for_each_entry_rcu(ptype, ptype_list, list) {
1819                 /* Never send packets back to the socket
1820                  * they originated from - MvS (miquels@drinkel.ow.org)
1821                  */
1822                 if (skb_loop_sk(ptype, skb))
1823                         continue;
1824
1825                 if (pt_prev) {
1826                         deliver_skb(skb2, pt_prev, skb->dev);
1827                         pt_prev = ptype;
1828                         continue;
1829                 }
1830
1831                 /* need to clone skb, done only once */
1832                 skb2 = skb_clone(skb, GFP_ATOMIC);
1833                 if (!skb2)
1834                         goto out_unlock;
1835
1836                 net_timestamp_set(skb2);
1837
1838                 /* skb->nh should be correctly
1839                  * set by sender, so that the second statement is
1840                  * just protection against buggy protocols.
1841                  */
1842                 skb_reset_mac_header(skb2);
1843
1844                 if (skb_network_header(skb2) < skb2->data ||
1845                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1846                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1847                                              ntohs(skb2->protocol),
1848                                              dev->name);
1849                         skb_reset_network_header(skb2);
1850                 }
1851
1852                 skb2->transport_header = skb2->network_header;
1853                 skb2->pkt_type = PACKET_OUTGOING;
1854                 pt_prev = ptype;
1855         }
1856
1857         if (ptype_list == &ptype_all) {
1858                 ptype_list = &dev->ptype_all;
1859                 goto again;
1860         }
1861 out_unlock:
1862         if (pt_prev)
1863                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1864         rcu_read_unlock();
1865 }
1866
1867 /**
1868  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1869  * @dev: Network device
1870  * @txq: number of queues available
1871  *
1872  * If real_num_tx_queues is changed the tc mappings may no longer be
1873  * valid. To resolve this verify the tc mapping remains valid and if
1874  * not NULL the mapping. With no priorities mapping to this
1875  * offset/count pair it will no longer be used. In the worst case TC0
1876  * is invalid nothing can be done so disable priority mappings. If is
1877  * expected that drivers will fix this mapping if they can before
1878  * calling netif_set_real_num_tx_queues.
1879  */
1880 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1881 {
1882         int i;
1883         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1884
1885         /* If TC0 is invalidated disable TC mapping */
1886         if (tc->offset + tc->count > txq) {
1887                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1888                 dev->num_tc = 0;
1889                 return;
1890         }
1891
1892         /* Invalidated prio to tc mappings set to TC0 */
1893         for (i = 1; i < TC_BITMASK + 1; i++) {
1894                 int q = netdev_get_prio_tc_map(dev, i);
1895
1896                 tc = &dev->tc_to_txq[q];
1897                 if (tc->offset + tc->count > txq) {
1898                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1899                                 i, q);
1900                         netdev_set_prio_tc_map(dev, i, 0);
1901                 }
1902         }
1903 }
1904
1905 #ifdef CONFIG_XPS
1906 static DEFINE_MUTEX(xps_map_mutex);
1907 #define xmap_dereference(P)             \
1908         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1909
1910 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1911                                         int cpu, u16 index)
1912 {
1913         struct xps_map *map = NULL;
1914         int pos;
1915
1916         if (dev_maps)
1917                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1918
1919         for (pos = 0; map && pos < map->len; pos++) {
1920                 if (map->queues[pos] == index) {
1921                         if (map->len > 1) {
1922                                 map->queues[pos] = map->queues[--map->len];
1923                         } else {
1924                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1925                                 kfree_rcu(map, rcu);
1926                                 map = NULL;
1927                         }
1928                         break;
1929                 }
1930         }
1931
1932         return map;
1933 }
1934
1935 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1936 {
1937         struct xps_dev_maps *dev_maps;
1938         int cpu, i;
1939         bool active = false;
1940
1941         mutex_lock(&xps_map_mutex);
1942         dev_maps = xmap_dereference(dev->xps_maps);
1943
1944         if (!dev_maps)
1945                 goto out_no_maps;
1946
1947         for_each_possible_cpu(cpu) {
1948                 for (i = index; i < dev->num_tx_queues; i++) {
1949                         if (!remove_xps_queue(dev_maps, cpu, i))
1950                                 break;
1951                 }
1952                 if (i == dev->num_tx_queues)
1953                         active = true;
1954         }
1955
1956         if (!active) {
1957                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1958                 kfree_rcu(dev_maps, rcu);
1959         }
1960
1961         for (i = index; i < dev->num_tx_queues; i++)
1962                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1963                                              NUMA_NO_NODE);
1964
1965 out_no_maps:
1966         mutex_unlock(&xps_map_mutex);
1967 }
1968
1969 static struct xps_map *expand_xps_map(struct xps_map *map,
1970                                       int cpu, u16 index)
1971 {
1972         struct xps_map *new_map;
1973         int alloc_len = XPS_MIN_MAP_ALLOC;
1974         int i, pos;
1975
1976         for (pos = 0; map && pos < map->len; pos++) {
1977                 if (map->queues[pos] != index)
1978                         continue;
1979                 return map;
1980         }
1981
1982         /* Need to add queue to this CPU's existing map */
1983         if (map) {
1984                 if (pos < map->alloc_len)
1985                         return map;
1986
1987                 alloc_len = map->alloc_len * 2;
1988         }
1989
1990         /* Need to allocate new map to store queue on this CPU's map */
1991         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1992                                cpu_to_node(cpu));
1993         if (!new_map)
1994                 return NULL;
1995
1996         for (i = 0; i < pos; i++)
1997                 new_map->queues[i] = map->queues[i];
1998         new_map->alloc_len = alloc_len;
1999         new_map->len = pos;
2000
2001         return new_map;
2002 }
2003
2004 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2005                         u16 index)
2006 {
2007         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2008         struct xps_map *map, *new_map;
2009         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2010         int cpu, numa_node_id = -2;
2011         bool active = false;
2012
2013         mutex_lock(&xps_map_mutex);
2014
2015         dev_maps = xmap_dereference(dev->xps_maps);
2016
2017         /* allocate memory for queue storage */
2018         for_each_online_cpu(cpu) {
2019                 if (!cpumask_test_cpu(cpu, mask))
2020                         continue;
2021
2022                 if (!new_dev_maps)
2023                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2024                 if (!new_dev_maps) {
2025                         mutex_unlock(&xps_map_mutex);
2026                         return -ENOMEM;
2027                 }
2028
2029                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2030                                  NULL;
2031
2032                 map = expand_xps_map(map, cpu, index);
2033                 if (!map)
2034                         goto error;
2035
2036                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2037         }
2038
2039         if (!new_dev_maps)
2040                 goto out_no_new_maps;
2041
2042         for_each_possible_cpu(cpu) {
2043                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2044                         /* add queue to CPU maps */
2045                         int pos = 0;
2046
2047                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2048                         while ((pos < map->len) && (map->queues[pos] != index))
2049                                 pos++;
2050
2051                         if (pos == map->len)
2052                                 map->queues[map->len++] = index;
2053 #ifdef CONFIG_NUMA
2054                         if (numa_node_id == -2)
2055                                 numa_node_id = cpu_to_node(cpu);
2056                         else if (numa_node_id != cpu_to_node(cpu))
2057                                 numa_node_id = -1;
2058 #endif
2059                 } else if (dev_maps) {
2060                         /* fill in the new device map from the old device map */
2061                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2062                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2063                 }
2064
2065         }
2066
2067         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2068
2069         /* Cleanup old maps */
2070         if (dev_maps) {
2071                 for_each_possible_cpu(cpu) {
2072                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2073                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2074                         if (map && map != new_map)
2075                                 kfree_rcu(map, rcu);
2076                 }
2077
2078                 kfree_rcu(dev_maps, rcu);
2079         }
2080
2081         dev_maps = new_dev_maps;
2082         active = true;
2083
2084 out_no_new_maps:
2085         /* update Tx queue numa node */
2086         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2087                                      (numa_node_id >= 0) ? numa_node_id :
2088                                      NUMA_NO_NODE);
2089
2090         if (!dev_maps)
2091                 goto out_no_maps;
2092
2093         /* removes queue from unused CPUs */
2094         for_each_possible_cpu(cpu) {
2095                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2096                         continue;
2097
2098                 if (remove_xps_queue(dev_maps, cpu, index))
2099                         active = true;
2100         }
2101
2102         /* free map if not active */
2103         if (!active) {
2104                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2105                 kfree_rcu(dev_maps, rcu);
2106         }
2107
2108 out_no_maps:
2109         mutex_unlock(&xps_map_mutex);
2110
2111         return 0;
2112 error:
2113         /* remove any maps that we added */
2114         for_each_possible_cpu(cpu) {
2115                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2116                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2117                                  NULL;
2118                 if (new_map && new_map != map)
2119                         kfree(new_map);
2120         }
2121
2122         mutex_unlock(&xps_map_mutex);
2123
2124         kfree(new_dev_maps);
2125         return -ENOMEM;
2126 }
2127 EXPORT_SYMBOL(netif_set_xps_queue);
2128
2129 #endif
2130 /*
2131  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2132  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2133  */
2134 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2135 {
2136         int rc;
2137
2138         if (txq < 1 || txq > dev->num_tx_queues)
2139                 return -EINVAL;
2140
2141         if (dev->reg_state == NETREG_REGISTERED ||
2142             dev->reg_state == NETREG_UNREGISTERING) {
2143                 ASSERT_RTNL();
2144
2145                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2146                                                   txq);
2147                 if (rc)
2148                         return rc;
2149
2150                 if (dev->num_tc)
2151                         netif_setup_tc(dev, txq);
2152
2153                 if (txq < dev->real_num_tx_queues) {
2154                         qdisc_reset_all_tx_gt(dev, txq);
2155 #ifdef CONFIG_XPS
2156                         netif_reset_xps_queues_gt(dev, txq);
2157 #endif
2158                 }
2159         }
2160
2161         dev->real_num_tx_queues = txq;
2162         return 0;
2163 }
2164 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2165
2166 #ifdef CONFIG_SYSFS
2167 /**
2168  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2169  *      @dev: Network device
2170  *      @rxq: Actual number of RX queues
2171  *
2172  *      This must be called either with the rtnl_lock held or before
2173  *      registration of the net device.  Returns 0 on success, or a
2174  *      negative error code.  If called before registration, it always
2175  *      succeeds.
2176  */
2177 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2178 {
2179         int rc;
2180
2181         if (rxq < 1 || rxq > dev->num_rx_queues)
2182                 return -EINVAL;
2183
2184         if (dev->reg_state == NETREG_REGISTERED) {
2185                 ASSERT_RTNL();
2186
2187                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2188                                                   rxq);
2189                 if (rc)
2190                         return rc;
2191         }
2192
2193         dev->real_num_rx_queues = rxq;
2194         return 0;
2195 }
2196 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2197 #endif
2198
2199 /**
2200  * netif_get_num_default_rss_queues - default number of RSS queues
2201  *
2202  * This routine should set an upper limit on the number of RSS queues
2203  * used by default by multiqueue devices.
2204  */
2205 int netif_get_num_default_rss_queues(void)
2206 {
2207         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2208 }
2209 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2210
2211 static inline void __netif_reschedule(struct Qdisc *q)
2212 {
2213         struct softnet_data *sd;
2214         unsigned long flags;
2215
2216         local_irq_save(flags);
2217         sd = this_cpu_ptr(&softnet_data);
2218         q->next_sched = NULL;
2219         *sd->output_queue_tailp = q;
2220         sd->output_queue_tailp = &q->next_sched;
2221         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2222         local_irq_restore(flags);
2223         preempt_check_resched_rt();
2224 }
2225
2226 void __netif_schedule(struct Qdisc *q)
2227 {
2228         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2229                 __netif_reschedule(q);
2230 }
2231 EXPORT_SYMBOL(__netif_schedule);
2232
2233 struct dev_kfree_skb_cb {
2234         enum skb_free_reason reason;
2235 };
2236
2237 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2238 {
2239         return (struct dev_kfree_skb_cb *)skb->cb;
2240 }
2241
2242 void netif_schedule_queue(struct netdev_queue *txq)
2243 {
2244         rcu_read_lock();
2245         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2246                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2247
2248                 __netif_schedule(q);
2249         }
2250         rcu_read_unlock();
2251 }
2252 EXPORT_SYMBOL(netif_schedule_queue);
2253
2254 /**
2255  *      netif_wake_subqueue - allow sending packets on subqueue
2256  *      @dev: network device
2257  *      @queue_index: sub queue index
2258  *
2259  * Resume individual transmit queue of a device with multiple transmit queues.
2260  */
2261 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2262 {
2263         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2264
2265         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2266                 struct Qdisc *q;
2267
2268                 rcu_read_lock();
2269                 q = rcu_dereference(txq->qdisc);
2270                 __netif_schedule(q);
2271                 rcu_read_unlock();
2272         }
2273 }
2274 EXPORT_SYMBOL(netif_wake_subqueue);
2275
2276 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2277 {
2278         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2279                 struct Qdisc *q;
2280
2281                 rcu_read_lock();
2282                 q = rcu_dereference(dev_queue->qdisc);
2283                 __netif_schedule(q);
2284                 rcu_read_unlock();
2285         }
2286 }
2287 EXPORT_SYMBOL(netif_tx_wake_queue);
2288
2289 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2290 {
2291         unsigned long flags;
2292
2293         if (likely(atomic_read(&skb->users) == 1)) {
2294                 smp_rmb();
2295                 atomic_set(&skb->users, 0);
2296         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2297                 return;
2298         }
2299         get_kfree_skb_cb(skb)->reason = reason;
2300         local_irq_save(flags);
2301         skb->next = __this_cpu_read(softnet_data.completion_queue);
2302         __this_cpu_write(softnet_data.completion_queue, skb);
2303         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2304         local_irq_restore(flags);
2305         preempt_check_resched_rt();
2306 }
2307 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2308
2309 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2310 {
2311         if (in_irq() || irqs_disabled())
2312                 __dev_kfree_skb_irq(skb, reason);
2313         else
2314                 dev_kfree_skb(skb);
2315 }
2316 EXPORT_SYMBOL(__dev_kfree_skb_any);
2317
2318
2319 /**
2320  * netif_device_detach - mark device as removed
2321  * @dev: network device
2322  *
2323  * Mark device as removed from system and therefore no longer available.
2324  */
2325 void netif_device_detach(struct net_device *dev)
2326 {
2327         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2328             netif_running(dev)) {
2329                 netif_tx_stop_all_queues(dev);
2330         }
2331 }
2332 EXPORT_SYMBOL(netif_device_detach);
2333
2334 /**
2335  * netif_device_attach - mark device as attached
2336  * @dev: network device
2337  *
2338  * Mark device as attached from system and restart if needed.
2339  */
2340 void netif_device_attach(struct net_device *dev)
2341 {
2342         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2343             netif_running(dev)) {
2344                 netif_tx_wake_all_queues(dev);
2345                 __netdev_watchdog_up(dev);
2346         }
2347 }
2348 EXPORT_SYMBOL(netif_device_attach);
2349
2350 static void skb_warn_bad_offload(const struct sk_buff *skb)
2351 {
2352         static const netdev_features_t null_features = 0;
2353         struct net_device *dev = skb->dev;
2354         const char *driver = "";
2355
2356         if (!net_ratelimit())
2357                 return;
2358
2359         if (dev && dev->dev.parent)
2360                 driver = dev_driver_string(dev->dev.parent);
2361
2362         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2363              "gso_type=%d ip_summed=%d\n",
2364              driver, dev ? &dev->features : &null_features,
2365              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2366              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2367              skb_shinfo(skb)->gso_type, skb->ip_summed);
2368 }
2369
2370 /*
2371  * Invalidate hardware checksum when packet is to be mangled, and
2372  * complete checksum manually on outgoing path.
2373  */
2374 int skb_checksum_help(struct sk_buff *skb)
2375 {
2376         __wsum csum;
2377         int ret = 0, offset;
2378
2379         if (skb->ip_summed == CHECKSUM_COMPLETE)
2380                 goto out_set_summed;
2381
2382         if (unlikely(skb_shinfo(skb)->gso_size)) {
2383                 skb_warn_bad_offload(skb);
2384                 return -EINVAL;
2385         }
2386
2387         /* Before computing a checksum, we should make sure no frag could
2388          * be modified by an external entity : checksum could be wrong.
2389          */
2390         if (skb_has_shared_frag(skb)) {
2391                 ret = __skb_linearize(skb);
2392                 if (ret)
2393                         goto out;
2394         }
2395
2396         offset = skb_checksum_start_offset(skb);
2397         BUG_ON(offset >= skb_headlen(skb));
2398         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2399
2400         offset += skb->csum_offset;
2401         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2402
2403         if (skb_cloned(skb) &&
2404             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2405                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2406                 if (ret)
2407                         goto out;
2408         }
2409
2410         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2411 out_set_summed:
2412         skb->ip_summed = CHECKSUM_NONE;
2413 out:
2414         return ret;
2415 }
2416 EXPORT_SYMBOL(skb_checksum_help);
2417
2418 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2419 {
2420         __be16 type = skb->protocol;
2421
2422         /* Tunnel gso handlers can set protocol to ethernet. */
2423         if (type == htons(ETH_P_TEB)) {
2424                 struct ethhdr *eth;
2425
2426                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2427                         return 0;
2428
2429                 eth = (struct ethhdr *)skb_mac_header(skb);
2430                 type = eth->h_proto;
2431         }
2432
2433         return __vlan_get_protocol(skb, type, depth);
2434 }
2435
2436 /**
2437  *      skb_mac_gso_segment - mac layer segmentation handler.
2438  *      @skb: buffer to segment
2439  *      @features: features for the output path (see dev->features)
2440  */
2441 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2442                                     netdev_features_t features)
2443 {
2444         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2445         struct packet_offload *ptype;
2446         int vlan_depth = skb->mac_len;
2447         __be16 type = skb_network_protocol(skb, &vlan_depth);
2448
2449         if (unlikely(!type))
2450                 return ERR_PTR(-EINVAL);
2451
2452         __skb_pull(skb, vlan_depth);
2453
2454         rcu_read_lock();
2455         list_for_each_entry_rcu(ptype, &offload_base, list) {
2456                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2457                         segs = ptype->callbacks.gso_segment(skb, features);
2458                         break;
2459                 }
2460         }
2461         rcu_read_unlock();
2462
2463         __skb_push(skb, skb->data - skb_mac_header(skb));
2464
2465         return segs;
2466 }
2467 EXPORT_SYMBOL(skb_mac_gso_segment);
2468
2469
2470 /* openvswitch calls this on rx path, so we need a different check.
2471  */
2472 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2473 {
2474         if (tx_path)
2475                 return skb->ip_summed != CHECKSUM_PARTIAL;
2476         else
2477                 return skb->ip_summed == CHECKSUM_NONE;
2478 }
2479
2480 /**
2481  *      __skb_gso_segment - Perform segmentation on skb.
2482  *      @skb: buffer to segment
2483  *      @features: features for the output path (see dev->features)
2484  *      @tx_path: whether it is called in TX path
2485  *
2486  *      This function segments the given skb and returns a list of segments.
2487  *
2488  *      It may return NULL if the skb requires no segmentation.  This is
2489  *      only possible when GSO is used for verifying header integrity.
2490  */
2491 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2492                                   netdev_features_t features, bool tx_path)
2493 {
2494         if (unlikely(skb_needs_check(skb, tx_path))) {
2495                 int err;
2496
2497                 skb_warn_bad_offload(skb);
2498
2499                 err = skb_cow_head(skb, 0);
2500                 if (err < 0)
2501                         return ERR_PTR(err);
2502         }
2503
2504         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2505         SKB_GSO_CB(skb)->encap_level = 0;
2506
2507         skb_reset_mac_header(skb);
2508         skb_reset_mac_len(skb);
2509
2510         return skb_mac_gso_segment(skb, features);
2511 }
2512 EXPORT_SYMBOL(__skb_gso_segment);
2513
2514 /* Take action when hardware reception checksum errors are detected. */
2515 #ifdef CONFIG_BUG
2516 void netdev_rx_csum_fault(struct net_device *dev)
2517 {
2518         if (net_ratelimit()) {
2519                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2520                 dump_stack();
2521         }
2522 }
2523 EXPORT_SYMBOL(netdev_rx_csum_fault);
2524 #endif
2525
2526 /* Actually, we should eliminate this check as soon as we know, that:
2527  * 1. IOMMU is present and allows to map all the memory.
2528  * 2. No high memory really exists on this machine.
2529  */
2530
2531 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2532 {
2533 #ifdef CONFIG_HIGHMEM
2534         int i;
2535         if (!(dev->features & NETIF_F_HIGHDMA)) {
2536                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2537                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2538                         if (PageHighMem(skb_frag_page(frag)))
2539                                 return 1;
2540                 }
2541         }
2542
2543         if (PCI_DMA_BUS_IS_PHYS) {
2544                 struct device *pdev = dev->dev.parent;
2545
2546                 if (!pdev)
2547                         return 0;
2548                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2549                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2550                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2551                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2552                                 return 1;
2553                 }
2554         }
2555 #endif
2556         return 0;
2557 }
2558
2559 /* If MPLS offload request, verify we are testing hardware MPLS features
2560  * instead of standard features for the netdev.
2561  */
2562 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2563 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2564                                            netdev_features_t features,
2565                                            __be16 type)
2566 {
2567         if (eth_p_mpls(type))
2568                 features &= skb->dev->mpls_features;
2569
2570         return features;
2571 }
2572 #else
2573 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2574                                            netdev_features_t features,
2575                                            __be16 type)
2576 {
2577         return features;
2578 }
2579 #endif
2580
2581 static netdev_features_t harmonize_features(struct sk_buff *skb,
2582         netdev_features_t features)
2583 {
2584         int tmp;
2585         __be16 type;
2586
2587         type = skb_network_protocol(skb, &tmp);
2588         features = net_mpls_features(skb, features, type);
2589
2590         if (skb->ip_summed != CHECKSUM_NONE &&
2591             !can_checksum_protocol(features, type)) {
2592                 features &= ~NETIF_F_ALL_CSUM;
2593         } else if (illegal_highdma(skb->dev, skb)) {
2594                 features &= ~NETIF_F_SG;
2595         }
2596
2597         return features;
2598 }
2599
2600 netdev_features_t passthru_features_check(struct sk_buff *skb,
2601                                           struct net_device *dev,
2602                                           netdev_features_t features)
2603 {
2604         return features;
2605 }
2606 EXPORT_SYMBOL(passthru_features_check);
2607
2608 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2609                                              struct net_device *dev,
2610                                              netdev_features_t features)
2611 {
2612         return vlan_features_check(skb, features);
2613 }
2614
2615 netdev_features_t netif_skb_features(struct sk_buff *skb)
2616 {
2617         struct net_device *dev = skb->dev;
2618         netdev_features_t features = dev->features;
2619         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2620
2621         if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2622                 features &= ~NETIF_F_GSO_MASK;
2623
2624         /* If encapsulation offload request, verify we are testing
2625          * hardware encapsulation features instead of standard
2626          * features for the netdev
2627          */
2628         if (skb->encapsulation)
2629                 features &= dev->hw_enc_features;
2630
2631         if (skb_vlan_tagged(skb))
2632                 features = netdev_intersect_features(features,
2633                                                      dev->vlan_features |
2634                                                      NETIF_F_HW_VLAN_CTAG_TX |
2635                                                      NETIF_F_HW_VLAN_STAG_TX);
2636
2637         if (dev->netdev_ops->ndo_features_check)
2638                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2639                                                                 features);
2640         else
2641                 features &= dflt_features_check(skb, dev, features);
2642
2643         return harmonize_features(skb, features);
2644 }
2645 EXPORT_SYMBOL(netif_skb_features);
2646
2647 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2648                     struct netdev_queue *txq, bool more)
2649 {
2650         unsigned int len;
2651         int rc;
2652
2653         if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2654                 dev_queue_xmit_nit(skb, dev);
2655
2656         len = skb->len;
2657         trace_net_dev_start_xmit(skb, dev);
2658         rc = netdev_start_xmit(skb, dev, txq, more);
2659         trace_net_dev_xmit(skb, rc, dev, len);
2660
2661         return rc;
2662 }
2663
2664 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2665                                     struct netdev_queue *txq, int *ret)
2666 {
2667         struct sk_buff *skb = first;
2668         int rc = NETDEV_TX_OK;
2669
2670         while (skb) {
2671                 struct sk_buff *next = skb->next;
2672
2673                 skb->next = NULL;
2674                 rc = xmit_one(skb, dev, txq, next != NULL);
2675                 if (unlikely(!dev_xmit_complete(rc))) {
2676                         skb->next = next;
2677                         goto out;
2678                 }
2679
2680                 skb = next;
2681                 if (netif_xmit_stopped(txq) && skb) {
2682                         rc = NETDEV_TX_BUSY;
2683                         break;
2684                 }
2685         }
2686
2687 out:
2688         *ret = rc;
2689         return skb;
2690 }
2691
2692 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2693                                           netdev_features_t features)
2694 {
2695         if (skb_vlan_tag_present(skb) &&
2696             !vlan_hw_offload_capable(features, skb->vlan_proto))
2697                 skb = __vlan_hwaccel_push_inside(skb);
2698         return skb;
2699 }
2700
2701 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2702 {
2703         netdev_features_t features;
2704
2705         if (skb->next)
2706                 return skb;
2707
2708         features = netif_skb_features(skb);
2709         skb = validate_xmit_vlan(skb, features);
2710         if (unlikely(!skb))
2711                 goto out_null;
2712
2713         if (netif_needs_gso(skb, features)) {
2714                 struct sk_buff *segs;
2715
2716                 segs = skb_gso_segment(skb, features);
2717                 if (IS_ERR(segs)) {
2718                         goto out_kfree_skb;
2719                 } else if (segs) {
2720                         consume_skb(skb);
2721                         skb = segs;
2722                 }
2723         } else {
2724                 if (skb_needs_linearize(skb, features) &&
2725                     __skb_linearize(skb))
2726                         goto out_kfree_skb;
2727
2728                 /* If packet is not checksummed and device does not
2729                  * support checksumming for this protocol, complete
2730                  * checksumming here.
2731                  */
2732                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2733                         if (skb->encapsulation)
2734                                 skb_set_inner_transport_header(skb,
2735                                                                skb_checksum_start_offset(skb));
2736                         else
2737                                 skb_set_transport_header(skb,
2738                                                          skb_checksum_start_offset(skb));
2739                         if (!(features & NETIF_F_ALL_CSUM) &&
2740                             skb_checksum_help(skb))
2741                                 goto out_kfree_skb;
2742                 }
2743         }
2744
2745         return skb;
2746
2747 out_kfree_skb:
2748         kfree_skb(skb);
2749 out_null:
2750         return NULL;
2751 }
2752
2753 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2754 {
2755         struct sk_buff *next, *head = NULL, *tail;
2756
2757         for (; skb != NULL; skb = next) {
2758                 next = skb->next;
2759                 skb->next = NULL;
2760
2761                 /* in case skb wont be segmented, point to itself */
2762                 skb->prev = skb;
2763
2764                 skb = validate_xmit_skb(skb, dev);
2765                 if (!skb)
2766                         continue;
2767
2768                 if (!head)
2769                         head = skb;
2770                 else
2771                         tail->next = skb;
2772                 /* If skb was segmented, skb->prev points to
2773                  * the last segment. If not, it still contains skb.
2774                  */
2775                 tail = skb->prev;
2776         }
2777         return head;
2778 }
2779
2780 static void qdisc_pkt_len_init(struct sk_buff *skb)
2781 {
2782         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2783
2784         qdisc_skb_cb(skb)->pkt_len = skb->len;
2785
2786         /* To get more precise estimation of bytes sent on wire,
2787          * we add to pkt_len the headers size of all segments
2788          */
2789         if (shinfo->gso_size)  {
2790                 unsigned int hdr_len;
2791                 u16 gso_segs = shinfo->gso_segs;
2792
2793                 /* mac layer + network layer */
2794                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2795
2796                 /* + transport layer */
2797                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2798                         hdr_len += tcp_hdrlen(skb);
2799                 else
2800                         hdr_len += sizeof(struct udphdr);
2801
2802                 if (shinfo->gso_type & SKB_GSO_DODGY)
2803                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2804                                                 shinfo->gso_size);
2805
2806                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2807         }
2808 }
2809
2810 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2811                                  struct net_device *dev,
2812                                  struct netdev_queue *txq)
2813 {
2814         spinlock_t *root_lock = qdisc_lock(q);
2815         bool contended;
2816         int rc;
2817
2818         qdisc_pkt_len_init(skb);
2819         qdisc_calculate_pkt_len(skb, q);
2820         /*
2821          * Heuristic to force contended enqueues to serialize on a
2822          * separate lock before trying to get qdisc main lock.
2823          * This permits __QDISC___STATE_RUNNING owner to get the lock more
2824          * often and dequeue packets faster.
2825          */
2826         contended = qdisc_is_running(q);
2827         if (unlikely(contended))
2828                 spin_lock(&q->busylock);
2829
2830         spin_lock(root_lock);
2831         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2832                 kfree_skb(skb);
2833                 rc = NET_XMIT_DROP;
2834         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2835                    qdisc_run_begin(q)) {
2836                 /*
2837                  * This is a work-conserving queue; there are no old skbs
2838                  * waiting to be sent out; and the qdisc is not running -
2839                  * xmit the skb directly.
2840                  */
2841
2842                 qdisc_bstats_update(q, skb);
2843
2844                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2845                         if (unlikely(contended)) {
2846                                 spin_unlock(&q->busylock);
2847                                 contended = false;
2848                         }
2849                         __qdisc_run(q);
2850                 } else
2851                         qdisc_run_end(q);
2852
2853                 rc = NET_XMIT_SUCCESS;
2854         } else {
2855                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2856                 if (qdisc_run_begin(q)) {
2857                         if (unlikely(contended)) {
2858                                 spin_unlock(&q->busylock);
2859                                 contended = false;
2860                         }
2861                         __qdisc_run(q);
2862                 }
2863         }
2864         spin_unlock(root_lock);
2865         if (unlikely(contended))
2866                 spin_unlock(&q->busylock);
2867         return rc;
2868 }
2869
2870 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2871 static void skb_update_prio(struct sk_buff *skb)
2872 {
2873         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2874
2875         if (!skb->priority && skb->sk && map) {
2876                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2877
2878                 if (prioidx < map->priomap_len)
2879                         skb->priority = map->priomap[prioidx];
2880         }
2881 }
2882 #else
2883 #define skb_update_prio(skb)
2884 #endif
2885
2886 DEFINE_PER_CPU(int, xmit_recursion);
2887 EXPORT_SYMBOL(xmit_recursion);
2888
2889 #define RECURSION_LIMIT 10
2890
2891 /**
2892  *      dev_loopback_xmit - loop back @skb
2893  *      @skb: buffer to transmit
2894  */
2895 int dev_loopback_xmit(struct sock *sk, struct sk_buff *skb)
2896 {
2897         skb_reset_mac_header(skb);
2898         __skb_pull(skb, skb_network_offset(skb));
2899         skb->pkt_type = PACKET_LOOPBACK;
2900         skb->ip_summed = CHECKSUM_UNNECESSARY;
2901         WARN_ON(!skb_dst(skb));
2902         skb_dst_force(skb);
2903         netif_rx_ni(skb);
2904         return 0;
2905 }
2906 EXPORT_SYMBOL(dev_loopback_xmit);
2907
2908 /**
2909  *      __dev_queue_xmit - transmit a buffer
2910  *      @skb: buffer to transmit
2911  *      @accel_priv: private data used for L2 forwarding offload
2912  *
2913  *      Queue a buffer for transmission to a network device. The caller must
2914  *      have set the device and priority and built the buffer before calling
2915  *      this function. The function can be called from an interrupt.
2916  *
2917  *      A negative errno code is returned on a failure. A success does not
2918  *      guarantee the frame will be transmitted as it may be dropped due
2919  *      to congestion or traffic shaping.
2920  *
2921  * -----------------------------------------------------------------------------------
2922  *      I notice this method can also return errors from the queue disciplines,
2923  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2924  *      be positive.
2925  *
2926  *      Regardless of the return value, the skb is consumed, so it is currently
2927  *      difficult to retry a send to this method.  (You can bump the ref count
2928  *      before sending to hold a reference for retry if you are careful.)
2929  *
2930  *      When calling this method, interrupts MUST be enabled.  This is because
2931  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2932  *          --BLG
2933  */
2934 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2935 {
2936         struct net_device *dev = skb->dev;
2937         struct netdev_queue *txq;
2938         struct Qdisc *q;
2939         int rc = -ENOMEM;
2940
2941         skb_reset_mac_header(skb);
2942
2943         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
2944                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
2945
2946         /* Disable soft irqs for various locks below. Also
2947          * stops preemption for RCU.
2948          */
2949         rcu_read_lock_bh();
2950
2951         skb_update_prio(skb);
2952
2953         /* If device/qdisc don't need skb->dst, release it right now while
2954          * its hot in this cpu cache.
2955          */
2956         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2957                 skb_dst_drop(skb);
2958         else
2959                 skb_dst_force(skb);
2960
2961         txq = netdev_pick_tx(dev, skb, accel_priv);
2962         q = rcu_dereference_bh(txq->qdisc);
2963
2964 #ifdef CONFIG_NET_CLS_ACT
2965         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2966 #endif
2967         trace_net_dev_queue(skb);
2968         if (q->enqueue) {
2969                 rc = __dev_xmit_skb(skb, q, dev, txq);
2970                 goto out;
2971         }
2972
2973         /* The device has no queue. Common case for software devices:
2974            loopback, all the sorts of tunnels...
2975
2976            Really, it is unlikely that netif_tx_lock protection is necessary
2977            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2978            counters.)
2979            However, it is possible, that they rely on protection
2980            made by us here.
2981
2982            Check this and shot the lock. It is not prone from deadlocks.
2983            Either shot noqueue qdisc, it is even simpler 8)
2984          */
2985         if (dev->flags & IFF_UP) {
2986                 int cpu = smp_processor_id(); /* ok because BHs are off */
2987
2988                 if (txq->xmit_lock_owner != cpu) {
2989
2990                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2991                                 goto recursion_alert;
2992
2993                         skb = validate_xmit_skb(skb, dev);
2994                         if (!skb)
2995                                 goto drop;
2996
2997                         HARD_TX_LOCK(dev, txq, cpu);
2998
2999                         if (!netif_xmit_stopped(txq)) {
3000                                 __this_cpu_inc(xmit_recursion);
3001                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3002                                 __this_cpu_dec(xmit_recursion);
3003                                 if (dev_xmit_complete(rc)) {
3004                                         HARD_TX_UNLOCK(dev, txq);
3005                                         goto out;
3006                                 }
3007                         }
3008                         HARD_TX_UNLOCK(dev, txq);
3009                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3010                                              dev->name);
3011                 } else {
3012                         /* Recursion is detected! It is possible,
3013                          * unfortunately
3014                          */
3015 recursion_alert:
3016                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3017                                              dev->name);
3018                 }
3019         }
3020
3021         rc = -ENETDOWN;
3022 drop:
3023         rcu_read_unlock_bh();
3024
3025         atomic_long_inc(&dev->tx_dropped);
3026         kfree_skb_list(skb);
3027         return rc;
3028 out:
3029         rcu_read_unlock_bh();
3030         return rc;
3031 }
3032
3033 int dev_queue_xmit_sk(struct sock *sk, struct sk_buff *skb)
3034 {
3035         return __dev_queue_xmit(skb, NULL);
3036 }
3037 EXPORT_SYMBOL(dev_queue_xmit_sk);
3038
3039 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3040 {
3041         return __dev_queue_xmit(skb, accel_priv);
3042 }
3043 EXPORT_SYMBOL(dev_queue_xmit_accel);
3044
3045
3046 /*=======================================================================
3047                         Receiver routines
3048   =======================================================================*/
3049
3050 int netdev_max_backlog __read_mostly = 1000;
3051 EXPORT_SYMBOL(netdev_max_backlog);
3052
3053 int netdev_tstamp_prequeue __read_mostly = 1;
3054 int netdev_budget __read_mostly = 300;
3055 int weight_p __read_mostly = 64;            /* old backlog weight */
3056
3057 /* Called with irq disabled */
3058 static inline void ____napi_schedule(struct softnet_data *sd,
3059                                      struct napi_struct *napi)
3060 {
3061         list_add_tail(&napi->poll_list, &sd->poll_list);
3062         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3063 }
3064
3065 #ifdef CONFIG_RPS
3066
3067 /* One global table that all flow-based protocols share. */
3068 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3069 EXPORT_SYMBOL(rps_sock_flow_table);
3070 u32 rps_cpu_mask __read_mostly;
3071 EXPORT_SYMBOL(rps_cpu_mask);
3072
3073 struct static_key rps_needed __read_mostly;
3074
3075 static struct rps_dev_flow *
3076 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3077             struct rps_dev_flow *rflow, u16 next_cpu)
3078 {
3079         if (next_cpu < nr_cpu_ids) {
3080 #ifdef CONFIG_RFS_ACCEL
3081                 struct netdev_rx_queue *rxqueue;
3082                 struct rps_dev_flow_table *flow_table;
3083                 struct rps_dev_flow *old_rflow;
3084                 u32 flow_id;
3085                 u16 rxq_index;
3086                 int rc;
3087
3088                 /* Should we steer this flow to a different hardware queue? */
3089                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3090                     !(dev->features & NETIF_F_NTUPLE))
3091                         goto out;
3092                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3093                 if (rxq_index == skb_get_rx_queue(skb))
3094                         goto out;
3095
3096                 rxqueue = dev->_rx + rxq_index;
3097                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3098                 if (!flow_table)
3099                         goto out;
3100                 flow_id = skb_get_hash(skb) & flow_table->mask;
3101                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3102                                                         rxq_index, flow_id);
3103                 if (rc < 0)
3104                         goto out;
3105                 old_rflow = rflow;
3106                 rflow = &flow_table->flows[flow_id];
3107                 rflow->filter = rc;
3108                 if (old_rflow->filter == rflow->filter)
3109                         old_rflow->filter = RPS_NO_FILTER;
3110         out:
3111 #endif
3112                 rflow->last_qtail =
3113                         per_cpu(softnet_data, next_cpu).input_queue_head;
3114         }
3115
3116         rflow->cpu = next_cpu;
3117         return rflow;
3118 }
3119
3120 /*
3121  * get_rps_cpu is called from netif_receive_skb and returns the target
3122  * CPU from the RPS map of the receiving queue for a given skb.
3123  * rcu_read_lock must be held on entry.
3124  */
3125 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3126                        struct rps_dev_flow **rflowp)
3127 {
3128         const struct rps_sock_flow_table *sock_flow_table;
3129         struct netdev_rx_queue *rxqueue = dev->_rx;
3130         struct rps_dev_flow_table *flow_table;
3131         struct rps_map *map;
3132         int cpu = -1;
3133         u32 tcpu;
3134         u32 hash;
3135
3136         if (skb_rx_queue_recorded(skb)) {
3137                 u16 index = skb_get_rx_queue(skb);
3138
3139                 if (unlikely(index >= dev->real_num_rx_queues)) {
3140                         WARN_ONCE(dev->real_num_rx_queues > 1,
3141                                   "%s received packet on queue %u, but number "
3142                                   "of RX queues is %u\n",
3143                                   dev->name, index, dev->real_num_rx_queues);
3144                         goto done;
3145                 }
3146                 rxqueue += index;
3147         }
3148
3149         /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3150
3151         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3152         map = rcu_dereference(rxqueue->rps_map);
3153         if (!flow_table && !map)
3154                 goto done;
3155
3156         skb_reset_network_header(skb);
3157         hash = skb_get_hash(skb);
3158         if (!hash)
3159                 goto done;
3160
3161         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3162         if (flow_table && sock_flow_table) {
3163                 struct rps_dev_flow *rflow;
3164                 u32 next_cpu;
3165                 u32 ident;
3166
3167                 /* First check into global flow table if there is a match */
3168                 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3169                 if ((ident ^ hash) & ~rps_cpu_mask)
3170                         goto try_rps;
3171
3172                 next_cpu = ident & rps_cpu_mask;
3173
3174                 /* OK, now we know there is a match,
3175                  * we can look at the local (per receive queue) flow table
3176                  */
3177                 rflow = &flow_table->flows[hash & flow_table->mask];
3178                 tcpu = rflow->cpu;
3179
3180                 /*
3181                  * If the desired CPU (where last recvmsg was done) is
3182                  * different from current CPU (one in the rx-queue flow
3183                  * table entry), switch if one of the following holds:
3184                  *   - Current CPU is unset (>= nr_cpu_ids).
3185                  *   - Current CPU is offline.
3186                  *   - The current CPU's queue tail has advanced beyond the
3187                  *     last packet that was enqueued using this table entry.
3188                  *     This guarantees that all previous packets for the flow
3189                  *     have been dequeued, thus preserving in order delivery.
3190                  */
3191                 if (unlikely(tcpu != next_cpu) &&
3192                     (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3193                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3194                       rflow->last_qtail)) >= 0)) {
3195                         tcpu = next_cpu;
3196                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3197                 }
3198
3199                 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3200                         *rflowp = rflow;
3201                         cpu = tcpu;
3202                         goto done;
3203                 }
3204         }
3205
3206 try_rps:
3207
3208         if (map) {
3209                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3210                 if (cpu_online(tcpu)) {
3211                         cpu = tcpu;
3212                         goto done;
3213                 }
3214         }
3215
3216 done:
3217         return cpu;
3218 }
3219
3220 #ifdef CONFIG_RFS_ACCEL
3221
3222 /**
3223  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3224  * @dev: Device on which the filter was set
3225  * @rxq_index: RX queue index
3226  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3227  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3228  *
3229  * Drivers that implement ndo_rx_flow_steer() should periodically call
3230  * this function for each installed filter and remove the filters for
3231  * which it returns %true.
3232  */
3233 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3234                          u32 flow_id, u16 filter_id)
3235 {
3236         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3237         struct rps_dev_flow_table *flow_table;
3238         struct rps_dev_flow *rflow;
3239         bool expire = true;
3240         unsigned int cpu;
3241
3242         rcu_read_lock();
3243         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3244         if (flow_table && flow_id <= flow_table->mask) {
3245                 rflow = &flow_table->flows[flow_id];
3246                 cpu = ACCESS_ONCE(rflow->cpu);
3247                 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3248                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3249                            rflow->last_qtail) <
3250                      (int)(10 * flow_table->mask)))
3251                         expire = false;
3252         }
3253         rcu_read_unlock();
3254         return expire;
3255 }
3256 EXPORT_SYMBOL(rps_may_expire_flow);
3257
3258 #endif /* CONFIG_RFS_ACCEL */
3259
3260 /* Called from hardirq (IPI) context */
3261 static void rps_trigger_softirq(void *data)
3262 {
3263         struct softnet_data *sd = data;
3264
3265         ____napi_schedule(sd, &sd->backlog);
3266         sd->received_rps++;
3267 }
3268
3269 #endif /* CONFIG_RPS */
3270
3271 /*
3272  * Check if this softnet_data structure is another cpu one
3273  * If yes, queue it to our IPI list and return 1
3274  * If no, return 0
3275  */
3276 static int rps_ipi_queued(struct softnet_data *sd)
3277 {
3278 #ifdef CONFIG_RPS
3279         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3280
3281         if (sd != mysd) {
3282                 sd->rps_ipi_next = mysd->rps_ipi_list;
3283                 mysd->rps_ipi_list = sd;
3284
3285                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3286                 return 1;
3287         }
3288 #endif /* CONFIG_RPS */
3289         return 0;
3290 }
3291
3292 #ifdef CONFIG_NET_FLOW_LIMIT
3293 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3294 #endif
3295
3296 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3297 {
3298 #ifdef CONFIG_NET_FLOW_LIMIT
3299         struct sd_flow_limit *fl;
3300         struct softnet_data *sd;
3301         unsigned int old_flow, new_flow;
3302
3303         if (qlen < (netdev_max_backlog >> 1))
3304                 return false;
3305
3306         sd = this_cpu_ptr(&softnet_data);
3307
3308         rcu_read_lock();
3309         fl = rcu_dereference(sd->flow_limit);
3310         if (fl) {
3311                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3312                 old_flow = fl->history[fl->history_head];
3313                 fl->history[fl->history_head] = new_flow;
3314
3315                 fl->history_head++;
3316                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3317
3318                 if (likely(fl->buckets[old_flow]))
3319                         fl->buckets[old_flow]--;
3320
3321                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3322                         fl->count++;
3323                         rcu_read_unlock();
3324                         return true;
3325                 }
3326         }
3327         rcu_read_unlock();
3328 #endif
3329         return false;
3330 }
3331
3332 /*
3333  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3334  * queue (may be a remote CPU queue).
3335  */
3336 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3337                               unsigned int *qtail)
3338 {
3339         struct softnet_data *sd;
3340         unsigned long flags;
3341         unsigned int qlen;
3342
3343         sd = &per_cpu(softnet_data, cpu);
3344
3345         local_irq_save(flags);
3346
3347         rps_lock(sd);
3348         if (!netif_running(skb->dev))
3349                 goto drop;
3350         qlen = skb_queue_len(&sd->input_pkt_queue);
3351         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3352                 if (qlen) {
3353 enqueue:
3354                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3355                         input_queue_tail_incr_save(sd, qtail);
3356                         rps_unlock(sd);
3357                         local_irq_restore(flags);
3358                         return NET_RX_SUCCESS;
3359                 }
3360
3361                 /* Schedule NAPI for backlog device
3362                  * We can use non atomic operation since we own the queue lock
3363                  */
3364                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3365                         if (!rps_ipi_queued(sd))
3366                                 ____napi_schedule(sd, &sd->backlog);
3367                 }
3368                 goto enqueue;
3369         }
3370
3371 drop:
3372         sd->dropped++;
3373         rps_unlock(sd);
3374
3375         local_irq_restore(flags);
3376         preempt_check_resched_rt();
3377
3378         atomic_long_inc(&skb->dev->rx_dropped);
3379         kfree_skb(skb);
3380         return NET_RX_DROP;
3381 }
3382
3383 static int netif_rx_internal(struct sk_buff *skb)
3384 {
3385         int ret;
3386
3387         net_timestamp_check(netdev_tstamp_prequeue, skb);
3388
3389         trace_netif_rx(skb);
3390 #ifdef CONFIG_RPS
3391         if (static_key_false(&rps_needed)) {
3392                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3393                 int cpu;
3394
3395                 migrate_disable();
3396                 rcu_read_lock();
3397
3398                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3399                 if (cpu < 0)
3400                         cpu = smp_processor_id();
3401
3402                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3403
3404                 rcu_read_unlock();
3405                 migrate_enable();
3406         } else
3407 #endif
3408         {
3409                 unsigned int qtail;
3410                 ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
3411                 put_cpu_light();
3412         }
3413         return ret;
3414 }
3415
3416 /**
3417  *      netif_rx        -       post buffer to the network code
3418  *      @skb: buffer to post
3419  *
3420  *      This function receives a packet from a device driver and queues it for
3421  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3422  *      may be dropped during processing for congestion control or by the
3423  *      protocol layers.
3424  *
3425  *      return values:
3426  *      NET_RX_SUCCESS  (no congestion)
3427  *      NET_RX_DROP     (packet was dropped)
3428  *
3429  */
3430
3431 int netif_rx(struct sk_buff *skb)
3432 {
3433         trace_netif_rx_entry(skb);
3434
3435         return netif_rx_internal(skb);
3436 }
3437 EXPORT_SYMBOL(netif_rx);
3438
3439 int netif_rx_ni(struct sk_buff *skb)
3440 {
3441         int err;
3442
3443         trace_netif_rx_ni_entry(skb);
3444
3445         local_bh_disable();
3446         err = netif_rx_internal(skb);
3447         local_bh_enable();
3448
3449         return err;
3450 }
3451 EXPORT_SYMBOL(netif_rx_ni);
3452
3453 #ifdef CONFIG_PREEMPT_RT_FULL
3454 /*
3455  * RT runs ksoftirqd as a real time thread and the root_lock is a
3456  * "sleeping spinlock". If the trylock fails then we can go into an
3457  * infinite loop when ksoftirqd preempted the task which actually
3458  * holds the lock, because we requeue q and raise NET_TX softirq
3459  * causing ksoftirqd to loop forever.
3460  *
3461  * It's safe to use spin_lock on RT here as softirqs run in thread
3462  * context and cannot deadlock against the thread which is holding
3463  * root_lock.
3464  *
3465  * On !RT the trylock might fail, but there we bail out from the
3466  * softirq loop after 10 attempts which we can't do on RT. And the
3467  * task holding root_lock cannot be preempted, so the only downside of
3468  * that trylock is that we need 10 loops to decide that we should have
3469  * given up in the first one :)
3470  */
3471 static inline int take_root_lock(spinlock_t *lock)
3472 {
3473         spin_lock(lock);
3474         return 1;
3475 }
3476 #else
3477 static inline int take_root_lock(spinlock_t *lock)
3478 {
3479         return spin_trylock(lock);
3480 }
3481 #endif
3482
3483 static void net_tx_action(struct softirq_action *h)
3484 {
3485         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3486
3487         if (sd->completion_queue) {
3488                 struct sk_buff *clist;
3489
3490                 local_irq_disable();
3491                 clist = sd->completion_queue;
3492                 sd->completion_queue = NULL;
3493                 local_irq_enable();
3494
3495                 while (clist) {
3496                         struct sk_buff *skb = clist;
3497                         clist = clist->next;
3498
3499                         WARN_ON(atomic_read(&skb->users));
3500                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3501                                 trace_consume_skb(skb);
3502                         else
3503                                 trace_kfree_skb(skb, net_tx_action);
3504                         __kfree_skb(skb);
3505                 }
3506         }
3507
3508         if (sd->output_queue) {
3509                 struct Qdisc *head;
3510
3511                 local_irq_disable();
3512                 head = sd->output_queue;
3513                 sd->output_queue = NULL;
3514                 sd->output_queue_tailp = &sd->output_queue;
3515                 local_irq_enable();
3516
3517                 while (head) {
3518                         struct Qdisc *q = head;
3519                         spinlock_t *root_lock;
3520
3521                         head = head->next_sched;
3522
3523                         root_lock = qdisc_lock(q);
3524                         if (take_root_lock(root_lock)) {
3525                                 smp_mb__before_atomic();
3526                                 clear_bit(__QDISC_STATE_SCHED,
3527                                           &q->state);
3528                                 qdisc_run(q);
3529                                 spin_unlock(root_lock);
3530                         } else {
3531                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3532                                               &q->state)) {
3533                                         __netif_reschedule(q);
3534                                 } else {
3535                                         smp_mb__before_atomic();
3536                                         clear_bit(__QDISC_STATE_SCHED,
3537                                                   &q->state);
3538                                 }
3539                         }
3540                 }
3541         }
3542 }
3543
3544 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3545     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3546 /* This hook is defined here for ATM LANE */
3547 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3548                              unsigned char *addr) __read_mostly;
3549 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3550 #endif
3551
3552 #ifdef CONFIG_NET_CLS_ACT
3553 /* TODO: Maybe we should just force sch_ingress to be compiled in
3554  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3555  * a compare and 2 stores extra right now if we dont have it on
3556  * but have CONFIG_NET_CLS_ACT
3557  * NOTE: This doesn't stop any functionality; if you dont have
3558  * the ingress scheduler, you just can't add policies on ingress.
3559  *
3560  */
3561 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3562 {
3563         struct net_device *dev = skb->dev;
3564         u32 ttl = G_TC_RTTL(skb->tc_verd);
3565         int result = TC_ACT_OK;
3566         struct Qdisc *q;
3567
3568         if (unlikely(MAX_RED_LOOP < ttl++)) {
3569                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3570                                      skb->skb_iif, dev->ifindex);
3571                 return TC_ACT_SHOT;
3572         }
3573
3574         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3575         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3576
3577         q = rcu_dereference(rxq->qdisc);
3578         if (q != &noop_qdisc) {
3579                 spin_lock(qdisc_lock(q));
3580                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3581                         result = qdisc_enqueue_root(skb, q);
3582                 spin_unlock(qdisc_lock(q));
3583         }
3584
3585         return result;
3586 }
3587
3588 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3589                                          struct packet_type **pt_prev,
3590                                          int *ret, struct net_device *orig_dev)
3591 {
3592         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3593
3594         if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
3595                 return skb;
3596
3597         if (*pt_prev) {
3598                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3599                 *pt_prev = NULL;
3600         }
3601
3602         switch (ing_filter(skb, rxq)) {
3603         case TC_ACT_SHOT:
3604         case TC_ACT_STOLEN:
3605                 kfree_skb(skb);
3606                 return NULL;
3607         }
3608
3609         return skb;
3610 }
3611 #endif
3612
3613 /**
3614  *      netdev_rx_handler_register - register receive handler
3615  *      @dev: device to register a handler for
3616  *      @rx_handler: receive handler to register
3617  *      @rx_handler_data: data pointer that is used by rx handler
3618  *
3619  *      Register a receive handler for a device. This handler will then be
3620  *      called from __netif_receive_skb. A negative errno code is returned
3621  *      on a failure.
3622  *
3623  *      The caller must hold the rtnl_mutex.
3624  *
3625  *      For a general description of rx_handler, see enum rx_handler_result.
3626  */
3627 int netdev_rx_handler_register(struct net_device *dev,
3628                                rx_handler_func_t *rx_handler,
3629                                void *rx_handler_data)
3630 {
3631         ASSERT_RTNL();
3632
3633         if (dev->rx_handler)
3634                 return -EBUSY;
3635
3636         /* Note: rx_handler_data must be set before rx_handler */
3637         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3638         rcu_assign_pointer(dev->rx_handler, rx_handler);
3639
3640         return 0;
3641 }
3642 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3643
3644 /**
3645  *      netdev_rx_handler_unregister - unregister receive handler
3646  *      @dev: device to unregister a handler from
3647  *
3648  *      Unregister a receive handler from a device.
3649  *
3650  *      The caller must hold the rtnl_mutex.
3651  */
3652 void netdev_rx_handler_unregister(struct net_device *dev)
3653 {
3654
3655         ASSERT_RTNL();
3656         RCU_INIT_POINTER(dev->rx_handler, NULL);
3657         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3658          * section has a guarantee to see a non NULL rx_handler_data
3659          * as well.
3660          */
3661         synchronize_net();
3662         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3663 }
3664 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3665
3666 /*
3667  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3668  * the special handling of PFMEMALLOC skbs.
3669  */
3670 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3671 {
3672         switch (skb->protocol) {
3673         case htons(ETH_P_ARP):
3674         case htons(ETH_P_IP):
3675         case htons(ETH_P_IPV6):
3676         case htons(ETH_P_8021Q):
3677         case htons(ETH_P_8021AD):
3678                 return true;
3679         default:
3680                 return false;
3681         }
3682 }
3683
3684 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3685 {
3686         struct packet_type *ptype, *pt_prev;
3687         rx_handler_func_t *rx_handler;
3688         struct net_device *orig_dev;
3689         bool deliver_exact = false;
3690         int ret = NET_RX_DROP;
3691         __be16 type;
3692
3693         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3694
3695         trace_netif_receive_skb(skb);
3696
3697         orig_dev = skb->dev;
3698
3699         skb_reset_network_header(skb);
3700         if (!skb_transport_header_was_set(skb))
3701                 skb_reset_transport_header(skb);
3702         skb_reset_mac_len(skb);
3703
3704         pt_prev = NULL;
3705
3706 another_round:
3707         skb->skb_iif = skb->dev->ifindex;
3708
3709         __this_cpu_inc(softnet_data.processed);
3710
3711         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3712             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3713                 skb = skb_vlan_untag(skb);
3714                 if (unlikely(!skb))
3715                         goto out;
3716         }
3717
3718 #ifdef CONFIG_NET_CLS_ACT
3719         if (skb->tc_verd & TC_NCLS) {
3720                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3721                 goto ncls;
3722         }
3723 #endif
3724
3725         if (pfmemalloc)
3726                 goto skip_taps;
3727
3728         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3729                 if (pt_prev)
3730                         ret = deliver_skb(skb, pt_prev, orig_dev);
3731                 pt_prev = ptype;
3732         }
3733
3734         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
3735                 if (pt_prev)
3736                         ret = deliver_skb(skb, pt_prev, orig_dev);
3737                 pt_prev = ptype;
3738         }
3739
3740 skip_taps:
3741 #ifdef CONFIG_NET_CLS_ACT
3742         if (static_key_false(&ingress_needed)) {
3743                 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3744                 if (!skb)
3745                         goto out;
3746         }
3747
3748         skb->tc_verd = 0;
3749 ncls:
3750 #endif
3751         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3752                 goto drop;
3753
3754         if (skb_vlan_tag_present(skb)) {
3755                 if (pt_prev) {
3756                         ret = deliver_skb(skb, pt_prev, orig_dev);
3757                         pt_prev = NULL;
3758                 }
3759                 if (vlan_do_receive(&skb))
3760                         goto another_round;
3761                 else if (unlikely(!skb))
3762                         goto out;
3763         }
3764
3765         rx_handler = rcu_dereference(skb->dev->rx_handler);
3766         if (rx_handler) {
3767                 if (pt_prev) {
3768                         ret = deliver_skb(skb, pt_prev, orig_dev);
3769                         pt_prev = NULL;
3770                 }
3771                 switch (rx_handler(&skb)) {
3772                 case RX_HANDLER_CONSUMED:
3773                         ret = NET_RX_SUCCESS;
3774                         goto out;
3775                 case RX_HANDLER_ANOTHER:
3776                         goto another_round;
3777                 case RX_HANDLER_EXACT:
3778                         deliver_exact = true;
3779                 case RX_HANDLER_PASS:
3780                         break;
3781                 default:
3782                         BUG();
3783                 }
3784         }
3785
3786         if (unlikely(skb_vlan_tag_present(skb))) {
3787                 if (skb_vlan_tag_get_id(skb))
3788                         skb->pkt_type = PACKET_OTHERHOST;
3789                 /* Note: we might in the future use prio bits
3790                  * and set skb->priority like in vlan_do_receive()
3791                  * For the time being, just ignore Priority Code Point
3792                  */
3793                 skb->vlan_tci = 0;
3794         }
3795
3796         type = skb->protocol;
3797
3798         /* deliver only exact match when indicated */
3799         if (likely(!deliver_exact)) {
3800                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3801                                        &ptype_base[ntohs(type) &
3802                                                    PTYPE_HASH_MASK]);
3803         }
3804
3805         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3806                                &orig_dev->ptype_specific);
3807
3808         if (unlikely(skb->dev != orig_dev)) {
3809                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3810                                        &skb->dev->ptype_specific);
3811         }
3812
3813         if (pt_prev) {
3814                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3815                         goto drop;
3816                 else
3817                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3818         } else {
3819 drop:
3820                 atomic_long_inc(&skb->dev->rx_dropped);
3821                 kfree_skb(skb);
3822                 /* Jamal, now you will not able to escape explaining
3823                  * me how you were going to use this. :-)
3824                  */
3825                 ret = NET_RX_DROP;
3826         }
3827
3828 out:
3829         return ret;
3830 }
3831
3832 static int __netif_receive_skb(struct sk_buff *skb)
3833 {
3834         int ret;
3835
3836         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3837                 unsigned long pflags = current->flags;
3838
3839                 /*
3840                  * PFMEMALLOC skbs are special, they should
3841                  * - be delivered to SOCK_MEMALLOC sockets only
3842                  * - stay away from userspace
3843                  * - have bounded memory usage
3844                  *
3845                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3846                  * context down to all allocation sites.
3847                  */
3848                 current->flags |= PF_MEMALLOC;
3849                 ret = __netif_receive_skb_core(skb, true);
3850                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3851         } else
3852                 ret = __netif_receive_skb_core(skb, false);
3853
3854         return ret;
3855 }
3856
3857 static int netif_receive_skb_internal(struct sk_buff *skb)
3858 {
3859         int ret;
3860
3861         net_timestamp_check(netdev_tstamp_prequeue, skb);
3862
3863         if (skb_defer_rx_timestamp(skb))
3864                 return NET_RX_SUCCESS;
3865
3866         rcu_read_lock();
3867
3868 #ifdef CONFIG_RPS
3869         if (static_key_false(&rps_needed)) {
3870                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3871                 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
3872
3873                 if (cpu >= 0) {
3874                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3875                         rcu_read_unlock();
3876                         return ret;
3877                 }
3878         }
3879 #endif
3880         ret = __netif_receive_skb(skb);
3881         rcu_read_unlock();
3882         return ret;
3883 }
3884
3885 /**
3886  *      netif_receive_skb - process receive buffer from network
3887  *      @skb: buffer to process
3888  *
3889  *      netif_receive_skb() is the main receive data processing function.
3890  *      It always succeeds. The buffer may be dropped during processing
3891  *      for congestion control or by the protocol layers.
3892  *
3893  *      This function may only be called from softirq context and interrupts
3894  *      should be enabled.
3895  *
3896  *      Return values (usually ignored):
3897  *      NET_RX_SUCCESS: no congestion
3898  *      NET_RX_DROP: packet was dropped
3899  */
3900 int netif_receive_skb_sk(struct sock *sk, struct sk_buff *skb)
3901 {
3902         trace_netif_receive_skb_entry(skb);
3903
3904         return netif_receive_skb_internal(skb);
3905 }
3906 EXPORT_SYMBOL(netif_receive_skb_sk);
3907
3908 /* Network device is going away, flush any packets still pending
3909  * Called with irqs disabled.
3910  */
3911 static void flush_backlog(void *arg)
3912 {
3913         struct net_device *dev = arg;
3914         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3915         struct sk_buff *skb, *tmp;
3916
3917         rps_lock(sd);
3918         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3919                 if (skb->dev == dev) {
3920                         __skb_unlink(skb, &sd->input_pkt_queue);
3921                         __skb_queue_tail(&sd->tofree_queue, skb);
3922                         input_queue_head_incr(sd);
3923                 }
3924         }
3925         rps_unlock(sd);
3926
3927         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3928                 if (skb->dev == dev) {
3929                         __skb_unlink(skb, &sd->process_queue);
3930                         __skb_queue_tail(&sd->tofree_queue, skb);
3931                         input_queue_head_incr(sd);
3932                 }
3933         }
3934
3935         if (!skb_queue_empty(&sd->tofree_queue))
3936                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
3937 }
3938
3939 static int napi_gro_complete(struct sk_buff *skb)
3940 {
3941         struct packet_offload *ptype;
3942         __be16 type = skb->protocol;
3943         struct list_head *head = &offload_base;
3944         int err = -ENOENT;
3945
3946         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3947
3948         if (NAPI_GRO_CB(skb)->count == 1) {
3949                 skb_shinfo(skb)->gso_size = 0;
3950                 goto out;
3951         }
3952
3953         rcu_read_lock();
3954         list_for_each_entry_rcu(ptype, head, list) {
3955                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3956                         continue;
3957
3958                 err = ptype->callbacks.gro_complete(skb, 0);
3959                 break;
3960         }
3961         rcu_read_unlock();
3962
3963         if (err) {
3964                 WARN_ON(&ptype->list == head);
3965                 kfree_skb(skb);
3966                 return NET_RX_SUCCESS;
3967         }
3968
3969 out:
3970         return netif_receive_skb_internal(skb);
3971 }
3972
3973 /* napi->gro_list contains packets ordered by age.
3974  * youngest packets at the head of it.
3975  * Complete skbs in reverse order to reduce latencies.
3976  */
3977 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3978 {
3979         struct sk_buff *skb, *prev = NULL;
3980
3981         /* scan list and build reverse chain */
3982         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3983                 skb->prev = prev;
3984                 prev = skb;
3985         }
3986
3987         for (skb = prev; skb; skb = prev) {
3988                 skb->next = NULL;
3989
3990                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3991                         return;
3992
3993                 prev = skb->prev;
3994                 napi_gro_complete(skb);
3995                 napi->gro_count--;
3996         }
3997
3998         napi->gro_list = NULL;
3999 }
4000 EXPORT_SYMBOL(napi_gro_flush);
4001
4002 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4003 {
4004         struct sk_buff *p;
4005         unsigned int maclen = skb->dev->hard_header_len;
4006         u32 hash = skb_get_hash_raw(skb);
4007
4008         for (p = napi->gro_list; p; p = p->next) {
4009                 unsigned long diffs;
4010
4011                 NAPI_GRO_CB(p)->flush = 0;
4012
4013                 if (hash != skb_get_hash_raw(p)) {
4014                         NAPI_GRO_CB(p)->same_flow = 0;
4015                         continue;
4016                 }
4017
4018                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4019                 diffs |= p->vlan_tci ^ skb->vlan_tci;
4020                 if (maclen == ETH_HLEN)
4021                         diffs |= compare_ether_header(skb_mac_header(p),
4022                                                       skb_mac_header(skb));
4023                 else if (!diffs)
4024                         diffs = memcmp(skb_mac_header(p),
4025                                        skb_mac_header(skb),
4026                                        maclen);
4027                 NAPI_GRO_CB(p)->same_flow = !diffs;
4028         }
4029 }
4030
4031 static void skb_gro_reset_offset(struct sk_buff *skb)
4032 {
4033         const struct skb_shared_info *pinfo = skb_shinfo(skb);
4034         const skb_frag_t *frag0 = &pinfo->frags[0];
4035
4036         NAPI_GRO_CB(skb)->data_offset = 0;
4037         NAPI_GRO_CB(skb)->frag0 = NULL;
4038         NAPI_GRO_CB(skb)->frag0_len = 0;
4039
4040         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4041             pinfo->nr_frags &&
4042             !PageHighMem(skb_frag_page(frag0))) {
4043                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4044                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
4045         }
4046 }
4047
4048 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4049 {
4050         struct skb_shared_info *pinfo = skb_shinfo(skb);
4051
4052         BUG_ON(skb->end - skb->tail < grow);
4053
4054         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4055
4056         skb->data_len -= grow;
4057         skb->tail += grow;
4058
4059         pinfo->frags[0].page_offset += grow;
4060         skb_frag_size_sub(&pinfo->frags[0], grow);
4061
4062         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4063                 skb_frag_unref(skb, 0);
4064                 memmove(pinfo->frags, pinfo->frags + 1,
4065                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4066         }
4067 }
4068
4069 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4070 {
4071         struct sk_buff **pp = NULL;
4072         struct packet_offload *ptype;
4073         __be16 type = skb->protocol;
4074         struct list_head *head = &offload_base;
4075         int same_flow;
4076         enum gro_result ret;
4077         int grow;
4078
4079         if (!(skb->dev->features & NETIF_F_GRO))
4080                 goto normal;
4081
4082         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4083                 goto normal;
4084
4085         gro_list_prepare(napi, skb);
4086
4087         rcu_read_lock();
4088         list_for_each_entry_rcu(ptype, head, list) {
4089                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4090                         continue;
4091
4092                 skb_set_network_header(skb, skb_gro_offset(skb));
4093                 skb_reset_mac_len(skb);
4094                 NAPI_GRO_CB(skb)->same_flow = 0;
4095                 NAPI_GRO_CB(skb)->flush = 0;
4096                 NAPI_GRO_CB(skb)->free = 0;
4097                 NAPI_GRO_CB(skb)->udp_mark = 0;
4098                 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4099
4100                 /* Setup for GRO checksum validation */
4101                 switch (skb->ip_summed) {
4102                 case CHECKSUM_COMPLETE:
4103                         NAPI_GRO_CB(skb)->csum = skb->csum;
4104                         NAPI_GRO_CB(skb)->csum_valid = 1;
4105                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4106                         break;
4107                 case CHECKSUM_UNNECESSARY:
4108                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4109                         NAPI_GRO_CB(skb)->csum_valid = 0;
4110                         break;
4111                 default:
4112                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4113                         NAPI_GRO_CB(skb)->csum_valid = 0;
4114                 }
4115
4116                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4117                 break;
4118         }
4119         rcu_read_unlock();
4120
4121         if (&ptype->list == head)
4122                 goto normal;
4123
4124         same_flow = NAPI_GRO_CB(skb)->same_flow;
4125         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4126
4127         if (pp) {
4128                 struct sk_buff *nskb = *pp;
4129
4130                 *pp = nskb->next;
4131                 nskb->next = NULL;
4132                 napi_gro_complete(nskb);
4133                 napi->gro_count--;
4134         }
4135
4136         if (same_flow)
4137                 goto ok;
4138
4139         if (NAPI_GRO_CB(skb)->flush)
4140                 goto normal;
4141
4142         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4143                 struct sk_buff *nskb = napi->gro_list;
4144
4145                 /* locate the end of the list to select the 'oldest' flow */
4146                 while (nskb->next) {
4147                         pp = &nskb->next;
4148                         nskb = *pp;
4149                 }
4150                 *pp = NULL;
4151                 nskb->next = NULL;
4152                 napi_gro_complete(nskb);
4153         } else {
4154                 napi->gro_count++;
4155         }
4156         NAPI_GRO_CB(skb)->count = 1;
4157         NAPI_GRO_CB(skb)->age = jiffies;
4158         NAPI_GRO_CB(skb)->last = skb;
4159         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4160         skb->next = napi->gro_list;
4161         napi->gro_list = skb;
4162         ret = GRO_HELD;
4163
4164 pull:
4165         grow = skb_gro_offset(skb) - skb_headlen(skb);
4166         if (grow > 0)
4167                 gro_pull_from_frag0(skb, grow);
4168 ok:
4169         return ret;
4170
4171 normal:
4172         ret = GRO_NORMAL;
4173         goto pull;
4174 }
4175
4176 struct packet_offload *gro_find_receive_by_type(__be16 type)
4177 {
4178         struct list_head *offload_head = &offload_base;
4179         struct packet_offload *ptype;
4180
4181         list_for_each_entry_rcu(ptype, offload_head, list) {
4182                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4183                         continue;
4184                 return ptype;
4185         }
4186         return NULL;
4187 }
4188 EXPORT_SYMBOL(gro_find_receive_by_type);
4189
4190 struct packet_offload *gro_find_complete_by_type(__be16 type)
4191 {
4192         struct list_head *offload_head = &offload_base;
4193         struct packet_offload *ptype;
4194
4195         list_for_each_entry_rcu(ptype, offload_head, list) {
4196                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4197                         continue;
4198                 return ptype;
4199         }
4200         return NULL;
4201 }
4202 EXPORT_SYMBOL(gro_find_complete_by_type);
4203
4204 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4205 {
4206         switch (ret) {
4207         case GRO_NORMAL:
4208                 if (netif_receive_skb_internal(skb))
4209                         ret = GRO_DROP;
4210                 break;
4211
4212         case GRO_DROP:
4213                 kfree_skb(skb);
4214                 break;
4215
4216         case GRO_MERGED_FREE:
4217                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4218                         kmem_cache_free(skbuff_head_cache, skb);
4219                 else
4220                         __kfree_skb(skb);
4221                 break;
4222
4223         case GRO_HELD:
4224         case GRO_MERGED:
4225                 break;
4226         }
4227
4228         return ret;
4229 }
4230
4231 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4232 {
4233         trace_napi_gro_receive_entry(skb);
4234
4235         skb_gro_reset_offset(skb);
4236
4237         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4238 }
4239 EXPORT_SYMBOL(napi_gro_receive);
4240
4241 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4242 {
4243         if (unlikely(skb->pfmemalloc)) {
4244                 consume_skb(skb);
4245                 return;
4246         }
4247         __skb_pull(skb, skb_headlen(skb));
4248         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4249         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4250         skb->vlan_tci = 0;
4251         skb->dev = napi->dev;
4252         skb->skb_iif = 0;
4253         skb->encapsulation = 0;
4254         skb_shinfo(skb)->gso_type = 0;
4255         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4256
4257         napi->skb = skb;
4258 }
4259
4260 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4261 {
4262         struct sk_buff *skb = napi->skb;
4263
4264         if (!skb) {
4265                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4266                 napi->skb = skb;
4267         }
4268         return skb;
4269 }
4270 EXPORT_SYMBOL(napi_get_frags);
4271
4272 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4273                                       struct sk_buff *skb,
4274                                       gro_result_t ret)
4275 {
4276         switch (ret) {
4277         case GRO_NORMAL:
4278         case GRO_HELD:
4279                 __skb_push(skb, ETH_HLEN);
4280                 skb->protocol = eth_type_trans(skb, skb->dev);
4281                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4282                         ret = GRO_DROP;
4283                 break;
4284
4285         case GRO_DROP:
4286         case GRO_MERGED_FREE:
4287                 napi_reuse_skb(napi, skb);
4288                 break;
4289
4290         case GRO_MERGED:
4291                 break;
4292         }
4293
4294         return ret;
4295 }
4296
4297 /* Upper GRO stack assumes network header starts at gro_offset=0
4298  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4299  * We copy ethernet header into skb->data to have a common layout.
4300  */
4301 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4302 {
4303         struct sk_buff *skb = napi->skb;
4304         const struct ethhdr *eth;
4305         unsigned int hlen = sizeof(*eth);
4306
4307         napi->skb = NULL;
4308
4309         skb_reset_mac_header(skb);
4310         skb_gro_reset_offset(skb);
4311
4312         eth = skb_gro_header_fast(skb, 0);
4313         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4314                 eth = skb_gro_header_slow(skb, hlen, 0);
4315                 if (unlikely(!eth)) {
4316                         napi_reuse_skb(napi, skb);
4317                         return NULL;
4318                 }
4319         } else {
4320                 gro_pull_from_frag0(skb, hlen);
4321                 NAPI_GRO_CB(skb)->frag0 += hlen;
4322                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4323         }
4324         __skb_pull(skb, hlen);
4325
4326         /*
4327          * This works because the only protocols we care about don't require
4328          * special handling.
4329          * We'll fix it up properly in napi_frags_finish()
4330          */
4331         skb->protocol = eth->h_proto;
4332
4333         return skb;
4334 }
4335
4336 gro_result_t napi_gro_frags(struct napi_struct *napi)
4337 {
4338         struct sk_buff *skb = napi_frags_skb(napi);
4339
4340         if (!skb)
4341                 return GRO_DROP;
4342
4343         trace_napi_gro_frags_entry(skb);
4344
4345         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4346 }
4347 EXPORT_SYMBOL(napi_gro_frags);
4348
4349 /* Compute the checksum from gro_offset and return the folded value
4350  * after adding in any pseudo checksum.
4351  */
4352 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4353 {
4354         __wsum wsum;
4355         __sum16 sum;
4356
4357         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4358
4359         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4360         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4361         if (likely(!sum)) {
4362                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4363                     !skb->csum_complete_sw)
4364                         netdev_rx_csum_fault(skb->dev);
4365         }
4366
4367         NAPI_GRO_CB(skb)->csum = wsum;
4368         NAPI_GRO_CB(skb)->csum_valid = 1;
4369
4370         return sum;
4371 }
4372 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4373
4374 /*
4375  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4376  * Note: called with local irq disabled, but exits with local irq enabled.
4377  */
4378 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4379 {
4380 #ifdef CONFIG_RPS
4381         struct softnet_data *remsd = sd->rps_ipi_list;
4382
4383         if (remsd) {
4384                 sd->rps_ipi_list = NULL;
4385
4386                 local_irq_enable();
4387                 preempt_check_resched_rt();
4388
4389                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4390                 while (remsd) {
4391                         struct softnet_data *next = remsd->rps_ipi_next;
4392
4393                         if (cpu_online(remsd->cpu))
4394                                 smp_call_function_single_async(remsd->cpu,
4395                                                            &remsd->csd);
4396                         remsd = next;
4397                 }
4398         } else
4399 #endif
4400                 local_irq_enable();
4401         preempt_check_resched_rt();
4402 }
4403
4404 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4405 {
4406 #ifdef CONFIG_RPS
4407         return sd->rps_ipi_list != NULL;
4408 #else
4409         return false;
4410 #endif
4411 }
4412
4413 static int process_backlog(struct napi_struct *napi, int quota)
4414 {
4415         int work = 0;
4416         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4417
4418         /* Check if we have pending ipi, its better to send them now,
4419          * not waiting net_rx_action() end.
4420          */
4421         if (sd_has_rps_ipi_waiting(sd)) {
4422                 local_irq_disable();
4423                 net_rps_action_and_irq_enable(sd);
4424         }
4425
4426         napi->weight = weight_p;
4427         local_irq_disable();
4428         while (1) {
4429                 struct sk_buff *skb;
4430
4431                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4432                         rcu_read_lock();
4433                         local_irq_enable();
4434                         __netif_receive_skb(skb);
4435                         rcu_read_unlock();
4436                         local_irq_disable();
4437                         input_queue_head_incr(sd);
4438                         if (++work >= quota) {
4439                                 local_irq_enable();
4440                                 return work;
4441                         }
4442                 }
4443
4444                 rps_lock(sd);
4445                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4446                         /*
4447                          * Inline a custom version of __napi_complete().
4448                          * only current cpu owns and manipulates this napi,
4449                          * and NAPI_STATE_SCHED is the only possible flag set
4450                          * on backlog.
4451                          * We can use a plain write instead of clear_bit(),
4452                          * and we dont need an smp_mb() memory barrier.
4453                          */
4454                         napi->state = 0;
4455                         rps_unlock(sd);
4456
4457                         break;
4458                 }
4459
4460                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4461                                            &sd->process_queue);
4462                 rps_unlock(sd);
4463         }
4464         local_irq_enable();
4465
4466         return work;
4467 }
4468
4469 /**
4470  * __napi_schedule - schedule for receive
4471  * @n: entry to schedule
4472  *
4473  * The entry's receive function will be scheduled to run.
4474  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4475  */
4476 void __napi_schedule(struct napi_struct *n)
4477 {
4478         unsigned long flags;
4479
4480         local_irq_save(flags);
4481         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4482         local_irq_restore(flags);
4483         preempt_check_resched_rt();
4484 }
4485 EXPORT_SYMBOL(__napi_schedule);
4486
4487 /**
4488  * __napi_schedule_irqoff - schedule for receive
4489  * @n: entry to schedule
4490  *
4491  * Variant of __napi_schedule() assuming hard irqs are masked
4492  */
4493 void __napi_schedule_irqoff(struct napi_struct *n)
4494 {
4495         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4496 }
4497 EXPORT_SYMBOL(__napi_schedule_irqoff);
4498
4499 void __napi_complete(struct napi_struct *n)
4500 {
4501         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4502
4503         list_del_init(&n->poll_list);
4504         smp_mb__before_atomic();
4505         clear_bit(NAPI_STATE_SCHED, &n->state);
4506 }
4507 EXPORT_SYMBOL(__napi_complete);
4508
4509 void napi_complete_done(struct napi_struct *n, int work_done)
4510 {
4511         unsigned long flags;
4512
4513         /*
4514          * don't let napi dequeue from the cpu poll list
4515          * just in case its running on a different cpu
4516          */
4517         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4518                 return;
4519
4520         if (n->gro_list) {
4521                 unsigned long timeout = 0;
4522
4523                 if (work_done)
4524                         timeout = n->dev->gro_flush_timeout;
4525
4526                 if (timeout)
4527                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
4528                                       HRTIMER_MODE_REL_PINNED);
4529                 else
4530                         napi_gro_flush(n, false);
4531         }
4532         if (likely(list_empty(&n->poll_list))) {
4533                 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4534         } else {
4535                 /* If n->poll_list is not empty, we need to mask irqs */
4536                 local_irq_save(flags);
4537                 __napi_complete(n);
4538                 local_irq_restore(flags);
4539         }
4540 }
4541 EXPORT_SYMBOL(napi_complete_done);
4542
4543 /* must be called under rcu_read_lock(), as we dont take a reference */
4544 struct napi_struct *napi_by_id(unsigned int napi_id)
4545 {
4546         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4547         struct napi_struct *napi;
4548
4549         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4550                 if (napi->napi_id == napi_id)
4551                         return napi;
4552
4553         return NULL;
4554 }
4555 EXPORT_SYMBOL_GPL(napi_by_id);
4556
4557 void napi_hash_add(struct napi_struct *napi)
4558 {
4559         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4560
4561                 spin_lock(&napi_hash_lock);
4562
4563                 /* 0 is not a valid id, we also skip an id that is taken
4564                  * we expect both events to be extremely rare
4565                  */
4566                 napi->napi_id = 0;
4567                 while (!napi->napi_id) {
4568                         napi->napi_id = ++napi_gen_id;
4569                         if (napi_by_id(napi->napi_id))
4570                                 napi->napi_id = 0;
4571                 }
4572
4573                 hlist_add_head_rcu(&napi->napi_hash_node,
4574                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4575
4576                 spin_unlock(&napi_hash_lock);
4577         }
4578 }
4579 EXPORT_SYMBOL_GPL(napi_hash_add);
4580
4581 /* Warning : caller is responsible to make sure rcu grace period
4582  * is respected before freeing memory containing @napi
4583  */
4584 void napi_hash_del(struct napi_struct *napi)
4585 {
4586         spin_lock(&napi_hash_lock);
4587
4588         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4589                 hlist_del_rcu(&napi->napi_hash_node);
4590
4591         spin_unlock(&napi_hash_lock);
4592 }
4593 EXPORT_SYMBOL_GPL(napi_hash_del);
4594
4595 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4596 {
4597         struct napi_struct *napi;
4598
4599         napi = container_of(timer, struct napi_struct, timer);
4600         if (napi->gro_list)
4601                 napi_schedule(napi);
4602
4603         return HRTIMER_NORESTART;
4604 }
4605
4606 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4607                     int (*poll)(struct napi_struct *, int), int weight)
4608 {
4609         INIT_LIST_HEAD(&napi->poll_list);
4610         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4611         napi->timer.function = napi_watchdog;
4612         napi->gro_count = 0;
4613         napi->gro_list = NULL;
4614         napi->skb = NULL;
4615         napi->poll = poll;
4616         if (weight > NAPI_POLL_WEIGHT)
4617                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4618                             weight, dev->name);
4619         napi->weight = weight;
4620         list_add(&napi->dev_list, &dev->napi_list);
4621         napi->dev = dev;
4622 #ifdef CONFIG_NETPOLL
4623         spin_lock_init(&napi->poll_lock);
4624         napi->poll_owner = -1;
4625 #endif
4626         set_bit(NAPI_STATE_SCHED, &napi->state);
4627 }
4628 EXPORT_SYMBOL(netif_napi_add);
4629
4630 void napi_disable(struct napi_struct *n)
4631 {
4632         might_sleep();
4633         set_bit(NAPI_STATE_DISABLE, &n->state);
4634
4635         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4636                 msleep(1);
4637
4638         hrtimer_cancel(&n->timer);
4639
4640         clear_bit(NAPI_STATE_DISABLE, &n->state);
4641 }
4642 EXPORT_SYMBOL(napi_disable);
4643
4644 void netif_napi_del(struct napi_struct *napi)
4645 {
4646         list_del_init(&napi->dev_list);
4647         napi_free_frags(napi);
4648
4649         kfree_skb_list(napi->gro_list);
4650         napi->gro_list = NULL;
4651         napi->gro_count = 0;
4652 }
4653 EXPORT_SYMBOL(netif_napi_del);
4654
4655 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4656 {
4657         void *have;
4658         int work, weight;
4659
4660         list_del_init(&n->poll_list);
4661
4662         have = netpoll_poll_lock(n);
4663
4664         weight = n->weight;
4665
4666         /* This NAPI_STATE_SCHED test is for avoiding a race
4667          * with netpoll's poll_napi().  Only the entity which
4668          * obtains the lock and sees NAPI_STATE_SCHED set will
4669          * actually make the ->poll() call.  Therefore we avoid
4670          * accidentally calling ->poll() when NAPI is not scheduled.
4671          */
4672         work = 0;
4673         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4674                 work = n->poll(n, weight);
4675                 trace_napi_poll(n);
4676         }
4677
4678         WARN_ON_ONCE(work > weight);
4679
4680         if (likely(work < weight))
4681                 goto out_unlock;
4682
4683         /* Drivers must not modify the NAPI state if they
4684          * consume the entire weight.  In such cases this code
4685          * still "owns" the NAPI instance and therefore can
4686          * move the instance around on the list at-will.
4687          */
4688         if (unlikely(napi_disable_pending(n))) {
4689                 napi_complete(n);
4690                 goto out_unlock;
4691         }
4692
4693         if (n->gro_list) {
4694                 /* flush too old packets
4695                  * If HZ < 1000, flush all packets.
4696                  */
4697                 napi_gro_flush(n, HZ >= 1000);
4698         }
4699
4700         /* Some drivers may have called napi_schedule
4701          * prior to exhausting their budget.
4702          */
4703         if (unlikely(!list_empty(&n->poll_list))) {
4704                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4705                              n->dev ? n->dev->name : "backlog");
4706                 goto out_unlock;
4707         }
4708
4709         list_add_tail(&n->poll_list, repoll);
4710
4711 out_unlock:
4712         netpoll_poll_unlock(have);
4713
4714         return work;
4715 }
4716
4717 static void net_rx_action(struct softirq_action *h)
4718 {
4719         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4720         unsigned long time_limit = jiffies + 2;
4721         int budget = netdev_budget;
4722         LIST_HEAD(list);
4723         LIST_HEAD(repoll);
4724
4725         local_irq_disable();
4726         list_splice_init(&sd->poll_list, &list);
4727         local_irq_enable();
4728
4729         for (;;) {
4730                 struct napi_struct *n;
4731
4732                 if (list_empty(&list)) {
4733                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4734                                 return;
4735                         break;
4736                 }
4737
4738                 n = list_first_entry(&list, struct napi_struct, poll_list);
4739                 budget -= napi_poll(n, &repoll);
4740
4741                 /* If softirq window is exhausted then punt.
4742                  * Allow this to run for 2 jiffies since which will allow
4743                  * an average latency of 1.5/HZ.
4744                  */
4745                 if (unlikely(budget <= 0 ||
4746                              time_after_eq(jiffies, time_limit))) {
4747                         sd->time_squeeze++;
4748                         break;
4749                 }
4750         }
4751
4752         local_irq_disable();
4753
4754         list_splice_tail_init(&sd->poll_list, &list);
4755         list_splice_tail(&repoll, &list);
4756         list_splice(&list, &sd->poll_list);
4757         if (!list_empty(&sd->poll_list))
4758                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4759
4760         net_rps_action_and_irq_enable(sd);
4761 }
4762
4763 struct netdev_adjacent {
4764         struct net_device *dev;
4765
4766         /* upper master flag, there can only be one master device per list */
4767         bool master;
4768
4769         /* counter for the number of times this device was added to us */
4770         u16 ref_nr;
4771
4772         /* private field for the users */
4773         void *private;
4774
4775         struct list_head list;
4776         struct rcu_head rcu;
4777 };
4778
4779 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4780                                                  struct net_device *adj_dev,
4781                                                  struct list_head *adj_list)
4782 {
4783         struct netdev_adjacent *adj;
4784
4785         list_for_each_entry(adj, adj_list, list) {
4786                 if (adj->dev == adj_dev)
4787                         return adj;
4788         }
4789         return NULL;
4790 }
4791
4792 /**
4793  * netdev_has_upper_dev - Check if device is linked to an upper device
4794  * @dev: device
4795  * @upper_dev: upper device to check
4796  *
4797  * Find out if a device is linked to specified upper device and return true
4798  * in case it is. Note that this checks only immediate upper device,
4799  * not through a complete stack of devices. The caller must hold the RTNL lock.
4800  */
4801 bool netdev_has_upper_dev(struct net_device *dev,
4802                           struct net_device *upper_dev)
4803 {
4804         ASSERT_RTNL();
4805
4806         return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4807 }
4808 EXPORT_SYMBOL(netdev_has_upper_dev);
4809
4810 /**
4811  * netdev_has_any_upper_dev - Check if device is linked to some device
4812  * @dev: device
4813  *
4814  * Find out if a device is linked to an upper device and return true in case
4815  * it is. The caller must hold the RTNL lock.
4816  */
4817 static bool netdev_has_any_upper_dev(struct net_device *dev)
4818 {
4819         ASSERT_RTNL();
4820
4821         return !list_empty(&dev->all_adj_list.upper);
4822 }
4823
4824 /**
4825  * netdev_master_upper_dev_get - Get master upper device
4826  * @dev: device
4827  *
4828  * Find a master upper device and return pointer to it or NULL in case
4829  * it's not there. The caller must hold the RTNL lock.
4830  */
4831 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4832 {
4833         struct netdev_adjacent *upper;
4834
4835         ASSERT_RTNL();
4836
4837         if (list_empty(&dev->adj_list.upper))
4838                 return NULL;
4839
4840         upper = list_first_entry(&dev->adj_list.upper,
4841                                  struct netdev_adjacent, list);
4842         if (likely(upper->master))
4843                 return upper->dev;
4844         return NULL;
4845 }
4846 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4847
4848 void *netdev_adjacent_get_private(struct list_head *adj_list)
4849 {
4850         struct netdev_adjacent *adj;
4851
4852         adj = list_entry(adj_list, struct netdev_adjacent, list);
4853
4854         return adj->private;
4855 }
4856 EXPORT_SYMBOL(netdev_adjacent_get_private);
4857
4858 /**
4859  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4860  * @dev: device
4861  * @iter: list_head ** of the current position
4862  *
4863  * Gets the next device from the dev's upper list, starting from iter
4864  * position. The caller must hold RCU read lock.
4865  */
4866 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4867                                                  struct list_head **iter)
4868 {
4869         struct netdev_adjacent *upper;
4870
4871         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4872
4873         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4874
4875         if (&upper->list == &dev->adj_list.upper)
4876                 return NULL;
4877
4878         *iter = &upper->list;
4879
4880         return upper->dev;
4881 }
4882 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4883
4884 /**
4885  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4886  * @dev: device
4887  * @iter: list_head ** of the current position
4888  *
4889  * Gets the next device from the dev's upper list, starting from iter
4890  * position. The caller must hold RCU read lock.
4891  */
4892 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4893                                                      struct list_head **iter)
4894 {
4895         struct netdev_adjacent *upper;
4896
4897         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4898
4899         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4900
4901         if (&upper->list == &dev->all_adj_list.upper)
4902                 return NULL;
4903
4904         *iter = &upper->list;
4905
4906         return upper->dev;
4907 }
4908 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4909
4910 /**
4911  * netdev_lower_get_next_private - Get the next ->private from the
4912  *                                 lower neighbour list
4913  * @dev: device
4914  * @iter: list_head ** of the current position
4915  *
4916  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4917  * list, starting from iter position. The caller must hold either hold the
4918  * RTNL lock or its own locking that guarantees that the neighbour lower
4919  * list will remain unchainged.
4920  */
4921 void *netdev_lower_get_next_private(struct net_device *dev,
4922                                     struct list_head **iter)
4923 {
4924         struct netdev_adjacent *lower;
4925
4926         lower = list_entry(*iter, struct netdev_adjacent, list);
4927
4928         if (&lower->list == &dev->adj_list.lower)
4929                 return NULL;
4930
4931         *iter = lower->list.next;
4932
4933         return lower->private;
4934 }
4935 EXPORT_SYMBOL(netdev_lower_get_next_private);
4936
4937 /**
4938  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4939  *                                     lower neighbour list, RCU
4940  *                                     variant
4941  * @dev: device
4942  * @iter: list_head ** of the current position
4943  *
4944  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4945  * list, starting from iter position. The caller must hold RCU read lock.
4946  */
4947 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4948                                         struct list_head **iter)
4949 {
4950         struct netdev_adjacent *lower;
4951
4952         WARN_ON_ONCE(!rcu_read_lock_held());
4953
4954         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4955
4956         if (&lower->list == &dev->adj_list.lower)
4957                 return NULL;
4958
4959         *iter = &lower->list;
4960
4961         return lower->private;
4962 }
4963 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4964
4965 /**
4966  * netdev_lower_get_next - Get the next device from the lower neighbour
4967  *                         list
4968  * @dev: device
4969  * @iter: list_head ** of the current position
4970  *
4971  * Gets the next netdev_adjacent from the dev's lower neighbour
4972  * list, starting from iter position. The caller must hold RTNL lock or
4973  * its own locking that guarantees that the neighbour lower
4974  * list will remain unchainged.
4975  */
4976 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4977 {
4978         struct netdev_adjacent *lower;
4979
4980         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4981
4982         if (&lower->list == &dev->adj_list.lower)
4983                 return NULL;
4984
4985         *iter = &lower->list;
4986
4987         return lower->dev;
4988 }
4989 EXPORT_SYMBOL(netdev_lower_get_next);
4990
4991 /**
4992  * netdev_lower_get_first_private_rcu - Get the first ->private from the
4993  *                                     lower neighbour list, RCU
4994  *                                     variant
4995  * @dev: device
4996  *
4997  * Gets the first netdev_adjacent->private from the dev's lower neighbour
4998  * list. The caller must hold RCU read lock.
4999  */
5000 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5001 {
5002         struct netdev_adjacent *lower;
5003
5004         lower = list_first_or_null_rcu(&dev->adj_list.lower,
5005                         struct netdev_adjacent, list);
5006         if (lower)
5007                 return lower->private;
5008         return NULL;
5009 }
5010 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5011
5012 /**
5013  * netdev_master_upper_dev_get_rcu - Get master upper device
5014  * @dev: device
5015  *
5016  * Find a master upper device and return pointer to it or NULL in case
5017  * it's not there. The caller must hold the RCU read lock.
5018  */
5019 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5020 {
5021         struct netdev_adjacent *upper;
5022
5023         upper = list_first_or_null_rcu(&dev->adj_list.upper,
5024                                        struct netdev_adjacent, list);
5025         if (upper && likely(upper->master))
5026                 return upper->dev;
5027         return NULL;
5028 }
5029 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5030
5031 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5032                               struct net_device *adj_dev,
5033                               struct list_head *dev_list)
5034 {
5035         char linkname[IFNAMSIZ+7];
5036         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5037                 "upper_%s" : "lower_%s", adj_dev->name);
5038         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5039                                  linkname);
5040 }
5041 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5042                                char *name,
5043                                struct list_head *dev_list)
5044 {
5045         char linkname[IFNAMSIZ+7];
5046         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5047                 "upper_%s" : "lower_%s", name);
5048         sysfs_remove_link(&(dev->dev.kobj), linkname);
5049 }
5050
5051 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5052                                                  struct net_device *adj_dev,
5053                                                  struct list_head *dev_list)
5054 {
5055         return (dev_list == &dev->adj_list.upper ||
5056                 dev_list == &dev->adj_list.lower) &&
5057                 net_eq(dev_net(dev), dev_net(adj_dev));
5058 }
5059
5060 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5061                                         struct net_device *adj_dev,
5062                                         struct list_head *dev_list,
5063                                         void *private, bool master)
5064 {
5065         struct netdev_adjacent *adj;
5066         int ret;
5067
5068         adj = __netdev_find_adj(dev, adj_dev, dev_list);
5069
5070         if (adj) {
5071                 adj->ref_nr++;
5072                 return 0;
5073         }
5074
5075         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5076         if (!adj)
5077                 return -ENOMEM;
5078
5079         adj->dev = adj_dev;
5080         adj->master = master;
5081         adj->ref_nr = 1;
5082         adj->private = private;
5083         dev_hold(adj_dev);
5084
5085         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5086                  adj_dev->name, dev->name, adj_dev->name);
5087
5088         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5089                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5090                 if (ret)
5091                         goto free_adj;
5092         }
5093
5094         /* Ensure that master link is always the first item in list. */
5095         if (master) {
5096                 ret = sysfs_create_link(&(dev->dev.kobj),
5097                                         &(adj_dev->dev.kobj), "master");
5098                 if (ret)
5099                         goto remove_symlinks;
5100
5101                 list_add_rcu(&adj->list, dev_list);
5102         } else {
5103                 list_add_tail_rcu(&adj->list, dev_list);
5104         }
5105
5106         return 0;
5107
5108 remove_symlinks:
5109         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5110                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5111 free_adj:
5112         kfree(adj);
5113         dev_put(adj_dev);
5114
5115         return ret;
5116 }
5117
5118 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5119                                          struct net_device *adj_dev,
5120                                          struct list_head *dev_list)
5121 {
5122         struct netdev_adjacent *adj;
5123
5124         adj = __netdev_find_adj(dev, adj_dev, dev_list);
5125
5126         if (!adj) {
5127                 pr_err("tried to remove device %s from %s\n",
5128                        dev->name, adj_dev->name);
5129                 BUG();
5130         }
5131
5132         if (adj->ref_nr > 1) {
5133                 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5134                          adj->ref_nr-1);
5135                 adj->ref_nr--;
5136                 return;
5137         }
5138
5139         if (adj->master)
5140                 sysfs_remove_link(&(dev->dev.kobj), "master");
5141
5142         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5143                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5144
5145         list_del_rcu(&adj->list);
5146         pr_debug("dev_put for %s, because link removed from %s to %s\n",
5147                  adj_dev->name, dev->name, adj_dev->name);
5148         dev_put(adj_dev);
5149         kfree_rcu(adj, rcu);
5150 }
5151
5152 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5153                                             struct net_device *upper_dev,
5154                                             struct list_head *up_list,
5155                                             struct list_head *down_list,
5156                                             void *private, bool master)
5157 {
5158         int ret;
5159
5160         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5161                                            master);
5162         if (ret)
5163                 return ret;
5164
5165         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5166                                            false);
5167         if (ret) {
5168                 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5169                 return ret;
5170         }
5171
5172         return 0;
5173 }
5174
5175 static int __netdev_adjacent_dev_link(struct net_device *dev,
5176                                       struct net_device *upper_dev)
5177 {
5178         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5179                                                 &dev->all_adj_list.upper,
5180                                                 &upper_dev->all_adj_list.lower,
5181                                                 NULL, false);
5182 }
5183
5184 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5185                                                struct net_device *upper_dev,
5186                                                struct list_head *up_list,
5187                                                struct list_head *down_list)
5188 {
5189         __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5190         __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5191 }
5192
5193 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5194                                          struct net_device *upper_dev)
5195 {
5196         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5197                                            &dev->all_adj_list.upper,
5198                                            &upper_dev->all_adj_list.lower);
5199 }
5200
5201 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5202                                                 struct net_device *upper_dev,
5203                                                 void *private, bool master)
5204 {
5205         int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5206
5207         if (ret)
5208                 return ret;
5209
5210         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5211                                                &dev->adj_list.upper,
5212                                                &upper_dev->adj_list.lower,
5213                                                private, master);
5214         if (ret) {
5215                 __netdev_adjacent_dev_unlink(dev, upper_dev);
5216                 return ret;
5217         }
5218
5219         return 0;
5220 }
5221
5222 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5223                                                    struct net_device *upper_dev)
5224 {
5225         __netdev_adjacent_dev_unlink(dev, upper_dev);
5226         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5227                                            &dev->adj_list.upper,
5228                                            &upper_dev->adj_list.lower);
5229 }
5230
5231 static int __netdev_upper_dev_link(struct net_device *dev,
5232                                    struct net_device *upper_dev, bool master,
5233                                    void *private)
5234 {
5235         struct netdev_adjacent *i, *j, *to_i, *to_j;
5236         int ret = 0;
5237
5238         ASSERT_RTNL();
5239
5240         if (dev == upper_dev)
5241                 return -EBUSY;
5242
5243         /* To prevent loops, check if dev is not upper device to upper_dev. */
5244         if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5245                 return -EBUSY;
5246
5247         if (__netdev_find_adj(dev, upper_dev, &dev->adj_list.upper))
5248                 return -EEXIST;
5249
5250         if (master && netdev_master_upper_dev_get(dev))
5251                 return -EBUSY;
5252
5253         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5254                                                    master);
5255         if (ret)
5256                 return ret;
5257
5258         /* Now that we linked these devs, make all the upper_dev's
5259          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5260          * versa, and don't forget the devices itself. All of these
5261          * links are non-neighbours.
5262          */
5263         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5264                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5265                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5266                                  i->dev->name, j->dev->name);
5267                         ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5268                         if (ret)
5269                                 goto rollback_mesh;
5270                 }
5271         }
5272
5273         /* add dev to every upper_dev's upper device */
5274         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5275                 pr_debug("linking %s's upper device %s with %s\n",
5276                          upper_dev->name, i->dev->name, dev->name);
5277                 ret = __netdev_adjacent_dev_link(dev, i->dev);
5278                 if (ret)
5279                         goto rollback_upper_mesh;
5280         }
5281
5282         /* add upper_dev to every dev's lower device */
5283         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5284                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5285                          i->dev->name, upper_dev->name);
5286                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5287                 if (ret)
5288                         goto rollback_lower_mesh;
5289         }
5290
5291         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5292         return 0;
5293
5294 rollback_lower_mesh:
5295         to_i = i;
5296         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5297                 if (i == to_i)
5298                         break;
5299                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5300         }
5301
5302         i = NULL;
5303
5304 rollback_upper_mesh:
5305         to_i = i;
5306         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5307                 if (i == to_i)
5308                         break;
5309                 __netdev_adjacent_dev_unlink(dev, i->dev);
5310         }
5311
5312         i = j = NULL;
5313
5314 rollback_mesh:
5315         to_i = i;
5316         to_j = j;
5317         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5318                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5319                         if (i == to_i && j == to_j)
5320                                 break;
5321                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5322                 }
5323                 if (i == to_i)
5324                         break;
5325         }
5326
5327         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5328
5329         return ret;
5330 }
5331
5332 /**
5333  * netdev_upper_dev_link - Add a link to the upper device
5334  * @dev: device
5335  * @upper_dev: new upper device
5336  *
5337  * Adds a link to device which is upper to this one. The caller must hold
5338  * the RTNL lock. On a failure a negative errno code is returned.
5339  * On success the reference counts are adjusted and the function
5340  * returns zero.
5341  */
5342 int netdev_upper_dev_link(struct net_device *dev,
5343                           struct net_device *upper_dev)
5344 {
5345         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5346 }
5347 EXPORT_SYMBOL(netdev_upper_dev_link);
5348
5349 /**
5350  * netdev_master_upper_dev_link - Add a master link to the upper device
5351  * @dev: device
5352  * @upper_dev: new upper device
5353  *
5354  * Adds a link to device which is upper to this one. In this case, only
5355  * one master upper device can be linked, although other non-master devices
5356  * might be linked as well. The caller must hold the RTNL lock.
5357  * On a failure a negative errno code is returned. On success the reference
5358  * counts are adjusted and the function returns zero.
5359  */
5360 int netdev_master_upper_dev_link(struct net_device *dev,
5361                                  struct net_device *upper_dev)
5362 {
5363         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5364 }
5365 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5366
5367 int netdev_master_upper_dev_link_private(struct net_device *dev,
5368                                          struct net_device *upper_dev,
5369                                          void *private)
5370 {
5371         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5372 }
5373 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5374
5375 /**
5376  * netdev_upper_dev_unlink - Removes a link to upper device
5377  * @dev: device
5378  * @upper_dev: new upper device
5379  *
5380  * Removes a link to device which is upper to this one. The caller must hold
5381  * the RTNL lock.
5382  */
5383 void netdev_upper_dev_unlink(struct net_device *dev,
5384                              struct net_device *upper_dev)
5385 {
5386         struct netdev_adjacent *i, *j;
5387         ASSERT_RTNL();
5388
5389         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5390
5391         /* Here is the tricky part. We must remove all dev's lower
5392          * devices from all upper_dev's upper devices and vice
5393          * versa, to maintain the graph relationship.
5394          */
5395         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5396                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5397                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5398
5399         /* remove also the devices itself from lower/upper device
5400          * list
5401          */
5402         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5403                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5404
5405         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5406                 __netdev_adjacent_dev_unlink(dev, i->dev);
5407
5408         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5409 }
5410 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5411
5412 /**
5413  * netdev_bonding_info_change - Dispatch event about slave change
5414  * @dev: device
5415  * @bonding_info: info to dispatch
5416  *
5417  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5418  * The caller must hold the RTNL lock.
5419  */
5420 void netdev_bonding_info_change(struct net_device *dev,
5421                                 struct netdev_bonding_info *bonding_info)
5422 {
5423         struct netdev_notifier_bonding_info     info;
5424
5425         memcpy(&info.bonding_info, bonding_info,
5426                sizeof(struct netdev_bonding_info));
5427         call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5428                                       &info.info);
5429 }
5430 EXPORT_SYMBOL(netdev_bonding_info_change);
5431
5432 static void netdev_adjacent_add_links(struct net_device *dev)
5433 {
5434         struct netdev_adjacent *iter;
5435
5436         struct net *net = dev_net(dev);
5437
5438         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5439                 if (!net_eq(net,dev_net(iter->dev)))
5440                         continue;
5441                 netdev_adjacent_sysfs_add(iter->dev, dev,
5442                                           &iter->dev->adj_list.lower);
5443                 netdev_adjacent_sysfs_add(dev, iter->dev,
5444                                           &dev->adj_list.upper);
5445         }
5446
5447         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5448                 if (!net_eq(net,dev_net(iter->dev)))
5449                         continue;
5450                 netdev_adjacent_sysfs_add(iter->dev, dev,
5451                                           &iter->dev->adj_list.upper);
5452                 netdev_adjacent_sysfs_add(dev, iter->dev,
5453                                           &dev->adj_list.lower);
5454         }
5455 }
5456
5457 static void netdev_adjacent_del_links(struct net_device *dev)
5458 {
5459         struct netdev_adjacent *iter;
5460
5461         struct net *net = dev_net(dev);
5462
5463         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5464                 if (!net_eq(net,dev_net(iter->dev)))
5465                         continue;
5466                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5467                                           &iter->dev->adj_list.lower);
5468                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5469                                           &dev->adj_list.upper);
5470         }
5471
5472         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5473                 if (!net_eq(net,dev_net(iter->dev)))
5474                         continue;
5475                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5476                                           &iter->dev->adj_list.upper);
5477                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5478                                           &dev->adj_list.lower);
5479         }
5480 }
5481
5482 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5483 {
5484         struct netdev_adjacent *iter;
5485
5486         struct net *net = dev_net(dev);
5487
5488         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5489                 if (!net_eq(net,dev_net(iter->dev)))
5490                         continue;
5491                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5492                                           &iter->dev->adj_list.lower);
5493                 netdev_adjacent_sysfs_add(iter->dev, dev,
5494                                           &iter->dev->adj_list.lower);
5495         }
5496
5497         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5498                 if (!net_eq(net,dev_net(iter->dev)))
5499                         continue;
5500                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5501                                           &iter->dev->adj_list.upper);
5502                 netdev_adjacent_sysfs_add(iter->dev, dev,
5503                                           &iter->dev->adj_list.upper);
5504         }
5505 }
5506
5507 void *netdev_lower_dev_get_private(struct net_device *dev,
5508                                    struct net_device *lower_dev)
5509 {
5510         struct netdev_adjacent *lower;
5511
5512         if (!lower_dev)
5513                 return NULL;
5514         lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5515         if (!lower)
5516                 return NULL;
5517
5518         return lower->private;
5519 }
5520 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5521
5522
5523 int dev_get_nest_level(struct net_device *dev,
5524                        bool (*type_check)(struct net_device *dev))
5525 {
5526         struct net_device *lower = NULL;
5527         struct list_head *iter;
5528         int max_nest = -1;
5529         int nest;
5530
5531         ASSERT_RTNL();
5532
5533         netdev_for_each_lower_dev(dev, lower, iter) {
5534                 nest = dev_get_nest_level(lower, type_check);
5535                 if (max_nest < nest)
5536                         max_nest = nest;
5537         }
5538
5539         if (type_check(dev))
5540                 max_nest++;
5541
5542         return max_nest;
5543 }
5544 EXPORT_SYMBOL(dev_get_nest_level);
5545
5546 static void dev_change_rx_flags(struct net_device *dev, int flags)
5547 {
5548         const struct net_device_ops *ops = dev->netdev_ops;
5549
5550         if (ops->ndo_change_rx_flags)
5551                 ops->ndo_change_rx_flags(dev, flags);
5552 }
5553
5554 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5555 {
5556         unsigned int old_flags = dev->flags;
5557         kuid_t uid;
5558         kgid_t gid;
5559
5560         ASSERT_RTNL();
5561
5562         dev->flags |= IFF_PROMISC;
5563         dev->promiscuity += inc;
5564         if (dev->promiscuity == 0) {
5565                 /*
5566                  * Avoid overflow.
5567                  * If inc causes overflow, untouch promisc and return error.
5568                  */
5569                 if (inc < 0)
5570                         dev->flags &= ~IFF_PROMISC;
5571                 else {
5572                         dev->promiscuity -= inc;
5573                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5574                                 dev->name);
5575                         return -EOVERFLOW;
5576                 }
5577         }
5578         if (dev->flags != old_flags) {
5579                 pr_info("device %s %s promiscuous mode\n",
5580                         dev->name,
5581                         dev->flags & IFF_PROMISC ? "entered" : "left");
5582                 if (audit_enabled) {
5583                         current_uid_gid(&uid, &gid);
5584                         audit_log(current->audit_context, GFP_ATOMIC,
5585                                 AUDIT_ANOM_PROMISCUOUS,
5586                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5587                                 dev->name, (dev->flags & IFF_PROMISC),
5588                                 (old_flags & IFF_PROMISC),
5589                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5590                                 from_kuid(&init_user_ns, uid),
5591                                 from_kgid(&init_user_ns, gid),
5592                                 audit_get_sessionid(current));
5593                 }
5594
5595                 dev_change_rx_flags(dev, IFF_PROMISC);
5596         }
5597         if (notify)
5598                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5599         return 0;
5600 }
5601
5602 /**
5603  *      dev_set_promiscuity     - update promiscuity count on a device
5604  *      @dev: device
5605  *      @inc: modifier
5606  *
5607  *      Add or remove promiscuity from a device. While the count in the device
5608  *      remains above zero the interface remains promiscuous. Once it hits zero
5609  *      the device reverts back to normal filtering operation. A negative inc
5610  *      value is used to drop promiscuity on the device.
5611  *      Return 0 if successful or a negative errno code on error.
5612  */
5613 int dev_set_promiscuity(struct net_device *dev, int inc)
5614 {
5615         unsigned int old_flags = dev->flags;
5616         int err;
5617
5618         err = __dev_set_promiscuity(dev, inc, true);
5619         if (err < 0)
5620                 return err;
5621         if (dev->flags != old_flags)
5622                 dev_set_rx_mode(dev);
5623         return err;
5624 }
5625 EXPORT_SYMBOL(dev_set_promiscuity);
5626
5627 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5628 {
5629         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5630
5631         ASSERT_RTNL();
5632
5633         dev->flags |= IFF_ALLMULTI;
5634         dev->allmulti += inc;
5635         if (dev->allmulti == 0) {
5636                 /*
5637                  * Avoid overflow.
5638                  * If inc causes overflow, untouch allmulti and return error.
5639                  */
5640                 if (inc < 0)
5641                         dev->flags &= ~IFF_ALLMULTI;
5642                 else {
5643                         dev->allmulti -= inc;
5644                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5645                                 dev->name);
5646                         return -EOVERFLOW;
5647                 }
5648         }
5649         if (dev->flags ^ old_flags) {
5650                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5651                 dev_set_rx_mode(dev);
5652                 if (notify)
5653                         __dev_notify_flags(dev, old_flags,
5654                                            dev->gflags ^ old_gflags);
5655         }
5656         return 0;
5657 }
5658
5659 /**
5660  *      dev_set_allmulti        - update allmulti count on a device
5661  *      @dev: device
5662  *      @inc: modifier
5663  *
5664  *      Add or remove reception of all multicast frames to a device. While the
5665  *      count in the device remains above zero the interface remains listening
5666  *      to all interfaces. Once it hits zero the device reverts back to normal
5667  *      filtering operation. A negative @inc value is used to drop the counter
5668  *      when releasing a resource needing all multicasts.
5669  *      Return 0 if successful or a negative errno code on error.
5670  */
5671
5672 int dev_set_allmulti(struct net_device *dev, int inc)
5673 {
5674         return __dev_set_allmulti(dev, inc, true);
5675 }
5676 EXPORT_SYMBOL(dev_set_allmulti);
5677
5678 /*
5679  *      Upload unicast and multicast address lists to device and
5680  *      configure RX filtering. When the device doesn't support unicast
5681  *      filtering it is put in promiscuous mode while unicast addresses
5682  *      are present.
5683  */
5684 void __dev_set_rx_mode(struct net_device *dev)
5685 {
5686         const struct net_device_ops *ops = dev->netdev_ops;
5687
5688         /* dev_open will call this function so the list will stay sane. */
5689         if (!(dev->flags&IFF_UP))
5690                 return;
5691
5692         if (!netif_device_present(dev))
5693                 return;
5694
5695         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5696                 /* Unicast addresses changes may only happen under the rtnl,
5697                  * therefore calling __dev_set_promiscuity here is safe.
5698                  */
5699                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5700                         __dev_set_promiscuity(dev, 1, false);
5701                         dev->uc_promisc = true;
5702                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5703                         __dev_set_promiscuity(dev, -1, false);
5704                         dev->uc_promisc = false;
5705                 }
5706         }
5707
5708         if (ops->ndo_set_rx_mode)
5709                 ops->ndo_set_rx_mode(dev);
5710 }
5711
5712 void dev_set_rx_mode(struct net_device *dev)
5713 {
5714         netif_addr_lock_bh(dev);
5715         __dev_set_rx_mode(dev);
5716         netif_addr_unlock_bh(dev);
5717 }
5718
5719 /**
5720  *      dev_get_flags - get flags reported to userspace
5721  *      @dev: device
5722  *
5723  *      Get the combination of flag bits exported through APIs to userspace.
5724  */
5725 unsigned int dev_get_flags(const struct net_device *dev)
5726 {
5727         unsigned int flags;
5728
5729         flags = (dev->flags & ~(IFF_PROMISC |
5730                                 IFF_ALLMULTI |
5731                                 IFF_RUNNING |
5732                                 IFF_LOWER_UP |
5733                                 IFF_DORMANT)) |
5734                 (dev->gflags & (IFF_PROMISC |
5735                                 IFF_ALLMULTI));
5736
5737         if (netif_running(dev)) {
5738                 if (netif_oper_up(dev))
5739                         flags |= IFF_RUNNING;
5740                 if (netif_carrier_ok(dev))
5741                         flags |= IFF_LOWER_UP;
5742                 if (netif_dormant(dev))
5743                         flags |= IFF_DORMANT;
5744         }
5745
5746         return flags;
5747 }
5748 EXPORT_SYMBOL(dev_get_flags);
5749
5750 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5751 {
5752         unsigned int old_flags = dev->flags;
5753         int ret;
5754
5755         ASSERT_RTNL();
5756
5757         /*
5758          *      Set the flags on our device.
5759          */
5760
5761         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5762                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5763                                IFF_AUTOMEDIA)) |
5764                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5765                                     IFF_ALLMULTI));
5766
5767         /*
5768          *      Load in the correct multicast list now the flags have changed.
5769          */
5770
5771         if ((old_flags ^ flags) & IFF_MULTICAST)
5772                 dev_change_rx_flags(dev, IFF_MULTICAST);
5773
5774         dev_set_rx_mode(dev);
5775
5776         /*
5777          *      Have we downed the interface. We handle IFF_UP ourselves
5778          *      according to user attempts to set it, rather than blindly
5779          *      setting it.
5780          */
5781
5782         ret = 0;
5783         if ((old_flags ^ flags) & IFF_UP)
5784                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5785
5786         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5787                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5788                 unsigned int old_flags = dev->flags;
5789
5790                 dev->gflags ^= IFF_PROMISC;
5791
5792                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5793                         if (dev->flags != old_flags)
5794                                 dev_set_rx_mode(dev);
5795         }
5796
5797         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5798            is important. Some (broken) drivers set IFF_PROMISC, when
5799            IFF_ALLMULTI is requested not asking us and not reporting.
5800          */
5801         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5802                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5803
5804                 dev->gflags ^= IFF_ALLMULTI;
5805                 __dev_set_allmulti(dev, inc, false);
5806         }
5807
5808         return ret;
5809 }
5810
5811 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5812                         unsigned int gchanges)
5813 {
5814         unsigned int changes = dev->flags ^ old_flags;
5815
5816         if (gchanges)
5817                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5818
5819         if (changes & IFF_UP) {
5820                 if (dev->flags & IFF_UP)
5821                         call_netdevice_notifiers(NETDEV_UP, dev);
5822                 else
5823                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5824         }
5825
5826         if (dev->flags & IFF_UP &&
5827             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5828                 struct netdev_notifier_change_info change_info;
5829
5830                 change_info.flags_changed = changes;
5831                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5832                                               &change_info.info);
5833         }
5834 }
5835
5836 /**
5837  *      dev_change_flags - change device settings
5838  *      @dev: device
5839  *      @flags: device state flags
5840  *
5841  *      Change settings on device based state flags. The flags are
5842  *      in the userspace exported format.
5843  */
5844 int dev_change_flags(struct net_device *dev, unsigned int flags)
5845 {
5846         int ret;
5847         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5848
5849         ret = __dev_change_flags(dev, flags);
5850         if (ret < 0)
5851                 return ret;
5852
5853         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5854         __dev_notify_flags(dev, old_flags, changes);
5855         return ret;
5856 }
5857 EXPORT_SYMBOL(dev_change_flags);
5858
5859 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5860 {
5861         const struct net_device_ops *ops = dev->netdev_ops;
5862
5863         if (ops->ndo_change_mtu)
5864                 return ops->ndo_change_mtu(dev, new_mtu);
5865
5866         dev->mtu = new_mtu;
5867         return 0;
5868 }
5869
5870 /**
5871  *      dev_set_mtu - Change maximum transfer unit
5872  *      @dev: device
5873  *      @new_mtu: new transfer unit
5874  *
5875  *      Change the maximum transfer size of the network device.
5876  */
5877 int dev_set_mtu(struct net_device *dev, int new_mtu)
5878 {
5879         int err, orig_mtu;
5880
5881         if (new_mtu == dev->mtu)
5882                 return 0;
5883
5884         /*      MTU must be positive.    */
5885         if (new_mtu < 0)
5886                 return -EINVAL;
5887
5888         if (!netif_device_present(dev))
5889                 return -ENODEV;
5890
5891         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5892         err = notifier_to_errno(err);
5893         if (err)
5894                 return err;
5895
5896         orig_mtu = dev->mtu;
5897         err = __dev_set_mtu(dev, new_mtu);
5898
5899         if (!err) {
5900                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5901                 err = notifier_to_errno(err);
5902                 if (err) {
5903                         /* setting mtu back and notifying everyone again,
5904                          * so that they have a chance to revert changes.
5905                          */
5906                         __dev_set_mtu(dev, orig_mtu);
5907                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5908                 }
5909         }
5910         return err;
5911 }
5912 EXPORT_SYMBOL(dev_set_mtu);
5913
5914 /**
5915  *      dev_set_group - Change group this device belongs to
5916  *      @dev: device
5917  *      @new_group: group this device should belong to
5918  */
5919 void dev_set_group(struct net_device *dev, int new_group)
5920 {
5921         dev->group = new_group;
5922 }
5923 EXPORT_SYMBOL(dev_set_group);
5924
5925 /**
5926  *      dev_set_mac_address - Change Media Access Control Address
5927  *      @dev: device
5928  *      @sa: new address
5929  *
5930  *      Change the hardware (MAC) address of the device
5931  */
5932 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5933 {
5934         const struct net_device_ops *ops = dev->netdev_ops;
5935         int err;
5936
5937         if (!ops->ndo_set_mac_address)
5938                 return -EOPNOTSUPP;
5939         if (sa->sa_family != dev->type)
5940                 return -EINVAL;
5941         if (!netif_device_present(dev))
5942                 return -ENODEV;
5943         err = ops->ndo_set_mac_address(dev, sa);
5944         if (err)
5945                 return err;
5946         dev->addr_assign_type = NET_ADDR_SET;
5947         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5948         add_device_randomness(dev->dev_addr, dev->addr_len);
5949         return 0;
5950 }
5951 EXPORT_SYMBOL(dev_set_mac_address);
5952
5953 /**
5954  *      dev_change_carrier - Change device carrier
5955  *      @dev: device
5956  *      @new_carrier: new value
5957  *
5958  *      Change device carrier
5959  */
5960 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5961 {
5962         const struct net_device_ops *ops = dev->netdev_ops;
5963
5964         if (!ops->ndo_change_carrier)
5965                 return -EOPNOTSUPP;
5966         if (!netif_device_present(dev))
5967                 return -ENODEV;
5968         return ops->ndo_change_carrier(dev, new_carrier);
5969 }
5970 EXPORT_SYMBOL(dev_change_carrier);
5971
5972 /**
5973  *      dev_get_phys_port_id - Get device physical port ID
5974  *      @dev: device
5975  *      @ppid: port ID
5976  *
5977  *      Get device physical port ID
5978  */
5979 int dev_get_phys_port_id(struct net_device *dev,
5980                          struct netdev_phys_item_id *ppid)
5981 {
5982         const struct net_device_ops *ops = dev->netdev_ops;
5983
5984         if (!ops->ndo_get_phys_port_id)
5985                 return -EOPNOTSUPP;
5986         return ops->ndo_get_phys_port_id(dev, ppid);
5987 }
5988 EXPORT_SYMBOL(dev_get_phys_port_id);
5989
5990 /**
5991  *      dev_get_phys_port_name - Get device physical port name
5992  *      @dev: device
5993  *      @name: port name
5994  *
5995  *      Get device physical port name
5996  */
5997 int dev_get_phys_port_name(struct net_device *dev,
5998                            char *name, size_t len)
5999 {
6000         const struct net_device_ops *ops = dev->netdev_ops;
6001
6002         if (!ops->ndo_get_phys_port_name)
6003                 return -EOPNOTSUPP;
6004         return ops->ndo_get_phys_port_name(dev, name, len);
6005 }
6006 EXPORT_SYMBOL(dev_get_phys_port_name);
6007
6008 /**
6009  *      dev_new_index   -       allocate an ifindex
6010  *      @net: the applicable net namespace
6011  *
6012  *      Returns a suitable unique value for a new device interface
6013  *      number.  The caller must hold the rtnl semaphore or the
6014  *      dev_base_lock to be sure it remains unique.
6015  */
6016 static int dev_new_index(struct net *net)
6017 {
6018         int ifindex = net->ifindex;
6019         for (;;) {
6020                 if (++ifindex <= 0)
6021                         ifindex = 1;
6022                 if (!__dev_get_by_index(net, ifindex))
6023                         return net->ifindex = ifindex;
6024         }
6025 }
6026
6027 /* Delayed registration/unregisteration */
6028 static LIST_HEAD(net_todo_list);
6029 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6030
6031 static void net_set_todo(struct net_device *dev)
6032 {
6033         list_add_tail(&dev->todo_list, &net_todo_list);
6034         dev_net(dev)->dev_unreg_count++;
6035 }
6036
6037 static void rollback_registered_many(struct list_head *head)
6038 {
6039         struct net_device *dev, *tmp;
6040         LIST_HEAD(close_head);
6041
6042         BUG_ON(dev_boot_phase);
6043         ASSERT_RTNL();
6044
6045         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6046                 /* Some devices call without registering
6047                  * for initialization unwind. Remove those
6048                  * devices and proceed with the remaining.
6049                  */
6050                 if (dev->reg_state == NETREG_UNINITIALIZED) {
6051                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6052                                  dev->name, dev);
6053
6054                         WARN_ON(1);
6055                         list_del(&dev->unreg_list);
6056                         continue;
6057                 }
6058                 dev->dismantle = true;
6059                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6060         }
6061
6062         /* If device is running, close it first. */
6063         list_for_each_entry(dev, head, unreg_list)
6064                 list_add_tail(&dev->close_list, &close_head);
6065         dev_close_many(&close_head, true);
6066
6067         list_for_each_entry(dev, head, unreg_list) {
6068                 /* And unlink it from device chain. */
6069                 unlist_netdevice(dev);
6070
6071                 dev->reg_state = NETREG_UNREGISTERING;
6072                 on_each_cpu(flush_backlog, dev, 1);
6073         }
6074
6075         synchronize_net();
6076
6077         list_for_each_entry(dev, head, unreg_list) {
6078                 struct sk_buff *skb = NULL;
6079
6080                 /* Shutdown queueing discipline. */
6081                 dev_shutdown(dev);
6082
6083
6084                 /* Notify protocols, that we are about to destroy
6085                    this device. They should clean all the things.
6086                 */
6087                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6088
6089                 if (!dev->rtnl_link_ops ||
6090                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6091                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6092                                                      GFP_KERNEL);
6093
6094                 /*
6095                  *      Flush the unicast and multicast chains
6096                  */
6097                 dev_uc_flush(dev);
6098                 dev_mc_flush(dev);
6099
6100                 if (dev->netdev_ops->ndo_uninit)
6101                         dev->netdev_ops->ndo_uninit(dev);
6102
6103                 if (skb)
6104                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6105
6106                 /* Notifier chain MUST detach us all upper devices. */
6107                 WARN_ON(netdev_has_any_upper_dev(dev));
6108
6109                 /* Remove entries from kobject tree */
6110                 netdev_unregister_kobject(dev);
6111 #ifdef CONFIG_XPS
6112                 /* Remove XPS queueing entries */
6113                 netif_reset_xps_queues_gt(dev, 0);
6114 #endif
6115         }
6116
6117         synchronize_net();
6118
6119         list_for_each_entry(dev, head, unreg_list)
6120                 dev_put(dev);
6121 }
6122
6123 static void rollback_registered(struct net_device *dev)
6124 {
6125         LIST_HEAD(single);
6126
6127         list_add(&dev->unreg_list, &single);
6128         rollback_registered_many(&single);
6129         list_del(&single);
6130 }
6131
6132 static netdev_features_t netdev_fix_features(struct net_device *dev,
6133         netdev_features_t features)
6134 {
6135         /* Fix illegal checksum combinations */
6136         if ((features & NETIF_F_HW_CSUM) &&
6137             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6138                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6139                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6140         }
6141
6142         /* TSO requires that SG is present as well. */
6143         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6144                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6145                 features &= ~NETIF_F_ALL_TSO;
6146         }
6147
6148         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6149                                         !(features & NETIF_F_IP_CSUM)) {
6150                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6151                 features &= ~NETIF_F_TSO;
6152                 features &= ~NETIF_F_TSO_ECN;
6153         }
6154
6155         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6156                                          !(features & NETIF_F_IPV6_CSUM)) {
6157                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6158                 features &= ~NETIF_F_TSO6;
6159         }
6160
6161         /* TSO ECN requires that TSO is present as well. */
6162         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6163                 features &= ~NETIF_F_TSO_ECN;
6164
6165         /* Software GSO depends on SG. */
6166         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6167                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6168                 features &= ~NETIF_F_GSO;
6169         }
6170
6171         /* UFO needs SG and checksumming */
6172         if (features & NETIF_F_UFO) {
6173                 /* maybe split UFO into V4 and V6? */
6174                 if (!((features & NETIF_F_GEN_CSUM) ||
6175                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6176                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6177                         netdev_dbg(dev,
6178                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6179                         features &= ~NETIF_F_UFO;
6180                 }
6181
6182                 if (!(features & NETIF_F_SG)) {
6183                         netdev_dbg(dev,
6184                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6185                         features &= ~NETIF_F_UFO;
6186                 }
6187         }
6188
6189 #ifdef CONFIG_NET_RX_BUSY_POLL
6190         if (dev->netdev_ops->ndo_busy_poll)
6191                 features |= NETIF_F_BUSY_POLL;
6192         else
6193 #endif
6194                 features &= ~NETIF_F_BUSY_POLL;
6195
6196         return features;
6197 }
6198
6199 int __netdev_update_features(struct net_device *dev)
6200 {
6201         netdev_features_t features;
6202         int err = 0;
6203
6204         ASSERT_RTNL();
6205
6206         features = netdev_get_wanted_features(dev);
6207
6208         if (dev->netdev_ops->ndo_fix_features)
6209                 features = dev->netdev_ops->ndo_fix_features(dev, features);
6210
6211         /* driver might be less strict about feature dependencies */
6212         features = netdev_fix_features(dev, features);
6213
6214         if (dev->features == features)
6215                 return 0;
6216
6217         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6218                 &dev->features, &features);
6219
6220         if (dev->netdev_ops->ndo_set_features)
6221                 err = dev->netdev_ops->ndo_set_features(dev, features);
6222
6223         if (unlikely(err < 0)) {
6224                 netdev_err(dev,
6225                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
6226                         err, &features, &dev->features);
6227                 return -1;
6228         }
6229
6230         if (!err)
6231                 dev->features = features;
6232
6233         return 1;
6234 }
6235
6236 /**
6237  *      netdev_update_features - recalculate device features
6238  *      @dev: the device to check
6239  *
6240  *      Recalculate dev->features set and send notifications if it
6241  *      has changed. Should be called after driver or hardware dependent
6242  *      conditions might have changed that influence the features.
6243  */
6244 void netdev_update_features(struct net_device *dev)
6245 {
6246         if (__netdev_update_features(dev))
6247                 netdev_features_change(dev);
6248 }
6249 EXPORT_SYMBOL(netdev_update_features);
6250
6251 /**
6252  *      netdev_change_features - recalculate device features
6253  *      @dev: the device to check
6254  *
6255  *      Recalculate dev->features set and send notifications even
6256  *      if they have not changed. Should be called instead of
6257  *      netdev_update_features() if also dev->vlan_features might
6258  *      have changed to allow the changes to be propagated to stacked
6259  *      VLAN devices.
6260  */
6261 void netdev_change_features(struct net_device *dev)
6262 {
6263         __netdev_update_features(dev);
6264         netdev_features_change(dev);
6265 }
6266 EXPORT_SYMBOL(netdev_change_features);
6267
6268 /**
6269  *      netif_stacked_transfer_operstate -      transfer operstate
6270  *      @rootdev: the root or lower level device to transfer state from
6271  *      @dev: the device to transfer operstate to
6272  *
6273  *      Transfer operational state from root to device. This is normally
6274  *      called when a stacking relationship exists between the root
6275  *      device and the device(a leaf device).
6276  */
6277 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6278                                         struct net_device *dev)
6279 {
6280         if (rootdev->operstate == IF_OPER_DORMANT)
6281                 netif_dormant_on(dev);
6282         else
6283                 netif_dormant_off(dev);
6284
6285         if (netif_carrier_ok(rootdev)) {
6286                 if (!netif_carrier_ok(dev))
6287                         netif_carrier_on(dev);
6288         } else {
6289                 if (netif_carrier_ok(dev))
6290                         netif_carrier_off(dev);
6291         }
6292 }
6293 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6294
6295 #ifdef CONFIG_SYSFS
6296 static int netif_alloc_rx_queues(struct net_device *dev)
6297 {
6298         unsigned int i, count = dev->num_rx_queues;
6299         struct netdev_rx_queue *rx;
6300         size_t sz = count * sizeof(*rx);
6301
6302         BUG_ON(count < 1);
6303
6304         rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6305         if (!rx) {
6306                 rx = vzalloc(sz);
6307                 if (!rx)
6308                         return -ENOMEM;
6309         }
6310         dev->_rx = rx;
6311
6312         for (i = 0; i < count; i++)
6313                 rx[i].dev = dev;
6314         return 0;
6315 }
6316 #endif
6317
6318 static void netdev_init_one_queue(struct net_device *dev,
6319                                   struct netdev_queue *queue, void *_unused)
6320 {
6321         /* Initialize queue lock */
6322         spin_lock_init(&queue->_xmit_lock);
6323         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6324         queue->xmit_lock_owner = -1;
6325         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6326         queue->dev = dev;
6327 #ifdef CONFIG_BQL
6328         dql_init(&queue->dql, HZ);
6329 #endif
6330 }
6331
6332 static void netif_free_tx_queues(struct net_device *dev)
6333 {
6334         kvfree(dev->_tx);
6335 }
6336
6337 static int netif_alloc_netdev_queues(struct net_device *dev)
6338 {
6339         unsigned int count = dev->num_tx_queues;
6340         struct netdev_queue *tx;
6341         size_t sz = count * sizeof(*tx);
6342
6343         if (count < 1 || count > 0xffff)
6344                 return -EINVAL;
6345
6346         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6347         if (!tx) {
6348                 tx = vzalloc(sz);
6349                 if (!tx)
6350                         return -ENOMEM;
6351         }
6352         dev->_tx = tx;
6353
6354         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6355         spin_lock_init(&dev->tx_global_lock);
6356
6357         return 0;
6358 }
6359
6360 /**
6361  *      register_netdevice      - register a network device
6362  *      @dev: device to register
6363  *
6364  *      Take a completed network device structure and add it to the kernel
6365  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6366  *      chain. 0 is returned on success. A negative errno code is returned
6367  *      on a failure to set up the device, or if the name is a duplicate.
6368  *
6369  *      Callers must hold the rtnl semaphore. You may want
6370  *      register_netdev() instead of this.
6371  *
6372  *      BUGS:
6373  *      The locking appears insufficient to guarantee two parallel registers
6374  *      will not get the same name.
6375  */
6376
6377 int register_netdevice(struct net_device *dev)
6378 {
6379         int ret;
6380         struct net *net = dev_net(dev);
6381
6382         BUG_ON(dev_boot_phase);
6383         ASSERT_RTNL();
6384
6385         might_sleep();
6386
6387         /* When net_device's are persistent, this will be fatal. */
6388         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6389         BUG_ON(!net);
6390
6391         spin_lock_init(&dev->addr_list_lock);
6392         netdev_set_addr_lockdep_class(dev);
6393
6394         ret = dev_get_valid_name(net, dev, dev->name);
6395         if (ret < 0)
6396                 goto out;
6397
6398         /* Init, if this function is available */
6399         if (dev->netdev_ops->ndo_init) {
6400                 ret = dev->netdev_ops->ndo_init(dev);
6401                 if (ret) {
6402                         if (ret > 0)
6403                                 ret = -EIO;
6404                         goto out;
6405                 }
6406         }
6407
6408         if (((dev->hw_features | dev->features) &
6409              NETIF_F_HW_VLAN_CTAG_FILTER) &&
6410             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6411              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6412                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6413                 ret = -EINVAL;
6414                 goto err_uninit;
6415         }
6416
6417         ret = -EBUSY;
6418         if (!dev->ifindex)
6419                 dev->ifindex = dev_new_index(net);
6420         else if (__dev_get_by_index(net, dev->ifindex))
6421                 goto err_uninit;
6422
6423         /* Transfer changeable features to wanted_features and enable
6424          * software offloads (GSO and GRO).
6425          */
6426         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6427         dev->features |= NETIF_F_SOFT_FEATURES;
6428         dev->wanted_features = dev->features & dev->hw_features;
6429
6430         if (!(dev->flags & IFF_LOOPBACK)) {
6431                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6432         }
6433
6434         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6435          */
6436         dev->vlan_features |= NETIF_F_HIGHDMA;
6437
6438         /* Make NETIF_F_SG inheritable to tunnel devices.
6439          */
6440         dev->hw_enc_features |= NETIF_F_SG;
6441
6442         /* Make NETIF_F_SG inheritable to MPLS.
6443          */
6444         dev->mpls_features |= NETIF_F_SG;
6445
6446         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6447         ret = notifier_to_errno(ret);
6448         if (ret)
6449                 goto err_uninit;
6450
6451         ret = netdev_register_kobject(dev);
6452         if (ret)
6453                 goto err_uninit;
6454         dev->reg_state = NETREG_REGISTERED;
6455
6456         __netdev_update_features(dev);
6457
6458         /*
6459          *      Default initial state at registry is that the
6460          *      device is present.
6461          */
6462
6463         set_bit(__LINK_STATE_PRESENT, &dev->state);
6464
6465         linkwatch_init_dev(dev);
6466
6467         dev_init_scheduler(dev);
6468         dev_hold(dev);
6469         list_netdevice(dev);
6470         add_device_randomness(dev->dev_addr, dev->addr_len);
6471
6472         /* If the device has permanent device address, driver should
6473          * set dev_addr and also addr_assign_type should be set to
6474          * NET_ADDR_PERM (default value).
6475          */
6476         if (dev->addr_assign_type == NET_ADDR_PERM)
6477                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6478
6479         /* Notify protocols, that a new device appeared. */
6480         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6481         ret = notifier_to_errno(ret);
6482         if (ret) {
6483                 rollback_registered(dev);
6484                 dev->reg_state = NETREG_UNREGISTERED;
6485         }
6486         /*
6487          *      Prevent userspace races by waiting until the network
6488          *      device is fully setup before sending notifications.
6489          */
6490         if (!dev->rtnl_link_ops ||
6491             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6492                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6493
6494 out:
6495         return ret;
6496
6497 err_uninit:
6498         if (dev->netdev_ops->ndo_uninit)
6499                 dev->netdev_ops->ndo_uninit(dev);
6500         goto out;
6501 }
6502 EXPORT_SYMBOL(register_netdevice);
6503
6504 /**
6505  *      init_dummy_netdev       - init a dummy network device for NAPI
6506  *      @dev: device to init
6507  *
6508  *      This takes a network device structure and initialize the minimum
6509  *      amount of fields so it can be used to schedule NAPI polls without
6510  *      registering a full blown interface. This is to be used by drivers
6511  *      that need to tie several hardware interfaces to a single NAPI
6512  *      poll scheduler due to HW limitations.
6513  */
6514 int init_dummy_netdev(struct net_device *dev)
6515 {
6516         /* Clear everything. Note we don't initialize spinlocks
6517          * are they aren't supposed to be taken by any of the
6518          * NAPI code and this dummy netdev is supposed to be
6519          * only ever used for NAPI polls
6520          */
6521         memset(dev, 0, sizeof(struct net_device));
6522
6523         /* make sure we BUG if trying to hit standard
6524          * register/unregister code path
6525          */
6526         dev->reg_state = NETREG_DUMMY;
6527
6528         /* NAPI wants this */
6529         INIT_LIST_HEAD(&dev->napi_list);
6530
6531         /* a dummy interface is started by default */
6532         set_bit(__LINK_STATE_PRESENT, &dev->state);
6533         set_bit(__LINK_STATE_START, &dev->state);
6534
6535         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6536          * because users of this 'device' dont need to change
6537          * its refcount.
6538          */
6539
6540         return 0;
6541 }
6542 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6543
6544
6545 /**
6546  *      register_netdev - register a network device
6547  *      @dev: device to register
6548  *
6549  *      Take a completed network device structure and add it to the kernel
6550  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6551  *      chain. 0 is returned on success. A negative errno code is returned
6552  *      on a failure to set up the device, or if the name is a duplicate.
6553  *
6554  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6555  *      and expands the device name if you passed a format string to
6556  *      alloc_netdev.
6557  */
6558 int register_netdev(struct net_device *dev)
6559 {
6560         int err;
6561
6562         rtnl_lock();
6563         err = register_netdevice(dev);
6564         rtnl_unlock();
6565         return err;
6566 }
6567 EXPORT_SYMBOL(register_netdev);
6568
6569 int netdev_refcnt_read(const struct net_device *dev)
6570 {
6571         int i, refcnt = 0;
6572
6573         for_each_possible_cpu(i)
6574                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6575         return refcnt;
6576 }
6577 EXPORT_SYMBOL(netdev_refcnt_read);
6578
6579 /**
6580  * netdev_wait_allrefs - wait until all references are gone.
6581  * @dev: target net_device
6582  *
6583  * This is called when unregistering network devices.
6584  *
6585  * Any protocol or device that holds a reference should register
6586  * for netdevice notification, and cleanup and put back the
6587  * reference if they receive an UNREGISTER event.
6588  * We can get stuck here if buggy protocols don't correctly
6589  * call dev_put.
6590  */
6591 static void netdev_wait_allrefs(struct net_device *dev)
6592 {
6593         unsigned long rebroadcast_time, warning_time;
6594         int refcnt;
6595
6596         linkwatch_forget_dev(dev);
6597
6598         rebroadcast_time = warning_time = jiffies;
6599         refcnt = netdev_refcnt_read(dev);
6600
6601         while (refcnt != 0) {
6602                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6603                         rtnl_lock();
6604
6605                         /* Rebroadcast unregister notification */
6606                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6607
6608                         __rtnl_unlock();
6609                         rcu_barrier();
6610                         rtnl_lock();
6611
6612                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6613                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6614                                      &dev->state)) {
6615                                 /* We must not have linkwatch events
6616                                  * pending on unregister. If this
6617                                  * happens, we simply run the queue
6618                                  * unscheduled, resulting in a noop
6619                                  * for this device.
6620                                  */
6621                                 linkwatch_run_queue();
6622                         }
6623
6624                         __rtnl_unlock();
6625
6626                         rebroadcast_time = jiffies;
6627                 }
6628
6629                 msleep(250);
6630
6631                 refcnt = netdev_refcnt_read(dev);
6632
6633                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6634                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6635                                  dev->name, refcnt);
6636                         warning_time = jiffies;
6637                 }
6638         }
6639 }
6640
6641 /* The sequence is:
6642  *
6643  *      rtnl_lock();
6644  *      ...
6645  *      register_netdevice(x1);
6646  *      register_netdevice(x2);
6647  *      ...
6648  *      unregister_netdevice(y1);
6649  *      unregister_netdevice(y2);
6650  *      ...
6651  *      rtnl_unlock();
6652  *      free_netdev(y1);
6653  *      free_netdev(y2);
6654  *
6655  * We are invoked by rtnl_unlock().
6656  * This allows us to deal with problems:
6657  * 1) We can delete sysfs objects which invoke hotplug
6658  *    without deadlocking with linkwatch via keventd.
6659  * 2) Since we run with the RTNL semaphore not held, we can sleep
6660  *    safely in order to wait for the netdev refcnt to drop to zero.
6661  *
6662  * We must not return until all unregister events added during
6663  * the interval the lock was held have been completed.
6664  */
6665 void netdev_run_todo(void)
6666 {
6667         struct list_head list;
6668
6669         /* Snapshot list, allow later requests */
6670         list_replace_init(&net_todo_list, &list);
6671
6672         __rtnl_unlock();
6673
6674
6675         /* Wait for rcu callbacks to finish before next phase */
6676         if (!list_empty(&list))
6677                 rcu_barrier();
6678
6679         while (!list_empty(&list)) {
6680                 struct net_device *dev
6681                         = list_first_entry(&list, struct net_device, todo_list);
6682                 list_del(&dev->todo_list);
6683
6684                 rtnl_lock();
6685                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6686                 __rtnl_unlock();
6687
6688                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6689                         pr_err("network todo '%s' but state %d\n",
6690                                dev->name, dev->reg_state);
6691                         dump_stack();
6692                         continue;
6693                 }
6694
6695                 dev->reg_state = NETREG_UNREGISTERED;
6696
6697                 netdev_wait_allrefs(dev);
6698
6699                 /* paranoia */
6700                 BUG_ON(netdev_refcnt_read(dev));
6701                 BUG_ON(!list_empty(&dev->ptype_all));
6702                 BUG_ON(!list_empty(&dev->ptype_specific));
6703                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6704                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6705                 WARN_ON(dev->dn_ptr);
6706
6707                 if (dev->destructor)
6708                         dev->destructor(dev);
6709
6710                 /* Report a network device has been unregistered */
6711                 rtnl_lock();
6712                 dev_net(dev)->dev_unreg_count--;
6713                 __rtnl_unlock();
6714                 wake_up(&netdev_unregistering_wq);
6715
6716                 /* Free network device */
6717                 kobject_put(&dev->dev.kobj);
6718         }
6719 }
6720
6721 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6722  * fields in the same order, with only the type differing.
6723  */
6724 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6725                              const struct net_device_stats *netdev_stats)
6726 {
6727 #if BITS_PER_LONG == 64
6728         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6729         memcpy(stats64, netdev_stats, sizeof(*stats64));
6730 #else
6731         size_t i, n = sizeof(*stats64) / sizeof(u64);
6732         const unsigned long *src = (const unsigned long *)netdev_stats;
6733         u64 *dst = (u64 *)stats64;
6734
6735         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6736                      sizeof(*stats64) / sizeof(u64));
6737         for (i = 0; i < n; i++)
6738                 dst[i] = src[i];
6739 #endif
6740 }
6741 EXPORT_SYMBOL(netdev_stats_to_stats64);
6742
6743 /**
6744  *      dev_get_stats   - get network device statistics
6745  *      @dev: device to get statistics from
6746  *      @storage: place to store stats
6747  *
6748  *      Get network statistics from device. Return @storage.
6749  *      The device driver may provide its own method by setting
6750  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6751  *      otherwise the internal statistics structure is used.
6752  */
6753 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6754                                         struct rtnl_link_stats64 *storage)
6755 {
6756         const struct net_device_ops *ops = dev->netdev_ops;
6757
6758         if (ops->ndo_get_stats64) {
6759                 memset(storage, 0, sizeof(*storage));
6760                 ops->ndo_get_stats64(dev, storage);
6761         } else if (ops->ndo_get_stats) {
6762                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6763         } else {
6764                 netdev_stats_to_stats64(storage, &dev->stats);
6765         }
6766         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6767         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6768         return storage;
6769 }
6770 EXPORT_SYMBOL(dev_get_stats);
6771
6772 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6773 {
6774         struct netdev_queue *queue = dev_ingress_queue(dev);
6775
6776 #ifdef CONFIG_NET_CLS_ACT
6777         if (queue)
6778                 return queue;
6779         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6780         if (!queue)
6781                 return NULL;
6782         netdev_init_one_queue(dev, queue, NULL);
6783         RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
6784         queue->qdisc_sleeping = &noop_qdisc;
6785         rcu_assign_pointer(dev->ingress_queue, queue);
6786 #endif
6787         return queue;
6788 }
6789
6790 static const struct ethtool_ops default_ethtool_ops;
6791
6792 void netdev_set_default_ethtool_ops(struct net_device *dev,
6793                                     const struct ethtool_ops *ops)
6794 {
6795         if (dev->ethtool_ops == &default_ethtool_ops)
6796                 dev->ethtool_ops = ops;
6797 }
6798 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6799
6800 void netdev_freemem(struct net_device *dev)
6801 {
6802         char *addr = (char *)dev - dev->padded;
6803
6804         kvfree(addr);
6805 }
6806
6807 /**
6808  *      alloc_netdev_mqs - allocate network device
6809  *      @sizeof_priv:           size of private data to allocate space for
6810  *      @name:                  device name format string
6811  *      @name_assign_type:      origin of device name
6812  *      @setup:                 callback to initialize device
6813  *      @txqs:                  the number of TX subqueues to allocate
6814  *      @rxqs:                  the number of RX subqueues to allocate
6815  *
6816  *      Allocates a struct net_device with private data area for driver use
6817  *      and performs basic initialization.  Also allocates subqueue structs
6818  *      for each queue on the device.
6819  */
6820 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6821                 unsigned char name_assign_type,
6822                 void (*setup)(struct net_device *),
6823                 unsigned int txqs, unsigned int rxqs)
6824 {
6825         struct net_device *dev;
6826         size_t alloc_size;
6827         struct net_device *p;
6828
6829         BUG_ON(strlen(name) >= sizeof(dev->name));
6830
6831         if (txqs < 1) {
6832                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6833                 return NULL;
6834         }
6835
6836 #ifdef CONFIG_SYSFS
6837         if (rxqs < 1) {
6838                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6839                 return NULL;
6840         }
6841 #endif
6842
6843         alloc_size = sizeof(struct net_device);
6844         if (sizeof_priv) {
6845                 /* ensure 32-byte alignment of private area */
6846                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6847                 alloc_size += sizeof_priv;
6848         }
6849         /* ensure 32-byte alignment of whole construct */
6850         alloc_size += NETDEV_ALIGN - 1;
6851
6852         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6853         if (!p)
6854                 p = vzalloc(alloc_size);
6855         if (!p)
6856                 return NULL;
6857
6858         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6859         dev->padded = (char *)dev - (char *)p;
6860
6861         dev->pcpu_refcnt = alloc_percpu(int);
6862         if (!dev->pcpu_refcnt)
6863                 goto free_dev;
6864
6865         if (dev_addr_init(dev))
6866                 goto free_pcpu;
6867
6868         dev_mc_init(dev);
6869         dev_uc_init(dev);
6870
6871         dev_net_set(dev, &init_net);
6872
6873         dev->gso_max_size = GSO_MAX_SIZE;
6874         dev->gso_max_segs = GSO_MAX_SEGS;
6875         dev->gso_min_segs = 0;
6876
6877         INIT_LIST_HEAD(&dev->napi_list);
6878         INIT_LIST_HEAD(&dev->unreg_list);
6879         INIT_LIST_HEAD(&dev->close_list);
6880         INIT_LIST_HEAD(&dev->link_watch_list);
6881         INIT_LIST_HEAD(&dev->adj_list.upper);
6882         INIT_LIST_HEAD(&dev->adj_list.lower);
6883         INIT_LIST_HEAD(&dev->all_adj_list.upper);
6884         INIT_LIST_HEAD(&dev->all_adj_list.lower);
6885         INIT_LIST_HEAD(&dev->ptype_all);
6886         INIT_LIST_HEAD(&dev->ptype_specific);
6887         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
6888         setup(dev);
6889
6890         dev->num_tx_queues = txqs;
6891         dev->real_num_tx_queues = txqs;
6892         if (netif_alloc_netdev_queues(dev))
6893                 goto free_all;
6894
6895 #ifdef CONFIG_SYSFS
6896         dev->num_rx_queues = rxqs;
6897         dev->real_num_rx_queues = rxqs;
6898         if (netif_alloc_rx_queues(dev))
6899                 goto free_all;
6900 #endif
6901
6902         strcpy(dev->name, name);
6903         dev->name_assign_type = name_assign_type;
6904         dev->group = INIT_NETDEV_GROUP;
6905         if (!dev->ethtool_ops)
6906                 dev->ethtool_ops = &default_ethtool_ops;
6907         return dev;
6908
6909 free_all:
6910         free_netdev(dev);
6911         return NULL;
6912
6913 free_pcpu:
6914         free_percpu(dev->pcpu_refcnt);
6915 free_dev:
6916         netdev_freemem(dev);
6917         return NULL;
6918 }
6919 EXPORT_SYMBOL(alloc_netdev_mqs);
6920
6921 /**
6922  *      free_netdev - free network device
6923  *      @dev: device
6924  *
6925  *      This function does the last stage of destroying an allocated device
6926  *      interface. The reference to the device object is released.
6927  *      If this is the last reference then it will be freed.
6928  */
6929 void free_netdev(struct net_device *dev)
6930 {
6931         struct napi_struct *p, *n;
6932
6933         netif_free_tx_queues(dev);
6934 #ifdef CONFIG_SYSFS
6935         kvfree(dev->_rx);
6936 #endif
6937
6938         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6939
6940         /* Flush device addresses */
6941         dev_addr_flush(dev);
6942
6943         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6944                 netif_napi_del(p);
6945
6946         free_percpu(dev->pcpu_refcnt);
6947         dev->pcpu_refcnt = NULL;
6948
6949         /*  Compatibility with error handling in drivers */
6950         if (dev->reg_state == NETREG_UNINITIALIZED) {
6951                 netdev_freemem(dev);
6952                 return;
6953         }
6954
6955         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6956         dev->reg_state = NETREG_RELEASED;
6957
6958         /* will free via device release */
6959         put_device(&dev->dev);
6960 }
6961 EXPORT_SYMBOL(free_netdev);
6962
6963 /**
6964  *      synchronize_net -  Synchronize with packet receive processing
6965  *
6966  *      Wait for packets currently being received to be done.
6967  *      Does not block later packets from starting.
6968  */
6969 void synchronize_net(void)
6970 {
6971         might_sleep();
6972         if (rtnl_is_locked())
6973                 synchronize_rcu_expedited();
6974         else
6975                 synchronize_rcu();
6976 }
6977 EXPORT_SYMBOL(synchronize_net);
6978
6979 /**
6980  *      unregister_netdevice_queue - remove device from the kernel
6981  *      @dev: device
6982  *      @head: list
6983  *
6984  *      This function shuts down a device interface and removes it
6985  *      from the kernel tables.
6986  *      If head not NULL, device is queued to be unregistered later.
6987  *
6988  *      Callers must hold the rtnl semaphore.  You may want
6989  *      unregister_netdev() instead of this.
6990  */
6991
6992 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6993 {
6994         ASSERT_RTNL();
6995
6996         if (head) {
6997                 list_move_tail(&dev->unreg_list, head);
6998         } else {
6999                 rollback_registered(dev);
7000                 /* Finish processing unregister after unlock */
7001                 net_set_todo(dev);
7002         }
7003 }
7004 EXPORT_SYMBOL(unregister_netdevice_queue);
7005
7006 /**
7007  *      unregister_netdevice_many - unregister many devices
7008  *      @head: list of devices
7009  *
7010  *  Note: As most callers use a stack allocated list_head,
7011  *  we force a list_del() to make sure stack wont be corrupted later.
7012  */
7013 void unregister_netdevice_many(struct list_head *head)
7014 {
7015         struct net_device *dev;
7016
7017         if (!list_empty(head)) {
7018                 rollback_registered_many(head);
7019                 list_for_each_entry(dev, head, unreg_list)
7020                         net_set_todo(dev);
7021                 list_del(head);
7022         }
7023 }
7024 EXPORT_SYMBOL(unregister_netdevice_many);
7025
7026 /**
7027  *      unregister_netdev - remove device from the kernel
7028  *      @dev: device
7029  *
7030  *      This function shuts down a device interface and removes it
7031  *      from the kernel tables.
7032  *
7033  *      This is just a wrapper for unregister_netdevice that takes
7034  *      the rtnl semaphore.  In general you want to use this and not
7035  *      unregister_netdevice.
7036  */
7037 void unregister_netdev(struct net_device *dev)
7038 {
7039         rtnl_lock();
7040         unregister_netdevice(dev);
7041         rtnl_unlock();
7042 }
7043 EXPORT_SYMBOL(unregister_netdev);
7044
7045 /**
7046  *      dev_change_net_namespace - move device to different nethost namespace
7047  *      @dev: device
7048  *      @net: network namespace
7049  *      @pat: If not NULL name pattern to try if the current device name
7050  *            is already taken in the destination network namespace.
7051  *
7052  *      This function shuts down a device interface and moves it
7053  *      to a new network namespace. On success 0 is returned, on
7054  *      a failure a netagive errno code is returned.
7055  *
7056  *      Callers must hold the rtnl semaphore.
7057  */
7058
7059 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7060 {
7061         int err;
7062
7063         ASSERT_RTNL();
7064
7065         /* Don't allow namespace local devices to be moved. */
7066         err = -EINVAL;
7067         if (dev->features & NETIF_F_NETNS_LOCAL)
7068                 goto out;
7069
7070         /* Ensure the device has been registrered */
7071         if (dev->reg_state != NETREG_REGISTERED)
7072                 goto out;
7073
7074         /* Get out if there is nothing todo */
7075         err = 0;
7076         if (net_eq(dev_net(dev), net))
7077                 goto out;
7078
7079         /* Pick the destination device name, and ensure
7080          * we can use it in the destination network namespace.
7081          */
7082         err = -EEXIST;
7083         if (__dev_get_by_name(net, dev->name)) {
7084                 /* We get here if we can't use the current device name */
7085                 if (!pat)
7086                         goto out;
7087                 if (dev_get_valid_name(net, dev, pat) < 0)
7088                         goto out;
7089         }
7090
7091         /*
7092          * And now a mini version of register_netdevice unregister_netdevice.
7093          */
7094
7095         /* If device is running close it first. */
7096         dev_close(dev);
7097
7098         /* And unlink it from device chain */
7099         err = -ENODEV;
7100         unlist_netdevice(dev);
7101
7102         synchronize_net();
7103
7104         /* Shutdown queueing discipline. */
7105         dev_shutdown(dev);
7106
7107         /* Notify protocols, that we are about to destroy
7108            this device. They should clean all the things.
7109
7110            Note that dev->reg_state stays at NETREG_REGISTERED.
7111            This is wanted because this way 8021q and macvlan know
7112            the device is just moving and can keep their slaves up.
7113         */
7114         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7115         rcu_barrier();
7116         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7117         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7118
7119         /*
7120          *      Flush the unicast and multicast chains
7121          */
7122         dev_uc_flush(dev);
7123         dev_mc_flush(dev);
7124
7125         /* Send a netdev-removed uevent to the old namespace */
7126         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7127         netdev_adjacent_del_links(dev);
7128
7129         /* Actually switch the network namespace */
7130         dev_net_set(dev, net);
7131
7132         /* If there is an ifindex conflict assign a new one */
7133         if (__dev_get_by_index(net, dev->ifindex))
7134                 dev->ifindex = dev_new_index(net);
7135
7136         /* Send a netdev-add uevent to the new namespace */
7137         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7138         netdev_adjacent_add_links(dev);
7139
7140         /* Fixup kobjects */
7141         err = device_rename(&dev->dev, dev->name);
7142         WARN_ON(err);
7143
7144         /* Add the device back in the hashes */
7145         list_netdevice(dev);
7146
7147         /* Notify protocols, that a new device appeared. */
7148         call_netdevice_notifiers(NETDEV_REGISTER, dev);
7149
7150         /*
7151          *      Prevent userspace races by waiting until the network
7152          *      device is fully setup before sending notifications.
7153          */
7154         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7155
7156         synchronize_net();
7157         err = 0;
7158 out:
7159         return err;
7160 }
7161 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7162
7163 static int dev_cpu_callback(struct notifier_block *nfb,
7164                             unsigned long action,
7165                             void *ocpu)
7166 {
7167         struct sk_buff **list_skb;
7168         struct sk_buff *skb;
7169         unsigned int cpu, oldcpu = (unsigned long)ocpu;
7170         struct softnet_data *sd, *oldsd;
7171
7172         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7173                 return NOTIFY_OK;
7174
7175         local_irq_disable();
7176         cpu = smp_processor_id();
7177         sd = &per_cpu(softnet_data, cpu);
7178         oldsd = &per_cpu(softnet_data, oldcpu);
7179
7180         /* Find end of our completion_queue. */
7181         list_skb = &sd->completion_queue;
7182         while (*list_skb)
7183                 list_skb = &(*list_skb)->next;
7184         /* Append completion queue from offline CPU. */
7185         *list_skb = oldsd->completion_queue;
7186         oldsd->completion_queue = NULL;
7187
7188         /* Append output queue from offline CPU. */
7189         if (oldsd->output_queue) {
7190                 *sd->output_queue_tailp = oldsd->output_queue;
7191                 sd->output_queue_tailp = oldsd->output_queue_tailp;
7192                 oldsd->output_queue = NULL;
7193                 oldsd->output_queue_tailp = &oldsd->output_queue;
7194         }
7195         /* Append NAPI poll list from offline CPU, with one exception :
7196          * process_backlog() must be called by cpu owning percpu backlog.
7197          * We properly handle process_queue & input_pkt_queue later.
7198          */
7199         while (!list_empty(&oldsd->poll_list)) {
7200                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7201                                                             struct napi_struct,
7202                                                             poll_list);
7203
7204                 list_del_init(&napi->poll_list);
7205                 if (napi->poll == process_backlog)
7206                         napi->state = 0;
7207                 else
7208                         ____napi_schedule(sd, napi);
7209         }
7210
7211         raise_softirq_irqoff(NET_TX_SOFTIRQ);
7212         local_irq_enable();
7213         preempt_check_resched_rt();
7214
7215         /* Process offline CPU's input_pkt_queue */
7216         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7217                 netif_rx_ni(skb);
7218                 input_queue_head_incr(oldsd);
7219         }
7220         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
7221                 netif_rx_ni(skb);
7222                 input_queue_head_incr(oldsd);
7223         }
7224         while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
7225                 kfree_skb(skb);
7226         }
7227
7228         return NOTIFY_OK;
7229 }
7230
7231
7232 /**
7233  *      netdev_increment_features - increment feature set by one
7234  *      @all: current feature set
7235  *      @one: new feature set
7236  *      @mask: mask feature set
7237  *
7238  *      Computes a new feature set after adding a device with feature set
7239  *      @one to the master device with current feature set @all.  Will not
7240  *      enable anything that is off in @mask. Returns the new feature set.
7241  */
7242 netdev_features_t netdev_increment_features(netdev_features_t all,
7243         netdev_features_t one, netdev_features_t mask)
7244 {
7245         if (mask & NETIF_F_GEN_CSUM)
7246                 mask |= NETIF_F_ALL_CSUM;
7247         mask |= NETIF_F_VLAN_CHALLENGED;
7248
7249         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7250         all &= one | ~NETIF_F_ALL_FOR_ALL;
7251
7252         /* If one device supports hw checksumming, set for all. */
7253         if (all & NETIF_F_GEN_CSUM)
7254                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7255
7256         return all;
7257 }
7258 EXPORT_SYMBOL(netdev_increment_features);
7259
7260 static struct hlist_head * __net_init netdev_create_hash(void)
7261 {
7262         int i;
7263         struct hlist_head *hash;
7264
7265         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7266         if (hash != NULL)
7267                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7268                         INIT_HLIST_HEAD(&hash[i]);
7269
7270         return hash;
7271 }
7272
7273 /* Initialize per network namespace state */
7274 static int __net_init netdev_init(struct net *net)
7275 {
7276         if (net != &init_net)
7277                 INIT_LIST_HEAD(&net->dev_base_head);
7278
7279         net->dev_name_head = netdev_create_hash();
7280         if (net->dev_name_head == NULL)
7281                 goto err_name;
7282
7283         net->dev_index_head = netdev_create_hash();
7284         if (net->dev_index_head == NULL)
7285                 goto err_idx;
7286
7287         return 0;
7288
7289 err_idx:
7290         kfree(net->dev_name_head);
7291 err_name:
7292         return -ENOMEM;
7293 }
7294
7295 /**
7296  *      netdev_drivername - network driver for the device
7297  *      @dev: network device
7298  *
7299  *      Determine network driver for device.
7300  */
7301 const char *netdev_drivername(const struct net_device *dev)
7302 {
7303         const struct device_driver *driver;
7304         const struct device *parent;
7305         const char *empty = "";
7306
7307         parent = dev->dev.parent;
7308         if (!parent)
7309                 return empty;
7310
7311         driver = parent->driver;
7312         if (driver && driver->name)
7313                 return driver->name;
7314         return empty;
7315 }
7316
7317 static void __netdev_printk(const char *level, const struct net_device *dev,
7318                             struct va_format *vaf)
7319 {
7320         if (dev && dev->dev.parent) {
7321                 dev_printk_emit(level[1] - '0',
7322                                 dev->dev.parent,
7323                                 "%s %s %s%s: %pV",
7324                                 dev_driver_string(dev->dev.parent),
7325                                 dev_name(dev->dev.parent),
7326                                 netdev_name(dev), netdev_reg_state(dev),
7327                                 vaf);
7328         } else if (dev) {
7329                 printk("%s%s%s: %pV",
7330                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
7331         } else {
7332                 printk("%s(NULL net_device): %pV", level, vaf);
7333         }
7334 }
7335
7336 void netdev_printk(const char *level, const struct net_device *dev,
7337                    const char *format, ...)
7338 {
7339         struct va_format vaf;
7340         va_list args;
7341
7342         va_start(args, format);
7343
7344         vaf.fmt = format;
7345         vaf.va = &args;
7346
7347         __netdev_printk(level, dev, &vaf);
7348
7349         va_end(args);
7350 }
7351 EXPORT_SYMBOL(netdev_printk);
7352
7353 #define define_netdev_printk_level(func, level)                 \
7354 void func(const struct net_device *dev, const char *fmt, ...)   \
7355 {                                                               \
7356         struct va_format vaf;                                   \
7357         va_list args;                                           \
7358                                                                 \
7359         va_start(args, fmt);                                    \
7360                                                                 \
7361         vaf.fmt = fmt;                                          \
7362         vaf.va = &args;                                         \
7363                                                                 \
7364         __netdev_printk(level, dev, &vaf);                      \
7365                                                                 \
7366         va_end(args);                                           \
7367 }                                                               \
7368 EXPORT_SYMBOL(func);
7369
7370 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7371 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7372 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7373 define_netdev_printk_level(netdev_err, KERN_ERR);
7374 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7375 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7376 define_netdev_printk_level(netdev_info, KERN_INFO);
7377
7378 static void __net_exit netdev_exit(struct net *net)
7379 {
7380         kfree(net->dev_name_head);
7381         kfree(net->dev_index_head);
7382 }
7383
7384 static struct pernet_operations __net_initdata netdev_net_ops = {
7385         .init = netdev_init,
7386         .exit = netdev_exit,
7387 };
7388
7389 static void __net_exit default_device_exit(struct net *net)
7390 {
7391         struct net_device *dev, *aux;
7392         /*
7393          * Push all migratable network devices back to the
7394          * initial network namespace
7395          */
7396         rtnl_lock();
7397         for_each_netdev_safe(net, dev, aux) {
7398                 int err;
7399                 char fb_name[IFNAMSIZ];
7400
7401                 /* Ignore unmoveable devices (i.e. loopback) */
7402                 if (dev->features & NETIF_F_NETNS_LOCAL)
7403                         continue;
7404
7405                 /* Leave virtual devices for the generic cleanup */
7406                 if (dev->rtnl_link_ops)
7407                         continue;
7408
7409                 /* Push remaining network devices to init_net */
7410                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7411                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7412                 if (err) {
7413                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7414                                  __func__, dev->name, err);
7415                         BUG();
7416                 }
7417         }
7418         rtnl_unlock();
7419 }
7420
7421 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7422 {
7423         /* Return with the rtnl_lock held when there are no network
7424          * devices unregistering in any network namespace in net_list.
7425          */
7426         struct net *net;
7427         bool unregistering;
7428         DEFINE_WAIT_FUNC(wait, woken_wake_function);
7429
7430         add_wait_queue(&netdev_unregistering_wq, &wait);
7431         for (;;) {
7432                 unregistering = false;
7433                 rtnl_lock();
7434                 list_for_each_entry(net, net_list, exit_list) {
7435                         if (net->dev_unreg_count > 0) {
7436                                 unregistering = true;
7437                                 break;
7438                         }
7439                 }
7440                 if (!unregistering)
7441                         break;
7442                 __rtnl_unlock();
7443
7444                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7445         }
7446         remove_wait_queue(&netdev_unregistering_wq, &wait);
7447 }
7448
7449 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7450 {
7451         /* At exit all network devices most be removed from a network
7452          * namespace.  Do this in the reverse order of registration.
7453          * Do this across as many network namespaces as possible to
7454          * improve batching efficiency.
7455          */
7456         struct net_device *dev;
7457         struct net *net;
7458         LIST_HEAD(dev_kill_list);
7459
7460         /* To prevent network device cleanup code from dereferencing
7461          * loopback devices or network devices that have been freed
7462          * wait here for all pending unregistrations to complete,
7463          * before unregistring the loopback device and allowing the
7464          * network namespace be freed.
7465          *
7466          * The netdev todo list containing all network devices
7467          * unregistrations that happen in default_device_exit_batch
7468          * will run in the rtnl_unlock() at the end of
7469          * default_device_exit_batch.
7470          */
7471         rtnl_lock_unregistering(net_list);
7472         list_for_each_entry(net, net_list, exit_list) {
7473                 for_each_netdev_reverse(net, dev) {
7474                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7475                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7476                         else
7477                                 unregister_netdevice_queue(dev, &dev_kill_list);
7478                 }
7479         }
7480         unregister_netdevice_many(&dev_kill_list);
7481         rtnl_unlock();
7482 }
7483
7484 static struct pernet_operations __net_initdata default_device_ops = {
7485         .exit = default_device_exit,
7486         .exit_batch = default_device_exit_batch,
7487 };
7488
7489 /*
7490  *      Initialize the DEV module. At boot time this walks the device list and
7491  *      unhooks any devices that fail to initialise (normally hardware not
7492  *      present) and leaves us with a valid list of present and active devices.
7493  *
7494  */
7495
7496 /*
7497  *       This is called single threaded during boot, so no need
7498  *       to take the rtnl semaphore.
7499  */
7500 static int __init net_dev_init(void)
7501 {
7502         int i, rc = -ENOMEM;
7503
7504         BUG_ON(!dev_boot_phase);
7505
7506         if (dev_proc_init())
7507                 goto out;
7508
7509         if (netdev_kobject_init())
7510                 goto out;
7511
7512         INIT_LIST_HEAD(&ptype_all);
7513         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7514                 INIT_LIST_HEAD(&ptype_base[i]);
7515
7516         INIT_LIST_HEAD(&offload_base);
7517
7518         if (register_pernet_subsys(&netdev_net_ops))
7519                 goto out;
7520
7521         /*
7522          *      Initialise the packet receive queues.
7523          */
7524
7525         for_each_possible_cpu(i) {
7526                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7527
7528                 skb_queue_head_init_raw(&sd->input_pkt_queue);
7529                 skb_queue_head_init_raw(&sd->process_queue);
7530                 skb_queue_head_init_raw(&sd->tofree_queue);
7531                 INIT_LIST_HEAD(&sd->poll_list);
7532                 sd->output_queue_tailp = &sd->output_queue;
7533 #ifdef CONFIG_RPS
7534                 sd->csd.func = rps_trigger_softirq;
7535                 sd->csd.info = sd;
7536                 sd->cpu = i;
7537 #endif
7538
7539                 sd->backlog.poll = process_backlog;
7540                 sd->backlog.weight = weight_p;
7541         }
7542
7543         dev_boot_phase = 0;
7544
7545         /* The loopback device is special if any other network devices
7546          * is present in a network namespace the loopback device must
7547          * be present. Since we now dynamically allocate and free the
7548          * loopback device ensure this invariant is maintained by
7549          * keeping the loopback device as the first device on the
7550          * list of network devices.  Ensuring the loopback devices
7551          * is the first device that appears and the last network device
7552          * that disappears.
7553          */
7554         if (register_pernet_device(&loopback_net_ops))
7555                 goto out;
7556
7557         if (register_pernet_device(&default_device_ops))
7558                 goto out;
7559
7560         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7561         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7562
7563         hotcpu_notifier(dev_cpu_callback, 0);
7564         dst_init();
7565         rc = 0;
7566 out:
7567         return rc;
7568 }
7569
7570 subsys_initcall(net_dev_init);