These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / net / core / dev.c
1 /*
2  *      NET3    Protocol independent device support routines.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  *      Derived from the non IP parts of dev.c 1.0.19
10  *              Authors:        Ross Biro
11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *      Additional Authors:
15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
17  *              David Hinds <dahinds@users.sourceforge.net>
18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *              Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *      Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *                                      to 2 if register_netdev gets called
25  *                                      before net_dev_init & also removed a
26  *                                      few lines of code in the process.
27  *              Alan Cox        :       device private ioctl copies fields back.
28  *              Alan Cox        :       Transmit queue code does relevant
29  *                                      stunts to keep the queue safe.
30  *              Alan Cox        :       Fixed double lock.
31  *              Alan Cox        :       Fixed promisc NULL pointer trap
32  *              ????????        :       Support the full private ioctl range
33  *              Alan Cox        :       Moved ioctl permission check into
34  *                                      drivers
35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
36  *              Alan Cox        :       100 backlog just doesn't cut it when
37  *                                      you start doing multicast video 8)
38  *              Alan Cox        :       Rewrote net_bh and list manager.
39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
40  *              Alan Cox        :       Took out transmit every packet pass
41  *                                      Saved a few bytes in the ioctl handler
42  *              Alan Cox        :       Network driver sets packet type before
43  *                                      calling netif_rx. Saves a function
44  *                                      call a packet.
45  *              Alan Cox        :       Hashed net_bh()
46  *              Richard Kooijman:       Timestamp fixes.
47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
48  *              Alan Cox        :       Device lock protection.
49  *              Alan Cox        :       Fixed nasty side effect of device close
50  *                                      changes.
51  *              Rudi Cilibrasi  :       Pass the right thing to
52  *                                      set_mac_address()
53  *              Dave Miller     :       32bit quantity for the device lock to
54  *                                      make it work out on a Sparc.
55  *              Bjorn Ekwall    :       Added KERNELD hack.
56  *              Alan Cox        :       Cleaned up the backlog initialise.
57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
58  *                                      1 device.
59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
60  *                                      is no device open function.
61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
63  *              Cyrus Durgin    :       Cleaned for KMOD
64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
65  *                                      A network device unload needs to purge
66  *                                      the backlog queue.
67  *      Paul Rusty Russell      :       SIOCSIFNAME
68  *              Pekka Riikonen  :       Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *                                      indefinitely on dev->refcnt
71  *              J Hadi Salim    :       - Backlog queue sampling
72  *                                      - netif_rx() feedback
73  */
74
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/stat.h>
101 #include <net/dst.h>
102 #include <net/dst_metadata.h>
103 #include <net/pkt_sched.h>
104 #include <net/checksum.h>
105 #include <net/xfrm.h>
106 #include <linux/highmem.h>
107 #include <linux/init.h>
108 #include <linux/module.h>
109 #include <linux/netpoll.h>
110 #include <linux/rcupdate.h>
111 #include <linux/delay.h>
112 #include <net/iw_handler.h>
113 #include <asm/current.h>
114 #include <linux/audit.h>
115 #include <linux/dmaengine.h>
116 #include <linux/err.h>
117 #include <linux/ctype.h>
118 #include <linux/if_arp.h>
119 #include <linux/if_vlan.h>
120 #include <linux/ip.h>
121 #include <net/ip.h>
122 #include <net/mpls.h>
123 #include <linux/ipv6.h>
124 #include <linux/in.h>
125 #include <linux/jhash.h>
126 #include <linux/random.h>
127 #include <trace/events/napi.h>
128 #include <trace/events/net.h>
129 #include <trace/events/skb.h>
130 #include <linux/pci.h>
131 #include <linux/inetdevice.h>
132 #include <linux/cpu_rmap.h>
133 #include <linux/static_key.h>
134 #include <linux/hashtable.h>
135 #include <linux/vmalloc.h>
136 #include <linux/if_macvlan.h>
137 #include <linux/errqueue.h>
138 #include <linux/hrtimer.h>
139 #include <linux/netfilter_ingress.h>
140
141 #include "net-sysfs.h"
142
143 /* Instead of increasing this, you should create a hash table. */
144 #define MAX_GRO_SKBS 8
145
146 /* This should be increased if a protocol with a bigger head is added. */
147 #define GRO_MAX_HEAD (MAX_HEADER + 128)
148
149 static DEFINE_SPINLOCK(ptype_lock);
150 static DEFINE_SPINLOCK(offload_lock);
151 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
152 struct list_head ptype_all __read_mostly;       /* Taps */
153 static struct list_head offload_base __read_mostly;
154
155 static int netif_rx_internal(struct sk_buff *skb);
156 static int call_netdevice_notifiers_info(unsigned long val,
157                                          struct net_device *dev,
158                                          struct netdev_notifier_info *info);
159
160 /*
161  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
162  * semaphore.
163  *
164  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
165  *
166  * Writers must hold the rtnl semaphore while they loop through the
167  * dev_base_head list, and hold dev_base_lock for writing when they do the
168  * actual updates.  This allows pure readers to access the list even
169  * while a writer is preparing to update it.
170  *
171  * To put it another way, dev_base_lock is held for writing only to
172  * protect against pure readers; the rtnl semaphore provides the
173  * protection against other writers.
174  *
175  * See, for example usages, register_netdevice() and
176  * unregister_netdevice(), which must be called with the rtnl
177  * semaphore held.
178  */
179 DEFINE_RWLOCK(dev_base_lock);
180 EXPORT_SYMBOL(dev_base_lock);
181
182 /* protects napi_hash addition/deletion and napi_gen_id */
183 static DEFINE_SPINLOCK(napi_hash_lock);
184
185 static unsigned int napi_gen_id;
186 static DEFINE_HASHTABLE(napi_hash, 8);
187
188 static seqcount_t devnet_rename_seq;
189 static DEFINE_MUTEX(devnet_rename_mutex);
190
191 static inline void dev_base_seq_inc(struct net *net)
192 {
193         while (++net->dev_base_seq == 0);
194 }
195
196 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
197 {
198         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
199
200         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
201 }
202
203 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
204 {
205         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
206 }
207
208 static inline void rps_lock(struct softnet_data *sd)
209 {
210 #ifdef CONFIG_RPS
211         raw_spin_lock(&sd->input_pkt_queue.raw_lock);
212 #endif
213 }
214
215 static inline void rps_unlock(struct softnet_data *sd)
216 {
217 #ifdef CONFIG_RPS
218         raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
219 #endif
220 }
221
222 /* Device list insertion */
223 static void list_netdevice(struct net_device *dev)
224 {
225         struct net *net = dev_net(dev);
226
227         ASSERT_RTNL();
228
229         write_lock_bh(&dev_base_lock);
230         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
231         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
232         hlist_add_head_rcu(&dev->index_hlist,
233                            dev_index_hash(net, dev->ifindex));
234         write_unlock_bh(&dev_base_lock);
235
236         dev_base_seq_inc(net);
237 }
238
239 /* Device list removal
240  * caller must respect a RCU grace period before freeing/reusing dev
241  */
242 static void unlist_netdevice(struct net_device *dev)
243 {
244         ASSERT_RTNL();
245
246         /* Unlink dev from the device chain */
247         write_lock_bh(&dev_base_lock);
248         list_del_rcu(&dev->dev_list);
249         hlist_del_rcu(&dev->name_hlist);
250         hlist_del_rcu(&dev->index_hlist);
251         write_unlock_bh(&dev_base_lock);
252
253         dev_base_seq_inc(dev_net(dev));
254 }
255
256 /*
257  *      Our notifier list
258  */
259
260 static RAW_NOTIFIER_HEAD(netdev_chain);
261
262 /*
263  *      Device drivers call our routines to queue packets here. We empty the
264  *      queue in the local softnet handler.
265  */
266
267 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
268 EXPORT_PER_CPU_SYMBOL(softnet_data);
269
270 #ifdef CONFIG_LOCKDEP
271 /*
272  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
273  * according to dev->type
274  */
275 static const unsigned short netdev_lock_type[] =
276         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
277          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
278          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
279          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
280          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
281          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
282          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
283          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
284          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
285          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
286          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
287          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
288          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
289          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
290          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
291
292 static const char *const netdev_lock_name[] =
293         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
294          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
295          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
296          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
297          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
298          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
299          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
300          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
301          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
302          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
303          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
304          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
305          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
306          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
307          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
308
309 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
310 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
311
312 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
313 {
314         int i;
315
316         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
317                 if (netdev_lock_type[i] == dev_type)
318                         return i;
319         /* the last key is used by default */
320         return ARRAY_SIZE(netdev_lock_type) - 1;
321 }
322
323 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
324                                                  unsigned short dev_type)
325 {
326         int i;
327
328         i = netdev_lock_pos(dev_type);
329         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
330                                    netdev_lock_name[i]);
331 }
332
333 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
334 {
335         int i;
336
337         i = netdev_lock_pos(dev->type);
338         lockdep_set_class_and_name(&dev->addr_list_lock,
339                                    &netdev_addr_lock_key[i],
340                                    netdev_lock_name[i]);
341 }
342 #else
343 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
344                                                  unsigned short dev_type)
345 {
346 }
347 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
348 {
349 }
350 #endif
351
352 /*******************************************************************************
353
354                 Protocol management and registration routines
355
356 *******************************************************************************/
357
358 /*
359  *      Add a protocol ID to the list. Now that the input handler is
360  *      smarter we can dispense with all the messy stuff that used to be
361  *      here.
362  *
363  *      BEWARE!!! Protocol handlers, mangling input packets,
364  *      MUST BE last in hash buckets and checking protocol handlers
365  *      MUST start from promiscuous ptype_all chain in net_bh.
366  *      It is true now, do not change it.
367  *      Explanation follows: if protocol handler, mangling packet, will
368  *      be the first on list, it is not able to sense, that packet
369  *      is cloned and should be copied-on-write, so that it will
370  *      change it and subsequent readers will get broken packet.
371  *                                                      --ANK (980803)
372  */
373
374 static inline struct list_head *ptype_head(const struct packet_type *pt)
375 {
376         if (pt->type == htons(ETH_P_ALL))
377                 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
378         else
379                 return pt->dev ? &pt->dev->ptype_specific :
380                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
381 }
382
383 /**
384  *      dev_add_pack - add packet handler
385  *      @pt: packet type declaration
386  *
387  *      Add a protocol handler to the networking stack. The passed &packet_type
388  *      is linked into kernel lists and may not be freed until it has been
389  *      removed from the kernel lists.
390  *
391  *      This call does not sleep therefore it can not
392  *      guarantee all CPU's that are in middle of receiving packets
393  *      will see the new packet type (until the next received packet).
394  */
395
396 void dev_add_pack(struct packet_type *pt)
397 {
398         struct list_head *head = ptype_head(pt);
399
400         spin_lock(&ptype_lock);
401         list_add_rcu(&pt->list, head);
402         spin_unlock(&ptype_lock);
403 }
404 EXPORT_SYMBOL(dev_add_pack);
405
406 /**
407  *      __dev_remove_pack        - remove packet handler
408  *      @pt: packet type declaration
409  *
410  *      Remove a protocol handler that was previously added to the kernel
411  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
412  *      from the kernel lists and can be freed or reused once this function
413  *      returns.
414  *
415  *      The packet type might still be in use by receivers
416  *      and must not be freed until after all the CPU's have gone
417  *      through a quiescent state.
418  */
419 void __dev_remove_pack(struct packet_type *pt)
420 {
421         struct list_head *head = ptype_head(pt);
422         struct packet_type *pt1;
423
424         spin_lock(&ptype_lock);
425
426         list_for_each_entry(pt1, head, list) {
427                 if (pt == pt1) {
428                         list_del_rcu(&pt->list);
429                         goto out;
430                 }
431         }
432
433         pr_warn("dev_remove_pack: %p not found\n", pt);
434 out:
435         spin_unlock(&ptype_lock);
436 }
437 EXPORT_SYMBOL(__dev_remove_pack);
438
439 /**
440  *      dev_remove_pack  - remove packet handler
441  *      @pt: packet type declaration
442  *
443  *      Remove a protocol handler that was previously added to the kernel
444  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
445  *      from the kernel lists and can be freed or reused once this function
446  *      returns.
447  *
448  *      This call sleeps to guarantee that no CPU is looking at the packet
449  *      type after return.
450  */
451 void dev_remove_pack(struct packet_type *pt)
452 {
453         __dev_remove_pack(pt);
454
455         synchronize_net();
456 }
457 EXPORT_SYMBOL(dev_remove_pack);
458
459
460 /**
461  *      dev_add_offload - register offload handlers
462  *      @po: protocol offload declaration
463  *
464  *      Add protocol offload handlers to the networking stack. The passed
465  *      &proto_offload is linked into kernel lists and may not be freed until
466  *      it has been removed from the kernel lists.
467  *
468  *      This call does not sleep therefore it can not
469  *      guarantee all CPU's that are in middle of receiving packets
470  *      will see the new offload handlers (until the next received packet).
471  */
472 void dev_add_offload(struct packet_offload *po)
473 {
474         struct packet_offload *elem;
475
476         spin_lock(&offload_lock);
477         list_for_each_entry(elem, &offload_base, list) {
478                 if (po->priority < elem->priority)
479                         break;
480         }
481         list_add_rcu(&po->list, elem->list.prev);
482         spin_unlock(&offload_lock);
483 }
484 EXPORT_SYMBOL(dev_add_offload);
485
486 /**
487  *      __dev_remove_offload     - remove offload handler
488  *      @po: packet offload declaration
489  *
490  *      Remove a protocol offload handler that was previously added to the
491  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
492  *      is removed from the kernel lists and can be freed or reused once this
493  *      function returns.
494  *
495  *      The packet type might still be in use by receivers
496  *      and must not be freed until after all the CPU's have gone
497  *      through a quiescent state.
498  */
499 static void __dev_remove_offload(struct packet_offload *po)
500 {
501         struct list_head *head = &offload_base;
502         struct packet_offload *po1;
503
504         spin_lock(&offload_lock);
505
506         list_for_each_entry(po1, head, list) {
507                 if (po == po1) {
508                         list_del_rcu(&po->list);
509                         goto out;
510                 }
511         }
512
513         pr_warn("dev_remove_offload: %p not found\n", po);
514 out:
515         spin_unlock(&offload_lock);
516 }
517
518 /**
519  *      dev_remove_offload       - remove packet offload handler
520  *      @po: packet offload declaration
521  *
522  *      Remove a packet offload handler that was previously added to the kernel
523  *      offload handlers by dev_add_offload(). The passed &offload_type is
524  *      removed from the kernel lists and can be freed or reused once this
525  *      function returns.
526  *
527  *      This call sleeps to guarantee that no CPU is looking at the packet
528  *      type after return.
529  */
530 void dev_remove_offload(struct packet_offload *po)
531 {
532         __dev_remove_offload(po);
533
534         synchronize_net();
535 }
536 EXPORT_SYMBOL(dev_remove_offload);
537
538 /******************************************************************************
539
540                       Device Boot-time Settings Routines
541
542 *******************************************************************************/
543
544 /* Boot time configuration table */
545 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
546
547 /**
548  *      netdev_boot_setup_add   - add new setup entry
549  *      @name: name of the device
550  *      @map: configured settings for the device
551  *
552  *      Adds new setup entry to the dev_boot_setup list.  The function
553  *      returns 0 on error and 1 on success.  This is a generic routine to
554  *      all netdevices.
555  */
556 static int netdev_boot_setup_add(char *name, struct ifmap *map)
557 {
558         struct netdev_boot_setup *s;
559         int i;
560
561         s = dev_boot_setup;
562         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
563                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
564                         memset(s[i].name, 0, sizeof(s[i].name));
565                         strlcpy(s[i].name, name, IFNAMSIZ);
566                         memcpy(&s[i].map, map, sizeof(s[i].map));
567                         break;
568                 }
569         }
570
571         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
572 }
573
574 /**
575  *      netdev_boot_setup_check - check boot time settings
576  *      @dev: the netdevice
577  *
578  *      Check boot time settings for the device.
579  *      The found settings are set for the device to be used
580  *      later in the device probing.
581  *      Returns 0 if no settings found, 1 if they are.
582  */
583 int netdev_boot_setup_check(struct net_device *dev)
584 {
585         struct netdev_boot_setup *s = dev_boot_setup;
586         int i;
587
588         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
589                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
590                     !strcmp(dev->name, s[i].name)) {
591                         dev->irq        = s[i].map.irq;
592                         dev->base_addr  = s[i].map.base_addr;
593                         dev->mem_start  = s[i].map.mem_start;
594                         dev->mem_end    = s[i].map.mem_end;
595                         return 1;
596                 }
597         }
598         return 0;
599 }
600 EXPORT_SYMBOL(netdev_boot_setup_check);
601
602
603 /**
604  *      netdev_boot_base        - get address from boot time settings
605  *      @prefix: prefix for network device
606  *      @unit: id for network device
607  *
608  *      Check boot time settings for the base address of device.
609  *      The found settings are set for the device to be used
610  *      later in the device probing.
611  *      Returns 0 if no settings found.
612  */
613 unsigned long netdev_boot_base(const char *prefix, int unit)
614 {
615         const struct netdev_boot_setup *s = dev_boot_setup;
616         char name[IFNAMSIZ];
617         int i;
618
619         sprintf(name, "%s%d", prefix, unit);
620
621         /*
622          * If device already registered then return base of 1
623          * to indicate not to probe for this interface
624          */
625         if (__dev_get_by_name(&init_net, name))
626                 return 1;
627
628         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
629                 if (!strcmp(name, s[i].name))
630                         return s[i].map.base_addr;
631         return 0;
632 }
633
634 /*
635  * Saves at boot time configured settings for any netdevice.
636  */
637 int __init netdev_boot_setup(char *str)
638 {
639         int ints[5];
640         struct ifmap map;
641
642         str = get_options(str, ARRAY_SIZE(ints), ints);
643         if (!str || !*str)
644                 return 0;
645
646         /* Save settings */
647         memset(&map, 0, sizeof(map));
648         if (ints[0] > 0)
649                 map.irq = ints[1];
650         if (ints[0] > 1)
651                 map.base_addr = ints[2];
652         if (ints[0] > 2)
653                 map.mem_start = ints[3];
654         if (ints[0] > 3)
655                 map.mem_end = ints[4];
656
657         /* Add new entry to the list */
658         return netdev_boot_setup_add(str, &map);
659 }
660
661 __setup("netdev=", netdev_boot_setup);
662
663 /*******************************************************************************
664
665                             Device Interface Subroutines
666
667 *******************************************************************************/
668
669 /**
670  *      dev_get_iflink  - get 'iflink' value of a interface
671  *      @dev: targeted interface
672  *
673  *      Indicates the ifindex the interface is linked to.
674  *      Physical interfaces have the same 'ifindex' and 'iflink' values.
675  */
676
677 int dev_get_iflink(const struct net_device *dev)
678 {
679         if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
680                 return dev->netdev_ops->ndo_get_iflink(dev);
681
682         return dev->ifindex;
683 }
684 EXPORT_SYMBOL(dev_get_iflink);
685
686 /**
687  *      dev_fill_metadata_dst - Retrieve tunnel egress information.
688  *      @dev: targeted interface
689  *      @skb: The packet.
690  *
691  *      For better visibility of tunnel traffic OVS needs to retrieve
692  *      egress tunnel information for a packet. Following API allows
693  *      user to get this info.
694  */
695 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
696 {
697         struct ip_tunnel_info *info;
698
699         if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
700                 return -EINVAL;
701
702         info = skb_tunnel_info_unclone(skb);
703         if (!info)
704                 return -ENOMEM;
705         if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
706                 return -EINVAL;
707
708         return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
709 }
710 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
711
712 /**
713  *      __dev_get_by_name       - find a device by its name
714  *      @net: the applicable net namespace
715  *      @name: name to find
716  *
717  *      Find an interface by name. Must be called under RTNL semaphore
718  *      or @dev_base_lock. If the name is found a pointer to the device
719  *      is returned. If the name is not found then %NULL is returned. The
720  *      reference counters are not incremented so the caller must be
721  *      careful with locks.
722  */
723
724 struct net_device *__dev_get_by_name(struct net *net, const char *name)
725 {
726         struct net_device *dev;
727         struct hlist_head *head = dev_name_hash(net, name);
728
729         hlist_for_each_entry(dev, head, name_hlist)
730                 if (!strncmp(dev->name, name, IFNAMSIZ))
731                         return dev;
732
733         return NULL;
734 }
735 EXPORT_SYMBOL(__dev_get_by_name);
736
737 /**
738  *      dev_get_by_name_rcu     - find a device by its name
739  *      @net: the applicable net namespace
740  *      @name: name to find
741  *
742  *      Find an interface by name.
743  *      If the name is found a pointer to the device is returned.
744  *      If the name is not found then %NULL is returned.
745  *      The reference counters are not incremented so the caller must be
746  *      careful with locks. The caller must hold RCU lock.
747  */
748
749 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
750 {
751         struct net_device *dev;
752         struct hlist_head *head = dev_name_hash(net, name);
753
754         hlist_for_each_entry_rcu(dev, head, name_hlist)
755                 if (!strncmp(dev->name, name, IFNAMSIZ))
756                         return dev;
757
758         return NULL;
759 }
760 EXPORT_SYMBOL(dev_get_by_name_rcu);
761
762 /**
763  *      dev_get_by_name         - find a device by its name
764  *      @net: the applicable net namespace
765  *      @name: name to find
766  *
767  *      Find an interface by name. This can be called from any
768  *      context and does its own locking. The returned handle has
769  *      the usage count incremented and the caller must use dev_put() to
770  *      release it when it is no longer needed. %NULL is returned if no
771  *      matching device is found.
772  */
773
774 struct net_device *dev_get_by_name(struct net *net, const char *name)
775 {
776         struct net_device *dev;
777
778         rcu_read_lock();
779         dev = dev_get_by_name_rcu(net, name);
780         if (dev)
781                 dev_hold(dev);
782         rcu_read_unlock();
783         return dev;
784 }
785 EXPORT_SYMBOL(dev_get_by_name);
786
787 /**
788  *      __dev_get_by_index - find a device by its ifindex
789  *      @net: the applicable net namespace
790  *      @ifindex: index of device
791  *
792  *      Search for an interface by index. Returns %NULL if the device
793  *      is not found or a pointer to the device. The device has not
794  *      had its reference counter increased so the caller must be careful
795  *      about locking. The caller must hold either the RTNL semaphore
796  *      or @dev_base_lock.
797  */
798
799 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
800 {
801         struct net_device *dev;
802         struct hlist_head *head = dev_index_hash(net, ifindex);
803
804         hlist_for_each_entry(dev, head, index_hlist)
805                 if (dev->ifindex == ifindex)
806                         return dev;
807
808         return NULL;
809 }
810 EXPORT_SYMBOL(__dev_get_by_index);
811
812 /**
813  *      dev_get_by_index_rcu - find a device by its ifindex
814  *      @net: the applicable net namespace
815  *      @ifindex: index of device
816  *
817  *      Search for an interface by index. Returns %NULL if the device
818  *      is not found or a pointer to the device. The device has not
819  *      had its reference counter increased so the caller must be careful
820  *      about locking. The caller must hold RCU lock.
821  */
822
823 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
824 {
825         struct net_device *dev;
826         struct hlist_head *head = dev_index_hash(net, ifindex);
827
828         hlist_for_each_entry_rcu(dev, head, index_hlist)
829                 if (dev->ifindex == ifindex)
830                         return dev;
831
832         return NULL;
833 }
834 EXPORT_SYMBOL(dev_get_by_index_rcu);
835
836
837 /**
838  *      dev_get_by_index - find a device by its ifindex
839  *      @net: the applicable net namespace
840  *      @ifindex: index of device
841  *
842  *      Search for an interface by index. Returns NULL if the device
843  *      is not found or a pointer to the device. The device returned has
844  *      had a reference added and the pointer is safe until the user calls
845  *      dev_put to indicate they have finished with it.
846  */
847
848 struct net_device *dev_get_by_index(struct net *net, int ifindex)
849 {
850         struct net_device *dev;
851
852         rcu_read_lock();
853         dev = dev_get_by_index_rcu(net, ifindex);
854         if (dev)
855                 dev_hold(dev);
856         rcu_read_unlock();
857         return dev;
858 }
859 EXPORT_SYMBOL(dev_get_by_index);
860
861 /**
862  *      netdev_get_name - get a netdevice name, knowing its ifindex.
863  *      @net: network namespace
864  *      @name: a pointer to the buffer where the name will be stored.
865  *      @ifindex: the ifindex of the interface to get the name from.
866  *
867  *      The use of raw_seqcount_begin() and cond_resched() before
868  *      retrying is required as we want to give the writers a chance
869  *      to complete when CONFIG_PREEMPT is not set.
870  */
871 int netdev_get_name(struct net *net, char *name, int ifindex)
872 {
873         struct net_device *dev;
874         unsigned int seq;
875
876 retry:
877         seq = raw_seqcount_begin(&devnet_rename_seq);
878         rcu_read_lock();
879         dev = dev_get_by_index_rcu(net, ifindex);
880         if (!dev) {
881                 rcu_read_unlock();
882                 return -ENODEV;
883         }
884
885         strcpy(name, dev->name);
886         rcu_read_unlock();
887         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
888                 mutex_lock(&devnet_rename_mutex);
889                 mutex_unlock(&devnet_rename_mutex);
890                 goto retry;
891         }
892
893         return 0;
894 }
895
896 /**
897  *      dev_getbyhwaddr_rcu - find a device by its hardware address
898  *      @net: the applicable net namespace
899  *      @type: media type of device
900  *      @ha: hardware address
901  *
902  *      Search for an interface by MAC address. Returns NULL if the device
903  *      is not found or a pointer to the device.
904  *      The caller must hold RCU or RTNL.
905  *      The returned device has not had its ref count increased
906  *      and the caller must therefore be careful about locking
907  *
908  */
909
910 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
911                                        const char *ha)
912 {
913         struct net_device *dev;
914
915         for_each_netdev_rcu(net, dev)
916                 if (dev->type == type &&
917                     !memcmp(dev->dev_addr, ha, dev->addr_len))
918                         return dev;
919
920         return NULL;
921 }
922 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
923
924 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
925 {
926         struct net_device *dev;
927
928         ASSERT_RTNL();
929         for_each_netdev(net, dev)
930                 if (dev->type == type)
931                         return dev;
932
933         return NULL;
934 }
935 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
936
937 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
938 {
939         struct net_device *dev, *ret = NULL;
940
941         rcu_read_lock();
942         for_each_netdev_rcu(net, dev)
943                 if (dev->type == type) {
944                         dev_hold(dev);
945                         ret = dev;
946                         break;
947                 }
948         rcu_read_unlock();
949         return ret;
950 }
951 EXPORT_SYMBOL(dev_getfirstbyhwtype);
952
953 /**
954  *      __dev_get_by_flags - find any device with given flags
955  *      @net: the applicable net namespace
956  *      @if_flags: IFF_* values
957  *      @mask: bitmask of bits in if_flags to check
958  *
959  *      Search for any interface with the given flags. Returns NULL if a device
960  *      is not found or a pointer to the device. Must be called inside
961  *      rtnl_lock(), and result refcount is unchanged.
962  */
963
964 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
965                                       unsigned short mask)
966 {
967         struct net_device *dev, *ret;
968
969         ASSERT_RTNL();
970
971         ret = NULL;
972         for_each_netdev(net, dev) {
973                 if (((dev->flags ^ if_flags) & mask) == 0) {
974                         ret = dev;
975                         break;
976                 }
977         }
978         return ret;
979 }
980 EXPORT_SYMBOL(__dev_get_by_flags);
981
982 /**
983  *      dev_valid_name - check if name is okay for network device
984  *      @name: name string
985  *
986  *      Network device names need to be valid file names to
987  *      to allow sysfs to work.  We also disallow any kind of
988  *      whitespace.
989  */
990 bool dev_valid_name(const char *name)
991 {
992         if (*name == '\0')
993                 return false;
994         if (strlen(name) >= IFNAMSIZ)
995                 return false;
996         if (!strcmp(name, ".") || !strcmp(name, ".."))
997                 return false;
998
999         while (*name) {
1000                 if (*name == '/' || *name == ':' || isspace(*name))
1001                         return false;
1002                 name++;
1003         }
1004         return true;
1005 }
1006 EXPORT_SYMBOL(dev_valid_name);
1007
1008 /**
1009  *      __dev_alloc_name - allocate a name for a device
1010  *      @net: network namespace to allocate the device name in
1011  *      @name: name format string
1012  *      @buf:  scratch buffer and result name string
1013  *
1014  *      Passed a format string - eg "lt%d" it will try and find a suitable
1015  *      id. It scans list of devices to build up a free map, then chooses
1016  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1017  *      while allocating the name and adding the device in order to avoid
1018  *      duplicates.
1019  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1020  *      Returns the number of the unit assigned or a negative errno code.
1021  */
1022
1023 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1024 {
1025         int i = 0;
1026         const char *p;
1027         const int max_netdevices = 8*PAGE_SIZE;
1028         unsigned long *inuse;
1029         struct net_device *d;
1030
1031         p = strnchr(name, IFNAMSIZ-1, '%');
1032         if (p) {
1033                 /*
1034                  * Verify the string as this thing may have come from
1035                  * the user.  There must be either one "%d" and no other "%"
1036                  * characters.
1037                  */
1038                 if (p[1] != 'd' || strchr(p + 2, '%'))
1039                         return -EINVAL;
1040
1041                 /* Use one page as a bit array of possible slots */
1042                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1043                 if (!inuse)
1044                         return -ENOMEM;
1045
1046                 for_each_netdev(net, d) {
1047                         if (!sscanf(d->name, name, &i))
1048                                 continue;
1049                         if (i < 0 || i >= max_netdevices)
1050                                 continue;
1051
1052                         /*  avoid cases where sscanf is not exact inverse of printf */
1053                         snprintf(buf, IFNAMSIZ, name, i);
1054                         if (!strncmp(buf, d->name, IFNAMSIZ))
1055                                 set_bit(i, inuse);
1056                 }
1057
1058                 i = find_first_zero_bit(inuse, max_netdevices);
1059                 free_page((unsigned long) inuse);
1060         }
1061
1062         if (buf != name)
1063                 snprintf(buf, IFNAMSIZ, name, i);
1064         if (!__dev_get_by_name(net, buf))
1065                 return i;
1066
1067         /* It is possible to run out of possible slots
1068          * when the name is long and there isn't enough space left
1069          * for the digits, or if all bits are used.
1070          */
1071         return -ENFILE;
1072 }
1073
1074 /**
1075  *      dev_alloc_name - allocate a name for a device
1076  *      @dev: device
1077  *      @name: name format string
1078  *
1079  *      Passed a format string - eg "lt%d" it will try and find a suitable
1080  *      id. It scans list of devices to build up a free map, then chooses
1081  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1082  *      while allocating the name and adding the device in order to avoid
1083  *      duplicates.
1084  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1085  *      Returns the number of the unit assigned or a negative errno code.
1086  */
1087
1088 int dev_alloc_name(struct net_device *dev, const char *name)
1089 {
1090         char buf[IFNAMSIZ];
1091         struct net *net;
1092         int ret;
1093
1094         BUG_ON(!dev_net(dev));
1095         net = dev_net(dev);
1096         ret = __dev_alloc_name(net, name, buf);
1097         if (ret >= 0)
1098                 strlcpy(dev->name, buf, IFNAMSIZ);
1099         return ret;
1100 }
1101 EXPORT_SYMBOL(dev_alloc_name);
1102
1103 static int dev_alloc_name_ns(struct net *net,
1104                              struct net_device *dev,
1105                              const char *name)
1106 {
1107         char buf[IFNAMSIZ];
1108         int ret;
1109
1110         ret = __dev_alloc_name(net, name, buf);
1111         if (ret >= 0)
1112                 strlcpy(dev->name, buf, IFNAMSIZ);
1113         return ret;
1114 }
1115
1116 static int dev_get_valid_name(struct net *net,
1117                               struct net_device *dev,
1118                               const char *name)
1119 {
1120         BUG_ON(!net);
1121
1122         if (!dev_valid_name(name))
1123                 return -EINVAL;
1124
1125         if (strchr(name, '%'))
1126                 return dev_alloc_name_ns(net, dev, name);
1127         else if (__dev_get_by_name(net, name))
1128                 return -EEXIST;
1129         else if (dev->name != name)
1130                 strlcpy(dev->name, name, IFNAMSIZ);
1131
1132         return 0;
1133 }
1134
1135 /**
1136  *      dev_change_name - change name of a device
1137  *      @dev: device
1138  *      @newname: name (or format string) must be at least IFNAMSIZ
1139  *
1140  *      Change name of a device, can pass format strings "eth%d".
1141  *      for wildcarding.
1142  */
1143 int dev_change_name(struct net_device *dev, const char *newname)
1144 {
1145         unsigned char old_assign_type;
1146         char oldname[IFNAMSIZ];
1147         int err = 0;
1148         int ret;
1149         struct net *net;
1150
1151         ASSERT_RTNL();
1152         BUG_ON(!dev_net(dev));
1153
1154         net = dev_net(dev);
1155         if (dev->flags & IFF_UP)
1156                 return -EBUSY;
1157
1158         mutex_lock(&devnet_rename_mutex);
1159         __raw_write_seqcount_begin(&devnet_rename_seq);
1160
1161         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1162                 goto outunlock;
1163
1164         memcpy(oldname, dev->name, IFNAMSIZ);
1165
1166         err = dev_get_valid_name(net, dev, newname);
1167         if (err < 0)
1168                 goto outunlock;
1169
1170         if (oldname[0] && !strchr(oldname, '%'))
1171                 netdev_info(dev, "renamed from %s\n", oldname);
1172
1173         old_assign_type = dev->name_assign_type;
1174         dev->name_assign_type = NET_NAME_RENAMED;
1175
1176 rollback:
1177         ret = device_rename(&dev->dev, dev->name);
1178         if (ret) {
1179                 memcpy(dev->name, oldname, IFNAMSIZ);
1180                 dev->name_assign_type = old_assign_type;
1181                 err = ret;
1182                 goto outunlock;
1183         }
1184
1185         __raw_write_seqcount_end(&devnet_rename_seq);
1186         mutex_unlock(&devnet_rename_mutex);
1187
1188         netdev_adjacent_rename_links(dev, oldname);
1189
1190         write_lock_bh(&dev_base_lock);
1191         hlist_del_rcu(&dev->name_hlist);
1192         write_unlock_bh(&dev_base_lock);
1193
1194         synchronize_rcu();
1195
1196         write_lock_bh(&dev_base_lock);
1197         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1198         write_unlock_bh(&dev_base_lock);
1199
1200         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1201         ret = notifier_to_errno(ret);
1202
1203         if (ret) {
1204                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1205                 if (err >= 0) {
1206                         err = ret;
1207                         mutex_lock(&devnet_rename_mutex);
1208                         __raw_write_seqcount_begin(&devnet_rename_seq);
1209                         memcpy(dev->name, oldname, IFNAMSIZ);
1210                         memcpy(oldname, newname, IFNAMSIZ);
1211                         dev->name_assign_type = old_assign_type;
1212                         old_assign_type = NET_NAME_RENAMED;
1213                         goto rollback;
1214                 } else {
1215                         pr_err("%s: name change rollback failed: %d\n",
1216                                dev->name, ret);
1217                 }
1218         }
1219
1220         return err;
1221
1222 outunlock:
1223         __raw_write_seqcount_end(&devnet_rename_seq);
1224         mutex_unlock(&devnet_rename_mutex);
1225         return err;
1226 }
1227
1228 /**
1229  *      dev_set_alias - change ifalias of a device
1230  *      @dev: device
1231  *      @alias: name up to IFALIASZ
1232  *      @len: limit of bytes to copy from info
1233  *
1234  *      Set ifalias for a device,
1235  */
1236 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1237 {
1238         char *new_ifalias;
1239
1240         ASSERT_RTNL();
1241
1242         if (len >= IFALIASZ)
1243                 return -EINVAL;
1244
1245         if (!len) {
1246                 kfree(dev->ifalias);
1247                 dev->ifalias = NULL;
1248                 return 0;
1249         }
1250
1251         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1252         if (!new_ifalias)
1253                 return -ENOMEM;
1254         dev->ifalias = new_ifalias;
1255
1256         strlcpy(dev->ifalias, alias, len+1);
1257         return len;
1258 }
1259
1260
1261 /**
1262  *      netdev_features_change - device changes features
1263  *      @dev: device to cause notification
1264  *
1265  *      Called to indicate a device has changed features.
1266  */
1267 void netdev_features_change(struct net_device *dev)
1268 {
1269         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1270 }
1271 EXPORT_SYMBOL(netdev_features_change);
1272
1273 /**
1274  *      netdev_state_change - device changes state
1275  *      @dev: device to cause notification
1276  *
1277  *      Called to indicate a device has changed state. This function calls
1278  *      the notifier chains for netdev_chain and sends a NEWLINK message
1279  *      to the routing socket.
1280  */
1281 void netdev_state_change(struct net_device *dev)
1282 {
1283         if (dev->flags & IFF_UP) {
1284                 struct netdev_notifier_change_info change_info;
1285
1286                 change_info.flags_changed = 0;
1287                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1288                                               &change_info.info);
1289                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1290         }
1291 }
1292 EXPORT_SYMBOL(netdev_state_change);
1293
1294 /**
1295  *      netdev_notify_peers - notify network peers about existence of @dev
1296  *      @dev: network device
1297  *
1298  * Generate traffic such that interested network peers are aware of
1299  * @dev, such as by generating a gratuitous ARP. This may be used when
1300  * a device wants to inform the rest of the network about some sort of
1301  * reconfiguration such as a failover event or virtual machine
1302  * migration.
1303  */
1304 void netdev_notify_peers(struct net_device *dev)
1305 {
1306         rtnl_lock();
1307         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1308         rtnl_unlock();
1309 }
1310 EXPORT_SYMBOL(netdev_notify_peers);
1311
1312 static int __dev_open(struct net_device *dev)
1313 {
1314         const struct net_device_ops *ops = dev->netdev_ops;
1315         int ret;
1316
1317         ASSERT_RTNL();
1318
1319         if (!netif_device_present(dev))
1320                 return -ENODEV;
1321
1322         /* Block netpoll from trying to do any rx path servicing.
1323          * If we don't do this there is a chance ndo_poll_controller
1324          * or ndo_poll may be running while we open the device
1325          */
1326         netpoll_poll_disable(dev);
1327
1328         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1329         ret = notifier_to_errno(ret);
1330         if (ret)
1331                 return ret;
1332
1333         set_bit(__LINK_STATE_START, &dev->state);
1334
1335         if (ops->ndo_validate_addr)
1336                 ret = ops->ndo_validate_addr(dev);
1337
1338         if (!ret && ops->ndo_open)
1339                 ret = ops->ndo_open(dev);
1340
1341         netpoll_poll_enable(dev);
1342
1343         if (ret)
1344                 clear_bit(__LINK_STATE_START, &dev->state);
1345         else {
1346                 dev->flags |= IFF_UP;
1347                 dev_set_rx_mode(dev);
1348                 dev_activate(dev);
1349                 add_device_randomness(dev->dev_addr, dev->addr_len);
1350         }
1351
1352         return ret;
1353 }
1354
1355 /**
1356  *      dev_open        - prepare an interface for use.
1357  *      @dev:   device to open
1358  *
1359  *      Takes a device from down to up state. The device's private open
1360  *      function is invoked and then the multicast lists are loaded. Finally
1361  *      the device is moved into the up state and a %NETDEV_UP message is
1362  *      sent to the netdev notifier chain.
1363  *
1364  *      Calling this function on an active interface is a nop. On a failure
1365  *      a negative errno code is returned.
1366  */
1367 int dev_open(struct net_device *dev)
1368 {
1369         int ret;
1370
1371         if (dev->flags & IFF_UP)
1372                 return 0;
1373
1374         ret = __dev_open(dev);
1375         if (ret < 0)
1376                 return ret;
1377
1378         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1379         call_netdevice_notifiers(NETDEV_UP, dev);
1380
1381         return ret;
1382 }
1383 EXPORT_SYMBOL(dev_open);
1384
1385 static int __dev_close_many(struct list_head *head)
1386 {
1387         struct net_device *dev;
1388
1389         ASSERT_RTNL();
1390         might_sleep();
1391
1392         list_for_each_entry(dev, head, close_list) {
1393                 /* Temporarily disable netpoll until the interface is down */
1394                 netpoll_poll_disable(dev);
1395
1396                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1397
1398                 clear_bit(__LINK_STATE_START, &dev->state);
1399
1400                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1401                  * can be even on different cpu. So just clear netif_running().
1402                  *
1403                  * dev->stop() will invoke napi_disable() on all of it's
1404                  * napi_struct instances on this device.
1405                  */
1406                 smp_mb__after_atomic(); /* Commit netif_running(). */
1407         }
1408
1409         dev_deactivate_many(head);
1410
1411         list_for_each_entry(dev, head, close_list) {
1412                 const struct net_device_ops *ops = dev->netdev_ops;
1413
1414                 /*
1415                  *      Call the device specific close. This cannot fail.
1416                  *      Only if device is UP
1417                  *
1418                  *      We allow it to be called even after a DETACH hot-plug
1419                  *      event.
1420                  */
1421                 if (ops->ndo_stop)
1422                         ops->ndo_stop(dev);
1423
1424                 dev->flags &= ~IFF_UP;
1425                 netpoll_poll_enable(dev);
1426         }
1427
1428         return 0;
1429 }
1430
1431 static int __dev_close(struct net_device *dev)
1432 {
1433         int retval;
1434         LIST_HEAD(single);
1435
1436         list_add(&dev->close_list, &single);
1437         retval = __dev_close_many(&single);
1438         list_del(&single);
1439
1440         return retval;
1441 }
1442
1443 int dev_close_many(struct list_head *head, bool unlink)
1444 {
1445         struct net_device *dev, *tmp;
1446
1447         /* Remove the devices that don't need to be closed */
1448         list_for_each_entry_safe(dev, tmp, head, close_list)
1449                 if (!(dev->flags & IFF_UP))
1450                         list_del_init(&dev->close_list);
1451
1452         __dev_close_many(head);
1453
1454         list_for_each_entry_safe(dev, tmp, head, close_list) {
1455                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1456                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1457                 if (unlink)
1458                         list_del_init(&dev->close_list);
1459         }
1460
1461         return 0;
1462 }
1463 EXPORT_SYMBOL(dev_close_many);
1464
1465 /**
1466  *      dev_close - shutdown an interface.
1467  *      @dev: device to shutdown
1468  *
1469  *      This function moves an active device into down state. A
1470  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1471  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1472  *      chain.
1473  */
1474 int dev_close(struct net_device *dev)
1475 {
1476         if (dev->flags & IFF_UP) {
1477                 LIST_HEAD(single);
1478
1479                 list_add(&dev->close_list, &single);
1480                 dev_close_many(&single, true);
1481                 list_del(&single);
1482         }
1483         return 0;
1484 }
1485 EXPORT_SYMBOL(dev_close);
1486
1487
1488 /**
1489  *      dev_disable_lro - disable Large Receive Offload on a device
1490  *      @dev: device
1491  *
1492  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1493  *      called under RTNL.  This is needed if received packets may be
1494  *      forwarded to another interface.
1495  */
1496 void dev_disable_lro(struct net_device *dev)
1497 {
1498         struct net_device *lower_dev;
1499         struct list_head *iter;
1500
1501         dev->wanted_features &= ~NETIF_F_LRO;
1502         netdev_update_features(dev);
1503
1504         if (unlikely(dev->features & NETIF_F_LRO))
1505                 netdev_WARN(dev, "failed to disable LRO!\n");
1506
1507         netdev_for_each_lower_dev(dev, lower_dev, iter)
1508                 dev_disable_lro(lower_dev);
1509 }
1510 EXPORT_SYMBOL(dev_disable_lro);
1511
1512 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1513                                    struct net_device *dev)
1514 {
1515         struct netdev_notifier_info info;
1516
1517         netdev_notifier_info_init(&info, dev);
1518         return nb->notifier_call(nb, val, &info);
1519 }
1520
1521 static int dev_boot_phase = 1;
1522
1523 /**
1524  *      register_netdevice_notifier - register a network notifier block
1525  *      @nb: notifier
1526  *
1527  *      Register a notifier to be called when network device events occur.
1528  *      The notifier passed is linked into the kernel structures and must
1529  *      not be reused until it has been unregistered. A negative errno code
1530  *      is returned on a failure.
1531  *
1532  *      When registered all registration and up events are replayed
1533  *      to the new notifier to allow device to have a race free
1534  *      view of the network device list.
1535  */
1536
1537 int register_netdevice_notifier(struct notifier_block *nb)
1538 {
1539         struct net_device *dev;
1540         struct net_device *last;
1541         struct net *net;
1542         int err;
1543
1544         rtnl_lock();
1545         err = raw_notifier_chain_register(&netdev_chain, nb);
1546         if (err)
1547                 goto unlock;
1548         if (dev_boot_phase)
1549                 goto unlock;
1550         for_each_net(net) {
1551                 for_each_netdev(net, dev) {
1552                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1553                         err = notifier_to_errno(err);
1554                         if (err)
1555                                 goto rollback;
1556
1557                         if (!(dev->flags & IFF_UP))
1558                                 continue;
1559
1560                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1561                 }
1562         }
1563
1564 unlock:
1565         rtnl_unlock();
1566         return err;
1567
1568 rollback:
1569         last = dev;
1570         for_each_net(net) {
1571                 for_each_netdev(net, dev) {
1572                         if (dev == last)
1573                                 goto outroll;
1574
1575                         if (dev->flags & IFF_UP) {
1576                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1577                                                         dev);
1578                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1579                         }
1580                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1581                 }
1582         }
1583
1584 outroll:
1585         raw_notifier_chain_unregister(&netdev_chain, nb);
1586         goto unlock;
1587 }
1588 EXPORT_SYMBOL(register_netdevice_notifier);
1589
1590 /**
1591  *      unregister_netdevice_notifier - unregister a network notifier block
1592  *      @nb: notifier
1593  *
1594  *      Unregister a notifier previously registered by
1595  *      register_netdevice_notifier(). The notifier is unlinked into the
1596  *      kernel structures and may then be reused. A negative errno code
1597  *      is returned on a failure.
1598  *
1599  *      After unregistering unregister and down device events are synthesized
1600  *      for all devices on the device list to the removed notifier to remove
1601  *      the need for special case cleanup code.
1602  */
1603
1604 int unregister_netdevice_notifier(struct notifier_block *nb)
1605 {
1606         struct net_device *dev;
1607         struct net *net;
1608         int err;
1609
1610         rtnl_lock();
1611         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1612         if (err)
1613                 goto unlock;
1614
1615         for_each_net(net) {
1616                 for_each_netdev(net, dev) {
1617                         if (dev->flags & IFF_UP) {
1618                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1619                                                         dev);
1620                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1621                         }
1622                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1623                 }
1624         }
1625 unlock:
1626         rtnl_unlock();
1627         return err;
1628 }
1629 EXPORT_SYMBOL(unregister_netdevice_notifier);
1630
1631 /**
1632  *      call_netdevice_notifiers_info - call all network notifier blocks
1633  *      @val: value passed unmodified to notifier function
1634  *      @dev: net_device pointer passed unmodified to notifier function
1635  *      @info: notifier information data
1636  *
1637  *      Call all network notifier blocks.  Parameters and return value
1638  *      are as for raw_notifier_call_chain().
1639  */
1640
1641 static int call_netdevice_notifiers_info(unsigned long val,
1642                                          struct net_device *dev,
1643                                          struct netdev_notifier_info *info)
1644 {
1645         ASSERT_RTNL();
1646         netdev_notifier_info_init(info, dev);
1647         return raw_notifier_call_chain(&netdev_chain, val, info);
1648 }
1649
1650 /**
1651  *      call_netdevice_notifiers - call all network notifier blocks
1652  *      @val: value passed unmodified to notifier function
1653  *      @dev: net_device pointer passed unmodified to notifier function
1654  *
1655  *      Call all network notifier blocks.  Parameters and return value
1656  *      are as for raw_notifier_call_chain().
1657  */
1658
1659 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1660 {
1661         struct netdev_notifier_info info;
1662
1663         return call_netdevice_notifiers_info(val, dev, &info);
1664 }
1665 EXPORT_SYMBOL(call_netdevice_notifiers);
1666
1667 #ifdef CONFIG_NET_INGRESS
1668 static struct static_key ingress_needed __read_mostly;
1669
1670 void net_inc_ingress_queue(void)
1671 {
1672         static_key_slow_inc(&ingress_needed);
1673 }
1674 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1675
1676 void net_dec_ingress_queue(void)
1677 {
1678         static_key_slow_dec(&ingress_needed);
1679 }
1680 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1681 #endif
1682
1683 static struct static_key netstamp_needed __read_mostly;
1684 #ifdef HAVE_JUMP_LABEL
1685 /* We are not allowed to call static_key_slow_dec() from irq context
1686  * If net_disable_timestamp() is called from irq context, defer the
1687  * static_key_slow_dec() calls.
1688  */
1689 static atomic_t netstamp_needed_deferred;
1690 #endif
1691
1692 void net_enable_timestamp(void)
1693 {
1694 #ifdef HAVE_JUMP_LABEL
1695         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1696
1697         if (deferred) {
1698                 while (--deferred)
1699                         static_key_slow_dec(&netstamp_needed);
1700                 return;
1701         }
1702 #endif
1703         static_key_slow_inc(&netstamp_needed);
1704 }
1705 EXPORT_SYMBOL(net_enable_timestamp);
1706
1707 void net_disable_timestamp(void)
1708 {
1709 #ifdef HAVE_JUMP_LABEL
1710         if (in_interrupt()) {
1711                 atomic_inc(&netstamp_needed_deferred);
1712                 return;
1713         }
1714 #endif
1715         static_key_slow_dec(&netstamp_needed);
1716 }
1717 EXPORT_SYMBOL(net_disable_timestamp);
1718
1719 static inline void net_timestamp_set(struct sk_buff *skb)
1720 {
1721         skb->tstamp.tv64 = 0;
1722         if (static_key_false(&netstamp_needed))
1723                 __net_timestamp(skb);
1724 }
1725
1726 #define net_timestamp_check(COND, SKB)                  \
1727         if (static_key_false(&netstamp_needed)) {               \
1728                 if ((COND) && !(SKB)->tstamp.tv64)      \
1729                         __net_timestamp(SKB);           \
1730         }                                               \
1731
1732 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1733 {
1734         unsigned int len;
1735
1736         if (!(dev->flags & IFF_UP))
1737                 return false;
1738
1739         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1740         if (skb->len <= len)
1741                 return true;
1742
1743         /* if TSO is enabled, we don't care about the length as the packet
1744          * could be forwarded without being segmented before
1745          */
1746         if (skb_is_gso(skb))
1747                 return true;
1748
1749         return false;
1750 }
1751 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1752
1753 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1754 {
1755         if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1756             unlikely(!is_skb_forwardable(dev, skb))) {
1757                 atomic_long_inc(&dev->rx_dropped);
1758                 kfree_skb(skb);
1759                 return NET_RX_DROP;
1760         }
1761
1762         skb_scrub_packet(skb, true);
1763         skb->priority = 0;
1764         skb->protocol = eth_type_trans(skb, dev);
1765         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1766
1767         return 0;
1768 }
1769 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1770
1771 /**
1772  * dev_forward_skb - loopback an skb to another netif
1773  *
1774  * @dev: destination network device
1775  * @skb: buffer to forward
1776  *
1777  * return values:
1778  *      NET_RX_SUCCESS  (no congestion)
1779  *      NET_RX_DROP     (packet was dropped, but freed)
1780  *
1781  * dev_forward_skb can be used for injecting an skb from the
1782  * start_xmit function of one device into the receive queue
1783  * of another device.
1784  *
1785  * The receiving device may be in another namespace, so
1786  * we have to clear all information in the skb that could
1787  * impact namespace isolation.
1788  */
1789 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1790 {
1791         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1792 }
1793 EXPORT_SYMBOL_GPL(dev_forward_skb);
1794
1795 static inline int deliver_skb(struct sk_buff *skb,
1796                               struct packet_type *pt_prev,
1797                               struct net_device *orig_dev)
1798 {
1799         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1800                 return -ENOMEM;
1801         atomic_inc(&skb->users);
1802         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1803 }
1804
1805 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1806                                           struct packet_type **pt,
1807                                           struct net_device *orig_dev,
1808                                           __be16 type,
1809                                           struct list_head *ptype_list)
1810 {
1811         struct packet_type *ptype, *pt_prev = *pt;
1812
1813         list_for_each_entry_rcu(ptype, ptype_list, list) {
1814                 if (ptype->type != type)
1815                         continue;
1816                 if (pt_prev)
1817                         deliver_skb(skb, pt_prev, orig_dev);
1818                 pt_prev = ptype;
1819         }
1820         *pt = pt_prev;
1821 }
1822
1823 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1824 {
1825         if (!ptype->af_packet_priv || !skb->sk)
1826                 return false;
1827
1828         if (ptype->id_match)
1829                 return ptype->id_match(ptype, skb->sk);
1830         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1831                 return true;
1832
1833         return false;
1834 }
1835
1836 /*
1837  *      Support routine. Sends outgoing frames to any network
1838  *      taps currently in use.
1839  */
1840
1841 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1842 {
1843         struct packet_type *ptype;
1844         struct sk_buff *skb2 = NULL;
1845         struct packet_type *pt_prev = NULL;
1846         struct list_head *ptype_list = &ptype_all;
1847
1848         rcu_read_lock();
1849 again:
1850         list_for_each_entry_rcu(ptype, ptype_list, list) {
1851                 /* Never send packets back to the socket
1852                  * they originated from - MvS (miquels@drinkel.ow.org)
1853                  */
1854                 if (skb_loop_sk(ptype, skb))
1855                         continue;
1856
1857                 if (pt_prev) {
1858                         deliver_skb(skb2, pt_prev, skb->dev);
1859                         pt_prev = ptype;
1860                         continue;
1861                 }
1862
1863                 /* need to clone skb, done only once */
1864                 skb2 = skb_clone(skb, GFP_ATOMIC);
1865                 if (!skb2)
1866                         goto out_unlock;
1867
1868                 net_timestamp_set(skb2);
1869
1870                 /* skb->nh should be correctly
1871                  * set by sender, so that the second statement is
1872                  * just protection against buggy protocols.
1873                  */
1874                 skb_reset_mac_header(skb2);
1875
1876                 if (skb_network_header(skb2) < skb2->data ||
1877                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1878                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1879                                              ntohs(skb2->protocol),
1880                                              dev->name);
1881                         skb_reset_network_header(skb2);
1882                 }
1883
1884                 skb2->transport_header = skb2->network_header;
1885                 skb2->pkt_type = PACKET_OUTGOING;
1886                 pt_prev = ptype;
1887         }
1888
1889         if (ptype_list == &ptype_all) {
1890                 ptype_list = &dev->ptype_all;
1891                 goto again;
1892         }
1893 out_unlock:
1894         if (pt_prev)
1895                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1896         rcu_read_unlock();
1897 }
1898
1899 /**
1900  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1901  * @dev: Network device
1902  * @txq: number of queues available
1903  *
1904  * If real_num_tx_queues is changed the tc mappings may no longer be
1905  * valid. To resolve this verify the tc mapping remains valid and if
1906  * not NULL the mapping. With no priorities mapping to this
1907  * offset/count pair it will no longer be used. In the worst case TC0
1908  * is invalid nothing can be done so disable priority mappings. If is
1909  * expected that drivers will fix this mapping if they can before
1910  * calling netif_set_real_num_tx_queues.
1911  */
1912 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1913 {
1914         int i;
1915         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1916
1917         /* If TC0 is invalidated disable TC mapping */
1918         if (tc->offset + tc->count > txq) {
1919                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1920                 dev->num_tc = 0;
1921                 return;
1922         }
1923
1924         /* Invalidated prio to tc mappings set to TC0 */
1925         for (i = 1; i < TC_BITMASK + 1; i++) {
1926                 int q = netdev_get_prio_tc_map(dev, i);
1927
1928                 tc = &dev->tc_to_txq[q];
1929                 if (tc->offset + tc->count > txq) {
1930                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1931                                 i, q);
1932                         netdev_set_prio_tc_map(dev, i, 0);
1933                 }
1934         }
1935 }
1936
1937 #ifdef CONFIG_XPS
1938 static DEFINE_MUTEX(xps_map_mutex);
1939 #define xmap_dereference(P)             \
1940         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1941
1942 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1943                                         int cpu, u16 index)
1944 {
1945         struct xps_map *map = NULL;
1946         int pos;
1947
1948         if (dev_maps)
1949                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1950
1951         for (pos = 0; map && pos < map->len; pos++) {
1952                 if (map->queues[pos] == index) {
1953                         if (map->len > 1) {
1954                                 map->queues[pos] = map->queues[--map->len];
1955                         } else {
1956                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1957                                 kfree_rcu(map, rcu);
1958                                 map = NULL;
1959                         }
1960                         break;
1961                 }
1962         }
1963
1964         return map;
1965 }
1966
1967 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1968 {
1969         struct xps_dev_maps *dev_maps;
1970         int cpu, i;
1971         bool active = false;
1972
1973         mutex_lock(&xps_map_mutex);
1974         dev_maps = xmap_dereference(dev->xps_maps);
1975
1976         if (!dev_maps)
1977                 goto out_no_maps;
1978
1979         for_each_possible_cpu(cpu) {
1980                 for (i = index; i < dev->num_tx_queues; i++) {
1981                         if (!remove_xps_queue(dev_maps, cpu, i))
1982                                 break;
1983                 }
1984                 if (i == dev->num_tx_queues)
1985                         active = true;
1986         }
1987
1988         if (!active) {
1989                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1990                 kfree_rcu(dev_maps, rcu);
1991         }
1992
1993         for (i = index; i < dev->num_tx_queues; i++)
1994                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1995                                              NUMA_NO_NODE);
1996
1997 out_no_maps:
1998         mutex_unlock(&xps_map_mutex);
1999 }
2000
2001 static struct xps_map *expand_xps_map(struct xps_map *map,
2002                                       int cpu, u16 index)
2003 {
2004         struct xps_map *new_map;
2005         int alloc_len = XPS_MIN_MAP_ALLOC;
2006         int i, pos;
2007
2008         for (pos = 0; map && pos < map->len; pos++) {
2009                 if (map->queues[pos] != index)
2010                         continue;
2011                 return map;
2012         }
2013
2014         /* Need to add queue to this CPU's existing map */
2015         if (map) {
2016                 if (pos < map->alloc_len)
2017                         return map;
2018
2019                 alloc_len = map->alloc_len * 2;
2020         }
2021
2022         /* Need to allocate new map to store queue on this CPU's map */
2023         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2024                                cpu_to_node(cpu));
2025         if (!new_map)
2026                 return NULL;
2027
2028         for (i = 0; i < pos; i++)
2029                 new_map->queues[i] = map->queues[i];
2030         new_map->alloc_len = alloc_len;
2031         new_map->len = pos;
2032
2033         return new_map;
2034 }
2035
2036 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2037                         u16 index)
2038 {
2039         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2040         struct xps_map *map, *new_map;
2041         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2042         int cpu, numa_node_id = -2;
2043         bool active = false;
2044
2045         mutex_lock(&xps_map_mutex);
2046
2047         dev_maps = xmap_dereference(dev->xps_maps);
2048
2049         /* allocate memory for queue storage */
2050         for_each_online_cpu(cpu) {
2051                 if (!cpumask_test_cpu(cpu, mask))
2052                         continue;
2053
2054                 if (!new_dev_maps)
2055                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2056                 if (!new_dev_maps) {
2057                         mutex_unlock(&xps_map_mutex);
2058                         return -ENOMEM;
2059                 }
2060
2061                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2062                                  NULL;
2063
2064                 map = expand_xps_map(map, cpu, index);
2065                 if (!map)
2066                         goto error;
2067
2068                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2069         }
2070
2071         if (!new_dev_maps)
2072                 goto out_no_new_maps;
2073
2074         for_each_possible_cpu(cpu) {
2075                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2076                         /* add queue to CPU maps */
2077                         int pos = 0;
2078
2079                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2080                         while ((pos < map->len) && (map->queues[pos] != index))
2081                                 pos++;
2082
2083                         if (pos == map->len)
2084                                 map->queues[map->len++] = index;
2085 #ifdef CONFIG_NUMA
2086                         if (numa_node_id == -2)
2087                                 numa_node_id = cpu_to_node(cpu);
2088                         else if (numa_node_id != cpu_to_node(cpu))
2089                                 numa_node_id = -1;
2090 #endif
2091                 } else if (dev_maps) {
2092                         /* fill in the new device map from the old device map */
2093                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2094                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2095                 }
2096
2097         }
2098
2099         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2100
2101         /* Cleanup old maps */
2102         if (dev_maps) {
2103                 for_each_possible_cpu(cpu) {
2104                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2105                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2106                         if (map && map != new_map)
2107                                 kfree_rcu(map, rcu);
2108                 }
2109
2110                 kfree_rcu(dev_maps, rcu);
2111         }
2112
2113         dev_maps = new_dev_maps;
2114         active = true;
2115
2116 out_no_new_maps:
2117         /* update Tx queue numa node */
2118         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2119                                      (numa_node_id >= 0) ? numa_node_id :
2120                                      NUMA_NO_NODE);
2121
2122         if (!dev_maps)
2123                 goto out_no_maps;
2124
2125         /* removes queue from unused CPUs */
2126         for_each_possible_cpu(cpu) {
2127                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2128                         continue;
2129
2130                 if (remove_xps_queue(dev_maps, cpu, index))
2131                         active = true;
2132         }
2133
2134         /* free map if not active */
2135         if (!active) {
2136                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2137                 kfree_rcu(dev_maps, rcu);
2138         }
2139
2140 out_no_maps:
2141         mutex_unlock(&xps_map_mutex);
2142
2143         return 0;
2144 error:
2145         /* remove any maps that we added */
2146         for_each_possible_cpu(cpu) {
2147                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2148                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2149                                  NULL;
2150                 if (new_map && new_map != map)
2151                         kfree(new_map);
2152         }
2153
2154         mutex_unlock(&xps_map_mutex);
2155
2156         kfree(new_dev_maps);
2157         return -ENOMEM;
2158 }
2159 EXPORT_SYMBOL(netif_set_xps_queue);
2160
2161 #endif
2162 /*
2163  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2164  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2165  */
2166 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2167 {
2168         int rc;
2169
2170         if (txq < 1 || txq > dev->num_tx_queues)
2171                 return -EINVAL;
2172
2173         if (dev->reg_state == NETREG_REGISTERED ||
2174             dev->reg_state == NETREG_UNREGISTERING) {
2175                 ASSERT_RTNL();
2176
2177                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2178                                                   txq);
2179                 if (rc)
2180                         return rc;
2181
2182                 if (dev->num_tc)
2183                         netif_setup_tc(dev, txq);
2184
2185                 if (txq < dev->real_num_tx_queues) {
2186                         qdisc_reset_all_tx_gt(dev, txq);
2187 #ifdef CONFIG_XPS
2188                         netif_reset_xps_queues_gt(dev, txq);
2189 #endif
2190                 }
2191         }
2192
2193         dev->real_num_tx_queues = txq;
2194         return 0;
2195 }
2196 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2197
2198 #ifdef CONFIG_SYSFS
2199 /**
2200  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2201  *      @dev: Network device
2202  *      @rxq: Actual number of RX queues
2203  *
2204  *      This must be called either with the rtnl_lock held or before
2205  *      registration of the net device.  Returns 0 on success, or a
2206  *      negative error code.  If called before registration, it always
2207  *      succeeds.
2208  */
2209 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2210 {
2211         int rc;
2212
2213         if (rxq < 1 || rxq > dev->num_rx_queues)
2214                 return -EINVAL;
2215
2216         if (dev->reg_state == NETREG_REGISTERED) {
2217                 ASSERT_RTNL();
2218
2219                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2220                                                   rxq);
2221                 if (rc)
2222                         return rc;
2223         }
2224
2225         dev->real_num_rx_queues = rxq;
2226         return 0;
2227 }
2228 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2229 #endif
2230
2231 /**
2232  * netif_get_num_default_rss_queues - default number of RSS queues
2233  *
2234  * This routine should set an upper limit on the number of RSS queues
2235  * used by default by multiqueue devices.
2236  */
2237 int netif_get_num_default_rss_queues(void)
2238 {
2239         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2240 }
2241 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2242
2243 static inline void __netif_reschedule(struct Qdisc *q)
2244 {
2245         struct softnet_data *sd;
2246         unsigned long flags;
2247
2248         local_irq_save(flags);
2249         sd = this_cpu_ptr(&softnet_data);
2250         q->next_sched = NULL;
2251         *sd->output_queue_tailp = q;
2252         sd->output_queue_tailp = &q->next_sched;
2253         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2254         local_irq_restore(flags);
2255         preempt_check_resched_rt();
2256 }
2257
2258 void __netif_schedule(struct Qdisc *q)
2259 {
2260         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2261                 __netif_reschedule(q);
2262 }
2263 EXPORT_SYMBOL(__netif_schedule);
2264
2265 struct dev_kfree_skb_cb {
2266         enum skb_free_reason reason;
2267 };
2268
2269 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2270 {
2271         return (struct dev_kfree_skb_cb *)skb->cb;
2272 }
2273
2274 void netif_schedule_queue(struct netdev_queue *txq)
2275 {
2276         rcu_read_lock();
2277         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2278                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2279
2280                 __netif_schedule(q);
2281         }
2282         rcu_read_unlock();
2283 }
2284 EXPORT_SYMBOL(netif_schedule_queue);
2285
2286 /**
2287  *      netif_wake_subqueue - allow sending packets on subqueue
2288  *      @dev: network device
2289  *      @queue_index: sub queue index
2290  *
2291  * Resume individual transmit queue of a device with multiple transmit queues.
2292  */
2293 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2294 {
2295         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2296
2297         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2298                 struct Qdisc *q;
2299
2300                 rcu_read_lock();
2301                 q = rcu_dereference(txq->qdisc);
2302                 __netif_schedule(q);
2303                 rcu_read_unlock();
2304         }
2305 }
2306 EXPORT_SYMBOL(netif_wake_subqueue);
2307
2308 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2309 {
2310         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2311                 struct Qdisc *q;
2312
2313                 rcu_read_lock();
2314                 q = rcu_dereference(dev_queue->qdisc);
2315                 __netif_schedule(q);
2316                 rcu_read_unlock();
2317         }
2318 }
2319 EXPORT_SYMBOL(netif_tx_wake_queue);
2320
2321 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2322 {
2323         unsigned long flags;
2324
2325         if (likely(atomic_read(&skb->users) == 1)) {
2326                 smp_rmb();
2327                 atomic_set(&skb->users, 0);
2328         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2329                 return;
2330         }
2331         get_kfree_skb_cb(skb)->reason = reason;
2332         local_irq_save(flags);
2333         skb->next = __this_cpu_read(softnet_data.completion_queue);
2334         __this_cpu_write(softnet_data.completion_queue, skb);
2335         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2336         local_irq_restore(flags);
2337         preempt_check_resched_rt();
2338 }
2339 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2340
2341 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2342 {
2343         if (in_irq() || irqs_disabled())
2344                 __dev_kfree_skb_irq(skb, reason);
2345         else
2346                 dev_kfree_skb(skb);
2347 }
2348 EXPORT_SYMBOL(__dev_kfree_skb_any);
2349
2350
2351 /**
2352  * netif_device_detach - mark device as removed
2353  * @dev: network device
2354  *
2355  * Mark device as removed from system and therefore no longer available.
2356  */
2357 void netif_device_detach(struct net_device *dev)
2358 {
2359         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2360             netif_running(dev)) {
2361                 netif_tx_stop_all_queues(dev);
2362         }
2363 }
2364 EXPORT_SYMBOL(netif_device_detach);
2365
2366 /**
2367  * netif_device_attach - mark device as attached
2368  * @dev: network device
2369  *
2370  * Mark device as attached from system and restart if needed.
2371  */
2372 void netif_device_attach(struct net_device *dev)
2373 {
2374         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2375             netif_running(dev)) {
2376                 netif_tx_wake_all_queues(dev);
2377                 __netdev_watchdog_up(dev);
2378         }
2379 }
2380 EXPORT_SYMBOL(netif_device_attach);
2381
2382 /*
2383  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2384  * to be used as a distribution range.
2385  */
2386 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2387                   unsigned int num_tx_queues)
2388 {
2389         u32 hash;
2390         u16 qoffset = 0;
2391         u16 qcount = num_tx_queues;
2392
2393         if (skb_rx_queue_recorded(skb)) {
2394                 hash = skb_get_rx_queue(skb);
2395                 while (unlikely(hash >= num_tx_queues))
2396                         hash -= num_tx_queues;
2397                 return hash;
2398         }
2399
2400         if (dev->num_tc) {
2401                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2402                 qoffset = dev->tc_to_txq[tc].offset;
2403                 qcount = dev->tc_to_txq[tc].count;
2404         }
2405
2406         return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2407 }
2408 EXPORT_SYMBOL(__skb_tx_hash);
2409
2410 static void skb_warn_bad_offload(const struct sk_buff *skb)
2411 {
2412         static const netdev_features_t null_features = 0;
2413         struct net_device *dev = skb->dev;
2414         const char *name = "";
2415
2416         if (!net_ratelimit())
2417                 return;
2418
2419         if (dev) {
2420                 if (dev->dev.parent)
2421                         name = dev_driver_string(dev->dev.parent);
2422                 else
2423                         name = netdev_name(dev);
2424         }
2425         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2426              "gso_type=%d ip_summed=%d\n",
2427              name, dev ? &dev->features : &null_features,
2428              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2429              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2430              skb_shinfo(skb)->gso_type, skb->ip_summed);
2431 }
2432
2433 /*
2434  * Invalidate hardware checksum when packet is to be mangled, and
2435  * complete checksum manually on outgoing path.
2436  */
2437 int skb_checksum_help(struct sk_buff *skb)
2438 {
2439         __wsum csum;
2440         int ret = 0, offset;
2441
2442         if (skb->ip_summed == CHECKSUM_COMPLETE)
2443                 goto out_set_summed;
2444
2445         if (unlikely(skb_shinfo(skb)->gso_size)) {
2446                 skb_warn_bad_offload(skb);
2447                 return -EINVAL;
2448         }
2449
2450         /* Before computing a checksum, we should make sure no frag could
2451          * be modified by an external entity : checksum could be wrong.
2452          */
2453         if (skb_has_shared_frag(skb)) {
2454                 ret = __skb_linearize(skb);
2455                 if (ret)
2456                         goto out;
2457         }
2458
2459         offset = skb_checksum_start_offset(skb);
2460         BUG_ON(offset >= skb_headlen(skb));
2461         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2462
2463         offset += skb->csum_offset;
2464         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2465
2466         if (skb_cloned(skb) &&
2467             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2468                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2469                 if (ret)
2470                         goto out;
2471         }
2472
2473         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2474 out_set_summed:
2475         skb->ip_summed = CHECKSUM_NONE;
2476 out:
2477         return ret;
2478 }
2479 EXPORT_SYMBOL(skb_checksum_help);
2480
2481 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2482 {
2483         __be16 type = skb->protocol;
2484
2485         /* Tunnel gso handlers can set protocol to ethernet. */
2486         if (type == htons(ETH_P_TEB)) {
2487                 struct ethhdr *eth;
2488
2489                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2490                         return 0;
2491
2492                 eth = (struct ethhdr *)skb_mac_header(skb);
2493                 type = eth->h_proto;
2494         }
2495
2496         return __vlan_get_protocol(skb, type, depth);
2497 }
2498
2499 /**
2500  *      skb_mac_gso_segment - mac layer segmentation handler.
2501  *      @skb: buffer to segment
2502  *      @features: features for the output path (see dev->features)
2503  */
2504 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2505                                     netdev_features_t features)
2506 {
2507         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2508         struct packet_offload *ptype;
2509         int vlan_depth = skb->mac_len;
2510         __be16 type = skb_network_protocol(skb, &vlan_depth);
2511
2512         if (unlikely(!type))
2513                 return ERR_PTR(-EINVAL);
2514
2515         __skb_pull(skb, vlan_depth);
2516
2517         rcu_read_lock();
2518         list_for_each_entry_rcu(ptype, &offload_base, list) {
2519                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2520                         segs = ptype->callbacks.gso_segment(skb, features);
2521                         break;
2522                 }
2523         }
2524         rcu_read_unlock();
2525
2526         __skb_push(skb, skb->data - skb_mac_header(skb));
2527
2528         return segs;
2529 }
2530 EXPORT_SYMBOL(skb_mac_gso_segment);
2531
2532
2533 /* openvswitch calls this on rx path, so we need a different check.
2534  */
2535 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2536 {
2537         if (tx_path)
2538                 return skb->ip_summed != CHECKSUM_PARTIAL;
2539         else
2540                 return skb->ip_summed == CHECKSUM_NONE;
2541 }
2542
2543 /**
2544  *      __skb_gso_segment - Perform segmentation on skb.
2545  *      @skb: buffer to segment
2546  *      @features: features for the output path (see dev->features)
2547  *      @tx_path: whether it is called in TX path
2548  *
2549  *      This function segments the given skb and returns a list of segments.
2550  *
2551  *      It may return NULL if the skb requires no segmentation.  This is
2552  *      only possible when GSO is used for verifying header integrity.
2553  *
2554  *      Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2555  */
2556 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2557                                   netdev_features_t features, bool tx_path)
2558 {
2559         if (unlikely(skb_needs_check(skb, tx_path))) {
2560                 int err;
2561
2562                 skb_warn_bad_offload(skb);
2563
2564                 err = skb_cow_head(skb, 0);
2565                 if (err < 0)
2566                         return ERR_PTR(err);
2567         }
2568
2569         BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2570                      sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2571
2572         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2573         SKB_GSO_CB(skb)->encap_level = 0;
2574
2575         skb_reset_mac_header(skb);
2576         skb_reset_mac_len(skb);
2577
2578         return skb_mac_gso_segment(skb, features);
2579 }
2580 EXPORT_SYMBOL(__skb_gso_segment);
2581
2582 /* Take action when hardware reception checksum errors are detected. */
2583 #ifdef CONFIG_BUG
2584 void netdev_rx_csum_fault(struct net_device *dev)
2585 {
2586         if (net_ratelimit()) {
2587                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2588                 dump_stack();
2589         }
2590 }
2591 EXPORT_SYMBOL(netdev_rx_csum_fault);
2592 #endif
2593
2594 /* Actually, we should eliminate this check as soon as we know, that:
2595  * 1. IOMMU is present and allows to map all the memory.
2596  * 2. No high memory really exists on this machine.
2597  */
2598
2599 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2600 {
2601 #ifdef CONFIG_HIGHMEM
2602         int i;
2603         if (!(dev->features & NETIF_F_HIGHDMA)) {
2604                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2605                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2606                         if (PageHighMem(skb_frag_page(frag)))
2607                                 return 1;
2608                 }
2609         }
2610
2611         if (PCI_DMA_BUS_IS_PHYS) {
2612                 struct device *pdev = dev->dev.parent;
2613
2614                 if (!pdev)
2615                         return 0;
2616                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2617                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2618                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2619                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2620                                 return 1;
2621                 }
2622         }
2623 #endif
2624         return 0;
2625 }
2626
2627 /* If MPLS offload request, verify we are testing hardware MPLS features
2628  * instead of standard features for the netdev.
2629  */
2630 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2631 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2632                                            netdev_features_t features,
2633                                            __be16 type)
2634 {
2635         if (eth_p_mpls(type))
2636                 features &= skb->dev->mpls_features;
2637
2638         return features;
2639 }
2640 #else
2641 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2642                                            netdev_features_t features,
2643                                            __be16 type)
2644 {
2645         return features;
2646 }
2647 #endif
2648
2649 static netdev_features_t harmonize_features(struct sk_buff *skb,
2650         netdev_features_t features)
2651 {
2652         int tmp;
2653         __be16 type;
2654
2655         type = skb_network_protocol(skb, &tmp);
2656         features = net_mpls_features(skb, features, type);
2657
2658         if (skb->ip_summed != CHECKSUM_NONE &&
2659             !can_checksum_protocol(features, type)) {
2660                 features &= ~NETIF_F_ALL_CSUM;
2661         } else if (illegal_highdma(skb->dev, skb)) {
2662                 features &= ~NETIF_F_SG;
2663         }
2664
2665         return features;
2666 }
2667
2668 netdev_features_t passthru_features_check(struct sk_buff *skb,
2669                                           struct net_device *dev,
2670                                           netdev_features_t features)
2671 {
2672         return features;
2673 }
2674 EXPORT_SYMBOL(passthru_features_check);
2675
2676 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2677                                              struct net_device *dev,
2678                                              netdev_features_t features)
2679 {
2680         return vlan_features_check(skb, features);
2681 }
2682
2683 netdev_features_t netif_skb_features(struct sk_buff *skb)
2684 {
2685         struct net_device *dev = skb->dev;
2686         netdev_features_t features = dev->features;
2687         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2688
2689         if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2690                 features &= ~NETIF_F_GSO_MASK;
2691
2692         /* If encapsulation offload request, verify we are testing
2693          * hardware encapsulation features instead of standard
2694          * features for the netdev
2695          */
2696         if (skb->encapsulation)
2697                 features &= dev->hw_enc_features;
2698
2699         if (skb_vlan_tagged(skb))
2700                 features = netdev_intersect_features(features,
2701                                                      dev->vlan_features |
2702                                                      NETIF_F_HW_VLAN_CTAG_TX |
2703                                                      NETIF_F_HW_VLAN_STAG_TX);
2704
2705         if (dev->netdev_ops->ndo_features_check)
2706                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2707                                                                 features);
2708         else
2709                 features &= dflt_features_check(skb, dev, features);
2710
2711         return harmonize_features(skb, features);
2712 }
2713 EXPORT_SYMBOL(netif_skb_features);
2714
2715 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2716                     struct netdev_queue *txq, bool more)
2717 {
2718         unsigned int len;
2719         int rc;
2720
2721         if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2722                 dev_queue_xmit_nit(skb, dev);
2723
2724         len = skb->len;
2725         trace_net_dev_start_xmit(skb, dev);
2726         rc = netdev_start_xmit(skb, dev, txq, more);
2727         trace_net_dev_xmit(skb, rc, dev, len);
2728
2729         return rc;
2730 }
2731
2732 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2733                                     struct netdev_queue *txq, int *ret)
2734 {
2735         struct sk_buff *skb = first;
2736         int rc = NETDEV_TX_OK;
2737
2738         while (skb) {
2739                 struct sk_buff *next = skb->next;
2740
2741                 skb->next = NULL;
2742                 rc = xmit_one(skb, dev, txq, next != NULL);
2743                 if (unlikely(!dev_xmit_complete(rc))) {
2744                         skb->next = next;
2745                         goto out;
2746                 }
2747
2748                 skb = next;
2749                 if (netif_xmit_stopped(txq) && skb) {
2750                         rc = NETDEV_TX_BUSY;
2751                         break;
2752                 }
2753         }
2754
2755 out:
2756         *ret = rc;
2757         return skb;
2758 }
2759
2760 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2761                                           netdev_features_t features)
2762 {
2763         if (skb_vlan_tag_present(skb) &&
2764             !vlan_hw_offload_capable(features, skb->vlan_proto))
2765                 skb = __vlan_hwaccel_push_inside(skb);
2766         return skb;
2767 }
2768
2769 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2770 {
2771         netdev_features_t features;
2772
2773         if (skb->next)
2774                 return skb;
2775
2776         features = netif_skb_features(skb);
2777         skb = validate_xmit_vlan(skb, features);
2778         if (unlikely(!skb))
2779                 goto out_null;
2780
2781         if (netif_needs_gso(skb, features)) {
2782                 struct sk_buff *segs;
2783
2784                 segs = skb_gso_segment(skb, features);
2785                 if (IS_ERR(segs)) {
2786                         goto out_kfree_skb;
2787                 } else if (segs) {
2788                         consume_skb(skb);
2789                         skb = segs;
2790                 }
2791         } else {
2792                 if (skb_needs_linearize(skb, features) &&
2793                     __skb_linearize(skb))
2794                         goto out_kfree_skb;
2795
2796                 /* If packet is not checksummed and device does not
2797                  * support checksumming for this protocol, complete
2798                  * checksumming here.
2799                  */
2800                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2801                         if (skb->encapsulation)
2802                                 skb_set_inner_transport_header(skb,
2803                                                                skb_checksum_start_offset(skb));
2804                         else
2805                                 skb_set_transport_header(skb,
2806                                                          skb_checksum_start_offset(skb));
2807                         if (!(features & NETIF_F_ALL_CSUM) &&
2808                             skb_checksum_help(skb))
2809                                 goto out_kfree_skb;
2810                 }
2811         }
2812
2813         return skb;
2814
2815 out_kfree_skb:
2816         kfree_skb(skb);
2817 out_null:
2818         return NULL;
2819 }
2820
2821 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2822 {
2823         struct sk_buff *next, *head = NULL, *tail;
2824
2825         for (; skb != NULL; skb = next) {
2826                 next = skb->next;
2827                 skb->next = NULL;
2828
2829                 /* in case skb wont be segmented, point to itself */
2830                 skb->prev = skb;
2831
2832                 skb = validate_xmit_skb(skb, dev);
2833                 if (!skb)
2834                         continue;
2835
2836                 if (!head)
2837                         head = skb;
2838                 else
2839                         tail->next = skb;
2840                 /* If skb was segmented, skb->prev points to
2841                  * the last segment. If not, it still contains skb.
2842                  */
2843                 tail = skb->prev;
2844         }
2845         return head;
2846 }
2847
2848 static void qdisc_pkt_len_init(struct sk_buff *skb)
2849 {
2850         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2851
2852         qdisc_skb_cb(skb)->pkt_len = skb->len;
2853
2854         /* To get more precise estimation of bytes sent on wire,
2855          * we add to pkt_len the headers size of all segments
2856          */
2857         if (shinfo->gso_size)  {
2858                 unsigned int hdr_len;
2859                 u16 gso_segs = shinfo->gso_segs;
2860
2861                 /* mac layer + network layer */
2862                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2863
2864                 /* + transport layer */
2865                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2866                         hdr_len += tcp_hdrlen(skb);
2867                 else
2868                         hdr_len += sizeof(struct udphdr);
2869
2870                 if (shinfo->gso_type & SKB_GSO_DODGY)
2871                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2872                                                 shinfo->gso_size);
2873
2874                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2875         }
2876 }
2877
2878 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2879                                  struct net_device *dev,
2880                                  struct netdev_queue *txq)
2881 {
2882         spinlock_t *root_lock = qdisc_lock(q);
2883         bool contended;
2884         int rc;
2885
2886         qdisc_pkt_len_init(skb);
2887         qdisc_calculate_pkt_len(skb, q);
2888         /*
2889          * Heuristic to force contended enqueues to serialize on a
2890          * separate lock before trying to get qdisc main lock.
2891          * This permits __QDISC___STATE_RUNNING owner to get the lock more
2892          * often and dequeue packets faster.
2893          */
2894 #ifdef CONFIG_PREEMPT_RT_FULL
2895         contended = true;
2896 #else
2897         contended = qdisc_is_running(q);
2898 #endif
2899         if (unlikely(contended))
2900                 spin_lock(&q->busylock);
2901
2902         spin_lock(root_lock);
2903         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2904                 kfree_skb(skb);
2905                 rc = NET_XMIT_DROP;
2906         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2907                    qdisc_run_begin(q)) {
2908                 /*
2909                  * This is a work-conserving queue; there are no old skbs
2910                  * waiting to be sent out; and the qdisc is not running -
2911                  * xmit the skb directly.
2912                  */
2913
2914                 qdisc_bstats_update(q, skb);
2915
2916                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2917                         if (unlikely(contended)) {
2918                                 spin_unlock(&q->busylock);
2919                                 contended = false;
2920                         }
2921                         __qdisc_run(q);
2922                 } else
2923                         qdisc_run_end(q);
2924
2925                 rc = NET_XMIT_SUCCESS;
2926         } else {
2927                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2928                 if (qdisc_run_begin(q)) {
2929                         if (unlikely(contended)) {
2930                                 spin_unlock(&q->busylock);
2931                                 contended = false;
2932                         }
2933                         __qdisc_run(q);
2934                 }
2935         }
2936         spin_unlock(root_lock);
2937         if (unlikely(contended))
2938                 spin_unlock(&q->busylock);
2939         return rc;
2940 }
2941
2942 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2943 static void skb_update_prio(struct sk_buff *skb)
2944 {
2945         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2946
2947         if (!skb->priority && skb->sk && map) {
2948                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2949
2950                 if (prioidx < map->priomap_len)
2951                         skb->priority = map->priomap[prioidx];
2952         }
2953 }
2954 #else
2955 #define skb_update_prio(skb)
2956 #endif
2957
2958 #ifdef CONFIG_PREEMPT_RT_FULL
2959
2960 static inline int xmit_rec_read(void)
2961 {
2962        return current->xmit_recursion;
2963 }
2964
2965 static inline void xmit_rec_inc(void)
2966 {
2967        current->xmit_recursion++;
2968 }
2969
2970 static inline void xmit_rec_dec(void)
2971 {
2972        current->xmit_recursion--;
2973 }
2974
2975 #else
2976
2977 DEFINE_PER_CPU(int, xmit_recursion);
2978 EXPORT_SYMBOL(xmit_recursion);
2979
2980 static inline int xmit_rec_read(void)
2981 {
2982         return __this_cpu_read(xmit_recursion);
2983 }
2984
2985 static inline void xmit_rec_inc(void)
2986 {
2987         __this_cpu_inc(xmit_recursion);
2988 }
2989
2990 static inline void xmit_rec_dec(void)
2991 {
2992         __this_cpu_dec(xmit_recursion);
2993 }
2994 #endif
2995
2996 #define RECURSION_LIMIT 10
2997
2998 /**
2999  *      dev_loopback_xmit - loop back @skb
3000  *      @net: network namespace this loopback is happening in
3001  *      @sk:  sk needed to be a netfilter okfn
3002  *      @skb: buffer to transmit
3003  */
3004 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3005 {
3006         skb_reset_mac_header(skb);
3007         __skb_pull(skb, skb_network_offset(skb));
3008         skb->pkt_type = PACKET_LOOPBACK;
3009         skb->ip_summed = CHECKSUM_UNNECESSARY;
3010         WARN_ON(!skb_dst(skb));
3011         skb_dst_force(skb);
3012         netif_rx_ni(skb);
3013         return 0;
3014 }
3015 EXPORT_SYMBOL(dev_loopback_xmit);
3016
3017 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3018 {
3019 #ifdef CONFIG_XPS
3020         struct xps_dev_maps *dev_maps;
3021         struct xps_map *map;
3022         int queue_index = -1;
3023
3024         rcu_read_lock();
3025         dev_maps = rcu_dereference(dev->xps_maps);
3026         if (dev_maps) {
3027                 map = rcu_dereference(
3028                     dev_maps->cpu_map[skb->sender_cpu - 1]);
3029                 if (map) {
3030                         if (map->len == 1)
3031                                 queue_index = map->queues[0];
3032                         else
3033                                 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3034                                                                            map->len)];
3035                         if (unlikely(queue_index >= dev->real_num_tx_queues))
3036                                 queue_index = -1;
3037                 }
3038         }
3039         rcu_read_unlock();
3040
3041         return queue_index;
3042 #else
3043         return -1;
3044 #endif
3045 }
3046
3047 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3048 {
3049         struct sock *sk = skb->sk;
3050         int queue_index = sk_tx_queue_get(sk);
3051
3052         if (queue_index < 0 || skb->ooo_okay ||
3053             queue_index >= dev->real_num_tx_queues) {
3054                 int new_index = get_xps_queue(dev, skb);
3055                 if (new_index < 0)
3056                         new_index = skb_tx_hash(dev, skb);
3057
3058                 if (queue_index != new_index && sk &&
3059                     sk_fullsock(sk) &&
3060                     rcu_access_pointer(sk->sk_dst_cache))
3061                         sk_tx_queue_set(sk, new_index);
3062
3063                 queue_index = new_index;
3064         }
3065
3066         return queue_index;
3067 }
3068
3069 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3070                                     struct sk_buff *skb,
3071                                     void *accel_priv)
3072 {
3073         int queue_index = 0;
3074
3075 #ifdef CONFIG_XPS
3076         if (skb->sender_cpu == 0)
3077                 skb->sender_cpu = raw_smp_processor_id() + 1;
3078 #endif
3079
3080         if (dev->real_num_tx_queues != 1) {
3081                 const struct net_device_ops *ops = dev->netdev_ops;
3082                 if (ops->ndo_select_queue)
3083                         queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3084                                                             __netdev_pick_tx);
3085                 else
3086                         queue_index = __netdev_pick_tx(dev, skb);
3087
3088                 if (!accel_priv)
3089                         queue_index = netdev_cap_txqueue(dev, queue_index);
3090         }
3091
3092         skb_set_queue_mapping(skb, queue_index);
3093         return netdev_get_tx_queue(dev, queue_index);
3094 }
3095
3096 /**
3097  *      __dev_queue_xmit - transmit a buffer
3098  *      @skb: buffer to transmit
3099  *      @accel_priv: private data used for L2 forwarding offload
3100  *
3101  *      Queue a buffer for transmission to a network device. The caller must
3102  *      have set the device and priority and built the buffer before calling
3103  *      this function. The function can be called from an interrupt.
3104  *
3105  *      A negative errno code is returned on a failure. A success does not
3106  *      guarantee the frame will be transmitted as it may be dropped due
3107  *      to congestion or traffic shaping.
3108  *
3109  * -----------------------------------------------------------------------------------
3110  *      I notice this method can also return errors from the queue disciplines,
3111  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3112  *      be positive.
3113  *
3114  *      Regardless of the return value, the skb is consumed, so it is currently
3115  *      difficult to retry a send to this method.  (You can bump the ref count
3116  *      before sending to hold a reference for retry if you are careful.)
3117  *
3118  *      When calling this method, interrupts MUST be enabled.  This is because
3119  *      the BH enable code must have IRQs enabled so that it will not deadlock.
3120  *          --BLG
3121  */
3122 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3123 {
3124         struct net_device *dev = skb->dev;
3125         struct netdev_queue *txq;
3126         struct Qdisc *q;
3127         int rc = -ENOMEM;
3128
3129         skb_reset_mac_header(skb);
3130
3131         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3132                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3133
3134         /* Disable soft irqs for various locks below. Also
3135          * stops preemption for RCU.
3136          */
3137         rcu_read_lock_bh();
3138
3139         skb_update_prio(skb);
3140
3141         /* If device/qdisc don't need skb->dst, release it right now while
3142          * its hot in this cpu cache.
3143          */
3144         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3145                 skb_dst_drop(skb);
3146         else
3147                 skb_dst_force(skb);
3148
3149 #ifdef CONFIG_NET_SWITCHDEV
3150         /* Don't forward if offload device already forwarded */
3151         if (skb->offload_fwd_mark &&
3152             skb->offload_fwd_mark == dev->offload_fwd_mark) {
3153                 consume_skb(skb);
3154                 rc = NET_XMIT_SUCCESS;
3155                 goto out;
3156         }
3157 #endif
3158
3159         txq = netdev_pick_tx(dev, skb, accel_priv);
3160         q = rcu_dereference_bh(txq->qdisc);
3161
3162 #ifdef CONFIG_NET_CLS_ACT
3163         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3164 #endif
3165         trace_net_dev_queue(skb);
3166         if (q->enqueue) {
3167                 rc = __dev_xmit_skb(skb, q, dev, txq);
3168                 goto out;
3169         }
3170
3171         /* The device has no queue. Common case for software devices:
3172            loopback, all the sorts of tunnels...
3173
3174            Really, it is unlikely that netif_tx_lock protection is necessary
3175            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3176            counters.)
3177            However, it is possible, that they rely on protection
3178            made by us here.
3179
3180            Check this and shot the lock. It is not prone from deadlocks.
3181            Either shot noqueue qdisc, it is even simpler 8)
3182          */
3183         if (dev->flags & IFF_UP) {
3184                 int cpu = smp_processor_id(); /* ok because BHs are off */
3185
3186                 if (txq->xmit_lock_owner != cpu) {
3187
3188                         if (xmit_rec_read() > RECURSION_LIMIT)
3189                                 goto recursion_alert;
3190
3191                         skb = validate_xmit_skb(skb, dev);
3192                         if (!skb)
3193                                 goto drop;
3194
3195                         HARD_TX_LOCK(dev, txq, cpu);
3196
3197                         if (!netif_xmit_stopped(txq)) {
3198                                 xmit_rec_inc();
3199                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3200                                 xmit_rec_dec();
3201                                 if (dev_xmit_complete(rc)) {
3202                                         HARD_TX_UNLOCK(dev, txq);
3203                                         goto out;
3204                                 }
3205                         }
3206                         HARD_TX_UNLOCK(dev, txq);
3207                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3208                                              dev->name);
3209                 } else {
3210                         /* Recursion is detected! It is possible,
3211                          * unfortunately
3212                          */
3213 recursion_alert:
3214                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3215                                              dev->name);
3216                 }
3217         }
3218
3219         rc = -ENETDOWN;
3220 drop:
3221         rcu_read_unlock_bh();
3222
3223         atomic_long_inc(&dev->tx_dropped);
3224         kfree_skb_list(skb);
3225         return rc;
3226 out:
3227         rcu_read_unlock_bh();
3228         return rc;
3229 }
3230
3231 int dev_queue_xmit(struct sk_buff *skb)
3232 {
3233         return __dev_queue_xmit(skb, NULL);
3234 }
3235 EXPORT_SYMBOL(dev_queue_xmit);
3236
3237 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3238 {
3239         return __dev_queue_xmit(skb, accel_priv);
3240 }
3241 EXPORT_SYMBOL(dev_queue_xmit_accel);
3242
3243
3244 /*=======================================================================
3245                         Receiver routines
3246   =======================================================================*/
3247
3248 int netdev_max_backlog __read_mostly = 1000;
3249 EXPORT_SYMBOL(netdev_max_backlog);
3250
3251 int netdev_tstamp_prequeue __read_mostly = 1;
3252 int netdev_budget __read_mostly = 300;
3253 int weight_p __read_mostly = 64;            /* old backlog weight */
3254
3255 /* Called with irq disabled */
3256 static inline void ____napi_schedule(struct softnet_data *sd,
3257                                      struct napi_struct *napi)
3258 {
3259         list_add_tail(&napi->poll_list, &sd->poll_list);
3260         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3261 }
3262
3263 #ifdef CONFIG_RPS
3264
3265 /* One global table that all flow-based protocols share. */
3266 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3267 EXPORT_SYMBOL(rps_sock_flow_table);
3268 u32 rps_cpu_mask __read_mostly;
3269 EXPORT_SYMBOL(rps_cpu_mask);
3270
3271 struct static_key rps_needed __read_mostly;
3272
3273 static struct rps_dev_flow *
3274 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3275             struct rps_dev_flow *rflow, u16 next_cpu)
3276 {
3277         if (next_cpu < nr_cpu_ids) {
3278 #ifdef CONFIG_RFS_ACCEL
3279                 struct netdev_rx_queue *rxqueue;
3280                 struct rps_dev_flow_table *flow_table;
3281                 struct rps_dev_flow *old_rflow;
3282                 u32 flow_id;
3283                 u16 rxq_index;
3284                 int rc;
3285
3286                 /* Should we steer this flow to a different hardware queue? */
3287                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3288                     !(dev->features & NETIF_F_NTUPLE))
3289                         goto out;
3290                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3291                 if (rxq_index == skb_get_rx_queue(skb))
3292                         goto out;
3293
3294                 rxqueue = dev->_rx + rxq_index;
3295                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3296                 if (!flow_table)
3297                         goto out;
3298                 flow_id = skb_get_hash(skb) & flow_table->mask;
3299                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3300                                                         rxq_index, flow_id);
3301                 if (rc < 0)
3302                         goto out;
3303                 old_rflow = rflow;
3304                 rflow = &flow_table->flows[flow_id];
3305                 rflow->filter = rc;
3306                 if (old_rflow->filter == rflow->filter)
3307                         old_rflow->filter = RPS_NO_FILTER;
3308         out:
3309 #endif
3310                 rflow->last_qtail =
3311                         per_cpu(softnet_data, next_cpu).input_queue_head;
3312         }
3313
3314         rflow->cpu = next_cpu;
3315         return rflow;
3316 }
3317
3318 /*
3319  * get_rps_cpu is called from netif_receive_skb and returns the target
3320  * CPU from the RPS map of the receiving queue for a given skb.
3321  * rcu_read_lock must be held on entry.
3322  */
3323 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3324                        struct rps_dev_flow **rflowp)
3325 {
3326         const struct rps_sock_flow_table *sock_flow_table;
3327         struct netdev_rx_queue *rxqueue = dev->_rx;
3328         struct rps_dev_flow_table *flow_table;
3329         struct rps_map *map;
3330         int cpu = -1;
3331         u32 tcpu;
3332         u32 hash;
3333
3334         if (skb_rx_queue_recorded(skb)) {
3335                 u16 index = skb_get_rx_queue(skb);
3336
3337                 if (unlikely(index >= dev->real_num_rx_queues)) {
3338                         WARN_ONCE(dev->real_num_rx_queues > 1,
3339                                   "%s received packet on queue %u, but number "
3340                                   "of RX queues is %u\n",
3341                                   dev->name, index, dev->real_num_rx_queues);
3342                         goto done;
3343                 }
3344                 rxqueue += index;
3345         }
3346
3347         /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3348
3349         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3350         map = rcu_dereference(rxqueue->rps_map);
3351         if (!flow_table && !map)
3352                 goto done;
3353
3354         skb_reset_network_header(skb);
3355         hash = skb_get_hash(skb);
3356         if (!hash)
3357                 goto done;
3358
3359         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3360         if (flow_table && sock_flow_table) {
3361                 struct rps_dev_flow *rflow;
3362                 u32 next_cpu;
3363                 u32 ident;
3364
3365                 /* First check into global flow table if there is a match */
3366                 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3367                 if ((ident ^ hash) & ~rps_cpu_mask)
3368                         goto try_rps;
3369
3370                 next_cpu = ident & rps_cpu_mask;
3371
3372                 /* OK, now we know there is a match,
3373                  * we can look at the local (per receive queue) flow table
3374                  */
3375                 rflow = &flow_table->flows[hash & flow_table->mask];
3376                 tcpu = rflow->cpu;
3377
3378                 /*
3379                  * If the desired CPU (where last recvmsg was done) is
3380                  * different from current CPU (one in the rx-queue flow
3381                  * table entry), switch if one of the following holds:
3382                  *   - Current CPU is unset (>= nr_cpu_ids).
3383                  *   - Current CPU is offline.
3384                  *   - The current CPU's queue tail has advanced beyond the
3385                  *     last packet that was enqueued using this table entry.
3386                  *     This guarantees that all previous packets for the flow
3387                  *     have been dequeued, thus preserving in order delivery.
3388                  */
3389                 if (unlikely(tcpu != next_cpu) &&
3390                     (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3391                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3392                       rflow->last_qtail)) >= 0)) {
3393                         tcpu = next_cpu;
3394                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3395                 }
3396
3397                 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3398                         *rflowp = rflow;
3399                         cpu = tcpu;
3400                         goto done;
3401                 }
3402         }
3403
3404 try_rps:
3405
3406         if (map) {
3407                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3408                 if (cpu_online(tcpu)) {
3409                         cpu = tcpu;
3410                         goto done;
3411                 }
3412         }
3413
3414 done:
3415         return cpu;
3416 }
3417
3418 #ifdef CONFIG_RFS_ACCEL
3419
3420 /**
3421  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3422  * @dev: Device on which the filter was set
3423  * @rxq_index: RX queue index
3424  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3425  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3426  *
3427  * Drivers that implement ndo_rx_flow_steer() should periodically call
3428  * this function for each installed filter and remove the filters for
3429  * which it returns %true.
3430  */
3431 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3432                          u32 flow_id, u16 filter_id)
3433 {
3434         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3435         struct rps_dev_flow_table *flow_table;
3436         struct rps_dev_flow *rflow;
3437         bool expire = true;
3438         unsigned int cpu;
3439
3440         rcu_read_lock();
3441         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3442         if (flow_table && flow_id <= flow_table->mask) {
3443                 rflow = &flow_table->flows[flow_id];
3444                 cpu = ACCESS_ONCE(rflow->cpu);
3445                 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3446                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3447                            rflow->last_qtail) <
3448                      (int)(10 * flow_table->mask)))
3449                         expire = false;
3450         }
3451         rcu_read_unlock();
3452         return expire;
3453 }
3454 EXPORT_SYMBOL(rps_may_expire_flow);
3455
3456 #endif /* CONFIG_RFS_ACCEL */
3457
3458 /* Called from hardirq (IPI) context */
3459 static void rps_trigger_softirq(void *data)
3460 {
3461         struct softnet_data *sd = data;
3462
3463         ____napi_schedule(sd, &sd->backlog);
3464         sd->received_rps++;
3465 }
3466
3467 #endif /* CONFIG_RPS */
3468
3469 /*
3470  * Check if this softnet_data structure is another cpu one
3471  * If yes, queue it to our IPI list and return 1
3472  * If no, return 0
3473  */
3474 static int rps_ipi_queued(struct softnet_data *sd)
3475 {
3476 #ifdef CONFIG_RPS
3477         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3478
3479         if (sd != mysd) {
3480                 sd->rps_ipi_next = mysd->rps_ipi_list;
3481                 mysd->rps_ipi_list = sd;
3482
3483                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3484                 return 1;
3485         }
3486 #endif /* CONFIG_RPS */
3487         return 0;
3488 }
3489
3490 #ifdef CONFIG_NET_FLOW_LIMIT
3491 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3492 #endif
3493
3494 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3495 {
3496 #ifdef CONFIG_NET_FLOW_LIMIT
3497         struct sd_flow_limit *fl;
3498         struct softnet_data *sd;
3499         unsigned int old_flow, new_flow;
3500
3501         if (qlen < (netdev_max_backlog >> 1))
3502                 return false;
3503
3504         sd = this_cpu_ptr(&softnet_data);
3505
3506         rcu_read_lock();
3507         fl = rcu_dereference(sd->flow_limit);
3508         if (fl) {
3509                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3510                 old_flow = fl->history[fl->history_head];
3511                 fl->history[fl->history_head] = new_flow;
3512
3513                 fl->history_head++;
3514                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3515
3516                 if (likely(fl->buckets[old_flow]))
3517                         fl->buckets[old_flow]--;
3518
3519                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3520                         fl->count++;
3521                         rcu_read_unlock();
3522                         return true;
3523                 }
3524         }
3525         rcu_read_unlock();
3526 #endif
3527         return false;
3528 }
3529
3530 /*
3531  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3532  * queue (may be a remote CPU queue).
3533  */
3534 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3535                               unsigned int *qtail)
3536 {
3537         struct softnet_data *sd;
3538         unsigned long flags;
3539         unsigned int qlen;
3540
3541         sd = &per_cpu(softnet_data, cpu);
3542
3543         local_irq_save(flags);
3544
3545         rps_lock(sd);
3546         if (!netif_running(skb->dev))
3547                 goto drop;
3548         qlen = skb_queue_len(&sd->input_pkt_queue);
3549         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3550                 if (qlen) {
3551 enqueue:
3552                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3553                         input_queue_tail_incr_save(sd, qtail);
3554                         rps_unlock(sd);
3555                         local_irq_restore(flags);
3556                         return NET_RX_SUCCESS;
3557                 }
3558
3559                 /* Schedule NAPI for backlog device
3560                  * We can use non atomic operation since we own the queue lock
3561                  */
3562                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3563                         if (!rps_ipi_queued(sd))
3564                                 ____napi_schedule(sd, &sd->backlog);
3565                 }
3566                 goto enqueue;
3567         }
3568
3569 drop:
3570         sd->dropped++;
3571         rps_unlock(sd);
3572
3573         local_irq_restore(flags);
3574         preempt_check_resched_rt();
3575
3576         atomic_long_inc(&skb->dev->rx_dropped);
3577         kfree_skb(skb);
3578         return NET_RX_DROP;
3579 }
3580
3581 static int netif_rx_internal(struct sk_buff *skb)
3582 {
3583         int ret;
3584
3585         net_timestamp_check(netdev_tstamp_prequeue, skb);
3586
3587         trace_netif_rx(skb);
3588 #ifdef CONFIG_RPS
3589         if (static_key_false(&rps_needed)) {
3590                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3591                 int cpu;
3592
3593                 migrate_disable();
3594                 rcu_read_lock();
3595
3596                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3597                 if (cpu < 0)
3598                         cpu = smp_processor_id();
3599
3600                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3601
3602                 rcu_read_unlock();
3603                 migrate_enable();
3604         } else
3605 #endif
3606         {
3607                 unsigned int qtail;
3608                 ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
3609                 put_cpu_light();
3610         }
3611         return ret;
3612 }
3613
3614 /**
3615  *      netif_rx        -       post buffer to the network code
3616  *      @skb: buffer to post
3617  *
3618  *      This function receives a packet from a device driver and queues it for
3619  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3620  *      may be dropped during processing for congestion control or by the
3621  *      protocol layers.
3622  *
3623  *      return values:
3624  *      NET_RX_SUCCESS  (no congestion)
3625  *      NET_RX_DROP     (packet was dropped)
3626  *
3627  */
3628
3629 int netif_rx(struct sk_buff *skb)
3630 {
3631         trace_netif_rx_entry(skb);
3632
3633         return netif_rx_internal(skb);
3634 }
3635 EXPORT_SYMBOL(netif_rx);
3636
3637 int netif_rx_ni(struct sk_buff *skb)
3638 {
3639         int err;
3640
3641         trace_netif_rx_ni_entry(skb);
3642
3643         local_bh_disable();
3644         err = netif_rx_internal(skb);
3645         local_bh_enable();
3646
3647         return err;
3648 }
3649 EXPORT_SYMBOL(netif_rx_ni);
3650
3651 #ifdef CONFIG_PREEMPT_RT_FULL
3652 /*
3653  * RT runs ksoftirqd as a real time thread and the root_lock is a
3654  * "sleeping spinlock". If the trylock fails then we can go into an
3655  * infinite loop when ksoftirqd preempted the task which actually
3656  * holds the lock, because we requeue q and raise NET_TX softirq
3657  * causing ksoftirqd to loop forever.
3658  *
3659  * It's safe to use spin_lock on RT here as softirqs run in thread
3660  * context and cannot deadlock against the thread which is holding
3661  * root_lock.
3662  *
3663  * On !RT the trylock might fail, but there we bail out from the
3664  * softirq loop after 10 attempts which we can't do on RT. And the
3665  * task holding root_lock cannot be preempted, so the only downside of
3666  * that trylock is that we need 10 loops to decide that we should have
3667  * given up in the first one :)
3668  */
3669 static inline int take_root_lock(spinlock_t *lock)
3670 {
3671         spin_lock(lock);
3672         return 1;
3673 }
3674 #else
3675 static inline int take_root_lock(spinlock_t *lock)
3676 {
3677         return spin_trylock(lock);
3678 }
3679 #endif
3680
3681 static void net_tx_action(struct softirq_action *h)
3682 {
3683         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3684
3685         if (sd->completion_queue) {
3686                 struct sk_buff *clist;
3687
3688                 local_irq_disable();
3689                 clist = sd->completion_queue;
3690                 sd->completion_queue = NULL;
3691                 local_irq_enable();
3692
3693                 while (clist) {
3694                         struct sk_buff *skb = clist;
3695                         clist = clist->next;
3696
3697                         WARN_ON(atomic_read(&skb->users));
3698                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3699                                 trace_consume_skb(skb);
3700                         else
3701                                 trace_kfree_skb(skb, net_tx_action);
3702                         __kfree_skb(skb);
3703                 }
3704         }
3705
3706         if (sd->output_queue) {
3707                 struct Qdisc *head;
3708
3709                 local_irq_disable();
3710                 head = sd->output_queue;
3711                 sd->output_queue = NULL;
3712                 sd->output_queue_tailp = &sd->output_queue;
3713                 local_irq_enable();
3714
3715                 while (head) {
3716                         struct Qdisc *q = head;
3717                         spinlock_t *root_lock;
3718
3719                         head = head->next_sched;
3720
3721                         root_lock = qdisc_lock(q);
3722                         if (take_root_lock(root_lock)) {
3723                                 smp_mb__before_atomic();
3724                                 clear_bit(__QDISC_STATE_SCHED,
3725                                           &q->state);
3726                                 qdisc_run(q);
3727                                 spin_unlock(root_lock);
3728                         } else {
3729                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3730                                               &q->state)) {
3731                                         __netif_reschedule(q);
3732                                 } else {
3733                                         smp_mb__before_atomic();
3734                                         clear_bit(__QDISC_STATE_SCHED,
3735                                                   &q->state);
3736                                 }
3737                         }
3738                 }
3739         }
3740 }
3741
3742 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3743     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3744 /* This hook is defined here for ATM LANE */
3745 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3746                              unsigned char *addr) __read_mostly;
3747 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3748 #endif
3749
3750 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3751                                          struct packet_type **pt_prev,
3752                                          int *ret, struct net_device *orig_dev)
3753 {
3754 #ifdef CONFIG_NET_CLS_ACT
3755         struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3756         struct tcf_result cl_res;
3757
3758         /* If there's at least one ingress present somewhere (so
3759          * we get here via enabled static key), remaining devices
3760          * that are not configured with an ingress qdisc will bail
3761          * out here.
3762          */
3763         if (!cl)
3764                 return skb;
3765         if (*pt_prev) {
3766                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3767                 *pt_prev = NULL;
3768         }
3769
3770         qdisc_skb_cb(skb)->pkt_len = skb->len;
3771         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3772         qdisc_bstats_cpu_update(cl->q, skb);
3773
3774         switch (tc_classify(skb, cl, &cl_res, false)) {
3775         case TC_ACT_OK:
3776         case TC_ACT_RECLASSIFY:
3777                 skb->tc_index = TC_H_MIN(cl_res.classid);
3778                 break;
3779         case TC_ACT_SHOT:
3780                 qdisc_qstats_cpu_drop(cl->q);
3781         case TC_ACT_STOLEN:
3782         case TC_ACT_QUEUED:
3783                 kfree_skb(skb);
3784                 return NULL;
3785         case TC_ACT_REDIRECT:
3786                 /* skb_mac_header check was done by cls/act_bpf, so
3787                  * we can safely push the L2 header back before
3788                  * redirecting to another netdev
3789                  */
3790                 __skb_push(skb, skb->mac_len);
3791                 skb_do_redirect(skb);
3792                 return NULL;
3793         default:
3794                 break;
3795         }
3796 #endif /* CONFIG_NET_CLS_ACT */
3797         return skb;
3798 }
3799
3800 /**
3801  *      netdev_rx_handler_register - register receive handler
3802  *      @dev: device to register a handler for
3803  *      @rx_handler: receive handler to register
3804  *      @rx_handler_data: data pointer that is used by rx handler
3805  *
3806  *      Register a receive handler for a device. This handler will then be
3807  *      called from __netif_receive_skb. A negative errno code is returned
3808  *      on a failure.
3809  *
3810  *      The caller must hold the rtnl_mutex.
3811  *
3812  *      For a general description of rx_handler, see enum rx_handler_result.
3813  */
3814 int netdev_rx_handler_register(struct net_device *dev,
3815                                rx_handler_func_t *rx_handler,
3816                                void *rx_handler_data)
3817 {
3818         ASSERT_RTNL();
3819
3820         if (dev->rx_handler)
3821                 return -EBUSY;
3822
3823         /* Note: rx_handler_data must be set before rx_handler */
3824         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3825         rcu_assign_pointer(dev->rx_handler, rx_handler);
3826
3827         return 0;
3828 }
3829 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3830
3831 /**
3832  *      netdev_rx_handler_unregister - unregister receive handler
3833  *      @dev: device to unregister a handler from
3834  *
3835  *      Unregister a receive handler from a device.
3836  *
3837  *      The caller must hold the rtnl_mutex.
3838  */
3839 void netdev_rx_handler_unregister(struct net_device *dev)
3840 {
3841
3842         ASSERT_RTNL();
3843         RCU_INIT_POINTER(dev->rx_handler, NULL);
3844         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3845          * section has a guarantee to see a non NULL rx_handler_data
3846          * as well.
3847          */
3848         synchronize_net();
3849         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3850 }
3851 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3852
3853 /*
3854  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3855  * the special handling of PFMEMALLOC skbs.
3856  */
3857 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3858 {
3859         switch (skb->protocol) {
3860         case htons(ETH_P_ARP):
3861         case htons(ETH_P_IP):
3862         case htons(ETH_P_IPV6):
3863         case htons(ETH_P_8021Q):
3864         case htons(ETH_P_8021AD):
3865                 return true;
3866         default:
3867                 return false;
3868         }
3869 }
3870
3871 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
3872                              int *ret, struct net_device *orig_dev)
3873 {
3874 #ifdef CONFIG_NETFILTER_INGRESS
3875         if (nf_hook_ingress_active(skb)) {
3876                 if (*pt_prev) {
3877                         *ret = deliver_skb(skb, *pt_prev, orig_dev);
3878                         *pt_prev = NULL;
3879                 }
3880
3881                 return nf_hook_ingress(skb);
3882         }
3883 #endif /* CONFIG_NETFILTER_INGRESS */
3884         return 0;
3885 }
3886
3887 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3888 {
3889         struct packet_type *ptype, *pt_prev;
3890         rx_handler_func_t *rx_handler;
3891         struct net_device *orig_dev;
3892         bool deliver_exact = false;
3893         int ret = NET_RX_DROP;
3894         __be16 type;
3895
3896         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3897
3898         trace_netif_receive_skb(skb);
3899
3900         orig_dev = skb->dev;
3901
3902         skb_reset_network_header(skb);
3903         if (!skb_transport_header_was_set(skb))
3904                 skb_reset_transport_header(skb);
3905         skb_reset_mac_len(skb);
3906
3907         pt_prev = NULL;
3908
3909 another_round:
3910         skb->skb_iif = skb->dev->ifindex;
3911
3912         __this_cpu_inc(softnet_data.processed);
3913
3914         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3915             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3916                 skb = skb_vlan_untag(skb);
3917                 if (unlikely(!skb))
3918                         goto out;
3919         }
3920
3921 #ifdef CONFIG_NET_CLS_ACT
3922         if (skb->tc_verd & TC_NCLS) {
3923                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3924                 goto ncls;
3925         }
3926 #endif
3927
3928         if (pfmemalloc)
3929                 goto skip_taps;
3930
3931         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3932                 if (pt_prev)
3933                         ret = deliver_skb(skb, pt_prev, orig_dev);
3934                 pt_prev = ptype;
3935         }
3936
3937         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
3938                 if (pt_prev)
3939                         ret = deliver_skb(skb, pt_prev, orig_dev);
3940                 pt_prev = ptype;
3941         }
3942
3943 skip_taps:
3944 #ifdef CONFIG_NET_INGRESS
3945         if (static_key_false(&ingress_needed)) {
3946                 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3947                 if (!skb)
3948                         goto out;
3949
3950                 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
3951                         goto out;
3952         }
3953 #endif
3954 #ifdef CONFIG_NET_CLS_ACT
3955         skb->tc_verd = 0;
3956 ncls:
3957 #endif
3958         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3959                 goto drop;
3960
3961         if (skb_vlan_tag_present(skb)) {
3962                 if (pt_prev) {
3963                         ret = deliver_skb(skb, pt_prev, orig_dev);
3964                         pt_prev = NULL;
3965                 }
3966                 if (vlan_do_receive(&skb))
3967                         goto another_round;
3968                 else if (unlikely(!skb))
3969                         goto out;
3970         }
3971
3972         rx_handler = rcu_dereference(skb->dev->rx_handler);
3973         if (rx_handler) {
3974                 if (pt_prev) {
3975                         ret = deliver_skb(skb, pt_prev, orig_dev);
3976                         pt_prev = NULL;
3977                 }
3978                 switch (rx_handler(&skb)) {
3979                 case RX_HANDLER_CONSUMED:
3980                         ret = NET_RX_SUCCESS;
3981                         goto out;
3982                 case RX_HANDLER_ANOTHER:
3983                         goto another_round;
3984                 case RX_HANDLER_EXACT:
3985                         deliver_exact = true;
3986                 case RX_HANDLER_PASS:
3987                         break;
3988                 default:
3989                         BUG();
3990                 }
3991         }
3992
3993         if (unlikely(skb_vlan_tag_present(skb))) {
3994                 if (skb_vlan_tag_get_id(skb))
3995                         skb->pkt_type = PACKET_OTHERHOST;
3996                 /* Note: we might in the future use prio bits
3997                  * and set skb->priority like in vlan_do_receive()
3998                  * For the time being, just ignore Priority Code Point
3999                  */
4000                 skb->vlan_tci = 0;
4001         }
4002
4003         type = skb->protocol;
4004
4005         /* deliver only exact match when indicated */
4006         if (likely(!deliver_exact)) {
4007                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4008                                        &ptype_base[ntohs(type) &
4009                                                    PTYPE_HASH_MASK]);
4010         }
4011
4012         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4013                                &orig_dev->ptype_specific);
4014
4015         if (unlikely(skb->dev != orig_dev)) {
4016                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4017                                        &skb->dev->ptype_specific);
4018         }
4019
4020         if (pt_prev) {
4021                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4022                         goto drop;
4023                 else
4024                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4025         } else {
4026 drop:
4027                 atomic_long_inc(&skb->dev->rx_dropped);
4028                 kfree_skb(skb);
4029                 /* Jamal, now you will not able to escape explaining
4030                  * me how you were going to use this. :-)
4031                  */
4032                 ret = NET_RX_DROP;
4033         }
4034
4035 out:
4036         return ret;
4037 }
4038
4039 static int __netif_receive_skb(struct sk_buff *skb)
4040 {
4041         int ret;
4042
4043         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4044                 unsigned long pflags = current->flags;
4045
4046                 /*
4047                  * PFMEMALLOC skbs are special, they should
4048                  * - be delivered to SOCK_MEMALLOC sockets only
4049                  * - stay away from userspace
4050                  * - have bounded memory usage
4051                  *
4052                  * Use PF_MEMALLOC as this saves us from propagating the allocation
4053                  * context down to all allocation sites.
4054                  */
4055                 current->flags |= PF_MEMALLOC;
4056                 ret = __netif_receive_skb_core(skb, true);
4057                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
4058         } else
4059                 ret = __netif_receive_skb_core(skb, false);
4060
4061         return ret;
4062 }
4063
4064 static int netif_receive_skb_internal(struct sk_buff *skb)
4065 {
4066         int ret;
4067
4068         net_timestamp_check(netdev_tstamp_prequeue, skb);
4069
4070         if (skb_defer_rx_timestamp(skb))
4071                 return NET_RX_SUCCESS;
4072
4073         rcu_read_lock();
4074
4075 #ifdef CONFIG_RPS
4076         if (static_key_false(&rps_needed)) {
4077                 struct rps_dev_flow voidflow, *rflow = &voidflow;
4078                 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4079
4080                 if (cpu >= 0) {
4081                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4082                         rcu_read_unlock();
4083                         return ret;
4084                 }
4085         }
4086 #endif
4087         ret = __netif_receive_skb(skb);
4088         rcu_read_unlock();
4089         return ret;
4090 }
4091
4092 /**
4093  *      netif_receive_skb - process receive buffer from network
4094  *      @skb: buffer to process
4095  *
4096  *      netif_receive_skb() is the main receive data processing function.
4097  *      It always succeeds. The buffer may be dropped during processing
4098  *      for congestion control or by the protocol layers.
4099  *
4100  *      This function may only be called from softirq context and interrupts
4101  *      should be enabled.
4102  *
4103  *      Return values (usually ignored):
4104  *      NET_RX_SUCCESS: no congestion
4105  *      NET_RX_DROP: packet was dropped
4106  */
4107 int netif_receive_skb(struct sk_buff *skb)
4108 {
4109         trace_netif_receive_skb_entry(skb);
4110
4111         return netif_receive_skb_internal(skb);
4112 }
4113 EXPORT_SYMBOL(netif_receive_skb);
4114
4115 /* Network device is going away, flush any packets still pending
4116  * Called with irqs disabled.
4117  */
4118 static void flush_backlog(void *arg)
4119 {
4120         struct net_device *dev = arg;
4121         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4122         struct sk_buff *skb, *tmp;
4123
4124         rps_lock(sd);
4125         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4126                 if (skb->dev == dev) {
4127                         __skb_unlink(skb, &sd->input_pkt_queue);
4128                         __skb_queue_tail(&sd->tofree_queue, skb);
4129                         input_queue_head_incr(sd);
4130                 }
4131         }
4132         rps_unlock(sd);
4133
4134         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4135                 if (skb->dev == dev) {
4136                         __skb_unlink(skb, &sd->process_queue);
4137                         __skb_queue_tail(&sd->tofree_queue, skb);
4138                         input_queue_head_incr(sd);
4139                 }
4140         }
4141
4142         if (!skb_queue_empty(&sd->tofree_queue))
4143                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
4144 }
4145
4146 static int napi_gro_complete(struct sk_buff *skb)
4147 {
4148         struct packet_offload *ptype;
4149         __be16 type = skb->protocol;
4150         struct list_head *head = &offload_base;
4151         int err = -ENOENT;
4152
4153         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4154
4155         if (NAPI_GRO_CB(skb)->count == 1) {
4156                 skb_shinfo(skb)->gso_size = 0;
4157                 goto out;
4158         }
4159
4160         rcu_read_lock();
4161         list_for_each_entry_rcu(ptype, head, list) {
4162                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4163                         continue;
4164
4165                 err = ptype->callbacks.gro_complete(skb, 0);
4166                 break;
4167         }
4168         rcu_read_unlock();
4169
4170         if (err) {
4171                 WARN_ON(&ptype->list == head);
4172                 kfree_skb(skb);
4173                 return NET_RX_SUCCESS;
4174         }
4175
4176 out:
4177         return netif_receive_skb_internal(skb);
4178 }
4179
4180 /* napi->gro_list contains packets ordered by age.
4181  * youngest packets at the head of it.
4182  * Complete skbs in reverse order to reduce latencies.
4183  */
4184 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4185 {
4186         struct sk_buff *skb, *prev = NULL;
4187
4188         /* scan list and build reverse chain */
4189         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4190                 skb->prev = prev;
4191                 prev = skb;
4192         }
4193
4194         for (skb = prev; skb; skb = prev) {
4195                 skb->next = NULL;
4196
4197                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4198                         return;
4199
4200                 prev = skb->prev;
4201                 napi_gro_complete(skb);
4202                 napi->gro_count--;
4203         }
4204
4205         napi->gro_list = NULL;
4206 }
4207 EXPORT_SYMBOL(napi_gro_flush);
4208
4209 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4210 {
4211         struct sk_buff *p;
4212         unsigned int maclen = skb->dev->hard_header_len;
4213         u32 hash = skb_get_hash_raw(skb);
4214
4215         for (p = napi->gro_list; p; p = p->next) {
4216                 unsigned long diffs;
4217
4218                 NAPI_GRO_CB(p)->flush = 0;
4219
4220                 if (hash != skb_get_hash_raw(p)) {
4221                         NAPI_GRO_CB(p)->same_flow = 0;
4222                         continue;
4223                 }
4224
4225                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4226                 diffs |= p->vlan_tci ^ skb->vlan_tci;
4227                 diffs |= skb_metadata_dst_cmp(p, skb);
4228                 if (maclen == ETH_HLEN)
4229                         diffs |= compare_ether_header(skb_mac_header(p),
4230                                                       skb_mac_header(skb));
4231                 else if (!diffs)
4232                         diffs = memcmp(skb_mac_header(p),
4233                                        skb_mac_header(skb),
4234                                        maclen);
4235                 NAPI_GRO_CB(p)->same_flow = !diffs;
4236         }
4237 }
4238
4239 static void skb_gro_reset_offset(struct sk_buff *skb)
4240 {
4241         const struct skb_shared_info *pinfo = skb_shinfo(skb);
4242         const skb_frag_t *frag0 = &pinfo->frags[0];
4243
4244         NAPI_GRO_CB(skb)->data_offset = 0;
4245         NAPI_GRO_CB(skb)->frag0 = NULL;
4246         NAPI_GRO_CB(skb)->frag0_len = 0;
4247
4248         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4249             pinfo->nr_frags &&
4250             !PageHighMem(skb_frag_page(frag0))) {
4251                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4252                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
4253         }
4254 }
4255
4256 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4257 {
4258         struct skb_shared_info *pinfo = skb_shinfo(skb);
4259
4260         BUG_ON(skb->end - skb->tail < grow);
4261
4262         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4263
4264         skb->data_len -= grow;
4265         skb->tail += grow;
4266
4267         pinfo->frags[0].page_offset += grow;
4268         skb_frag_size_sub(&pinfo->frags[0], grow);
4269
4270         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4271                 skb_frag_unref(skb, 0);
4272                 memmove(pinfo->frags, pinfo->frags + 1,
4273                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4274         }
4275 }
4276
4277 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4278 {
4279         struct sk_buff **pp = NULL;
4280         struct packet_offload *ptype;
4281         __be16 type = skb->protocol;
4282         struct list_head *head = &offload_base;
4283         int same_flow;
4284         enum gro_result ret;
4285         int grow;
4286
4287         if (!(skb->dev->features & NETIF_F_GRO))
4288                 goto normal;
4289
4290         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4291                 goto normal;
4292
4293         gro_list_prepare(napi, skb);
4294
4295         rcu_read_lock();
4296         list_for_each_entry_rcu(ptype, head, list) {
4297                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4298                         continue;
4299
4300                 skb_set_network_header(skb, skb_gro_offset(skb));
4301                 skb_reset_mac_len(skb);
4302                 NAPI_GRO_CB(skb)->same_flow = 0;
4303                 NAPI_GRO_CB(skb)->flush = 0;
4304                 NAPI_GRO_CB(skb)->free = 0;
4305                 NAPI_GRO_CB(skb)->udp_mark = 0;
4306                 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4307
4308                 /* Setup for GRO checksum validation */
4309                 switch (skb->ip_summed) {
4310                 case CHECKSUM_COMPLETE:
4311                         NAPI_GRO_CB(skb)->csum = skb->csum;
4312                         NAPI_GRO_CB(skb)->csum_valid = 1;
4313                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4314                         break;
4315                 case CHECKSUM_UNNECESSARY:
4316                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4317                         NAPI_GRO_CB(skb)->csum_valid = 0;
4318                         break;
4319                 default:
4320                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4321                         NAPI_GRO_CB(skb)->csum_valid = 0;
4322                 }
4323
4324                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4325                 break;
4326         }
4327         rcu_read_unlock();
4328
4329         if (&ptype->list == head)
4330                 goto normal;
4331
4332         same_flow = NAPI_GRO_CB(skb)->same_flow;
4333         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4334
4335         if (pp) {
4336                 struct sk_buff *nskb = *pp;
4337
4338                 *pp = nskb->next;
4339                 nskb->next = NULL;
4340                 napi_gro_complete(nskb);
4341                 napi->gro_count--;
4342         }
4343
4344         if (same_flow)
4345                 goto ok;
4346
4347         if (NAPI_GRO_CB(skb)->flush)
4348                 goto normal;
4349
4350         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4351                 struct sk_buff *nskb = napi->gro_list;
4352
4353                 /* locate the end of the list to select the 'oldest' flow */
4354                 while (nskb->next) {
4355                         pp = &nskb->next;
4356                         nskb = *pp;
4357                 }
4358                 *pp = NULL;
4359                 nskb->next = NULL;
4360                 napi_gro_complete(nskb);
4361         } else {
4362                 napi->gro_count++;
4363         }
4364         NAPI_GRO_CB(skb)->count = 1;
4365         NAPI_GRO_CB(skb)->age = jiffies;
4366         NAPI_GRO_CB(skb)->last = skb;
4367         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4368         skb->next = napi->gro_list;
4369         napi->gro_list = skb;
4370         ret = GRO_HELD;
4371
4372 pull:
4373         grow = skb_gro_offset(skb) - skb_headlen(skb);
4374         if (grow > 0)
4375                 gro_pull_from_frag0(skb, grow);
4376 ok:
4377         return ret;
4378
4379 normal:
4380         ret = GRO_NORMAL;
4381         goto pull;
4382 }
4383
4384 struct packet_offload *gro_find_receive_by_type(__be16 type)
4385 {
4386         struct list_head *offload_head = &offload_base;
4387         struct packet_offload *ptype;
4388
4389         list_for_each_entry_rcu(ptype, offload_head, list) {
4390                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4391                         continue;
4392                 return ptype;
4393         }
4394         return NULL;
4395 }
4396 EXPORT_SYMBOL(gro_find_receive_by_type);
4397
4398 struct packet_offload *gro_find_complete_by_type(__be16 type)
4399 {
4400         struct list_head *offload_head = &offload_base;
4401         struct packet_offload *ptype;
4402
4403         list_for_each_entry_rcu(ptype, offload_head, list) {
4404                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4405                         continue;
4406                 return ptype;
4407         }
4408         return NULL;
4409 }
4410 EXPORT_SYMBOL(gro_find_complete_by_type);
4411
4412 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4413 {
4414         switch (ret) {
4415         case GRO_NORMAL:
4416                 if (netif_receive_skb_internal(skb))
4417                         ret = GRO_DROP;
4418                 break;
4419
4420         case GRO_DROP:
4421                 kfree_skb(skb);
4422                 break;
4423
4424         case GRO_MERGED_FREE:
4425                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4426                         skb_dst_drop(skb);
4427                         kmem_cache_free(skbuff_head_cache, skb);
4428                 } else {
4429                         __kfree_skb(skb);
4430                 }
4431                 break;
4432
4433         case GRO_HELD:
4434         case GRO_MERGED:
4435                 break;
4436         }
4437
4438         return ret;
4439 }
4440
4441 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4442 {
4443         trace_napi_gro_receive_entry(skb);
4444
4445         skb_gro_reset_offset(skb);
4446
4447         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4448 }
4449 EXPORT_SYMBOL(napi_gro_receive);
4450
4451 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4452 {
4453         if (unlikely(skb->pfmemalloc)) {
4454                 consume_skb(skb);
4455                 return;
4456         }
4457         __skb_pull(skb, skb_headlen(skb));
4458         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4459         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4460         skb->vlan_tci = 0;
4461         skb->dev = napi->dev;
4462         skb->skb_iif = 0;
4463         skb->encapsulation = 0;
4464         skb_shinfo(skb)->gso_type = 0;
4465         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4466
4467         napi->skb = skb;
4468 }
4469
4470 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4471 {
4472         struct sk_buff *skb = napi->skb;
4473
4474         if (!skb) {
4475                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4476                 napi->skb = skb;
4477         }
4478         return skb;
4479 }
4480 EXPORT_SYMBOL(napi_get_frags);
4481
4482 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4483                                       struct sk_buff *skb,
4484                                       gro_result_t ret)
4485 {
4486         switch (ret) {
4487         case GRO_NORMAL:
4488         case GRO_HELD:
4489                 __skb_push(skb, ETH_HLEN);
4490                 skb->protocol = eth_type_trans(skb, skb->dev);
4491                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4492                         ret = GRO_DROP;
4493                 break;
4494
4495         case GRO_DROP:
4496         case GRO_MERGED_FREE:
4497                 napi_reuse_skb(napi, skb);
4498                 break;
4499
4500         case GRO_MERGED:
4501                 break;
4502         }
4503
4504         return ret;
4505 }
4506
4507 /* Upper GRO stack assumes network header starts at gro_offset=0
4508  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4509  * We copy ethernet header into skb->data to have a common layout.
4510  */
4511 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4512 {
4513         struct sk_buff *skb = napi->skb;
4514         const struct ethhdr *eth;
4515         unsigned int hlen = sizeof(*eth);
4516
4517         napi->skb = NULL;
4518
4519         skb_reset_mac_header(skb);
4520         skb_gro_reset_offset(skb);
4521
4522         eth = skb_gro_header_fast(skb, 0);
4523         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4524                 eth = skb_gro_header_slow(skb, hlen, 0);
4525                 if (unlikely(!eth)) {
4526                         napi_reuse_skb(napi, skb);
4527                         return NULL;
4528                 }
4529         } else {
4530                 gro_pull_from_frag0(skb, hlen);
4531                 NAPI_GRO_CB(skb)->frag0 += hlen;
4532                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4533         }
4534         __skb_pull(skb, hlen);
4535
4536         /*
4537          * This works because the only protocols we care about don't require
4538          * special handling.
4539          * We'll fix it up properly in napi_frags_finish()
4540          */
4541         skb->protocol = eth->h_proto;
4542
4543         return skb;
4544 }
4545
4546 gro_result_t napi_gro_frags(struct napi_struct *napi)
4547 {
4548         struct sk_buff *skb = napi_frags_skb(napi);
4549
4550         if (!skb)
4551                 return GRO_DROP;
4552
4553         trace_napi_gro_frags_entry(skb);
4554
4555         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4556 }
4557 EXPORT_SYMBOL(napi_gro_frags);
4558
4559 /* Compute the checksum from gro_offset and return the folded value
4560  * after adding in any pseudo checksum.
4561  */
4562 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4563 {
4564         __wsum wsum;
4565         __sum16 sum;
4566
4567         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4568
4569         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4570         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4571         if (likely(!sum)) {
4572                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4573                     !skb->csum_complete_sw)
4574                         netdev_rx_csum_fault(skb->dev);
4575         }
4576
4577         NAPI_GRO_CB(skb)->csum = wsum;
4578         NAPI_GRO_CB(skb)->csum_valid = 1;
4579
4580         return sum;
4581 }
4582 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4583
4584 /*
4585  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4586  * Note: called with local irq disabled, but exits with local irq enabled.
4587  */
4588 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4589 {
4590 #ifdef CONFIG_RPS
4591         struct softnet_data *remsd = sd->rps_ipi_list;
4592
4593         if (remsd) {
4594                 sd->rps_ipi_list = NULL;
4595
4596                 local_irq_enable();
4597                 preempt_check_resched_rt();
4598
4599                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4600                 while (remsd) {
4601                         struct softnet_data *next = remsd->rps_ipi_next;
4602
4603                         if (cpu_online(remsd->cpu))
4604                                 smp_call_function_single_async(remsd->cpu,
4605                                                            &remsd->csd);
4606                         remsd = next;
4607                 }
4608         } else
4609 #endif
4610                 local_irq_enable();
4611         preempt_check_resched_rt();
4612 }
4613
4614 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4615 {
4616 #ifdef CONFIG_RPS
4617         return sd->rps_ipi_list != NULL;
4618 #else
4619         return false;
4620 #endif
4621 }
4622
4623 static int process_backlog(struct napi_struct *napi, int quota)
4624 {
4625         int work = 0;
4626         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4627
4628         /* Check if we have pending ipi, its better to send them now,
4629          * not waiting net_rx_action() end.
4630          */
4631         if (sd_has_rps_ipi_waiting(sd)) {
4632                 local_irq_disable();
4633                 net_rps_action_and_irq_enable(sd);
4634         }
4635
4636         napi->weight = weight_p;
4637         local_irq_disable();
4638         while (1) {
4639                 struct sk_buff *skb;
4640
4641                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4642                         rcu_read_lock();
4643                         local_irq_enable();
4644                         __netif_receive_skb(skb);
4645                         rcu_read_unlock();
4646                         local_irq_disable();
4647                         input_queue_head_incr(sd);
4648                         if (++work >= quota) {
4649                                 local_irq_enable();
4650                                 return work;
4651                         }
4652                 }
4653
4654                 rps_lock(sd);
4655                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4656                         /*
4657                          * Inline a custom version of __napi_complete().
4658                          * only current cpu owns and manipulates this napi,
4659                          * and NAPI_STATE_SCHED is the only possible flag set
4660                          * on backlog.
4661                          * We can use a plain write instead of clear_bit(),
4662                          * and we dont need an smp_mb() memory barrier.
4663                          */
4664                         napi->state = 0;
4665                         rps_unlock(sd);
4666
4667                         break;
4668                 }
4669
4670                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4671                                            &sd->process_queue);
4672                 rps_unlock(sd);
4673         }
4674         local_irq_enable();
4675
4676         return work;
4677 }
4678
4679 /**
4680  * __napi_schedule - schedule for receive
4681  * @n: entry to schedule
4682  *
4683  * The entry's receive function will be scheduled to run.
4684  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4685  */
4686 void __napi_schedule(struct napi_struct *n)
4687 {
4688         unsigned long flags;
4689
4690         local_irq_save(flags);
4691         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4692         local_irq_restore(flags);
4693         preempt_check_resched_rt();
4694 }
4695 EXPORT_SYMBOL(__napi_schedule);
4696
4697 /**
4698  * __napi_schedule_irqoff - schedule for receive
4699  * @n: entry to schedule
4700  *
4701  * Variant of __napi_schedule() assuming hard irqs are masked
4702  */
4703 void __napi_schedule_irqoff(struct napi_struct *n)
4704 {
4705         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4706 }
4707 EXPORT_SYMBOL(__napi_schedule_irqoff);
4708
4709 void __napi_complete(struct napi_struct *n)
4710 {
4711         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4712
4713         list_del_init(&n->poll_list);
4714         smp_mb__before_atomic();
4715         clear_bit(NAPI_STATE_SCHED, &n->state);
4716 }
4717 EXPORT_SYMBOL(__napi_complete);
4718
4719 void napi_complete_done(struct napi_struct *n, int work_done)
4720 {
4721         unsigned long flags;
4722
4723         /*
4724          * don't let napi dequeue from the cpu poll list
4725          * just in case its running on a different cpu
4726          */
4727         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4728                 return;
4729
4730         if (n->gro_list) {
4731                 unsigned long timeout = 0;
4732
4733                 if (work_done)
4734                         timeout = n->dev->gro_flush_timeout;
4735
4736                 if (timeout)
4737                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
4738                                       HRTIMER_MODE_REL_PINNED);
4739                 else
4740                         napi_gro_flush(n, false);
4741         }
4742         if (likely(list_empty(&n->poll_list))) {
4743                 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4744         } else {
4745                 /* If n->poll_list is not empty, we need to mask irqs */
4746                 local_irq_save(flags);
4747                 __napi_complete(n);
4748                 local_irq_restore(flags);
4749         }
4750 }
4751 EXPORT_SYMBOL(napi_complete_done);
4752
4753 /* must be called under rcu_read_lock(), as we dont take a reference */
4754 struct napi_struct *napi_by_id(unsigned int napi_id)
4755 {
4756         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4757         struct napi_struct *napi;
4758
4759         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4760                 if (napi->napi_id == napi_id)
4761                         return napi;
4762
4763         return NULL;
4764 }
4765 EXPORT_SYMBOL_GPL(napi_by_id);
4766
4767 void napi_hash_add(struct napi_struct *napi)
4768 {
4769         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4770
4771                 spin_lock(&napi_hash_lock);
4772
4773                 /* 0 is not a valid id, we also skip an id that is taken
4774                  * we expect both events to be extremely rare
4775                  */
4776                 napi->napi_id = 0;
4777                 while (!napi->napi_id) {
4778                         napi->napi_id = ++napi_gen_id;
4779                         if (napi_by_id(napi->napi_id))
4780                                 napi->napi_id = 0;
4781                 }
4782
4783                 hlist_add_head_rcu(&napi->napi_hash_node,
4784                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4785
4786                 spin_unlock(&napi_hash_lock);
4787         }
4788 }
4789 EXPORT_SYMBOL_GPL(napi_hash_add);
4790
4791 /* Warning : caller is responsible to make sure rcu grace period
4792  * is respected before freeing memory containing @napi
4793  */
4794 void napi_hash_del(struct napi_struct *napi)
4795 {
4796         spin_lock(&napi_hash_lock);
4797
4798         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4799                 hlist_del_rcu(&napi->napi_hash_node);
4800
4801         spin_unlock(&napi_hash_lock);
4802 }
4803 EXPORT_SYMBOL_GPL(napi_hash_del);
4804
4805 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4806 {
4807         struct napi_struct *napi;
4808
4809         napi = container_of(timer, struct napi_struct, timer);
4810         if (napi->gro_list)
4811                 napi_schedule(napi);
4812
4813         return HRTIMER_NORESTART;
4814 }
4815
4816 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4817                     int (*poll)(struct napi_struct *, int), int weight)
4818 {
4819         INIT_LIST_HEAD(&napi->poll_list);
4820         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4821         napi->timer.function = napi_watchdog;
4822         napi->gro_count = 0;
4823         napi->gro_list = NULL;
4824         napi->skb = NULL;
4825         napi->poll = poll;
4826         if (weight > NAPI_POLL_WEIGHT)
4827                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4828                             weight, dev->name);
4829         napi->weight = weight;
4830         list_add(&napi->dev_list, &dev->napi_list);
4831         napi->dev = dev;
4832 #ifdef CONFIG_NETPOLL
4833         spin_lock_init(&napi->poll_lock);
4834         napi->poll_owner = -1;
4835 #endif
4836         set_bit(NAPI_STATE_SCHED, &napi->state);
4837 }
4838 EXPORT_SYMBOL(netif_napi_add);
4839
4840 void napi_disable(struct napi_struct *n)
4841 {
4842         might_sleep();
4843         set_bit(NAPI_STATE_DISABLE, &n->state);
4844
4845         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4846                 msleep(1);
4847         while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
4848                 msleep(1);
4849
4850         hrtimer_cancel(&n->timer);
4851
4852         clear_bit(NAPI_STATE_DISABLE, &n->state);
4853 }
4854 EXPORT_SYMBOL(napi_disable);
4855
4856 void netif_napi_del(struct napi_struct *napi)
4857 {
4858         list_del_init(&napi->dev_list);
4859         napi_free_frags(napi);
4860
4861         kfree_skb_list(napi->gro_list);
4862         napi->gro_list = NULL;
4863         napi->gro_count = 0;
4864 }
4865 EXPORT_SYMBOL(netif_napi_del);
4866
4867 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4868 {
4869         void *have;
4870         int work, weight;
4871
4872         list_del_init(&n->poll_list);
4873
4874         have = netpoll_poll_lock(n);
4875
4876         weight = n->weight;
4877
4878         /* This NAPI_STATE_SCHED test is for avoiding a race
4879          * with netpoll's poll_napi().  Only the entity which
4880          * obtains the lock and sees NAPI_STATE_SCHED set will
4881          * actually make the ->poll() call.  Therefore we avoid
4882          * accidentally calling ->poll() when NAPI is not scheduled.
4883          */
4884         work = 0;
4885         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4886                 work = n->poll(n, weight);
4887                 trace_napi_poll(n);
4888         }
4889
4890         WARN_ON_ONCE(work > weight);
4891
4892         if (likely(work < weight))
4893                 goto out_unlock;
4894
4895         /* Drivers must not modify the NAPI state if they
4896          * consume the entire weight.  In such cases this code
4897          * still "owns" the NAPI instance and therefore can
4898          * move the instance around on the list at-will.
4899          */
4900         if (unlikely(napi_disable_pending(n))) {
4901                 napi_complete(n);
4902                 goto out_unlock;
4903         }
4904
4905         if (n->gro_list) {
4906                 /* flush too old packets
4907                  * If HZ < 1000, flush all packets.
4908                  */
4909                 napi_gro_flush(n, HZ >= 1000);
4910         }
4911
4912         /* Some drivers may have called napi_schedule
4913          * prior to exhausting their budget.
4914          */
4915         if (unlikely(!list_empty(&n->poll_list))) {
4916                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4917                              n->dev ? n->dev->name : "backlog");
4918                 goto out_unlock;
4919         }
4920
4921         list_add_tail(&n->poll_list, repoll);
4922
4923 out_unlock:
4924         netpoll_poll_unlock(have);
4925
4926         return work;
4927 }
4928
4929 static void net_rx_action(struct softirq_action *h)
4930 {
4931         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4932         unsigned long time_limit = jiffies + 2;
4933         int budget = netdev_budget;
4934         LIST_HEAD(list);
4935         LIST_HEAD(repoll);
4936
4937         local_irq_disable();
4938         list_splice_init(&sd->poll_list, &list);
4939         local_irq_enable();
4940
4941         for (;;) {
4942                 struct napi_struct *n;
4943
4944                 if (list_empty(&list)) {
4945                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4946                                 return;
4947                         break;
4948                 }
4949
4950                 n = list_first_entry(&list, struct napi_struct, poll_list);
4951                 budget -= napi_poll(n, &repoll);
4952
4953                 /* If softirq window is exhausted then punt.
4954                  * Allow this to run for 2 jiffies since which will allow
4955                  * an average latency of 1.5/HZ.
4956                  */
4957                 if (unlikely(budget <= 0 ||
4958                              time_after_eq(jiffies, time_limit))) {
4959                         sd->time_squeeze++;
4960                         break;
4961                 }
4962         }
4963
4964         local_irq_disable();
4965
4966         list_splice_tail_init(&sd->poll_list, &list);
4967         list_splice_tail(&repoll, &list);
4968         list_splice(&list, &sd->poll_list);
4969         if (!list_empty(&sd->poll_list))
4970                 __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ);
4971
4972         net_rps_action_and_irq_enable(sd);
4973 }
4974
4975 struct netdev_adjacent {
4976         struct net_device *dev;
4977
4978         /* upper master flag, there can only be one master device per list */
4979         bool master;
4980
4981         /* counter for the number of times this device was added to us */
4982         u16 ref_nr;
4983
4984         /* private field for the users */
4985         void *private;
4986
4987         struct list_head list;
4988         struct rcu_head rcu;
4989 };
4990
4991 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
4992                                                  struct list_head *adj_list)
4993 {
4994         struct netdev_adjacent *adj;
4995
4996         list_for_each_entry(adj, adj_list, list) {
4997                 if (adj->dev == adj_dev)
4998                         return adj;
4999         }
5000         return NULL;
5001 }
5002
5003 /**
5004  * netdev_has_upper_dev - Check if device is linked to an upper device
5005  * @dev: device
5006  * @upper_dev: upper device to check
5007  *
5008  * Find out if a device is linked to specified upper device and return true
5009  * in case it is. Note that this checks only immediate upper device,
5010  * not through a complete stack of devices. The caller must hold the RTNL lock.
5011  */
5012 bool netdev_has_upper_dev(struct net_device *dev,
5013                           struct net_device *upper_dev)
5014 {
5015         ASSERT_RTNL();
5016
5017         return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
5018 }
5019 EXPORT_SYMBOL(netdev_has_upper_dev);
5020
5021 /**
5022  * netdev_has_any_upper_dev - Check if device is linked to some device
5023  * @dev: device
5024  *
5025  * Find out if a device is linked to an upper device and return true in case
5026  * it is. The caller must hold the RTNL lock.
5027  */
5028 static bool netdev_has_any_upper_dev(struct net_device *dev)
5029 {
5030         ASSERT_RTNL();
5031
5032         return !list_empty(&dev->all_adj_list.upper);
5033 }
5034
5035 /**
5036  * netdev_master_upper_dev_get - Get master upper device
5037  * @dev: device
5038  *
5039  * Find a master upper device and return pointer to it or NULL in case
5040  * it's not there. The caller must hold the RTNL lock.
5041  */
5042 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5043 {
5044         struct netdev_adjacent *upper;
5045
5046         ASSERT_RTNL();
5047
5048         if (list_empty(&dev->adj_list.upper))
5049                 return NULL;
5050
5051         upper = list_first_entry(&dev->adj_list.upper,
5052                                  struct netdev_adjacent, list);
5053         if (likely(upper->master))
5054                 return upper->dev;
5055         return NULL;
5056 }
5057 EXPORT_SYMBOL(netdev_master_upper_dev_get);
5058
5059 void *netdev_adjacent_get_private(struct list_head *adj_list)
5060 {
5061         struct netdev_adjacent *adj;
5062
5063         adj = list_entry(adj_list, struct netdev_adjacent, list);
5064
5065         return adj->private;
5066 }
5067 EXPORT_SYMBOL(netdev_adjacent_get_private);
5068
5069 /**
5070  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5071  * @dev: device
5072  * @iter: list_head ** of the current position
5073  *
5074  * Gets the next device from the dev's upper list, starting from iter
5075  * position. The caller must hold RCU read lock.
5076  */
5077 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5078                                                  struct list_head **iter)
5079 {
5080         struct netdev_adjacent *upper;
5081
5082         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5083
5084         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5085
5086         if (&upper->list == &dev->adj_list.upper)
5087                 return NULL;
5088
5089         *iter = &upper->list;
5090
5091         return upper->dev;
5092 }
5093 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5094
5095 /**
5096  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
5097  * @dev: device
5098  * @iter: list_head ** of the current position
5099  *
5100  * Gets the next device from the dev's upper list, starting from iter
5101  * position. The caller must hold RCU read lock.
5102  */
5103 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5104                                                      struct list_head **iter)
5105 {
5106         struct netdev_adjacent *upper;
5107
5108         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5109
5110         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5111
5112         if (&upper->list == &dev->all_adj_list.upper)
5113                 return NULL;
5114
5115         *iter = &upper->list;
5116
5117         return upper->dev;
5118 }
5119 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
5120
5121 /**
5122  * netdev_lower_get_next_private - Get the next ->private from the
5123  *                                 lower neighbour list
5124  * @dev: device
5125  * @iter: list_head ** of the current position
5126  *
5127  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5128  * list, starting from iter position. The caller must hold either hold the
5129  * RTNL lock or its own locking that guarantees that the neighbour lower
5130  * list will remain unchanged.
5131  */
5132 void *netdev_lower_get_next_private(struct net_device *dev,
5133                                     struct list_head **iter)
5134 {
5135         struct netdev_adjacent *lower;
5136
5137         lower = list_entry(*iter, struct netdev_adjacent, list);
5138
5139         if (&lower->list == &dev->adj_list.lower)
5140                 return NULL;
5141
5142         *iter = lower->list.next;
5143
5144         return lower->private;
5145 }
5146 EXPORT_SYMBOL(netdev_lower_get_next_private);
5147
5148 /**
5149  * netdev_lower_get_next_private_rcu - Get the next ->private from the
5150  *                                     lower neighbour list, RCU
5151  *                                     variant
5152  * @dev: device
5153  * @iter: list_head ** of the current position
5154  *
5155  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5156  * list, starting from iter position. The caller must hold RCU read lock.
5157  */
5158 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5159                                         struct list_head **iter)
5160 {
5161         struct netdev_adjacent *lower;
5162
5163         WARN_ON_ONCE(!rcu_read_lock_held());
5164
5165         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5166
5167         if (&lower->list == &dev->adj_list.lower)
5168                 return NULL;
5169
5170         *iter = &lower->list;
5171
5172         return lower->private;
5173 }
5174 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5175
5176 /**
5177  * netdev_lower_get_next - Get the next device from the lower neighbour
5178  *                         list
5179  * @dev: device
5180  * @iter: list_head ** of the current position
5181  *
5182  * Gets the next netdev_adjacent from the dev's lower neighbour
5183  * list, starting from iter position. The caller must hold RTNL lock or
5184  * its own locking that guarantees that the neighbour lower
5185  * list will remain unchanged.
5186  */
5187 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5188 {
5189         struct netdev_adjacent *lower;
5190
5191         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5192
5193         if (&lower->list == &dev->adj_list.lower)
5194                 return NULL;
5195
5196         *iter = &lower->list;
5197
5198         return lower->dev;
5199 }
5200 EXPORT_SYMBOL(netdev_lower_get_next);
5201
5202 /**
5203  * netdev_lower_get_first_private_rcu - Get the first ->private from the
5204  *                                     lower neighbour list, RCU
5205  *                                     variant
5206  * @dev: device
5207  *
5208  * Gets the first netdev_adjacent->private from the dev's lower neighbour
5209  * list. The caller must hold RCU read lock.
5210  */
5211 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5212 {
5213         struct netdev_adjacent *lower;
5214
5215         lower = list_first_or_null_rcu(&dev->adj_list.lower,
5216                         struct netdev_adjacent, list);
5217         if (lower)
5218                 return lower->private;
5219         return NULL;
5220 }
5221 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5222
5223 /**
5224  * netdev_master_upper_dev_get_rcu - Get master upper device
5225  * @dev: device
5226  *
5227  * Find a master upper device and return pointer to it or NULL in case
5228  * it's not there. The caller must hold the RCU read lock.
5229  */
5230 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5231 {
5232         struct netdev_adjacent *upper;
5233
5234         upper = list_first_or_null_rcu(&dev->adj_list.upper,
5235                                        struct netdev_adjacent, list);
5236         if (upper && likely(upper->master))
5237                 return upper->dev;
5238         return NULL;
5239 }
5240 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5241
5242 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5243                               struct net_device *adj_dev,
5244                               struct list_head *dev_list)
5245 {
5246         char linkname[IFNAMSIZ+7];
5247         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5248                 "upper_%s" : "lower_%s", adj_dev->name);
5249         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5250                                  linkname);
5251 }
5252 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5253                                char *name,
5254                                struct list_head *dev_list)
5255 {
5256         char linkname[IFNAMSIZ+7];
5257         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5258                 "upper_%s" : "lower_%s", name);
5259         sysfs_remove_link(&(dev->dev.kobj), linkname);
5260 }
5261
5262 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5263                                                  struct net_device *adj_dev,
5264                                                  struct list_head *dev_list)
5265 {
5266         return (dev_list == &dev->adj_list.upper ||
5267                 dev_list == &dev->adj_list.lower) &&
5268                 net_eq(dev_net(dev), dev_net(adj_dev));
5269 }
5270
5271 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5272                                         struct net_device *adj_dev,
5273                                         struct list_head *dev_list,
5274                                         void *private, bool master)
5275 {
5276         struct netdev_adjacent *adj;
5277         int ret;
5278
5279         adj = __netdev_find_adj(adj_dev, dev_list);
5280
5281         if (adj) {
5282                 adj->ref_nr++;
5283                 return 0;
5284         }
5285
5286         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5287         if (!adj)
5288                 return -ENOMEM;
5289
5290         adj->dev = adj_dev;
5291         adj->master = master;
5292         adj->ref_nr = 1;
5293         adj->private = private;
5294         dev_hold(adj_dev);
5295
5296         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5297                  adj_dev->name, dev->name, adj_dev->name);
5298
5299         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5300                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5301                 if (ret)
5302                         goto free_adj;
5303         }
5304
5305         /* Ensure that master link is always the first item in list. */
5306         if (master) {
5307                 ret = sysfs_create_link(&(dev->dev.kobj),
5308                                         &(adj_dev->dev.kobj), "master");
5309                 if (ret)
5310                         goto remove_symlinks;
5311
5312                 list_add_rcu(&adj->list, dev_list);
5313         } else {
5314                 list_add_tail_rcu(&adj->list, dev_list);
5315         }
5316
5317         return 0;
5318
5319 remove_symlinks:
5320         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5321                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5322 free_adj:
5323         kfree(adj);
5324         dev_put(adj_dev);
5325
5326         return ret;
5327 }
5328
5329 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5330                                          struct net_device *adj_dev,
5331                                          struct list_head *dev_list)
5332 {
5333         struct netdev_adjacent *adj;
5334
5335         adj = __netdev_find_adj(adj_dev, dev_list);
5336
5337         if (!adj) {
5338                 pr_err("tried to remove device %s from %s\n",
5339                        dev->name, adj_dev->name);
5340                 BUG();
5341         }
5342
5343         if (adj->ref_nr > 1) {
5344                 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5345                          adj->ref_nr-1);
5346                 adj->ref_nr--;
5347                 return;
5348         }
5349
5350         if (adj->master)
5351                 sysfs_remove_link(&(dev->dev.kobj), "master");
5352
5353         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5354                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5355
5356         list_del_rcu(&adj->list);
5357         pr_debug("dev_put for %s, because link removed from %s to %s\n",
5358                  adj_dev->name, dev->name, adj_dev->name);
5359         dev_put(adj_dev);
5360         kfree_rcu(adj, rcu);
5361 }
5362
5363 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5364                                             struct net_device *upper_dev,
5365                                             struct list_head *up_list,
5366                                             struct list_head *down_list,
5367                                             void *private, bool master)
5368 {
5369         int ret;
5370
5371         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5372                                            master);
5373         if (ret)
5374                 return ret;
5375
5376         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5377                                            false);
5378         if (ret) {
5379                 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5380                 return ret;
5381         }
5382
5383         return 0;
5384 }
5385
5386 static int __netdev_adjacent_dev_link(struct net_device *dev,
5387                                       struct net_device *upper_dev)
5388 {
5389         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5390                                                 &dev->all_adj_list.upper,
5391                                                 &upper_dev->all_adj_list.lower,
5392                                                 NULL, false);
5393 }
5394
5395 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5396                                                struct net_device *upper_dev,
5397                                                struct list_head *up_list,
5398                                                struct list_head *down_list)
5399 {
5400         __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5401         __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5402 }
5403
5404 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5405                                          struct net_device *upper_dev)
5406 {
5407         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5408                                            &dev->all_adj_list.upper,
5409                                            &upper_dev->all_adj_list.lower);
5410 }
5411
5412 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5413                                                 struct net_device *upper_dev,
5414                                                 void *private, bool master)
5415 {
5416         int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5417
5418         if (ret)
5419                 return ret;
5420
5421         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5422                                                &dev->adj_list.upper,
5423                                                &upper_dev->adj_list.lower,
5424                                                private, master);
5425         if (ret) {
5426                 __netdev_adjacent_dev_unlink(dev, upper_dev);
5427                 return ret;
5428         }
5429
5430         return 0;
5431 }
5432
5433 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5434                                                    struct net_device *upper_dev)
5435 {
5436         __netdev_adjacent_dev_unlink(dev, upper_dev);
5437         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5438                                            &dev->adj_list.upper,
5439                                            &upper_dev->adj_list.lower);
5440 }
5441
5442 static int __netdev_upper_dev_link(struct net_device *dev,
5443                                    struct net_device *upper_dev, bool master,
5444                                    void *private)
5445 {
5446         struct netdev_notifier_changeupper_info changeupper_info;
5447         struct netdev_adjacent *i, *j, *to_i, *to_j;
5448         int ret = 0;
5449
5450         ASSERT_RTNL();
5451
5452         if (dev == upper_dev)
5453                 return -EBUSY;
5454
5455         /* To prevent loops, check if dev is not upper device to upper_dev. */
5456         if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
5457                 return -EBUSY;
5458
5459         if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
5460                 return -EEXIST;
5461
5462         if (master && netdev_master_upper_dev_get(dev))
5463                 return -EBUSY;
5464
5465         changeupper_info.upper_dev = upper_dev;
5466         changeupper_info.master = master;
5467         changeupper_info.linking = true;
5468
5469         ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5470                                             &changeupper_info.info);
5471         ret = notifier_to_errno(ret);
5472         if (ret)
5473                 return ret;
5474
5475         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5476                                                    master);
5477         if (ret)
5478                 return ret;
5479
5480         /* Now that we linked these devs, make all the upper_dev's
5481          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5482          * versa, and don't forget the devices itself. All of these
5483          * links are non-neighbours.
5484          */
5485         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5486                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5487                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5488                                  i->dev->name, j->dev->name);
5489                         ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5490                         if (ret)
5491                                 goto rollback_mesh;
5492                 }
5493         }
5494
5495         /* add dev to every upper_dev's upper device */
5496         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5497                 pr_debug("linking %s's upper device %s with %s\n",
5498                          upper_dev->name, i->dev->name, dev->name);
5499                 ret = __netdev_adjacent_dev_link(dev, i->dev);
5500                 if (ret)
5501                         goto rollback_upper_mesh;
5502         }
5503
5504         /* add upper_dev to every dev's lower device */
5505         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5506                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5507                          i->dev->name, upper_dev->name);
5508                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5509                 if (ret)
5510                         goto rollback_lower_mesh;
5511         }
5512
5513         call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5514                                       &changeupper_info.info);
5515         return 0;
5516
5517 rollback_lower_mesh:
5518         to_i = i;
5519         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5520                 if (i == to_i)
5521                         break;
5522                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5523         }
5524
5525         i = NULL;
5526
5527 rollback_upper_mesh:
5528         to_i = i;
5529         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5530                 if (i == to_i)
5531                         break;
5532                 __netdev_adjacent_dev_unlink(dev, i->dev);
5533         }
5534
5535         i = j = NULL;
5536
5537 rollback_mesh:
5538         to_i = i;
5539         to_j = j;
5540         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5541                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5542                         if (i == to_i && j == to_j)
5543                                 break;
5544                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5545                 }
5546                 if (i == to_i)
5547                         break;
5548         }
5549
5550         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5551
5552         return ret;
5553 }
5554
5555 /**
5556  * netdev_upper_dev_link - Add a link to the upper device
5557  * @dev: device
5558  * @upper_dev: new upper device
5559  *
5560  * Adds a link to device which is upper to this one. The caller must hold
5561  * the RTNL lock. On a failure a negative errno code is returned.
5562  * On success the reference counts are adjusted and the function
5563  * returns zero.
5564  */
5565 int netdev_upper_dev_link(struct net_device *dev,
5566                           struct net_device *upper_dev)
5567 {
5568         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5569 }
5570 EXPORT_SYMBOL(netdev_upper_dev_link);
5571
5572 /**
5573  * netdev_master_upper_dev_link - Add a master link to the upper device
5574  * @dev: device
5575  * @upper_dev: new upper device
5576  *
5577  * Adds a link to device which is upper to this one. In this case, only
5578  * one master upper device can be linked, although other non-master devices
5579  * might be linked as well. The caller must hold the RTNL lock.
5580  * On a failure a negative errno code is returned. On success the reference
5581  * counts are adjusted and the function returns zero.
5582  */
5583 int netdev_master_upper_dev_link(struct net_device *dev,
5584                                  struct net_device *upper_dev)
5585 {
5586         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5587 }
5588 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5589
5590 int netdev_master_upper_dev_link_private(struct net_device *dev,
5591                                          struct net_device *upper_dev,
5592                                          void *private)
5593 {
5594         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5595 }
5596 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5597
5598 /**
5599  * netdev_upper_dev_unlink - Removes a link to upper device
5600  * @dev: device
5601  * @upper_dev: new upper device
5602  *
5603  * Removes a link to device which is upper to this one. The caller must hold
5604  * the RTNL lock.
5605  */
5606 void netdev_upper_dev_unlink(struct net_device *dev,
5607                              struct net_device *upper_dev)
5608 {
5609         struct netdev_notifier_changeupper_info changeupper_info;
5610         struct netdev_adjacent *i, *j;
5611         ASSERT_RTNL();
5612
5613         changeupper_info.upper_dev = upper_dev;
5614         changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5615         changeupper_info.linking = false;
5616
5617         call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5618                                       &changeupper_info.info);
5619
5620         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5621
5622         /* Here is the tricky part. We must remove all dev's lower
5623          * devices from all upper_dev's upper devices and vice
5624          * versa, to maintain the graph relationship.
5625          */
5626         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5627                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5628                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5629
5630         /* remove also the devices itself from lower/upper device
5631          * list
5632          */
5633         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5634                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5635
5636         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5637                 __netdev_adjacent_dev_unlink(dev, i->dev);
5638
5639         call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5640                                       &changeupper_info.info);
5641 }
5642 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5643
5644 /**
5645  * netdev_bonding_info_change - Dispatch event about slave change
5646  * @dev: device
5647  * @bonding_info: info to dispatch
5648  *
5649  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5650  * The caller must hold the RTNL lock.
5651  */
5652 void netdev_bonding_info_change(struct net_device *dev,
5653                                 struct netdev_bonding_info *bonding_info)
5654 {
5655         struct netdev_notifier_bonding_info     info;
5656
5657         memcpy(&info.bonding_info, bonding_info,
5658                sizeof(struct netdev_bonding_info));
5659         call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5660                                       &info.info);
5661 }
5662 EXPORT_SYMBOL(netdev_bonding_info_change);
5663
5664 static void netdev_adjacent_add_links(struct net_device *dev)
5665 {
5666         struct netdev_adjacent *iter;
5667
5668         struct net *net = dev_net(dev);
5669
5670         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5671                 if (!net_eq(net,dev_net(iter->dev)))
5672                         continue;
5673                 netdev_adjacent_sysfs_add(iter->dev, dev,
5674                                           &iter->dev->adj_list.lower);
5675                 netdev_adjacent_sysfs_add(dev, iter->dev,
5676                                           &dev->adj_list.upper);
5677         }
5678
5679         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5680                 if (!net_eq(net,dev_net(iter->dev)))
5681                         continue;
5682                 netdev_adjacent_sysfs_add(iter->dev, dev,
5683                                           &iter->dev->adj_list.upper);
5684                 netdev_adjacent_sysfs_add(dev, iter->dev,
5685                                           &dev->adj_list.lower);
5686         }
5687 }
5688
5689 static void netdev_adjacent_del_links(struct net_device *dev)
5690 {
5691         struct netdev_adjacent *iter;
5692
5693         struct net *net = dev_net(dev);
5694
5695         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5696                 if (!net_eq(net,dev_net(iter->dev)))
5697                         continue;
5698                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5699                                           &iter->dev->adj_list.lower);
5700                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5701                                           &dev->adj_list.upper);
5702         }
5703
5704         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5705                 if (!net_eq(net,dev_net(iter->dev)))
5706                         continue;
5707                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5708                                           &iter->dev->adj_list.upper);
5709                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5710                                           &dev->adj_list.lower);
5711         }
5712 }
5713
5714 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5715 {
5716         struct netdev_adjacent *iter;
5717
5718         struct net *net = dev_net(dev);
5719
5720         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5721                 if (!net_eq(net,dev_net(iter->dev)))
5722                         continue;
5723                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5724                                           &iter->dev->adj_list.lower);
5725                 netdev_adjacent_sysfs_add(iter->dev, dev,
5726                                           &iter->dev->adj_list.lower);
5727         }
5728
5729         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5730                 if (!net_eq(net,dev_net(iter->dev)))
5731                         continue;
5732                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5733                                           &iter->dev->adj_list.upper);
5734                 netdev_adjacent_sysfs_add(iter->dev, dev,
5735                                           &iter->dev->adj_list.upper);
5736         }
5737 }
5738
5739 void *netdev_lower_dev_get_private(struct net_device *dev,
5740                                    struct net_device *lower_dev)
5741 {
5742         struct netdev_adjacent *lower;
5743
5744         if (!lower_dev)
5745                 return NULL;
5746         lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
5747         if (!lower)
5748                 return NULL;
5749
5750         return lower->private;
5751 }
5752 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5753
5754
5755 int dev_get_nest_level(struct net_device *dev,
5756                        bool (*type_check)(struct net_device *dev))
5757 {
5758         struct net_device *lower = NULL;
5759         struct list_head *iter;
5760         int max_nest = -1;
5761         int nest;
5762
5763         ASSERT_RTNL();
5764
5765         netdev_for_each_lower_dev(dev, lower, iter) {
5766                 nest = dev_get_nest_level(lower, type_check);
5767                 if (max_nest < nest)
5768                         max_nest = nest;
5769         }
5770
5771         if (type_check(dev))
5772                 max_nest++;
5773
5774         return max_nest;
5775 }
5776 EXPORT_SYMBOL(dev_get_nest_level);
5777
5778 static void dev_change_rx_flags(struct net_device *dev, int flags)
5779 {
5780         const struct net_device_ops *ops = dev->netdev_ops;
5781
5782         if (ops->ndo_change_rx_flags)
5783                 ops->ndo_change_rx_flags(dev, flags);
5784 }
5785
5786 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5787 {
5788         unsigned int old_flags = dev->flags;
5789         kuid_t uid;
5790         kgid_t gid;
5791
5792         ASSERT_RTNL();
5793
5794         dev->flags |= IFF_PROMISC;
5795         dev->promiscuity += inc;
5796         if (dev->promiscuity == 0) {
5797                 /*
5798                  * Avoid overflow.
5799                  * If inc causes overflow, untouch promisc and return error.
5800                  */
5801                 if (inc < 0)
5802                         dev->flags &= ~IFF_PROMISC;
5803                 else {
5804                         dev->promiscuity -= inc;
5805                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5806                                 dev->name);
5807                         return -EOVERFLOW;
5808                 }
5809         }
5810         if (dev->flags != old_flags) {
5811                 pr_info("device %s %s promiscuous mode\n",
5812                         dev->name,
5813                         dev->flags & IFF_PROMISC ? "entered" : "left");
5814                 if (audit_enabled) {
5815                         current_uid_gid(&uid, &gid);
5816                         audit_log(current->audit_context, GFP_ATOMIC,
5817                                 AUDIT_ANOM_PROMISCUOUS,
5818                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5819                                 dev->name, (dev->flags & IFF_PROMISC),
5820                                 (old_flags & IFF_PROMISC),
5821                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5822                                 from_kuid(&init_user_ns, uid),
5823                                 from_kgid(&init_user_ns, gid),
5824                                 audit_get_sessionid(current));
5825                 }
5826
5827                 dev_change_rx_flags(dev, IFF_PROMISC);
5828         }
5829         if (notify)
5830                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5831         return 0;
5832 }
5833
5834 /**
5835  *      dev_set_promiscuity     - update promiscuity count on a device
5836  *      @dev: device
5837  *      @inc: modifier
5838  *
5839  *      Add or remove promiscuity from a device. While the count in the device
5840  *      remains above zero the interface remains promiscuous. Once it hits zero
5841  *      the device reverts back to normal filtering operation. A negative inc
5842  *      value is used to drop promiscuity on the device.
5843  *      Return 0 if successful or a negative errno code on error.
5844  */
5845 int dev_set_promiscuity(struct net_device *dev, int inc)
5846 {
5847         unsigned int old_flags = dev->flags;
5848         int err;
5849
5850         err = __dev_set_promiscuity(dev, inc, true);
5851         if (err < 0)
5852                 return err;
5853         if (dev->flags != old_flags)
5854                 dev_set_rx_mode(dev);
5855         return err;
5856 }
5857 EXPORT_SYMBOL(dev_set_promiscuity);
5858
5859 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5860 {
5861         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5862
5863         ASSERT_RTNL();
5864
5865         dev->flags |= IFF_ALLMULTI;
5866         dev->allmulti += inc;
5867         if (dev->allmulti == 0) {
5868                 /*
5869                  * Avoid overflow.
5870                  * If inc causes overflow, untouch allmulti and return error.
5871                  */
5872                 if (inc < 0)
5873                         dev->flags &= ~IFF_ALLMULTI;
5874                 else {
5875                         dev->allmulti -= inc;
5876                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5877                                 dev->name);
5878                         return -EOVERFLOW;
5879                 }
5880         }
5881         if (dev->flags ^ old_flags) {
5882                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5883                 dev_set_rx_mode(dev);
5884                 if (notify)
5885                         __dev_notify_flags(dev, old_flags,
5886                                            dev->gflags ^ old_gflags);
5887         }
5888         return 0;
5889 }
5890
5891 /**
5892  *      dev_set_allmulti        - update allmulti count on a device
5893  *      @dev: device
5894  *      @inc: modifier
5895  *
5896  *      Add or remove reception of all multicast frames to a device. While the
5897  *      count in the device remains above zero the interface remains listening
5898  *      to all interfaces. Once it hits zero the device reverts back to normal
5899  *      filtering operation. A negative @inc value is used to drop the counter
5900  *      when releasing a resource needing all multicasts.
5901  *      Return 0 if successful or a negative errno code on error.
5902  */
5903
5904 int dev_set_allmulti(struct net_device *dev, int inc)
5905 {
5906         return __dev_set_allmulti(dev, inc, true);
5907 }
5908 EXPORT_SYMBOL(dev_set_allmulti);
5909
5910 /*
5911  *      Upload unicast and multicast address lists to device and
5912  *      configure RX filtering. When the device doesn't support unicast
5913  *      filtering it is put in promiscuous mode while unicast addresses
5914  *      are present.
5915  */
5916 void __dev_set_rx_mode(struct net_device *dev)
5917 {
5918         const struct net_device_ops *ops = dev->netdev_ops;
5919
5920         /* dev_open will call this function so the list will stay sane. */
5921         if (!(dev->flags&IFF_UP))
5922                 return;
5923
5924         if (!netif_device_present(dev))
5925                 return;
5926
5927         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5928                 /* Unicast addresses changes may only happen under the rtnl,
5929                  * therefore calling __dev_set_promiscuity here is safe.
5930                  */
5931                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5932                         __dev_set_promiscuity(dev, 1, false);
5933                         dev->uc_promisc = true;
5934                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5935                         __dev_set_promiscuity(dev, -1, false);
5936                         dev->uc_promisc = false;
5937                 }
5938         }
5939
5940         if (ops->ndo_set_rx_mode)
5941                 ops->ndo_set_rx_mode(dev);
5942 }
5943
5944 void dev_set_rx_mode(struct net_device *dev)
5945 {
5946         netif_addr_lock_bh(dev);
5947         __dev_set_rx_mode(dev);
5948         netif_addr_unlock_bh(dev);
5949 }
5950
5951 /**
5952  *      dev_get_flags - get flags reported to userspace
5953  *      @dev: device
5954  *
5955  *      Get the combination of flag bits exported through APIs to userspace.
5956  */
5957 unsigned int dev_get_flags(const struct net_device *dev)
5958 {
5959         unsigned int flags;
5960
5961         flags = (dev->flags & ~(IFF_PROMISC |
5962                                 IFF_ALLMULTI |
5963                                 IFF_RUNNING |
5964                                 IFF_LOWER_UP |
5965                                 IFF_DORMANT)) |
5966                 (dev->gflags & (IFF_PROMISC |
5967                                 IFF_ALLMULTI));
5968
5969         if (netif_running(dev)) {
5970                 if (netif_oper_up(dev))
5971                         flags |= IFF_RUNNING;
5972                 if (netif_carrier_ok(dev))
5973                         flags |= IFF_LOWER_UP;
5974                 if (netif_dormant(dev))
5975                         flags |= IFF_DORMANT;
5976         }
5977
5978         return flags;
5979 }
5980 EXPORT_SYMBOL(dev_get_flags);
5981
5982 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5983 {
5984         unsigned int old_flags = dev->flags;
5985         int ret;
5986
5987         ASSERT_RTNL();
5988
5989         /*
5990          *      Set the flags on our device.
5991          */
5992
5993         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5994                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5995                                IFF_AUTOMEDIA)) |
5996                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5997                                     IFF_ALLMULTI));
5998
5999         /*
6000          *      Load in the correct multicast list now the flags have changed.
6001          */
6002
6003         if ((old_flags ^ flags) & IFF_MULTICAST)
6004                 dev_change_rx_flags(dev, IFF_MULTICAST);
6005
6006         dev_set_rx_mode(dev);
6007
6008         /*
6009          *      Have we downed the interface. We handle IFF_UP ourselves
6010          *      according to user attempts to set it, rather than blindly
6011          *      setting it.
6012          */
6013
6014         ret = 0;
6015         if ((old_flags ^ flags) & IFF_UP)
6016                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6017
6018         if ((flags ^ dev->gflags) & IFF_PROMISC) {
6019                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
6020                 unsigned int old_flags = dev->flags;
6021
6022                 dev->gflags ^= IFF_PROMISC;
6023
6024                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
6025                         if (dev->flags != old_flags)
6026                                 dev_set_rx_mode(dev);
6027         }
6028
6029         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6030            is important. Some (broken) drivers set IFF_PROMISC, when
6031            IFF_ALLMULTI is requested not asking us and not reporting.
6032          */
6033         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6034                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6035
6036                 dev->gflags ^= IFF_ALLMULTI;
6037                 __dev_set_allmulti(dev, inc, false);
6038         }
6039
6040         return ret;
6041 }
6042
6043 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6044                         unsigned int gchanges)
6045 {
6046         unsigned int changes = dev->flags ^ old_flags;
6047
6048         if (gchanges)
6049                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6050
6051         if (changes & IFF_UP) {
6052                 if (dev->flags & IFF_UP)
6053                         call_netdevice_notifiers(NETDEV_UP, dev);
6054                 else
6055                         call_netdevice_notifiers(NETDEV_DOWN, dev);
6056         }
6057
6058         if (dev->flags & IFF_UP &&
6059             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6060                 struct netdev_notifier_change_info change_info;
6061
6062                 change_info.flags_changed = changes;
6063                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6064                                               &change_info.info);
6065         }
6066 }
6067
6068 /**
6069  *      dev_change_flags - change device settings
6070  *      @dev: device
6071  *      @flags: device state flags
6072  *
6073  *      Change settings on device based state flags. The flags are
6074  *      in the userspace exported format.
6075  */
6076 int dev_change_flags(struct net_device *dev, unsigned int flags)
6077 {
6078         int ret;
6079         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6080
6081         ret = __dev_change_flags(dev, flags);
6082         if (ret < 0)
6083                 return ret;
6084
6085         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6086         __dev_notify_flags(dev, old_flags, changes);
6087         return ret;
6088 }
6089 EXPORT_SYMBOL(dev_change_flags);
6090
6091 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6092 {
6093         const struct net_device_ops *ops = dev->netdev_ops;
6094
6095         if (ops->ndo_change_mtu)
6096                 return ops->ndo_change_mtu(dev, new_mtu);
6097
6098         dev->mtu = new_mtu;
6099         return 0;
6100 }
6101
6102 /**
6103  *      dev_set_mtu - Change maximum transfer unit
6104  *      @dev: device
6105  *      @new_mtu: new transfer unit
6106  *
6107  *      Change the maximum transfer size of the network device.
6108  */
6109 int dev_set_mtu(struct net_device *dev, int new_mtu)
6110 {
6111         int err, orig_mtu;
6112
6113         if (new_mtu == dev->mtu)
6114                 return 0;
6115
6116         /*      MTU must be positive.    */
6117         if (new_mtu < 0)
6118                 return -EINVAL;
6119
6120         if (!netif_device_present(dev))
6121                 return -ENODEV;
6122
6123         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6124         err = notifier_to_errno(err);
6125         if (err)
6126                 return err;
6127
6128         orig_mtu = dev->mtu;
6129         err = __dev_set_mtu(dev, new_mtu);
6130
6131         if (!err) {
6132                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6133                 err = notifier_to_errno(err);
6134                 if (err) {
6135                         /* setting mtu back and notifying everyone again,
6136                          * so that they have a chance to revert changes.
6137                          */
6138                         __dev_set_mtu(dev, orig_mtu);
6139                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6140                 }
6141         }
6142         return err;
6143 }
6144 EXPORT_SYMBOL(dev_set_mtu);
6145
6146 /**
6147  *      dev_set_group - Change group this device belongs to
6148  *      @dev: device
6149  *      @new_group: group this device should belong to
6150  */
6151 void dev_set_group(struct net_device *dev, int new_group)
6152 {
6153         dev->group = new_group;
6154 }
6155 EXPORT_SYMBOL(dev_set_group);
6156
6157 /**
6158  *      dev_set_mac_address - Change Media Access Control Address
6159  *      @dev: device
6160  *      @sa: new address
6161  *
6162  *      Change the hardware (MAC) address of the device
6163  */
6164 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6165 {
6166         const struct net_device_ops *ops = dev->netdev_ops;
6167         int err;
6168
6169         if (!ops->ndo_set_mac_address)
6170                 return -EOPNOTSUPP;
6171         if (sa->sa_family != dev->type)
6172                 return -EINVAL;
6173         if (!netif_device_present(dev))
6174                 return -ENODEV;
6175         err = ops->ndo_set_mac_address(dev, sa);
6176         if (err)
6177                 return err;
6178         dev->addr_assign_type = NET_ADDR_SET;
6179         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6180         add_device_randomness(dev->dev_addr, dev->addr_len);
6181         return 0;
6182 }
6183 EXPORT_SYMBOL(dev_set_mac_address);
6184
6185 /**
6186  *      dev_change_carrier - Change device carrier
6187  *      @dev: device
6188  *      @new_carrier: new value
6189  *
6190  *      Change device carrier
6191  */
6192 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6193 {
6194         const struct net_device_ops *ops = dev->netdev_ops;
6195
6196         if (!ops->ndo_change_carrier)
6197                 return -EOPNOTSUPP;
6198         if (!netif_device_present(dev))
6199                 return -ENODEV;
6200         return ops->ndo_change_carrier(dev, new_carrier);
6201 }
6202 EXPORT_SYMBOL(dev_change_carrier);
6203
6204 /**
6205  *      dev_get_phys_port_id - Get device physical port ID
6206  *      @dev: device
6207  *      @ppid: port ID
6208  *
6209  *      Get device physical port ID
6210  */
6211 int dev_get_phys_port_id(struct net_device *dev,
6212                          struct netdev_phys_item_id *ppid)
6213 {
6214         const struct net_device_ops *ops = dev->netdev_ops;
6215
6216         if (!ops->ndo_get_phys_port_id)
6217                 return -EOPNOTSUPP;
6218         return ops->ndo_get_phys_port_id(dev, ppid);
6219 }
6220 EXPORT_SYMBOL(dev_get_phys_port_id);
6221
6222 /**
6223  *      dev_get_phys_port_name - Get device physical port name
6224  *      @dev: device
6225  *      @name: port name
6226  *
6227  *      Get device physical port name
6228  */
6229 int dev_get_phys_port_name(struct net_device *dev,
6230                            char *name, size_t len)
6231 {
6232         const struct net_device_ops *ops = dev->netdev_ops;
6233
6234         if (!ops->ndo_get_phys_port_name)
6235                 return -EOPNOTSUPP;
6236         return ops->ndo_get_phys_port_name(dev, name, len);
6237 }
6238 EXPORT_SYMBOL(dev_get_phys_port_name);
6239
6240 /**
6241  *      dev_change_proto_down - update protocol port state information
6242  *      @dev: device
6243  *      @proto_down: new value
6244  *
6245  *      This info can be used by switch drivers to set the phys state of the
6246  *      port.
6247  */
6248 int dev_change_proto_down(struct net_device *dev, bool proto_down)
6249 {
6250         const struct net_device_ops *ops = dev->netdev_ops;
6251
6252         if (!ops->ndo_change_proto_down)
6253                 return -EOPNOTSUPP;
6254         if (!netif_device_present(dev))
6255                 return -ENODEV;
6256         return ops->ndo_change_proto_down(dev, proto_down);
6257 }
6258 EXPORT_SYMBOL(dev_change_proto_down);
6259
6260 /**
6261  *      dev_new_index   -       allocate an ifindex
6262  *      @net: the applicable net namespace
6263  *
6264  *      Returns a suitable unique value for a new device interface
6265  *      number.  The caller must hold the rtnl semaphore or the
6266  *      dev_base_lock to be sure it remains unique.
6267  */
6268 static int dev_new_index(struct net *net)
6269 {
6270         int ifindex = net->ifindex;
6271         for (;;) {
6272                 if (++ifindex <= 0)
6273                         ifindex = 1;
6274                 if (!__dev_get_by_index(net, ifindex))
6275                         return net->ifindex = ifindex;
6276         }
6277 }
6278
6279 /* Delayed registration/unregisteration */
6280 static LIST_HEAD(net_todo_list);
6281 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6282
6283 static void net_set_todo(struct net_device *dev)
6284 {
6285         list_add_tail(&dev->todo_list, &net_todo_list);
6286         dev_net(dev)->dev_unreg_count++;
6287 }
6288
6289 static void rollback_registered_many(struct list_head *head)
6290 {
6291         struct net_device *dev, *tmp;
6292         LIST_HEAD(close_head);
6293
6294         BUG_ON(dev_boot_phase);
6295         ASSERT_RTNL();
6296
6297         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6298                 /* Some devices call without registering
6299                  * for initialization unwind. Remove those
6300                  * devices and proceed with the remaining.
6301                  */
6302                 if (dev->reg_state == NETREG_UNINITIALIZED) {
6303                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6304                                  dev->name, dev);
6305
6306                         WARN_ON(1);
6307                         list_del(&dev->unreg_list);
6308                         continue;
6309                 }
6310                 dev->dismantle = true;
6311                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6312         }
6313
6314         /* If device is running, close it first. */
6315         list_for_each_entry(dev, head, unreg_list)
6316                 list_add_tail(&dev->close_list, &close_head);
6317         dev_close_many(&close_head, true);
6318
6319         list_for_each_entry(dev, head, unreg_list) {
6320                 /* And unlink it from device chain. */
6321                 unlist_netdevice(dev);
6322
6323                 dev->reg_state = NETREG_UNREGISTERING;
6324                 on_each_cpu(flush_backlog, dev, 1);
6325         }
6326
6327         synchronize_net();
6328
6329         list_for_each_entry(dev, head, unreg_list) {
6330                 struct sk_buff *skb = NULL;
6331
6332                 /* Shutdown queueing discipline. */
6333                 dev_shutdown(dev);
6334
6335
6336                 /* Notify protocols, that we are about to destroy
6337                    this device. They should clean all the things.
6338                 */
6339                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6340
6341                 if (!dev->rtnl_link_ops ||
6342                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6343                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6344                                                      GFP_KERNEL);
6345
6346                 /*
6347                  *      Flush the unicast and multicast chains
6348                  */
6349                 dev_uc_flush(dev);
6350                 dev_mc_flush(dev);
6351
6352                 if (dev->netdev_ops->ndo_uninit)
6353                         dev->netdev_ops->ndo_uninit(dev);
6354
6355                 if (skb)
6356                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6357
6358                 /* Notifier chain MUST detach us all upper devices. */
6359                 WARN_ON(netdev_has_any_upper_dev(dev));
6360
6361                 /* Remove entries from kobject tree */
6362                 netdev_unregister_kobject(dev);
6363 #ifdef CONFIG_XPS
6364                 /* Remove XPS queueing entries */
6365                 netif_reset_xps_queues_gt(dev, 0);
6366 #endif
6367         }
6368
6369         synchronize_net();
6370
6371         list_for_each_entry(dev, head, unreg_list)
6372                 dev_put(dev);
6373 }
6374
6375 static void rollback_registered(struct net_device *dev)
6376 {
6377         LIST_HEAD(single);
6378
6379         list_add(&dev->unreg_list, &single);
6380         rollback_registered_many(&single);
6381         list_del(&single);
6382 }
6383
6384 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6385         struct net_device *upper, netdev_features_t features)
6386 {
6387         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6388         netdev_features_t feature;
6389         int feature_bit;
6390
6391         for_each_netdev_feature(&upper_disables, feature_bit) {
6392                 feature = __NETIF_F_BIT(feature_bit);
6393                 if (!(upper->wanted_features & feature)
6394                     && (features & feature)) {
6395                         netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6396                                    &feature, upper->name);
6397                         features &= ~feature;
6398                 }
6399         }
6400
6401         return features;
6402 }
6403
6404 static void netdev_sync_lower_features(struct net_device *upper,
6405         struct net_device *lower, netdev_features_t features)
6406 {
6407         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6408         netdev_features_t feature;
6409         int feature_bit;
6410
6411         for_each_netdev_feature(&upper_disables, feature_bit) {
6412                 feature = __NETIF_F_BIT(feature_bit);
6413                 if (!(features & feature) && (lower->features & feature)) {
6414                         netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6415                                    &feature, lower->name);
6416                         lower->wanted_features &= ~feature;
6417                         netdev_update_features(lower);
6418
6419                         if (unlikely(lower->features & feature))
6420                                 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6421                                             &feature, lower->name);
6422                 }
6423         }
6424 }
6425
6426 static netdev_features_t netdev_fix_features(struct net_device *dev,
6427         netdev_features_t features)
6428 {
6429         /* Fix illegal checksum combinations */
6430         if ((features & NETIF_F_HW_CSUM) &&
6431             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6432                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6433                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6434         }
6435
6436         /* TSO requires that SG is present as well. */
6437         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6438                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6439                 features &= ~NETIF_F_ALL_TSO;
6440         }
6441
6442         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6443                                         !(features & NETIF_F_IP_CSUM)) {
6444                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6445                 features &= ~NETIF_F_TSO;
6446                 features &= ~NETIF_F_TSO_ECN;
6447         }
6448
6449         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6450                                          !(features & NETIF_F_IPV6_CSUM)) {
6451                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6452                 features &= ~NETIF_F_TSO6;
6453         }
6454
6455         /* TSO ECN requires that TSO is present as well. */
6456         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6457                 features &= ~NETIF_F_TSO_ECN;
6458
6459         /* Software GSO depends on SG. */
6460         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6461                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6462                 features &= ~NETIF_F_GSO;
6463         }
6464
6465         /* UFO needs SG and checksumming */
6466         if (features & NETIF_F_UFO) {
6467                 /* maybe split UFO into V4 and V6? */
6468                 if (!((features & NETIF_F_GEN_CSUM) ||
6469                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6470                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6471                         netdev_dbg(dev,
6472                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6473                         features &= ~NETIF_F_UFO;
6474                 }
6475
6476                 if (!(features & NETIF_F_SG)) {
6477                         netdev_dbg(dev,
6478                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6479                         features &= ~NETIF_F_UFO;
6480                 }
6481         }
6482
6483 #ifdef CONFIG_NET_RX_BUSY_POLL
6484         if (dev->netdev_ops->ndo_busy_poll)
6485                 features |= NETIF_F_BUSY_POLL;
6486         else
6487 #endif
6488                 features &= ~NETIF_F_BUSY_POLL;
6489
6490         return features;
6491 }
6492
6493 int __netdev_update_features(struct net_device *dev)
6494 {
6495         struct net_device *upper, *lower;
6496         netdev_features_t features;
6497         struct list_head *iter;
6498         int err = -1;
6499
6500         ASSERT_RTNL();
6501
6502         features = netdev_get_wanted_features(dev);
6503
6504         if (dev->netdev_ops->ndo_fix_features)
6505                 features = dev->netdev_ops->ndo_fix_features(dev, features);
6506
6507         /* driver might be less strict about feature dependencies */
6508         features = netdev_fix_features(dev, features);
6509
6510         /* some features can't be enabled if they're off an an upper device */
6511         netdev_for_each_upper_dev_rcu(dev, upper, iter)
6512                 features = netdev_sync_upper_features(dev, upper, features);
6513
6514         if (dev->features == features)
6515                 goto sync_lower;
6516
6517         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6518                 &dev->features, &features);
6519
6520         if (dev->netdev_ops->ndo_set_features)
6521                 err = dev->netdev_ops->ndo_set_features(dev, features);
6522         else
6523                 err = 0;
6524
6525         if (unlikely(err < 0)) {
6526                 netdev_err(dev,
6527                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
6528                         err, &features, &dev->features);
6529                 /* return non-0 since some features might have changed and
6530                  * it's better to fire a spurious notification than miss it
6531                  */
6532                 return -1;
6533         }
6534
6535 sync_lower:
6536         /* some features must be disabled on lower devices when disabled
6537          * on an upper device (think: bonding master or bridge)
6538          */
6539         netdev_for_each_lower_dev(dev, lower, iter)
6540                 netdev_sync_lower_features(dev, lower, features);
6541
6542         if (!err)
6543                 dev->features = features;
6544
6545         return err < 0 ? 0 : 1;
6546 }
6547
6548 /**
6549  *      netdev_update_features - recalculate device features
6550  *      @dev: the device to check
6551  *
6552  *      Recalculate dev->features set and send notifications if it
6553  *      has changed. Should be called after driver or hardware dependent
6554  *      conditions might have changed that influence the features.
6555  */
6556 void netdev_update_features(struct net_device *dev)
6557 {
6558         if (__netdev_update_features(dev))
6559                 netdev_features_change(dev);
6560 }
6561 EXPORT_SYMBOL(netdev_update_features);
6562
6563 /**
6564  *      netdev_change_features - recalculate device features
6565  *      @dev: the device to check
6566  *
6567  *      Recalculate dev->features set and send notifications even
6568  *      if they have not changed. Should be called instead of
6569  *      netdev_update_features() if also dev->vlan_features might
6570  *      have changed to allow the changes to be propagated to stacked
6571  *      VLAN devices.
6572  */
6573 void netdev_change_features(struct net_device *dev)
6574 {
6575         __netdev_update_features(dev);
6576         netdev_features_change(dev);
6577 }
6578 EXPORT_SYMBOL(netdev_change_features);
6579
6580 /**
6581  *      netif_stacked_transfer_operstate -      transfer operstate
6582  *      @rootdev: the root or lower level device to transfer state from
6583  *      @dev: the device to transfer operstate to
6584  *
6585  *      Transfer operational state from root to device. This is normally
6586  *      called when a stacking relationship exists between the root
6587  *      device and the device(a leaf device).
6588  */
6589 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6590                                         struct net_device *dev)
6591 {
6592         if (rootdev->operstate == IF_OPER_DORMANT)
6593                 netif_dormant_on(dev);
6594         else
6595                 netif_dormant_off(dev);
6596
6597         if (netif_carrier_ok(rootdev)) {
6598                 if (!netif_carrier_ok(dev))
6599                         netif_carrier_on(dev);
6600         } else {
6601                 if (netif_carrier_ok(dev))
6602                         netif_carrier_off(dev);
6603         }
6604 }
6605 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6606
6607 #ifdef CONFIG_SYSFS
6608 static int netif_alloc_rx_queues(struct net_device *dev)
6609 {
6610         unsigned int i, count = dev->num_rx_queues;
6611         struct netdev_rx_queue *rx;
6612         size_t sz = count * sizeof(*rx);
6613
6614         BUG_ON(count < 1);
6615
6616         rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6617         if (!rx) {
6618                 rx = vzalloc(sz);
6619                 if (!rx)
6620                         return -ENOMEM;
6621         }
6622         dev->_rx = rx;
6623
6624         for (i = 0; i < count; i++)
6625                 rx[i].dev = dev;
6626         return 0;
6627 }
6628 #endif
6629
6630 static void netdev_init_one_queue(struct net_device *dev,
6631                                   struct netdev_queue *queue, void *_unused)
6632 {
6633         /* Initialize queue lock */
6634         spin_lock_init(&queue->_xmit_lock);
6635         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6636         queue->xmit_lock_owner = -1;
6637         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6638         queue->dev = dev;
6639 #ifdef CONFIG_BQL
6640         dql_init(&queue->dql, HZ);
6641 #endif
6642 }
6643
6644 static void netif_free_tx_queues(struct net_device *dev)
6645 {
6646         kvfree(dev->_tx);
6647 }
6648
6649 static int netif_alloc_netdev_queues(struct net_device *dev)
6650 {
6651         unsigned int count = dev->num_tx_queues;
6652         struct netdev_queue *tx;
6653         size_t sz = count * sizeof(*tx);
6654
6655         if (count < 1 || count > 0xffff)
6656                 return -EINVAL;
6657
6658         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6659         if (!tx) {
6660                 tx = vzalloc(sz);
6661                 if (!tx)
6662                         return -ENOMEM;
6663         }
6664         dev->_tx = tx;
6665
6666         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6667         spin_lock_init(&dev->tx_global_lock);
6668
6669         return 0;
6670 }
6671
6672 void netif_tx_stop_all_queues(struct net_device *dev)
6673 {
6674         unsigned int i;
6675
6676         for (i = 0; i < dev->num_tx_queues; i++) {
6677                 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
6678                 netif_tx_stop_queue(txq);
6679         }
6680 }
6681 EXPORT_SYMBOL(netif_tx_stop_all_queues);
6682
6683 /**
6684  *      register_netdevice      - register a network device
6685  *      @dev: device to register
6686  *
6687  *      Take a completed network device structure and add it to the kernel
6688  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6689  *      chain. 0 is returned on success. A negative errno code is returned
6690  *      on a failure to set up the device, or if the name is a duplicate.
6691  *
6692  *      Callers must hold the rtnl semaphore. You may want
6693  *      register_netdev() instead of this.
6694  *
6695  *      BUGS:
6696  *      The locking appears insufficient to guarantee two parallel registers
6697  *      will not get the same name.
6698  */
6699
6700 int register_netdevice(struct net_device *dev)
6701 {
6702         int ret;
6703         struct net *net = dev_net(dev);
6704
6705         BUG_ON(dev_boot_phase);
6706         ASSERT_RTNL();
6707
6708         might_sleep();
6709
6710         /* When net_device's are persistent, this will be fatal. */
6711         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6712         BUG_ON(!net);
6713
6714         spin_lock_init(&dev->addr_list_lock);
6715         netdev_set_addr_lockdep_class(dev);
6716
6717         ret = dev_get_valid_name(net, dev, dev->name);
6718         if (ret < 0)
6719                 goto out;
6720
6721         /* Init, if this function is available */
6722         if (dev->netdev_ops->ndo_init) {
6723                 ret = dev->netdev_ops->ndo_init(dev);
6724                 if (ret) {
6725                         if (ret > 0)
6726                                 ret = -EIO;
6727                         goto out;
6728                 }
6729         }
6730
6731         if (((dev->hw_features | dev->features) &
6732              NETIF_F_HW_VLAN_CTAG_FILTER) &&
6733             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6734              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6735                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6736                 ret = -EINVAL;
6737                 goto err_uninit;
6738         }
6739
6740         ret = -EBUSY;
6741         if (!dev->ifindex)
6742                 dev->ifindex = dev_new_index(net);
6743         else if (__dev_get_by_index(net, dev->ifindex))
6744                 goto err_uninit;
6745
6746         /* Transfer changeable features to wanted_features and enable
6747          * software offloads (GSO and GRO).
6748          */
6749         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6750         dev->features |= NETIF_F_SOFT_FEATURES;
6751         dev->wanted_features = dev->features & dev->hw_features;
6752
6753         if (!(dev->flags & IFF_LOOPBACK)) {
6754                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6755         }
6756
6757         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6758          */
6759         dev->vlan_features |= NETIF_F_HIGHDMA;
6760
6761         /* Make NETIF_F_SG inheritable to tunnel devices.
6762          */
6763         dev->hw_enc_features |= NETIF_F_SG;
6764
6765         /* Make NETIF_F_SG inheritable to MPLS.
6766          */
6767         dev->mpls_features |= NETIF_F_SG;
6768
6769         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6770         ret = notifier_to_errno(ret);
6771         if (ret)
6772                 goto err_uninit;
6773
6774         ret = netdev_register_kobject(dev);
6775         if (ret)
6776                 goto err_uninit;
6777         dev->reg_state = NETREG_REGISTERED;
6778
6779         __netdev_update_features(dev);
6780
6781         /*
6782          *      Default initial state at registry is that the
6783          *      device is present.
6784          */
6785
6786         set_bit(__LINK_STATE_PRESENT, &dev->state);
6787
6788         linkwatch_init_dev(dev);
6789
6790         dev_init_scheduler(dev);
6791         dev_hold(dev);
6792         list_netdevice(dev);
6793         add_device_randomness(dev->dev_addr, dev->addr_len);
6794
6795         /* If the device has permanent device address, driver should
6796          * set dev_addr and also addr_assign_type should be set to
6797          * NET_ADDR_PERM (default value).
6798          */
6799         if (dev->addr_assign_type == NET_ADDR_PERM)
6800                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6801
6802         /* Notify protocols, that a new device appeared. */
6803         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6804         ret = notifier_to_errno(ret);
6805         if (ret) {
6806                 rollback_registered(dev);
6807                 dev->reg_state = NETREG_UNREGISTERED;
6808         }
6809         /*
6810          *      Prevent userspace races by waiting until the network
6811          *      device is fully setup before sending notifications.
6812          */
6813         if (!dev->rtnl_link_ops ||
6814             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6815                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6816
6817 out:
6818         return ret;
6819
6820 err_uninit:
6821         if (dev->netdev_ops->ndo_uninit)
6822                 dev->netdev_ops->ndo_uninit(dev);
6823         goto out;
6824 }
6825 EXPORT_SYMBOL(register_netdevice);
6826
6827 /**
6828  *      init_dummy_netdev       - init a dummy network device for NAPI
6829  *      @dev: device to init
6830  *
6831  *      This takes a network device structure and initialize the minimum
6832  *      amount of fields so it can be used to schedule NAPI polls without
6833  *      registering a full blown interface. This is to be used by drivers
6834  *      that need to tie several hardware interfaces to a single NAPI
6835  *      poll scheduler due to HW limitations.
6836  */
6837 int init_dummy_netdev(struct net_device *dev)
6838 {
6839         /* Clear everything. Note we don't initialize spinlocks
6840          * are they aren't supposed to be taken by any of the
6841          * NAPI code and this dummy netdev is supposed to be
6842          * only ever used for NAPI polls
6843          */
6844         memset(dev, 0, sizeof(struct net_device));
6845
6846         /* make sure we BUG if trying to hit standard
6847          * register/unregister code path
6848          */
6849         dev->reg_state = NETREG_DUMMY;
6850
6851         /* NAPI wants this */
6852         INIT_LIST_HEAD(&dev->napi_list);
6853
6854         /* a dummy interface is started by default */
6855         set_bit(__LINK_STATE_PRESENT, &dev->state);
6856         set_bit(__LINK_STATE_START, &dev->state);
6857
6858         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6859          * because users of this 'device' dont need to change
6860          * its refcount.
6861          */
6862
6863         return 0;
6864 }
6865 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6866
6867
6868 /**
6869  *      register_netdev - register a network device
6870  *      @dev: device to register
6871  *
6872  *      Take a completed network device structure and add it to the kernel
6873  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6874  *      chain. 0 is returned on success. A negative errno code is returned
6875  *      on a failure to set up the device, or if the name is a duplicate.
6876  *
6877  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6878  *      and expands the device name if you passed a format string to
6879  *      alloc_netdev.
6880  */
6881 int register_netdev(struct net_device *dev)
6882 {
6883         int err;
6884
6885         rtnl_lock();
6886         err = register_netdevice(dev);
6887         rtnl_unlock();
6888         return err;
6889 }
6890 EXPORT_SYMBOL(register_netdev);
6891
6892 int netdev_refcnt_read(const struct net_device *dev)
6893 {
6894         int i, refcnt = 0;
6895
6896         for_each_possible_cpu(i)
6897                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6898         return refcnt;
6899 }
6900 EXPORT_SYMBOL(netdev_refcnt_read);
6901
6902 /**
6903  * netdev_wait_allrefs - wait until all references are gone.
6904  * @dev: target net_device
6905  *
6906  * This is called when unregistering network devices.
6907  *
6908  * Any protocol or device that holds a reference should register
6909  * for netdevice notification, and cleanup and put back the
6910  * reference if they receive an UNREGISTER event.
6911  * We can get stuck here if buggy protocols don't correctly
6912  * call dev_put.
6913  */
6914 static void netdev_wait_allrefs(struct net_device *dev)
6915 {
6916         unsigned long rebroadcast_time, warning_time;
6917         int refcnt;
6918
6919         linkwatch_forget_dev(dev);
6920
6921         rebroadcast_time = warning_time = jiffies;
6922         refcnt = netdev_refcnt_read(dev);
6923
6924         while (refcnt != 0) {
6925                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6926                         rtnl_lock();
6927
6928                         /* Rebroadcast unregister notification */
6929                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6930
6931                         __rtnl_unlock();
6932                         rcu_barrier();
6933                         rtnl_lock();
6934
6935                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6936                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6937                                      &dev->state)) {
6938                                 /* We must not have linkwatch events
6939                                  * pending on unregister. If this
6940                                  * happens, we simply run the queue
6941                                  * unscheduled, resulting in a noop
6942                                  * for this device.
6943                                  */
6944                                 linkwatch_run_queue();
6945                         }
6946
6947                         __rtnl_unlock();
6948
6949                         rebroadcast_time = jiffies;
6950                 }
6951
6952                 msleep(250);
6953
6954                 refcnt = netdev_refcnt_read(dev);
6955
6956                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6957                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6958                                  dev->name, refcnt);
6959                         warning_time = jiffies;
6960                 }
6961         }
6962 }
6963
6964 /* The sequence is:
6965  *
6966  *      rtnl_lock();
6967  *      ...
6968  *      register_netdevice(x1);
6969  *      register_netdevice(x2);
6970  *      ...
6971  *      unregister_netdevice(y1);
6972  *      unregister_netdevice(y2);
6973  *      ...
6974  *      rtnl_unlock();
6975  *      free_netdev(y1);
6976  *      free_netdev(y2);
6977  *
6978  * We are invoked by rtnl_unlock().
6979  * This allows us to deal with problems:
6980  * 1) We can delete sysfs objects which invoke hotplug
6981  *    without deadlocking with linkwatch via keventd.
6982  * 2) Since we run with the RTNL semaphore not held, we can sleep
6983  *    safely in order to wait for the netdev refcnt to drop to zero.
6984  *
6985  * We must not return until all unregister events added during
6986  * the interval the lock was held have been completed.
6987  */
6988 void netdev_run_todo(void)
6989 {
6990         struct list_head list;
6991
6992         /* Snapshot list, allow later requests */
6993         list_replace_init(&net_todo_list, &list);
6994
6995         __rtnl_unlock();
6996
6997
6998         /* Wait for rcu callbacks to finish before next phase */
6999         if (!list_empty(&list))
7000                 rcu_barrier();
7001
7002         while (!list_empty(&list)) {
7003                 struct net_device *dev
7004                         = list_first_entry(&list, struct net_device, todo_list);
7005                 list_del(&dev->todo_list);
7006
7007                 rtnl_lock();
7008                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7009                 __rtnl_unlock();
7010
7011                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7012                         pr_err("network todo '%s' but state %d\n",
7013                                dev->name, dev->reg_state);
7014                         dump_stack();
7015                         continue;
7016                 }
7017
7018                 dev->reg_state = NETREG_UNREGISTERED;
7019
7020                 netdev_wait_allrefs(dev);
7021
7022                 /* paranoia */
7023                 BUG_ON(netdev_refcnt_read(dev));
7024                 BUG_ON(!list_empty(&dev->ptype_all));
7025                 BUG_ON(!list_empty(&dev->ptype_specific));
7026                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
7027                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7028                 WARN_ON(dev->dn_ptr);
7029
7030                 if (dev->destructor)
7031                         dev->destructor(dev);
7032
7033                 /* Report a network device has been unregistered */
7034                 rtnl_lock();
7035                 dev_net(dev)->dev_unreg_count--;
7036                 __rtnl_unlock();
7037                 wake_up(&netdev_unregistering_wq);
7038
7039                 /* Free network device */
7040                 kobject_put(&dev->dev.kobj);
7041         }
7042 }
7043
7044 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
7045  * fields in the same order, with only the type differing.
7046  */
7047 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7048                              const struct net_device_stats *netdev_stats)
7049 {
7050 #if BITS_PER_LONG == 64
7051         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
7052         memcpy(stats64, netdev_stats, sizeof(*stats64));
7053 #else
7054         size_t i, n = sizeof(*stats64) / sizeof(u64);
7055         const unsigned long *src = (const unsigned long *)netdev_stats;
7056         u64 *dst = (u64 *)stats64;
7057
7058         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
7059                      sizeof(*stats64) / sizeof(u64));
7060         for (i = 0; i < n; i++)
7061                 dst[i] = src[i];
7062 #endif
7063 }
7064 EXPORT_SYMBOL(netdev_stats_to_stats64);
7065
7066 /**
7067  *      dev_get_stats   - get network device statistics
7068  *      @dev: device to get statistics from
7069  *      @storage: place to store stats
7070  *
7071  *      Get network statistics from device. Return @storage.
7072  *      The device driver may provide its own method by setting
7073  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7074  *      otherwise the internal statistics structure is used.
7075  */
7076 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7077                                         struct rtnl_link_stats64 *storage)
7078 {
7079         const struct net_device_ops *ops = dev->netdev_ops;
7080
7081         if (ops->ndo_get_stats64) {
7082                 memset(storage, 0, sizeof(*storage));
7083                 ops->ndo_get_stats64(dev, storage);
7084         } else if (ops->ndo_get_stats) {
7085                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7086         } else {
7087                 netdev_stats_to_stats64(storage, &dev->stats);
7088         }
7089         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7090         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7091         return storage;
7092 }
7093 EXPORT_SYMBOL(dev_get_stats);
7094
7095 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7096 {
7097         struct netdev_queue *queue = dev_ingress_queue(dev);
7098
7099 #ifdef CONFIG_NET_CLS_ACT
7100         if (queue)
7101                 return queue;
7102         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7103         if (!queue)
7104                 return NULL;
7105         netdev_init_one_queue(dev, queue, NULL);
7106         RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7107         queue->qdisc_sleeping = &noop_qdisc;
7108         rcu_assign_pointer(dev->ingress_queue, queue);
7109 #endif
7110         return queue;
7111 }
7112
7113 static const struct ethtool_ops default_ethtool_ops;
7114
7115 void netdev_set_default_ethtool_ops(struct net_device *dev,
7116                                     const struct ethtool_ops *ops)
7117 {
7118         if (dev->ethtool_ops == &default_ethtool_ops)
7119                 dev->ethtool_ops = ops;
7120 }
7121 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7122
7123 void netdev_freemem(struct net_device *dev)
7124 {
7125         char *addr = (char *)dev - dev->padded;
7126
7127         kvfree(addr);
7128 }
7129
7130 /**
7131  *      alloc_netdev_mqs - allocate network device
7132  *      @sizeof_priv:           size of private data to allocate space for
7133  *      @name:                  device name format string
7134  *      @name_assign_type:      origin of device name
7135  *      @setup:                 callback to initialize device
7136  *      @txqs:                  the number of TX subqueues to allocate
7137  *      @rxqs:                  the number of RX subqueues to allocate
7138  *
7139  *      Allocates a struct net_device with private data area for driver use
7140  *      and performs basic initialization.  Also allocates subqueue structs
7141  *      for each queue on the device.
7142  */
7143 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7144                 unsigned char name_assign_type,
7145                 void (*setup)(struct net_device *),
7146                 unsigned int txqs, unsigned int rxqs)
7147 {
7148         struct net_device *dev;
7149         size_t alloc_size;
7150         struct net_device *p;
7151
7152         BUG_ON(strlen(name) >= sizeof(dev->name));
7153
7154         if (txqs < 1) {
7155                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7156                 return NULL;
7157         }
7158
7159 #ifdef CONFIG_SYSFS
7160         if (rxqs < 1) {
7161                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7162                 return NULL;
7163         }
7164 #endif
7165
7166         alloc_size = sizeof(struct net_device);
7167         if (sizeof_priv) {
7168                 /* ensure 32-byte alignment of private area */
7169                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7170                 alloc_size += sizeof_priv;
7171         }
7172         /* ensure 32-byte alignment of whole construct */
7173         alloc_size += NETDEV_ALIGN - 1;
7174
7175         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7176         if (!p)
7177                 p = vzalloc(alloc_size);
7178         if (!p)
7179                 return NULL;
7180
7181         dev = PTR_ALIGN(p, NETDEV_ALIGN);
7182         dev->padded = (char *)dev - (char *)p;
7183
7184         dev->pcpu_refcnt = alloc_percpu(int);
7185         if (!dev->pcpu_refcnt)
7186                 goto free_dev;
7187
7188         if (dev_addr_init(dev))
7189                 goto free_pcpu;
7190
7191         dev_mc_init(dev);
7192         dev_uc_init(dev);
7193
7194         dev_net_set(dev, &init_net);
7195
7196         dev->gso_max_size = GSO_MAX_SIZE;
7197         dev->gso_max_segs = GSO_MAX_SEGS;
7198         dev->gso_min_segs = 0;
7199
7200         INIT_LIST_HEAD(&dev->napi_list);
7201         INIT_LIST_HEAD(&dev->unreg_list);
7202         INIT_LIST_HEAD(&dev->close_list);
7203         INIT_LIST_HEAD(&dev->link_watch_list);
7204         INIT_LIST_HEAD(&dev->adj_list.upper);
7205         INIT_LIST_HEAD(&dev->adj_list.lower);
7206         INIT_LIST_HEAD(&dev->all_adj_list.upper);
7207         INIT_LIST_HEAD(&dev->all_adj_list.lower);
7208         INIT_LIST_HEAD(&dev->ptype_all);
7209         INIT_LIST_HEAD(&dev->ptype_specific);
7210         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7211         setup(dev);
7212
7213         if (!dev->tx_queue_len) {
7214                 dev->priv_flags |= IFF_NO_QUEUE;
7215                 dev->tx_queue_len = 1;
7216         }
7217
7218         dev->num_tx_queues = txqs;
7219         dev->real_num_tx_queues = txqs;
7220         if (netif_alloc_netdev_queues(dev))
7221                 goto free_all;
7222
7223 #ifdef CONFIG_SYSFS
7224         dev->num_rx_queues = rxqs;
7225         dev->real_num_rx_queues = rxqs;
7226         if (netif_alloc_rx_queues(dev))
7227                 goto free_all;
7228 #endif
7229
7230         strcpy(dev->name, name);
7231         dev->name_assign_type = name_assign_type;
7232         dev->group = INIT_NETDEV_GROUP;
7233         if (!dev->ethtool_ops)
7234                 dev->ethtool_ops = &default_ethtool_ops;
7235
7236         nf_hook_ingress_init(dev);
7237
7238         return dev;
7239
7240 free_all:
7241         free_netdev(dev);
7242         return NULL;
7243
7244 free_pcpu:
7245         free_percpu(dev->pcpu_refcnt);
7246 free_dev:
7247         netdev_freemem(dev);
7248         return NULL;
7249 }
7250 EXPORT_SYMBOL(alloc_netdev_mqs);
7251
7252 /**
7253  *      free_netdev - free network device
7254  *      @dev: device
7255  *
7256  *      This function does the last stage of destroying an allocated device
7257  *      interface. The reference to the device object is released.
7258  *      If this is the last reference then it will be freed.
7259  */
7260 void free_netdev(struct net_device *dev)
7261 {
7262         struct napi_struct *p, *n;
7263
7264         netif_free_tx_queues(dev);
7265 #ifdef CONFIG_SYSFS
7266         kvfree(dev->_rx);
7267 #endif
7268
7269         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7270
7271         /* Flush device addresses */
7272         dev_addr_flush(dev);
7273
7274         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7275                 netif_napi_del(p);
7276
7277         free_percpu(dev->pcpu_refcnt);
7278         dev->pcpu_refcnt = NULL;
7279
7280         /*  Compatibility with error handling in drivers */
7281         if (dev->reg_state == NETREG_UNINITIALIZED) {
7282                 netdev_freemem(dev);
7283                 return;
7284         }
7285
7286         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7287         dev->reg_state = NETREG_RELEASED;
7288
7289         /* will free via device release */
7290         put_device(&dev->dev);
7291 }
7292 EXPORT_SYMBOL(free_netdev);
7293
7294 /**
7295  *      synchronize_net -  Synchronize with packet receive processing
7296  *
7297  *      Wait for packets currently being received to be done.
7298  *      Does not block later packets from starting.
7299  */
7300 void synchronize_net(void)
7301 {
7302         might_sleep();
7303         if (rtnl_is_locked() && !IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
7304                 synchronize_rcu_expedited();
7305         else
7306                 synchronize_rcu();
7307 }
7308 EXPORT_SYMBOL(synchronize_net);
7309
7310 /**
7311  *      unregister_netdevice_queue - remove device from the kernel
7312  *      @dev: device
7313  *      @head: list
7314  *
7315  *      This function shuts down a device interface and removes it
7316  *      from the kernel tables.
7317  *      If head not NULL, device is queued to be unregistered later.
7318  *
7319  *      Callers must hold the rtnl semaphore.  You may want
7320  *      unregister_netdev() instead of this.
7321  */
7322
7323 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7324 {
7325         ASSERT_RTNL();
7326
7327         if (head) {
7328                 list_move_tail(&dev->unreg_list, head);
7329         } else {
7330                 rollback_registered(dev);
7331                 /* Finish processing unregister after unlock */
7332                 net_set_todo(dev);
7333         }
7334 }
7335 EXPORT_SYMBOL(unregister_netdevice_queue);
7336
7337 /**
7338  *      unregister_netdevice_many - unregister many devices
7339  *      @head: list of devices
7340  *
7341  *  Note: As most callers use a stack allocated list_head,
7342  *  we force a list_del() to make sure stack wont be corrupted later.
7343  */
7344 void unregister_netdevice_many(struct list_head *head)
7345 {
7346         struct net_device *dev;
7347
7348         if (!list_empty(head)) {
7349                 rollback_registered_many(head);
7350                 list_for_each_entry(dev, head, unreg_list)
7351                         net_set_todo(dev);
7352                 list_del(head);
7353         }
7354 }
7355 EXPORT_SYMBOL(unregister_netdevice_many);
7356
7357 /**
7358  *      unregister_netdev - remove device from the kernel
7359  *      @dev: device
7360  *
7361  *      This function shuts down a device interface and removes it
7362  *      from the kernel tables.
7363  *
7364  *      This is just a wrapper for unregister_netdevice that takes
7365  *      the rtnl semaphore.  In general you want to use this and not
7366  *      unregister_netdevice.
7367  */
7368 void unregister_netdev(struct net_device *dev)
7369 {
7370         rtnl_lock();
7371         unregister_netdevice(dev);
7372         rtnl_unlock();
7373 }
7374 EXPORT_SYMBOL(unregister_netdev);
7375
7376 /**
7377  *      dev_change_net_namespace - move device to different nethost namespace
7378  *      @dev: device
7379  *      @net: network namespace
7380  *      @pat: If not NULL name pattern to try if the current device name
7381  *            is already taken in the destination network namespace.
7382  *
7383  *      This function shuts down a device interface and moves it
7384  *      to a new network namespace. On success 0 is returned, on
7385  *      a failure a netagive errno code is returned.
7386  *
7387  *      Callers must hold the rtnl semaphore.
7388  */
7389
7390 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7391 {
7392         int err;
7393
7394         ASSERT_RTNL();
7395
7396         /* Don't allow namespace local devices to be moved. */
7397         err = -EINVAL;
7398         if (dev->features & NETIF_F_NETNS_LOCAL)
7399                 goto out;
7400
7401         /* Ensure the device has been registrered */
7402         if (dev->reg_state != NETREG_REGISTERED)
7403                 goto out;
7404
7405         /* Get out if there is nothing todo */
7406         err = 0;
7407         if (net_eq(dev_net(dev), net))
7408                 goto out;
7409
7410         /* Pick the destination device name, and ensure
7411          * we can use it in the destination network namespace.
7412          */
7413         err = -EEXIST;
7414         if (__dev_get_by_name(net, dev->name)) {
7415                 /* We get here if we can't use the current device name */
7416                 if (!pat)
7417                         goto out;
7418                 if (dev_get_valid_name(net, dev, pat) < 0)
7419                         goto out;
7420         }
7421
7422         /*
7423          * And now a mini version of register_netdevice unregister_netdevice.
7424          */
7425
7426         /* If device is running close it first. */
7427         dev_close(dev);
7428
7429         /* And unlink it from device chain */
7430         err = -ENODEV;
7431         unlist_netdevice(dev);
7432
7433         synchronize_net();
7434
7435         /* Shutdown queueing discipline. */
7436         dev_shutdown(dev);
7437
7438         /* Notify protocols, that we are about to destroy
7439            this device. They should clean all the things.
7440
7441            Note that dev->reg_state stays at NETREG_REGISTERED.
7442            This is wanted because this way 8021q and macvlan know
7443            the device is just moving and can keep their slaves up.
7444         */
7445         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7446         rcu_barrier();
7447         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7448         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7449
7450         /*
7451          *      Flush the unicast and multicast chains
7452          */
7453         dev_uc_flush(dev);
7454         dev_mc_flush(dev);
7455
7456         /* Send a netdev-removed uevent to the old namespace */
7457         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7458         netdev_adjacent_del_links(dev);
7459
7460         /* Actually switch the network namespace */
7461         dev_net_set(dev, net);
7462
7463         /* If there is an ifindex conflict assign a new one */
7464         if (__dev_get_by_index(net, dev->ifindex))
7465                 dev->ifindex = dev_new_index(net);
7466
7467         /* Send a netdev-add uevent to the new namespace */
7468         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7469         netdev_adjacent_add_links(dev);
7470
7471         /* Fixup kobjects */
7472         err = device_rename(&dev->dev, dev->name);
7473         WARN_ON(err);
7474
7475         /* Add the device back in the hashes */
7476         list_netdevice(dev);
7477
7478         /* Notify protocols, that a new device appeared. */
7479         call_netdevice_notifiers(NETDEV_REGISTER, dev);
7480
7481         /*
7482          *      Prevent userspace races by waiting until the network
7483          *      device is fully setup before sending notifications.
7484          */
7485         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7486
7487         synchronize_net();
7488         err = 0;
7489 out:
7490         return err;
7491 }
7492 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7493
7494 static int dev_cpu_callback(struct notifier_block *nfb,
7495                             unsigned long action,
7496                             void *ocpu)
7497 {
7498         struct sk_buff **list_skb;
7499         struct sk_buff *skb;
7500         unsigned int cpu, oldcpu = (unsigned long)ocpu;
7501         struct softnet_data *sd, *oldsd;
7502
7503         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7504                 return NOTIFY_OK;
7505
7506         local_irq_disable();
7507         cpu = smp_processor_id();
7508         sd = &per_cpu(softnet_data, cpu);
7509         oldsd = &per_cpu(softnet_data, oldcpu);
7510
7511         /* Find end of our completion_queue. */
7512         list_skb = &sd->completion_queue;
7513         while (*list_skb)
7514                 list_skb = &(*list_skb)->next;
7515         /* Append completion queue from offline CPU. */
7516         *list_skb = oldsd->completion_queue;
7517         oldsd->completion_queue = NULL;
7518
7519         /* Append output queue from offline CPU. */
7520         if (oldsd->output_queue) {
7521                 *sd->output_queue_tailp = oldsd->output_queue;
7522                 sd->output_queue_tailp = oldsd->output_queue_tailp;
7523                 oldsd->output_queue = NULL;
7524                 oldsd->output_queue_tailp = &oldsd->output_queue;
7525         }
7526         /* Append NAPI poll list from offline CPU, with one exception :
7527          * process_backlog() must be called by cpu owning percpu backlog.
7528          * We properly handle process_queue & input_pkt_queue later.
7529          */
7530         while (!list_empty(&oldsd->poll_list)) {
7531                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7532                                                             struct napi_struct,
7533                                                             poll_list);
7534
7535                 list_del_init(&napi->poll_list);
7536                 if (napi->poll == process_backlog)
7537                         napi->state = 0;
7538                 else
7539                         ____napi_schedule(sd, napi);
7540         }
7541
7542         raise_softirq_irqoff(NET_TX_SOFTIRQ);
7543         local_irq_enable();
7544         preempt_check_resched_rt();
7545
7546         /* Process offline CPU's input_pkt_queue */
7547         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7548                 netif_rx_ni(skb);
7549                 input_queue_head_incr(oldsd);
7550         }
7551         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
7552                 netif_rx_ni(skb);
7553                 input_queue_head_incr(oldsd);
7554         }
7555         while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
7556                 kfree_skb(skb);
7557         }
7558
7559         return NOTIFY_OK;
7560 }
7561
7562
7563 /**
7564  *      netdev_increment_features - increment feature set by one
7565  *      @all: current feature set
7566  *      @one: new feature set
7567  *      @mask: mask feature set
7568  *
7569  *      Computes a new feature set after adding a device with feature set
7570  *      @one to the master device with current feature set @all.  Will not
7571  *      enable anything that is off in @mask. Returns the new feature set.
7572  */
7573 netdev_features_t netdev_increment_features(netdev_features_t all,
7574         netdev_features_t one, netdev_features_t mask)
7575 {
7576         if (mask & NETIF_F_GEN_CSUM)
7577                 mask |= NETIF_F_ALL_CSUM;
7578         mask |= NETIF_F_VLAN_CHALLENGED;
7579
7580         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7581         all &= one | ~NETIF_F_ALL_FOR_ALL;
7582
7583         /* If one device supports hw checksumming, set for all. */
7584         if (all & NETIF_F_GEN_CSUM)
7585                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7586
7587         return all;
7588 }
7589 EXPORT_SYMBOL(netdev_increment_features);
7590
7591 static struct hlist_head * __net_init netdev_create_hash(void)
7592 {
7593         int i;
7594         struct hlist_head *hash;
7595
7596         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7597         if (hash != NULL)
7598                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7599                         INIT_HLIST_HEAD(&hash[i]);
7600
7601         return hash;
7602 }
7603
7604 /* Initialize per network namespace state */
7605 static int __net_init netdev_init(struct net *net)
7606 {
7607         if (net != &init_net)
7608                 INIT_LIST_HEAD(&net->dev_base_head);
7609
7610         net->dev_name_head = netdev_create_hash();
7611         if (net->dev_name_head == NULL)
7612                 goto err_name;
7613
7614         net->dev_index_head = netdev_create_hash();
7615         if (net->dev_index_head == NULL)
7616                 goto err_idx;
7617
7618         return 0;
7619
7620 err_idx:
7621         kfree(net->dev_name_head);
7622 err_name:
7623         return -ENOMEM;
7624 }
7625
7626 /**
7627  *      netdev_drivername - network driver for the device
7628  *      @dev: network device
7629  *
7630  *      Determine network driver for device.
7631  */
7632 const char *netdev_drivername(const struct net_device *dev)
7633 {
7634         const struct device_driver *driver;
7635         const struct device *parent;
7636         const char *empty = "";
7637
7638         parent = dev->dev.parent;
7639         if (!parent)
7640                 return empty;
7641
7642         driver = parent->driver;
7643         if (driver && driver->name)
7644                 return driver->name;
7645         return empty;
7646 }
7647
7648 static void __netdev_printk(const char *level, const struct net_device *dev,
7649                             struct va_format *vaf)
7650 {
7651         if (dev && dev->dev.parent) {
7652                 dev_printk_emit(level[1] - '0',
7653                                 dev->dev.parent,
7654                                 "%s %s %s%s: %pV",
7655                                 dev_driver_string(dev->dev.parent),
7656                                 dev_name(dev->dev.parent),
7657                                 netdev_name(dev), netdev_reg_state(dev),
7658                                 vaf);
7659         } else if (dev) {
7660                 printk("%s%s%s: %pV",
7661                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
7662         } else {
7663                 printk("%s(NULL net_device): %pV", level, vaf);
7664         }
7665 }
7666
7667 void netdev_printk(const char *level, const struct net_device *dev,
7668                    const char *format, ...)
7669 {
7670         struct va_format vaf;
7671         va_list args;
7672
7673         va_start(args, format);
7674
7675         vaf.fmt = format;
7676         vaf.va = &args;
7677
7678         __netdev_printk(level, dev, &vaf);
7679
7680         va_end(args);
7681 }
7682 EXPORT_SYMBOL(netdev_printk);
7683
7684 #define define_netdev_printk_level(func, level)                 \
7685 void func(const struct net_device *dev, const char *fmt, ...)   \
7686 {                                                               \
7687         struct va_format vaf;                                   \
7688         va_list args;                                           \
7689                                                                 \
7690         va_start(args, fmt);                                    \
7691                                                                 \
7692         vaf.fmt = fmt;                                          \
7693         vaf.va = &args;                                         \
7694                                                                 \
7695         __netdev_printk(level, dev, &vaf);                      \
7696                                                                 \
7697         va_end(args);                                           \
7698 }                                                               \
7699 EXPORT_SYMBOL(func);
7700
7701 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7702 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7703 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7704 define_netdev_printk_level(netdev_err, KERN_ERR);
7705 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7706 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7707 define_netdev_printk_level(netdev_info, KERN_INFO);
7708
7709 static void __net_exit netdev_exit(struct net *net)
7710 {
7711         kfree(net->dev_name_head);
7712         kfree(net->dev_index_head);
7713 }
7714
7715 static struct pernet_operations __net_initdata netdev_net_ops = {
7716         .init = netdev_init,
7717         .exit = netdev_exit,
7718 };
7719
7720 static void __net_exit default_device_exit(struct net *net)
7721 {
7722         struct net_device *dev, *aux;
7723         /*
7724          * Push all migratable network devices back to the
7725          * initial network namespace
7726          */
7727         rtnl_lock();
7728         for_each_netdev_safe(net, dev, aux) {
7729                 int err;
7730                 char fb_name[IFNAMSIZ];
7731
7732                 /* Ignore unmoveable devices (i.e. loopback) */
7733                 if (dev->features & NETIF_F_NETNS_LOCAL)
7734                         continue;
7735
7736                 /* Leave virtual devices for the generic cleanup */
7737                 if (dev->rtnl_link_ops)
7738                         continue;
7739
7740                 /* Push remaining network devices to init_net */
7741                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7742                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7743                 if (err) {
7744                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7745                                  __func__, dev->name, err);
7746                         BUG();
7747                 }
7748         }
7749         rtnl_unlock();
7750 }
7751
7752 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7753 {
7754         /* Return with the rtnl_lock held when there are no network
7755          * devices unregistering in any network namespace in net_list.
7756          */
7757         struct net *net;
7758         bool unregistering;
7759         DEFINE_WAIT_FUNC(wait, woken_wake_function);
7760
7761         add_wait_queue(&netdev_unregistering_wq, &wait);
7762         for (;;) {
7763                 unregistering = false;
7764                 rtnl_lock();
7765                 list_for_each_entry(net, net_list, exit_list) {
7766                         if (net->dev_unreg_count > 0) {
7767                                 unregistering = true;
7768                                 break;
7769                         }
7770                 }
7771                 if (!unregistering)
7772                         break;
7773                 __rtnl_unlock();
7774
7775                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7776         }
7777         remove_wait_queue(&netdev_unregistering_wq, &wait);
7778 }
7779
7780 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7781 {
7782         /* At exit all network devices most be removed from a network
7783          * namespace.  Do this in the reverse order of registration.
7784          * Do this across as many network namespaces as possible to
7785          * improve batching efficiency.
7786          */
7787         struct net_device *dev;
7788         struct net *net;
7789         LIST_HEAD(dev_kill_list);
7790
7791         /* To prevent network device cleanup code from dereferencing
7792          * loopback devices or network devices that have been freed
7793          * wait here for all pending unregistrations to complete,
7794          * before unregistring the loopback device and allowing the
7795          * network namespace be freed.
7796          *
7797          * The netdev todo list containing all network devices
7798          * unregistrations that happen in default_device_exit_batch
7799          * will run in the rtnl_unlock() at the end of
7800          * default_device_exit_batch.
7801          */
7802         rtnl_lock_unregistering(net_list);
7803         list_for_each_entry(net, net_list, exit_list) {
7804                 for_each_netdev_reverse(net, dev) {
7805                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7806                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7807                         else
7808                                 unregister_netdevice_queue(dev, &dev_kill_list);
7809                 }
7810         }
7811         unregister_netdevice_many(&dev_kill_list);
7812         rtnl_unlock();
7813 }
7814
7815 static struct pernet_operations __net_initdata default_device_ops = {
7816         .exit = default_device_exit,
7817         .exit_batch = default_device_exit_batch,
7818 };
7819
7820 /*
7821  *      Initialize the DEV module. At boot time this walks the device list and
7822  *      unhooks any devices that fail to initialise (normally hardware not
7823  *      present) and leaves us with a valid list of present and active devices.
7824  *
7825  */
7826
7827 /*
7828  *       This is called single threaded during boot, so no need
7829  *       to take the rtnl semaphore.
7830  */
7831 static int __init net_dev_init(void)
7832 {
7833         int i, rc = -ENOMEM;
7834
7835         BUG_ON(!dev_boot_phase);
7836
7837         if (dev_proc_init())
7838                 goto out;
7839
7840         if (netdev_kobject_init())
7841                 goto out;
7842
7843         INIT_LIST_HEAD(&ptype_all);
7844         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7845                 INIT_LIST_HEAD(&ptype_base[i]);
7846
7847         INIT_LIST_HEAD(&offload_base);
7848
7849         if (register_pernet_subsys(&netdev_net_ops))
7850                 goto out;
7851
7852         /*
7853          *      Initialise the packet receive queues.
7854          */
7855
7856         for_each_possible_cpu(i) {
7857                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7858
7859                 skb_queue_head_init_raw(&sd->input_pkt_queue);
7860                 skb_queue_head_init_raw(&sd->process_queue);
7861                 skb_queue_head_init_raw(&sd->tofree_queue);
7862                 INIT_LIST_HEAD(&sd->poll_list);
7863                 sd->output_queue_tailp = &sd->output_queue;
7864 #ifdef CONFIG_RPS
7865                 sd->csd.func = rps_trigger_softirq;
7866                 sd->csd.info = sd;
7867                 sd->cpu = i;
7868 #endif
7869
7870                 sd->backlog.poll = process_backlog;
7871                 sd->backlog.weight = weight_p;
7872         }
7873
7874         dev_boot_phase = 0;
7875
7876         /* The loopback device is special if any other network devices
7877          * is present in a network namespace the loopback device must
7878          * be present. Since we now dynamically allocate and free the
7879          * loopback device ensure this invariant is maintained by
7880          * keeping the loopback device as the first device on the
7881          * list of network devices.  Ensuring the loopback devices
7882          * is the first device that appears and the last network device
7883          * that disappears.
7884          */
7885         if (register_pernet_device(&loopback_net_ops))
7886                 goto out;
7887
7888         if (register_pernet_device(&default_device_ops))
7889                 goto out;
7890
7891         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7892         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7893
7894         hotcpu_notifier(dev_cpu_callback, 0);
7895         dst_subsys_init();
7896         rc = 0;
7897 out:
7898         return rc;
7899 }
7900
7901 subsys_initcall(net_dev_init);