kernel/net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/dst_metadata.h>
 103 #include <net/pkt_sched.h>
 104 #include <net/checksum.h>
 105 #include <net/xfrm.h>
 106 #include <linux/highmem.h>
 107 #include <linux/init.h>
 108 #include <linux/module.h>
 109 #include <linux/netpoll.h>
 110 #include <linux/rcupdate.h>
 111 #include <linux/delay.h>
 112 #include <net/iw_handler.h>
 113 #include <asm/current.h>
 114 #include <linux/audit.h>
 115 #include <linux/dmaengine.h>
 116 #include <linux/err.h>
 117 #include <linux/ctype.h>
 118 #include <linux/if_arp.h>
 119 #include <linux/if_vlan.h>
 120 #include <linux/ip.h>
 121 #include <net/ip.h>
 122 #include <net/mpls.h>
 123 #include <linux/ipv6.h>
 124 #include <linux/in.h>
 125 #include <linux/jhash.h>
 126 #include <linux/random.h>
 127 #include <trace/events/napi.h>
 128 #include <trace/events/net.h>
 129 #include <trace/events/skb.h>
 130 #include <linux/pci.h>
 131 #include <linux/inetdevice.h>
 132 #include <linux/cpu_rmap.h>
 133 #include <linux/static_key.h>
 134 #include <linux/hashtable.h>
 135 #include <linux/vmalloc.h>
 136 #include <linux/if_macvlan.h>
 137 #include <linux/errqueue.h>
 138 #include <linux/hrtimer.h>
 139 #include <linux/netfilter_ingress.h>
 140
 141 #include "net-sysfs.h"
 142
 143 /* Instead of increasing this, you should create a hash table. */
 144 #define MAX_GRO_SKBS 8
 145
 146 /* This should be increased if a protocol with a bigger head is added. */
 147 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 148
 149 static DEFINE_SPINLOCK(ptype_lock);
 150 static DEFINE_SPINLOCK(offload_lock);
 151 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 152 struct list_head ptype_all __read_mostly;       /* Taps */
 153 static struct list_head offload_base __read_mostly;
 154
 155 static int netif_rx_internal(struct sk_buff *skb);
 156 static int call_netdevice_notifiers_info(unsigned long val,
 157                                          struct net_device *dev,
 158                                          struct netdev_notifier_info *info);
 159
 160 /*
 161  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 162  * semaphore.
 163  *
 164  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 165  *
 166  * Writers must hold the rtnl semaphore while they loop through the
 167  * dev_base_head list, and hold dev_base_lock for writing when they do the
 168  * actual updates.  This allows pure readers to access the list even
 169  * while a writer is preparing to update it.
 170  *
 171  * To put it another way, dev_base_lock is held for writing only to
 172  * protect against pure readers; the rtnl semaphore provides the
 173  * protection against other writers.
 174  *
 175  * See, for example usages, register_netdevice() and
 176  * unregister_netdevice(), which must be called with the rtnl
 177  * semaphore held.
 178  */
 179 DEFINE_RWLOCK(dev_base_lock);
 180 EXPORT_SYMBOL(dev_base_lock);
 181
 182 /* protects napi_hash addition/deletion and napi_gen_id */
 183 static DEFINE_SPINLOCK(napi_hash_lock);
 184
 185 static unsigned int napi_gen_id;
 186 static DEFINE_HASHTABLE(napi_hash, 8);
 187
 188 static seqcount_t devnet_rename_seq;
 189 static DEFINE_MUTEX(devnet_rename_mutex);
 190
 191 static inline void dev_base_seq_inc(struct net *net)
 192 {
 193         while (++net->dev_base_seq == 0);
 194 }
 195
 196 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 197 {
 198         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 199
 200         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 201 }
 202
 203 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 204 {
 205         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 206 }
 207
 208 static inline void rps_lock(struct softnet_data *sd)
 209 {
 210 #ifdef CONFIG_RPS
 211         raw_spin_lock(&sd->input_pkt_queue.raw_lock);
 212 #endif
 213 }
 214
 215 static inline void rps_unlock(struct softnet_data *sd)
 216 {
 217 #ifdef CONFIG_RPS
 218         raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
 219 #endif
 220 }
 221
 222 /* Device list insertion */
 223 static void list_netdevice(struct net_device *dev)
 224 {
 225         struct net *net = dev_net(dev);
 226
 227         ASSERT_RTNL();
 228
 229         write_lock_bh(&dev_base_lock);
 230         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 231         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 232         hlist_add_head_rcu(&dev->index_hlist,
 233                            dev_index_hash(net, dev->ifindex));
 234         write_unlock_bh(&dev_base_lock);
 235
 236         dev_base_seq_inc(net);
 237 }
 238
 239 /* Device list removal
 240  * caller must respect a RCU grace period before freeing/reusing dev
 241  */
 242 static void unlist_netdevice(struct net_device *dev)
 243 {
 244         ASSERT_RTNL();
 245
 246         /* Unlink dev from the device chain */
 247         write_lock_bh(&dev_base_lock);
 248         list_del_rcu(&dev->dev_list);
 249         hlist_del_rcu(&dev->name_hlist);
 250         hlist_del_rcu(&dev->index_hlist);
 251         write_unlock_bh(&dev_base_lock);
 252
 253         dev_base_seq_inc(dev_net(dev));
 254 }
 255
 256 /*
 257  *      Our notifier list
 258  */
 259
 260 static RAW_NOTIFIER_HEAD(netdev_chain);
 261
 262 /*
 263  *      Device drivers call our routines to queue packets here. We empty the
 264  *      queue in the local softnet handler.
 265  */
 266
 267 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 268 EXPORT_PER_CPU_SYMBOL(softnet_data);
 269
 270 #ifdef CONFIG_LOCKDEP
 271 /*
 272  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 273  * according to dev->type
 274  */
 275 static const unsigned short netdev_lock_type[] =
 276         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 277          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 278          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 279          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 280          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 281          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 282          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 283          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 284          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 285          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 286          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 287          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 288          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 289          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 290          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 291
 292 static const char *const netdev_lock_name[] =
 293         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 294          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 295          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 296          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 297          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 298          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 299          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 300          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 301          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 302          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 303          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 304          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 305          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 306          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 307          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 308
 309 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 310 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 311
 312 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 313 {
 314         int i;
 315
 316         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 317                 if (netdev_lock_type[i] == dev_type)
 318                         return i;
 319         /* the last key is used by default */
 320         return ARRAY_SIZE(netdev_lock_type) - 1;
 321 }
 322
 323 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 324                                                  unsigned short dev_type)
 325 {
 326         int i;
 327
 328         i = netdev_lock_pos(dev_type);
 329         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 330                                    netdev_lock_name[i]);
 331 }
 332
 333 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 334 {
 335         int i;
 336
 337         i = netdev_lock_pos(dev->type);
 338         lockdep_set_class_and_name(&dev->addr_list_lock,
 339                                    &netdev_addr_lock_key[i],
 340                                    netdev_lock_name[i]);
 341 }
 342 #else
 343 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 344                                                  unsigned short dev_type)
 345 {
 346 }
 347 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 348 {
 349 }
 350 #endif
 351
 352 /*******************************************************************************
 353
 354                 Protocol management and registration routines
 355
 356 *******************************************************************************/
 357
 358 /*
 359  *      Add a protocol ID to the list. Now that the input handler is
 360  *      smarter we can dispense with all the messy stuff that used to be
 361  *      here.
 362  *
 363  *      BEWARE!!! Protocol handlers, mangling input packets,
 364  *      MUST BE last in hash buckets and checking protocol handlers
 365  *      MUST start from promiscuous ptype_all chain in net_bh.
 366  *      It is true now, do not change it.
 367  *      Explanation follows: if protocol handler, mangling packet, will
 368  *      be the first on list, it is not able to sense, that packet
 369  *      is cloned and should be copied-on-write, so that it will
 370  *      change it and subsequent readers will get broken packet.
 371  *                                                      --ANK (980803)
 372  */
 373
 374 static inline struct list_head *ptype_head(const struct packet_type *pt)
 375 {
 376         if (pt->type == htons(ETH_P_ALL))
 377                 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 378         else
 379                 return pt->dev ? &pt->dev->ptype_specific :
 380                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 381 }
 382
 383 /**
 384  *      dev_add_pack - add packet handler
 385  *      @pt: packet type declaration
 386  *
 387  *      Add a protocol handler to the networking stack. The passed &packet_type
 388  *      is linked into kernel lists and may not be freed until it has been
 389  *      removed from the kernel lists.
 390  *
 391  *      This call does not sleep therefore it can not
 392  *      guarantee all CPU's that are in middle of receiving packets
 393  *      will see the new packet type (until the next received packet).
 394  */
 395
 396 void dev_add_pack(struct packet_type *pt)
 397 {
 398         struct list_head *head = ptype_head(pt);
 399
 400         spin_lock(&ptype_lock);
 401         list_add_rcu(&pt->list, head);
 402         spin_unlock(&ptype_lock);
 403 }
 404 EXPORT_SYMBOL(dev_add_pack);
 405
 406 /**
 407  *      __dev_remove_pack        - remove packet handler
 408  *      @pt: packet type declaration
 409  *
 410  *      Remove a protocol handler that was previously added to the kernel
 411  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 412  *      from the kernel lists and can be freed or reused once this function
 413  *      returns.
 414  *
 415  *      The packet type might still be in use by receivers
 416  *      and must not be freed until after all the CPU's have gone
 417  *      through a quiescent state.
 418  */
 419 void __dev_remove_pack(struct packet_type *pt)
 420 {
 421         struct list_head *head = ptype_head(pt);
 422         struct packet_type *pt1;
 423
 424         spin_lock(&ptype_lock);
 425
 426         list_for_each_entry(pt1, head, list) {
 427                 if (pt == pt1) {
 428                         list_del_rcu(&pt->list);
 429                         goto out;
 430                 }
 431         }
 432
 433         pr_warn("dev_remove_pack: %p not found\n", pt);
 434 out:
 435         spin_unlock(&ptype_lock);
 436 }
 437 EXPORT_SYMBOL(__dev_remove_pack);
 438
 439 /**
 440  *      dev_remove_pack  - remove packet handler
 441  *      @pt: packet type declaration
 442  *
 443  *      Remove a protocol handler that was previously added to the kernel
 444  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 445  *      from the kernel lists and can be freed or reused once this function
 446  *      returns.
 447  *
 448  *      This call sleeps to guarantee that no CPU is looking at the packet
 449  *      type after return.
 450  */
 451 void dev_remove_pack(struct packet_type *pt)
 452 {
 453         __dev_remove_pack(pt);
 454
 455         synchronize_net();
 456 }
 457 EXPORT_SYMBOL(dev_remove_pack);
 458
 459
 460 /**
 461  *      dev_add_offload - register offload handlers
 462  *      @po: protocol offload declaration
 463  *
 464  *      Add protocol offload handlers to the networking stack. The passed
 465  *      &proto_offload is linked into kernel lists and may not be freed until
 466  *      it has been removed from the kernel lists.
 467  *
 468  *      This call does not sleep therefore it can not
 469  *      guarantee all CPU's that are in middle of receiving packets
 470  *      will see the new offload handlers (until the next received packet).
 471  */
 472 void dev_add_offload(struct packet_offload *po)
 473 {
 474         struct packet_offload *elem;
 475
 476         spin_lock(&offload_lock);
 477         list_for_each_entry(elem, &offload_base, list) {
 478                 if (po->priority < elem->priority)
 479                         break;
 480         }
 481         list_add_rcu(&po->list, elem->list.prev);
 482         spin_unlock(&offload_lock);
 483 }
 484 EXPORT_SYMBOL(dev_add_offload);
 485
 486 /**
 487  *      __dev_remove_offload     - remove offload handler
 488  *      @po: packet offload declaration
 489  *
 490  *      Remove a protocol offload handler that was previously added to the
 491  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 492  *      is removed from the kernel lists and can be freed or reused once this
 493  *      function returns.
 494  *
 495  *      The packet type might still be in use by receivers
 496  *      and must not be freed until after all the CPU's have gone
 497  *      through a quiescent state.
 498  */
 499 static void __dev_remove_offload(struct packet_offload *po)
 500 {
 501         struct list_head *head = &offload_base;
 502         struct packet_offload *po1;
 503
 504         spin_lock(&offload_lock);
 505
 506         list_for_each_entry(po1, head, list) {
 507                 if (po == po1) {
 508                         list_del_rcu(&po->list);
 509                         goto out;
 510                 }
 511         }
 512
 513         pr_warn("dev_remove_offload: %p not found\n", po);
 514 out:
 515         spin_unlock(&offload_lock);
 516 }
 517
 518 /**
 519  *      dev_remove_offload       - remove packet offload handler
 520  *      @po: packet offload declaration
 521  *
 522  *      Remove a packet offload handler that was previously added to the kernel
 523  *      offload handlers by dev_add_offload(). The passed &offload_type is
 524  *      removed from the kernel lists and can be freed or reused once this
 525  *      function returns.
 526  *
 527  *      This call sleeps to guarantee that no CPU is looking at the packet
 528  *      type after return.
 529  */
 530 void dev_remove_offload(struct packet_offload *po)
 531 {
 532         __dev_remove_offload(po);
 533
 534         synchronize_net();
 535 }
 536 EXPORT_SYMBOL(dev_remove_offload);
 537
 538 /******************************************************************************
 539
 540                       Device Boot-time Settings Routines
 541
 542 *******************************************************************************/
 543
 544 /* Boot time configuration table */
 545 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 546
 547 /**
 548  *      netdev_boot_setup_add   - add new setup entry
 549  *      @name: name of the device
 550  *      @map: configured settings for the device
 551  *
 552  *      Adds new setup entry to the dev_boot_setup list.  The function
 553  *      returns 0 on error and 1 on success.  This is a generic routine to
 554  *      all netdevices.
 555  */
 556 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 557 {
 558         struct netdev_boot_setup *s;
 559         int i;
 560
 561         s = dev_boot_setup;
 562         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 563                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 564                         memset(s[i].name, 0, sizeof(s[i].name));
 565                         strlcpy(s[i].name, name, IFNAMSIZ);
 566                         memcpy(&s[i].map, map, sizeof(s[i].map));
 567                         break;
 568                 }
 569         }
 570
 571         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 572 }
 573
 574 /**
 575  *      netdev_boot_setup_check - check boot time settings
 576  *      @dev: the netdevice
 577  *
 578  *      Check boot time settings for the device.
 579  *      The found settings are set for the device to be used
 580  *      later in the device probing.
 581  *      Returns 0 if no settings found, 1 if they are.
 582  */
 583 int netdev_boot_setup_check(struct net_device *dev)
 584 {
 585         struct netdev_boot_setup *s = dev_boot_setup;
 586         int i;
 587
 588         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 589                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 590                     !strcmp(dev->name, s[i].name)) {
 591                         dev->irq        = s[i].map.irq;
 592                         dev->base_addr  = s[i].map.base_addr;
 593                         dev->mem_start  = s[i].map.mem_start;
 594                         dev->mem_end    = s[i].map.mem_end;
 595                         return 1;
 596                 }
 597         }
 598         return 0;
 599 }
 600 EXPORT_SYMBOL(netdev_boot_setup_check);
 601
 602
 603 /**
 604  *      netdev_boot_base        - get address from boot time settings
 605  *      @prefix: prefix for network device
 606  *      @unit: id for network device
 607  *
 608  *      Check boot time settings for the base address of device.
 609  *      The found settings are set for the device to be used
 610  *      later in the device probing.
 611  *      Returns 0 if no settings found.
 612  */
 613 unsigned long netdev_boot_base(const char *prefix, int unit)
 614 {
 615         const struct netdev_boot_setup *s = dev_boot_setup;
 616         char name[IFNAMSIZ];
 617         int i;
 618
 619         sprintf(name, "%s%d", prefix, unit);
 620
 621         /*
 622          * If device already registered then return base of 1
 623          * to indicate not to probe for this interface
 624          */
 625         if (__dev_get_by_name(&init_net, name))
 626                 return 1;
 627
 628         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 629                 if (!strcmp(name, s[i].name))
 630                         return s[i].map.base_addr;
 631         return 0;
 632 }
 633
 634 /*
 635  * Saves at boot time configured settings for any netdevice.
 636  */
 637 int __init netdev_boot_setup(char *str)
 638 {
 639         int ints[5];
 640         struct ifmap map;
 641
 642         str = get_options(str, ARRAY_SIZE(ints), ints);
 643         if (!str || !*str)
 644                 return 0;
 645
 646         /* Save settings */
 647         memset(&map, 0, sizeof(map));
 648         if (ints[0] > 0)
 649                 map.irq = ints[1];
 650         if (ints[0] > 1)
 651                 map.base_addr = ints[2];
 652         if (ints[0] > 2)
 653                 map.mem_start = ints[3];
 654         if (ints[0] > 3)
 655                 map.mem_end = ints[4];
 656
 657         /* Add new entry to the list */
 658         return netdev_boot_setup_add(str, &map);
 659 }
 660
 661 __setup("netdev=", netdev_boot_setup);
 662
 663 /*******************************************************************************
 664
 665                             Device Interface Subroutines
 666
 667 *******************************************************************************/
 668
 669 /**
 670  *      dev_get_iflink  - get 'iflink' value of a interface
 671  *      @dev: targeted interface
 672  *
 673  *      Indicates the ifindex the interface is linked to.
 674  *      Physical interfaces have the same 'ifindex' and 'iflink' values.
 675  */
 676
 677 int dev_get_iflink(const struct net_device *dev)
 678 {
 679         if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 680                 return dev->netdev_ops->ndo_get_iflink(dev);
 681
 682         return dev->ifindex;
 683 }
 684 EXPORT_SYMBOL(dev_get_iflink);
 685
 686 /**
 687  *      dev_fill_metadata_dst - Retrieve tunnel egress information.
 688  *      @dev: targeted interface
 689  *      @skb: The packet.
 690  *
 691  *      For better visibility of tunnel traffic OVS needs to retrieve
 692  *      egress tunnel information for a packet. Following API allows
 693  *      user to get this info.
 694  */
 695 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 696 {
 697         struct ip_tunnel_info *info;
 698
 699         if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 700                 return -EINVAL;
 701
 702         info = skb_tunnel_info_unclone(skb);
 703         if (!info)
 704                 return -ENOMEM;
 705         if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 706                 return -EINVAL;
 707
 708         return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 709 }
 710 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 711
 712 /**
 713  *      __dev_get_by_name       - find a device by its name
 714  *      @net: the applicable net namespace
 715  *      @name: name to find
 716  *
 717  *      Find an interface by name. Must be called under RTNL semaphore
 718  *      or @dev_base_lock. If the name is found a pointer to the device
 719  *      is returned. If the name is not found then %NULL is returned. The
 720  *      reference counters are not incremented so the caller must be
 721  *      careful with locks.
 722  */
 723
 724 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 725 {
 726         struct net_device *dev;
 727         struct hlist_head *head = dev_name_hash(net, name);
 728
 729         hlist_for_each_entry(dev, head, name_hlist)
 730                 if (!strncmp(dev->name, name, IFNAMSIZ))
 731                         return dev;
 732
 733         return NULL;
 734 }
 735 EXPORT_SYMBOL(__dev_get_by_name);
 736
 737 /**
 738  *      dev_get_by_name_rcu     - find a device by its name
 739  *      @net: the applicable net namespace
 740  *      @name: name to find
 741  *
 742  *      Find an interface by name.
 743  *      If the name is found a pointer to the device is returned.
 744  *      If the name is not found then %NULL is returned.
 745  *      The reference counters are not incremented so the caller must be
 746  *      careful with locks. The caller must hold RCU lock.
 747  */
 748
 749 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 750 {
 751         struct net_device *dev;
 752         struct hlist_head *head = dev_name_hash(net, name);
 753
 754         hlist_for_each_entry_rcu(dev, head, name_hlist)
 755                 if (!strncmp(dev->name, name, IFNAMSIZ))
 756                         return dev;
 757
 758         return NULL;
 759 }
 760 EXPORT_SYMBOL(dev_get_by_name_rcu);
 761
 762 /**
 763  *      dev_get_by_name         - find a device by its name
 764  *      @net: the applicable net namespace
 765  *      @name: name to find
 766  *
 767  *      Find an interface by name. This can be called from any
 768  *      context and does its own locking. The returned handle has
 769  *      the usage count incremented and the caller must use dev_put() to
 770  *      release it when it is no longer needed. %NULL is returned if no
 771  *      matching device is found.
 772  */
 773
 774 struct net_device *dev_get_by_name(struct net *net, const char *name)
 775 {
 776         struct net_device *dev;
 777
 778         rcu_read_lock();
 779         dev = dev_get_by_name_rcu(net, name);
 780         if (dev)
 781                 dev_hold(dev);
 782         rcu_read_unlock();
 783         return dev;
 784 }
 785 EXPORT_SYMBOL(dev_get_by_name);
 786
 787 /**
 788  *      __dev_get_by_index - find a device by its ifindex
 789  *      @net: the applicable net namespace
 790  *      @ifindex: index of device
 791  *
 792  *      Search for an interface by index. Returns %NULL if the device
 793  *      is not found or a pointer to the device. The device has not
 794  *      had its reference counter increased so the caller must be careful
 795  *      about locking. The caller must hold either the RTNL semaphore
 796  *      or @dev_base_lock.
 797  */
 798
 799 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 800 {
 801         struct net_device *dev;
 802         struct hlist_head *head = dev_index_hash(net, ifindex);
 803
 804         hlist_for_each_entry(dev, head, index_hlist)
 805                 if (dev->ifindex == ifindex)
 806                         return dev;
 807
 808         return NULL;
 809 }
 810 EXPORT_SYMBOL(__dev_get_by_index);
 811
 812 /**
 813  *      dev_get_by_index_rcu - find a device by its ifindex
 814  *      @net: the applicable net namespace
 815  *      @ifindex: index of device
 816  *
 817  *      Search for an interface by index. Returns %NULL if the device
 818  *      is not found or a pointer to the device. The device has not
 819  *      had its reference counter increased so the caller must be careful
 820  *      about locking. The caller must hold RCU lock.
 821  */
 822
 823 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 824 {
 825         struct net_device *dev;
 826         struct hlist_head *head = dev_index_hash(net, ifindex);
 827
 828         hlist_for_each_entry_rcu(dev, head, index_hlist)
 829                 if (dev->ifindex == ifindex)
 830                         return dev;
 831
 832         return NULL;
 833 }
 834 EXPORT_SYMBOL(dev_get_by_index_rcu);
 835
 836
 837 /**
 838  *      dev_get_by_index - find a device by its ifindex
 839  *      @net: the applicable net namespace
 840  *      @ifindex: index of device
 841  *
 842  *      Search for an interface by index. Returns NULL if the device
 843  *      is not found or a pointer to the device. The device returned has
 844  *      had a reference added and the pointer is safe until the user calls
 845  *      dev_put to indicate they have finished with it.
 846  */
 847
 848 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 849 {
 850         struct net_device *dev;
 851
 852         rcu_read_lock();
 853         dev = dev_get_by_index_rcu(net, ifindex);
 854         if (dev)
 855                 dev_hold(dev);
 856         rcu_read_unlock();
 857         return dev;
 858 }
 859 EXPORT_SYMBOL(dev_get_by_index);
 860
 861 /**
 862  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 863  *      @net: network namespace
 864  *      @name: a pointer to the buffer where the name will be stored.
 865  *      @ifindex: the ifindex of the interface to get the name from.
 866  *
 867  *      The use of raw_seqcount_begin() and cond_resched() before
 868  *      retrying is required as we want to give the writers a chance
 869  *      to complete when CONFIG_PREEMPT is not set.
 870  */
 871 int netdev_get_name(struct net *net, char *name, int ifindex)
 872 {
 873         struct net_device *dev;
 874         unsigned int seq;
 875
 876 retry:
 877         seq = raw_seqcount_begin(&devnet_rename_seq);
 878         rcu_read_lock();
 879         dev = dev_get_by_index_rcu(net, ifindex);
 880         if (!dev) {
 881                 rcu_read_unlock();
 882                 return -ENODEV;
 883         }
 884
 885         strcpy(name, dev->name);
 886         rcu_read_unlock();
 887         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 888                 mutex_lock(&devnet_rename_mutex);
 889                 mutex_unlock(&devnet_rename_mutex);
 890                 goto retry;
 891         }
 892
 893         return 0;
 894 }
 895
 896 /**
 897  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 898  *      @net: the applicable net namespace
 899  *      @type: media type of device
 900  *      @ha: hardware address
 901  *
 902  *      Search for an interface by MAC address. Returns NULL if the device
 903  *      is not found or a pointer to the device.
 904  *      The caller must hold RCU or RTNL.
 905  *      The returned device has not had its ref count increased
 906  *      and the caller must therefore be careful about locking
 907  *
 908  */
 909
 910 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 911                                        const char *ha)
 912 {
 913         struct net_device *dev;
 914
 915         for_each_netdev_rcu(net, dev)
 916                 if (dev->type == type &&
 917                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 918                         return dev;
 919
 920         return NULL;
 921 }
 922 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 923
 924 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 925 {
 926         struct net_device *dev;
 927
 928         ASSERT_RTNL();
 929         for_each_netdev(net, dev)
 930                 if (dev->type == type)
 931                         return dev;
 932
 933         return NULL;
 934 }
 935 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 936
 937 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 938 {
 939         struct net_device *dev, *ret = NULL;
 940
 941         rcu_read_lock();
 942         for_each_netdev_rcu(net, dev)
 943                 if (dev->type == type) {
 944                         dev_hold(dev);
 945                         ret = dev;
 946                         break;
 947                 }
 948         rcu_read_unlock();
 949         return ret;
 950 }
 951 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 952
 953 /**
 954  *      __dev_get_by_flags - find any device with given flags
 955  *      @net: the applicable net namespace
 956  *      @if_flags: IFF_* values
 957  *      @mask: bitmask of bits in if_flags to check
 958  *
 959  *      Search for any interface with the given flags. Returns NULL if a device
 960  *      is not found or a pointer to the device. Must be called inside
 961  *      rtnl_lock(), and result refcount is unchanged.
 962  */
 963
 964 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 965                                       unsigned short mask)
 966 {
 967         struct net_device *dev, *ret;
 968
 969         ASSERT_RTNL();
 970
 971         ret = NULL;
 972         for_each_netdev(net, dev) {
 973                 if (((dev->flags ^ if_flags) & mask) == 0) {
 974                         ret = dev;
 975                         break;
 976                 }
 977         }
 978         return ret;
 979 }
 980 EXPORT_SYMBOL(__dev_get_by_flags);
 981
 982 /**
 983  *      dev_valid_name - check if name is okay for network device
 984  *      @name: name string
 985  *
 986  *      Network device names need to be valid file names to
 987  *      to allow sysfs to work.  We also disallow any kind of
 988  *      whitespace.
 989  */
 990 bool dev_valid_name(const char *name)
 991 {
 992         if (*name == '\0')
 993                 return false;
 994         if (strlen(name) >= IFNAMSIZ)
 995                 return false;
 996         if (!strcmp(name, ".") || !strcmp(name, ".."))
 997                 return false;
 998
 999         while (*name) {
1000                 if (*name == '/' || *name == ':' || isspace(*name))
1001                         return false;
1002                 name++;
1003         }
1004         return true;
1005 }
1006 EXPORT_SYMBOL(dev_valid_name);
1007
1008 /**
1009  *      __dev_alloc_name - allocate a name for a device
1010  *      @net: network namespace to allocate the device name in
1011  *      @name: name format string
1012  *      @buf:  scratch buffer and result name string
1013  *
1014  *      Passed a format string - eg "lt%d" it will try and find a suitable
1015  *      id. It scans list of devices to build up a free map, then chooses
1016  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1017  *      while allocating the name and adding the device in order to avoid
1018  *      duplicates.
1019  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1020  *      Returns the number of the unit assigned or a negative errno code.
1021  */
1022
1023 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1024 {
1025         int i = 0;
1026         const char *p;
1027         const int max_netdevices = 8*PAGE_SIZE;
1028         unsigned long *inuse;
1029         struct net_device *d;
1030
1031         p = strnchr(name, IFNAMSIZ-1, '%');
1032         if (p) {
1033                 /*
1034                  * Verify the string as this thing may have come from
1035                  * the user.  There must be either one "%d" and no other "%"
1036                  * characters.
1037                  */
1038                 if (p[1] != 'd' || strchr(p + 2, '%'))
1039                         return -EINVAL;
1040
1041                 /* Use one page as a bit array of possible slots */
1042                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1043                 if (!inuse)
1044                         return -ENOMEM;
1045
1046                 for_each_netdev(net, d) {
1047                         if (!sscanf(d->name, name, &i))
1048                                 continue;
1049                         if (i < 0 || i >= max_netdevices)
1050                                 continue;
1051
1052                         /*  avoid cases where sscanf is not exact inverse of printf */
1053                         snprintf(buf, IFNAMSIZ, name, i);
1054                         if (!strncmp(buf, d->name, IFNAMSIZ))
1055                                 set_bit(i, inuse);
1056                 }
1057
1058                 i = find_first_zero_bit(inuse, max_netdevices);
1059                 free_page((unsigned long) inuse);
1060         }
1061
1062         if (buf != name)
1063                 snprintf(buf, IFNAMSIZ, name, i);
1064         if (!__dev_get_by_name(net, buf))
1065                 return i;
1066
1067         /* It is possible to run out of possible slots
1068          * when the name is long and there isn't enough space left
1069          * for the digits, or if all bits are used.
1070          */
1071         return -ENFILE;
1072 }
1073
1074 /**
1075  *      dev_alloc_name - allocate a name for a device
1076  *      @dev: device
1077  *      @name: name format string
1078  *
1079  *      Passed a format string - eg "lt%d" it will try and find a suitable
1080  *      id. It scans list of devices to build up a free map, then chooses
1081  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1082  *      while allocating the name and adding the device in order to avoid
1083  *      duplicates.
1084  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1085  *      Returns the number of the unit assigned or a negative errno code.
1086  */
1087
1088 int dev_alloc_name(struct net_device *dev, const char *name)
1089 {
1090         char buf[IFNAMSIZ];
1091         struct net *net;
1092         int ret;
1093
1094         BUG_ON(!dev_net(dev));
1095         net = dev_net(dev);
1096         ret = __dev_alloc_name(net, name, buf);
1097         if (ret >= 0)
1098                 strlcpy(dev->name, buf, IFNAMSIZ);
1099         return ret;
1100 }
1101 EXPORT_SYMBOL(dev_alloc_name);
1102
1103 static int dev_alloc_name_ns(struct net *net,
1104                              struct net_device *dev,
1105                              const char *name)
1106 {
1107         char buf[IFNAMSIZ];
1108         int ret;
1109
1110         ret = __dev_alloc_name(net, name, buf);
1111         if (ret >= 0)
1112                 strlcpy(dev->name, buf, IFNAMSIZ);
1113         return ret;
1114 }
1115
1116 static int dev_get_valid_name(struct net *net,
1117                               struct net_device *dev,
1118                               const char *name)
1119 {
1120         BUG_ON(!net);
1121
1122         if (!dev_valid_name(name))
1123                 return -EINVAL;
1124
1125         if (strchr(name, '%'))
1126                 return dev_alloc_name_ns(net, dev, name);
1127         else if (__dev_get_by_name(net, name))
1128                 return -EEXIST;
1129         else if (dev->name != name)
1130                 strlcpy(dev->name, name, IFNAMSIZ);
1131
1132         return 0;
1133 }
1134
1135 /**
1136  *      dev_change_name - change name of a device
1137  *      @dev: device
1138  *      @newname: name (or format string) must be at least IFNAMSIZ
1139  *
1140  *      Change name of a device, can pass format strings "eth%d".
1141  *      for wildcarding.
1142  */
1143 int dev_change_name(struct net_device *dev, const char *newname)
1144 {
1145         unsigned char old_assign_type;
1146         char oldname[IFNAMSIZ];
1147         int err = 0;
1148         int ret;
1149         struct net *net;
1150
1151         ASSERT_RTNL();
1152         BUG_ON(!dev_net(dev));
1153
1154         net = dev_net(dev);
1155         if (dev->flags & IFF_UP)
1156                 return -EBUSY;
1157
1158         mutex_lock(&devnet_rename_mutex);
1159         __raw_write_seqcount_begin(&devnet_rename_seq);
1160
1161         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1162                 goto outunlock;
1163
1164         memcpy(oldname, dev->name, IFNAMSIZ);
1165
1166         err = dev_get_valid_name(net, dev, newname);
1167         if (err < 0)
1168                 goto outunlock;
1169
1170         if (oldname[0] && !strchr(oldname, '%'))
1171                 netdev_info(dev, "renamed from %s\n", oldname);
1172
1173         old_assign_type = dev->name_assign_type;
1174         dev->name_assign_type = NET_NAME_RENAMED;
1175
1176 rollback:
1177         ret = device_rename(&dev->dev, dev->name);
1178         if (ret) {
1179                 memcpy(dev->name, oldname, IFNAMSIZ);
1180                 dev->name_assign_type = old_assign_type;
1181                 err = ret;
1182                 goto outunlock;
1183         }
1184
1185         __raw_write_seqcount_end(&devnet_rename_seq);
1186         mutex_unlock(&devnet_rename_mutex);
1187
1188         netdev_adjacent_rename_links(dev, oldname);
1189
1190         write_lock_bh(&dev_base_lock);
1191         hlist_del_rcu(&dev->name_hlist);
1192         write_unlock_bh(&dev_base_lock);
1193
1194         synchronize_rcu();
1195
1196         write_lock_bh(&dev_base_lock);
1197         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1198         write_unlock_bh(&dev_base_lock);
1199
1200         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1201         ret = notifier_to_errno(ret);
1202
1203         if (ret) {
1204                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1205                 if (err >= 0) {
1206                         err = ret;
1207                         mutex_lock(&devnet_rename_mutex);
1208                         __raw_write_seqcount_begin(&devnet_rename_seq);
1209                         memcpy(dev->name, oldname, IFNAMSIZ);
1210                         memcpy(oldname, newname, IFNAMSIZ);
1211                         dev->name_assign_type = old_assign_type;
1212                         old_assign_type = NET_NAME_RENAMED;
1213                         goto rollback;
1214                 } else {
1215                         pr_err("%s: name change rollback failed: %d\n",
1216                                dev->name, ret);
1217                 }
1218         }
1219
1220         return err;
1221
1222 outunlock:
1223         __raw_write_seqcount_end(&devnet_rename_seq);
1224         mutex_unlock(&devnet_rename_mutex);
1225         return err;
1226 }
1227
1228 /**
1229  *      dev_set_alias - change ifalias of a device
1230  *      @dev: device
1231  *      @alias: name up to IFALIASZ
1232  *      @len: limit of bytes to copy from info
1233  *
1234  *      Set ifalias for a device,
1235  */
1236 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1237 {
1238         char *new_ifalias;
1239
1240         ASSERT_RTNL();
1241
1242         if (len >= IFALIASZ)
1243                 return -EINVAL;
1244
1245         if (!len) {
1246                 kfree(dev->ifalias);
1247                 dev->ifalias = NULL;
1248                 return 0;
1249         }
1250
1251         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1252         if (!new_ifalias)
1253                 return -ENOMEM;
1254         dev->ifalias = new_ifalias;
1255
1256         strlcpy(dev->ifalias, alias, len+1);
1257         return len;
1258 }
1259
1260
1261 /**
1262  *      netdev_features_change - device changes features
1263  *      @dev: device to cause notification
1264  *
1265  *      Called to indicate a device has changed features.
1266  */
1267 void netdev_features_change(struct net_device *dev)
1268 {
1269         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1270 }
1271 EXPORT_SYMBOL(netdev_features_change);
1272
1273 /**
1274  *      netdev_state_change - device changes state
1275  *      @dev: device to cause notification
1276  *
1277  *      Called to indicate a device has changed state. This function calls
1278  *      the notifier chains for netdev_chain and sends a NEWLINK message
1279  *      to the routing socket.
1280  */
1281 void netdev_state_change(struct net_device *dev)
1282 {
1283         if (dev->flags & IFF_UP) {
1284                 struct netdev_notifier_change_info change_info;
1285
1286                 change_info.flags_changed = 0;
1287                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1288                                               &change_info.info);
1289                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1290         }
1291 }
1292 EXPORT_SYMBOL(netdev_state_change);
1293
1294 /**
1295  *      netdev_notify_peers - notify network peers about existence of @dev
1296  *      @dev: network device
1297  *
1298  * Generate traffic such that interested network peers are aware of
1299  * @dev, such as by generating a gratuitous ARP. This may be used when
1300  * a device wants to inform the rest of the network about some sort of
1301  * reconfiguration such as a failover event or virtual machine
1302  * migration.
1303  */
1304 void netdev_notify_peers(struct net_device *dev)
1305 {
1306         rtnl_lock();
1307         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1308         rtnl_unlock();
1309 }
1310 EXPORT_SYMBOL(netdev_notify_peers);
1311
1312 static int __dev_open(struct net_device *dev)
1313 {
1314         const struct net_device_ops *ops = dev->netdev_ops;
1315         int ret;
1316
1317         ASSERT_RTNL();
1318
1319         if (!netif_device_present(dev))
1320                 return -ENODEV;
1321
1322         /* Block netpoll from trying to do any rx path servicing.
1323          * If we don't do this there is a chance ndo_poll_controller
1324          * or ndo_poll may be running while we open the device
1325          */
1326         netpoll_poll_disable(dev);
1327
1328         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1329         ret = notifier_to_errno(ret);
1330         if (ret)
1331                 return ret;
1332
1333         set_bit(__LINK_STATE_START, &dev->state);
1334
1335         if (ops->ndo_validate_addr)
1336                 ret = ops->ndo_validate_addr(dev);
1337
1338         if (!ret && ops->ndo_open)
1339                 ret = ops->ndo_open(dev);
1340
1341         netpoll_poll_enable(dev);
1342
1343         if (ret)
1344                 clear_bit(__LINK_STATE_START, &dev->state);
1345         else {
1346                 dev->flags |= IFF_UP;
1347                 dev_set_rx_mode(dev);
1348                 dev_activate(dev);
1349                 add_device_randomness(dev->dev_addr, dev->addr_len);
1350         }
1351
1352         return ret;
1353 }
1354
1355 /**
1356  *      dev_open        - prepare an interface for use.
1357  *      @dev:   device to open
1358  *
1359  *      Takes a device from down to up state. The device's private open
1360  *      function is invoked and then the multicast lists are loaded. Finally
1361  *      the device is moved into the up state and a %NETDEV_UP message is
1362  *      sent to the netdev notifier chain.
1363  *
1364  *      Calling this function on an active interface is a nop. On a failure
1365  *      a negative errno code is returned.
1366  */
1367 int dev_open(struct net_device *dev)
1368 {
1369         int ret;
1370
1371         if (dev->flags & IFF_UP)
1372                 return 0;
1373
1374         ret = __dev_open(dev);
1375         if (ret < 0)
1376                 return ret;
1377
1378         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1379         call_netdevice_notifiers(NETDEV_UP, dev);
1380
1381         return ret;
1382 }
1383 EXPORT_SYMBOL(dev_open);
1384
1385 static int __dev_close_many(struct list_head *head)
1386 {
1387         struct net_device *dev;
1388
1389         ASSERT_RTNL();
1390         might_sleep();
1391
1392         list_for_each_entry(dev, head, close_list) {
1393                 /* Temporarily disable netpoll until the interface is down */
1394                 netpoll_poll_disable(dev);
1395
1396                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1397
1398                 clear_bit(__LINK_STATE_START, &dev->state);
1399
1400                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1401                  * can be even on different cpu. So just clear netif_running().
1402                  *
1403                  * dev->stop() will invoke napi_disable() on all of it's
1404                  * napi_struct instances on this device.
1405                  */
1406                 smp_mb__after_atomic(); /* Commit netif_running(). */
1407         }
1408
1409         dev_deactivate_many(head);
1410
1411         list_for_each_entry(dev, head, close_list) {
1412                 const struct net_device_ops *ops = dev->netdev_ops;
1413
1414                 /*
1415                  *      Call the device specific close. This cannot fail.
1416                  *      Only if device is UP
1417                  *
1418                  *      We allow it to be called even after a DETACH hot-plug
1419                  *      event.
1420                  */
1421                 if (ops->ndo_stop)
1422                         ops->ndo_stop(dev);
1423
1424                 dev->flags &= ~IFF_UP;
1425                 netpoll_poll_enable(dev);
1426         }
1427
1428         return 0;
1429 }
1430
1431 static int __dev_close(struct net_device *dev)
1432 {
1433         int retval;
1434         LIST_HEAD(single);
1435
1436         list_add(&dev->close_list, &single);
1437         retval = __dev_close_many(&single);
1438         list_del(&single);
1439
1440         return retval;
1441 }
1442
1443 int dev_close_many(struct list_head *head, bool unlink)
1444 {
1445         struct net_device *dev, *tmp;
1446
1447         /* Remove the devices that don't need to be closed */
1448         list_for_each_entry_safe(dev, tmp, head, close_list)
1449                 if (!(dev->flags & IFF_UP))
1450                         list_del_init(&dev->close_list);
1451
1452         __dev_close_many(head);
1453
1454         list_for_each_entry_safe(dev, tmp, head, close_list) {
1455                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1456                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1457                 if (unlink)
1458                         list_del_init(&dev->close_list);
1459         }
1460
1461         return 0;
1462 }
1463 EXPORT_SYMBOL(dev_close_many);
1464
1465 /**
1466  *      dev_close - shutdown an interface.
1467  *      @dev: device to shutdown
1468  *
1469  *      This function moves an active device into down state. A
1470  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1471  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1472  *      chain.
1473  */
1474 int dev_close(struct net_device *dev)
1475 {
1476         if (dev->flags & IFF_UP) {
1477                 LIST_HEAD(single);
1478
1479                 list_add(&dev->close_list, &single);
1480                 dev_close_many(&single, true);
1481                 list_del(&single);
1482         }
1483         return 0;
1484 }
1485 EXPORT_SYMBOL(dev_close);
1486
1487
1488 /**
1489  *      dev_disable_lro - disable Large Receive Offload on a device
1490  *      @dev: device
1491  *
1492  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1493  *      called under RTNL.  This is needed if received packets may be
1494  *      forwarded to another interface.
1495  */
1496 void dev_disable_lro(struct net_device *dev)
1497 {
1498         struct net_device *lower_dev;
1499         struct list_head *iter;
1500
1501         dev->wanted_features &= ~NETIF_F_LRO;
1502         netdev_update_features(dev);
1503
1504         if (unlikely(dev->features & NETIF_F_LRO))
1505                 netdev_WARN(dev, "failed to disable LRO!\n");
1506
1507         netdev_for_each_lower_dev(dev, lower_dev, iter)
1508                 dev_disable_lro(lower_dev);
1509 }
1510 EXPORT_SYMBOL(dev_disable_lro);
1511
1512 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1513                                    struct net_device *dev)
1514 {
1515         struct netdev_notifier_info info;
1516
1517         netdev_notifier_info_init(&info, dev);
1518         return nb->notifier_call(nb, val, &info);
1519 }
1520
1521 static int dev_boot_phase = 1;
1522
1523 /**
1524  *      register_netdevice_notifier - register a network notifier block
1525  *      @nb: notifier
1526  *
1527  *      Register a notifier to be called when network device events occur.
1528  *      The notifier passed is linked into the kernel structures and must
1529  *      not be reused until it has been unregistered. A negative errno code
1530  *      is returned on a failure.
1531  *
1532  *      When registered all registration and up events are replayed
1533  *      to the new notifier to allow device to have a race free
1534  *      view of the network device list.
1535  */
1536
1537 int register_netdevice_notifier(struct notifier_block *nb)
1538 {
1539         struct net_device *dev;
1540         struct net_device *last;
1541         struct net *net;
1542         int err;
1543
1544         rtnl_lock();
1545         err = raw_notifier_chain_register(&netdev_chain, nb);
1546         if (err)
1547                 goto unlock;
1548         if (dev_boot_phase)
1549                 goto unlock;
1550         for_each_net(net) {
1551                 for_each_netdev(net, dev) {
1552                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1553                         err = notifier_to_errno(err);
1554                         if (err)
1555                                 goto rollback;
1556
1557                         if (!(dev->flags & IFF_UP))
1558                                 continue;
1559
1560                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1561                 }
1562         }
1563
1564 unlock:
1565         rtnl_unlock();
1566         return err;
1567
1568 rollback:
1569         last = dev;
1570         for_each_net(net) {
1571                 for_each_netdev(net, dev) {
1572                         if (dev == last)
1573                                 goto outroll;
1574
1575                         if (dev->flags & IFF_UP) {
1576                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1577                                                         dev);
1578                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1579                         }
1580                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1581                 }
1582         }
1583
1584 outroll:
1585         raw_notifier_chain_unregister(&netdev_chain, nb);
1586         goto unlock;
1587 }
1588 EXPORT_SYMBOL(register_netdevice_notifier);
1589
1590 /**
1591  *      unregister_netdevice_notifier - unregister a network notifier block
1592  *      @nb: notifier
1593  *
1594  *      Unregister a notifier previously registered by
1595  *      register_netdevice_notifier(). The notifier is unlinked into the
1596  *      kernel structures and may then be reused. A negative errno code
1597  *      is returned on a failure.
1598  *
1599  *      After unregistering unregister and down device events are synthesized
1600  *      for all devices on the device list to the removed notifier to remove
1601  *      the need for special case cleanup code.
1602  */
1603
1604 int unregister_netdevice_notifier(struct notifier_block *nb)
1605 {
1606         struct net_device *dev;
1607         struct net *net;
1608         int err;
1609
1610         rtnl_lock();
1611         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1612         if (err)
1613                 goto unlock;
1614
1615         for_each_net(net) {
1616                 for_each_netdev(net, dev) {
1617                         if (dev->flags & IFF_UP) {
1618                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1619                                                         dev);
1620                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1621                         }
1622                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1623                 }
1624         }
1625 unlock:
1626         rtnl_unlock();
1627         return err;
1628 }
1629 EXPORT_SYMBOL(unregister_netdevice_notifier);
1630
1631 /**
1632  *      call_netdevice_notifiers_info - call all network notifier blocks
1633  *      @val: value passed unmodified to notifier function
1634  *      @dev: net_device pointer passed unmodified to notifier function
1635  *      @info: notifier information data
1636  *
1637  *      Call all network notifier blocks.  Parameters and return value
1638  *      are as for raw_notifier_call_chain().
1639  */
1640
1641 static int call_netdevice_notifiers_info(unsigned long val,
1642                                          struct net_device *dev,
1643                                          struct netdev_notifier_info *info)
1644 {
1645         ASSERT_RTNL();
1646         netdev_notifier_info_init(info, dev);
1647         return raw_notifier_call_chain(&netdev_chain, val, info);
1648 }
1649
1650 /**
1651  *      call_netdevice_notifiers - call all network notifier blocks
1652  *      @val: value passed unmodified to notifier function
1653  *      @dev: net_device pointer passed unmodified to notifier function
1654  *
1655  *      Call all network notifier blocks.  Parameters and return value
1656  *      are as for raw_notifier_call_chain().
1657  */
1658
1659 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1660 {
1661         struct netdev_notifier_info info;
1662
1663         return call_netdevice_notifiers_info(val, dev, &info);
1664 }
1665 EXPORT_SYMBOL(call_netdevice_notifiers);
1666
1667 #ifdef CONFIG_NET_INGRESS
1668 static struct static_key ingress_needed __read_mostly;
1669
1670 void net_inc_ingress_queue(void)
1671 {
1672         static_key_slow_inc(&ingress_needed);
1673 }
1674 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1675
1676 void net_dec_ingress_queue(void)
1677 {
1678         static_key_slow_dec(&ingress_needed);
1679 }
1680 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1681 #endif
1682
1683 static struct static_key netstamp_needed __read_mostly;
1684 #ifdef HAVE_JUMP_LABEL
1685 static atomic_t netstamp_needed_deferred;
1686 static void netstamp_clear(struct work_struct *work)
1687 {
1688         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1689
1690         while (deferred--)
1691                 static_key_slow_dec(&netstamp_needed);
1692 }
1693 static DECLARE_WORK(netstamp_work, netstamp_clear);
1694 #endif
1695
1696 void net_enable_timestamp(void)
1697 {
1698         static_key_slow_inc(&netstamp_needed);
1699 }
1700 EXPORT_SYMBOL(net_enable_timestamp);
1701
1702 void net_disable_timestamp(void)
1703 {
1704 #ifdef HAVE_JUMP_LABEL
1705         /* net_disable_timestamp() can be called from non process context */
1706         atomic_inc(&netstamp_needed_deferred);
1707         schedule_work(&netstamp_work);
1708 #else
1709         static_key_slow_dec(&netstamp_needed);
1710 #endif
1711 }
1712 EXPORT_SYMBOL(net_disable_timestamp);
1713
1714 static inline void net_timestamp_set(struct sk_buff *skb)
1715 {
1716         skb->tstamp.tv64 = 0;
1717         if (static_key_false(&netstamp_needed))
1718                 __net_timestamp(skb);
1719 }
1720
1721 #define net_timestamp_check(COND, SKB)                  \
1722         if (static_key_false(&netstamp_needed)) {               \
1723                 if ((COND) && !(SKB)->tstamp.tv64)      \
1724                         __net_timestamp(SKB);           \
1725         }                                               \
1726
1727 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1728 {
1729         unsigned int len;
1730
1731         if (!(dev->flags & IFF_UP))
1732                 return false;
1733
1734         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1735         if (skb->len <= len)
1736                 return true;
1737
1738         /* if TSO is enabled, we don't care about the length as the packet
1739          * could be forwarded without being segmented before
1740          */
1741         if (skb_is_gso(skb))
1742                 return true;
1743
1744         return false;
1745 }
1746 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1747
1748 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1749 {
1750         if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1751             unlikely(!is_skb_forwardable(dev, skb))) {
1752                 atomic_long_inc(&dev->rx_dropped);
1753                 kfree_skb(skb);
1754                 return NET_RX_DROP;
1755         }
1756
1757         skb_scrub_packet(skb, true);
1758         skb->priority = 0;
1759         skb->protocol = eth_type_trans(skb, dev);
1760         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1761
1762         return 0;
1763 }
1764 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1765
1766 /**
1767  * dev_forward_skb - loopback an skb to another netif
1768  *
1769  * @dev: destination network device
1770  * @skb: buffer to forward
1771  *
1772  * return values:
1773  *      NET_RX_SUCCESS  (no congestion)
1774  *      NET_RX_DROP     (packet was dropped, but freed)
1775  *
1776  * dev_forward_skb can be used for injecting an skb from the
1777  * start_xmit function of one device into the receive queue
1778  * of another device.
1779  *
1780  * The receiving device may be in another namespace, so
1781  * we have to clear all information in the skb that could
1782  * impact namespace isolation.
1783  */
1784 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1785 {
1786         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1787 }
1788 EXPORT_SYMBOL_GPL(dev_forward_skb);
1789
1790 static inline int deliver_skb(struct sk_buff *skb,
1791                               struct packet_type *pt_prev,
1792                               struct net_device *orig_dev)
1793 {
1794         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1795                 return -ENOMEM;
1796         atomic_inc(&skb->users);
1797         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1798 }
1799
1800 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1801                                           struct packet_type **pt,
1802                                           struct net_device *orig_dev,
1803                                           __be16 type,
1804                                           struct list_head *ptype_list)
1805 {
1806         struct packet_type *ptype, *pt_prev = *pt;
1807
1808         list_for_each_entry_rcu(ptype, ptype_list, list) {
1809                 if (ptype->type != type)
1810                         continue;
1811                 if (pt_prev)
1812                         deliver_skb(skb, pt_prev, orig_dev);
1813                 pt_prev = ptype;
1814         }
1815         *pt = pt_prev;
1816 }
1817
1818 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1819 {
1820         if (!ptype->af_packet_priv || !skb->sk)
1821                 return false;
1822
1823         if (ptype->id_match)
1824                 return ptype->id_match(ptype, skb->sk);
1825         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1826                 return true;
1827
1828         return false;
1829 }
1830
1831 /*
1832  *      Support routine. Sends outgoing frames to any network
1833  *      taps currently in use.
1834  */
1835
1836 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1837 {
1838         struct packet_type *ptype;
1839         struct sk_buff *skb2 = NULL;
1840         struct packet_type *pt_prev = NULL;
1841         struct list_head *ptype_list = &ptype_all;
1842
1843         rcu_read_lock();
1844 again:
1845         list_for_each_entry_rcu(ptype, ptype_list, list) {
1846                 /* Never send packets back to the socket
1847                  * they originated from - MvS (miquels@drinkel.ow.org)
1848                  */
1849                 if (skb_loop_sk(ptype, skb))
1850                         continue;
1851
1852                 if (pt_prev) {
1853                         deliver_skb(skb2, pt_prev, skb->dev);
1854                         pt_prev = ptype;
1855                         continue;
1856                 }
1857
1858                 /* need to clone skb, done only once */
1859                 skb2 = skb_clone(skb, GFP_ATOMIC);
1860                 if (!skb2)
1861                         goto out_unlock;
1862
1863                 net_timestamp_set(skb2);
1864
1865                 /* skb->nh should be correctly
1866                  * set by sender, so that the second statement is
1867                  * just protection against buggy protocols.
1868                  */
1869                 skb_reset_mac_header(skb2);
1870
1871                 if (skb_network_header(skb2) < skb2->data ||
1872                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1873                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1874                                              ntohs(skb2->protocol),
1875                                              dev->name);
1876                         skb_reset_network_header(skb2);
1877                 }
1878
1879                 skb2->transport_header = skb2->network_header;
1880                 skb2->pkt_type = PACKET_OUTGOING;
1881                 pt_prev = ptype;
1882         }
1883
1884         if (ptype_list == &ptype_all) {
1885                 ptype_list = &dev->ptype_all;
1886                 goto again;
1887         }
1888 out_unlock:
1889         if (pt_prev)
1890                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1891         rcu_read_unlock();
1892 }
1893
1894 /**
1895  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1896  * @dev: Network device
1897  * @txq: number of queues available
1898  *
1899  * If real_num_tx_queues is changed the tc mappings may no longer be
1900  * valid. To resolve this verify the tc mapping remains valid and if
1901  * not NULL the mapping. With no priorities mapping to this
1902  * offset/count pair it will no longer be used. In the worst case TC0
1903  * is invalid nothing can be done so disable priority mappings. If is
1904  * expected that drivers will fix this mapping if they can before
1905  * calling netif_set_real_num_tx_queues.
1906  */
1907 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1908 {
1909         int i;
1910         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1911
1912         /* If TC0 is invalidated disable TC mapping */
1913         if (tc->offset + tc->count > txq) {
1914                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1915                 dev->num_tc = 0;
1916                 return;
1917         }
1918
1919         /* Invalidated prio to tc mappings set to TC0 */
1920         for (i = 1; i < TC_BITMASK + 1; i++) {
1921                 int q = netdev_get_prio_tc_map(dev, i);
1922
1923                 tc = &dev->tc_to_txq[q];
1924                 if (tc->offset + tc->count > txq) {
1925                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1926                                 i, q);
1927                         netdev_set_prio_tc_map(dev, i, 0);
1928                 }
1929         }
1930 }
1931
1932 #ifdef CONFIG_XPS
1933 static DEFINE_MUTEX(xps_map_mutex);
1934 #define xmap_dereference(P)             \
1935         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1936
1937 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1938                                         int cpu, u16 index)
1939 {
1940         struct xps_map *map = NULL;
1941         int pos;
1942
1943         if (dev_maps)
1944                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1945
1946         for (pos = 0; map && pos < map->len; pos++) {
1947                 if (map->queues[pos] == index) {
1948                         if (map->len > 1) {
1949                                 map->queues[pos] = map->queues[--map->len];
1950                         } else {
1951                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1952                                 kfree_rcu(map, rcu);
1953                                 map = NULL;
1954                         }
1955                         break;
1956                 }
1957         }
1958
1959         return map;
1960 }
1961
1962 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1963 {
1964         struct xps_dev_maps *dev_maps;
1965         int cpu, i;
1966         bool active = false;
1967
1968         mutex_lock(&xps_map_mutex);
1969         dev_maps = xmap_dereference(dev->xps_maps);
1970
1971         if (!dev_maps)
1972                 goto out_no_maps;
1973
1974         for_each_possible_cpu(cpu) {
1975                 for (i = index; i < dev->num_tx_queues; i++) {
1976                         if (!remove_xps_queue(dev_maps, cpu, i))
1977                                 break;
1978                 }
1979                 if (i == dev->num_tx_queues)
1980                         active = true;
1981         }
1982
1983         if (!active) {
1984                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1985                 kfree_rcu(dev_maps, rcu);
1986         }
1987
1988         for (i = index; i < dev->num_tx_queues; i++)
1989                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1990                                              NUMA_NO_NODE);
1991
1992 out_no_maps:
1993         mutex_unlock(&xps_map_mutex);
1994 }
1995
1996 static struct xps_map *expand_xps_map(struct xps_map *map,
1997                                       int cpu, u16 index)
1998 {
1999         struct xps_map *new_map;
2000         int alloc_len = XPS_MIN_MAP_ALLOC;
2001         int i, pos;
2002
2003         for (pos = 0; map && pos < map->len; pos++) {
2004                 if (map->queues[pos] != index)
2005                         continue;
2006                 return map;
2007         }
2008
2009         /* Need to add queue to this CPU's existing map */
2010         if (map) {
2011                 if (pos < map->alloc_len)
2012                         return map;
2013
2014                 alloc_len = map->alloc_len * 2;
2015         }
2016
2017         /* Need to allocate new map to store queue on this CPU's map */
2018         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2019                                cpu_to_node(cpu));
2020         if (!new_map)
2021                 return NULL;
2022
2023         for (i = 0; i < pos; i++)
2024                 new_map->queues[i] = map->queues[i];
2025         new_map->alloc_len = alloc_len;
2026         new_map->len = pos;
2027
2028         return new_map;
2029 }
2030
2031 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2032                         u16 index)
2033 {
2034         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2035         struct xps_map *map, *new_map;
2036         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2037         int cpu, numa_node_id = -2;
2038         bool active = false;
2039
2040         mutex_lock(&xps_map_mutex);
2041
2042         dev_maps = xmap_dereference(dev->xps_maps);
2043
2044         /* allocate memory for queue storage */
2045         for_each_online_cpu(cpu) {
2046                 if (!cpumask_test_cpu(cpu, mask))
2047                         continue;
2048
2049                 if (!new_dev_maps)
2050                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2051                 if (!new_dev_maps) {
2052                         mutex_unlock(&xps_map_mutex);
2053                         return -ENOMEM;
2054                 }
2055
2056                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2057                                  NULL;
2058
2059                 map = expand_xps_map(map, cpu, index);
2060                 if (!map)
2061                         goto error;
2062
2063                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2064         }
2065
2066         if (!new_dev_maps)
2067                 goto out_no_new_maps;
2068
2069         for_each_possible_cpu(cpu) {
2070                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2071                         /* add queue to CPU maps */
2072                         int pos = 0;
2073
2074                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2075                         while ((pos < map->len) && (map->queues[pos] != index))
2076                                 pos++;
2077
2078                         if (pos == map->len)
2079                                 map->queues[map->len++] = index;
2080 #ifdef CONFIG_NUMA
2081                         if (numa_node_id == -2)
2082                                 numa_node_id = cpu_to_node(cpu);
2083                         else if (numa_node_id != cpu_to_node(cpu))
2084                                 numa_node_id = -1;
2085 #endif
2086                 } else if (dev_maps) {
2087                         /* fill in the new device map from the old device map */
2088                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2089                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2090                 }
2091
2092         }
2093
2094         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2095
2096         /* Cleanup old maps */
2097         if (dev_maps) {
2098                 for_each_possible_cpu(cpu) {
2099                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2100                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2101                         if (map && map != new_map)
2102                                 kfree_rcu(map, rcu);
2103                 }
2104
2105                 kfree_rcu(dev_maps, rcu);
2106         }
2107
2108         dev_maps = new_dev_maps;
2109         active = true;
2110
2111 out_no_new_maps:
2112         /* update Tx queue numa node */
2113         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2114                                      (numa_node_id >= 0) ? numa_node_id :
2115                                      NUMA_NO_NODE);
2116
2117         if (!dev_maps)
2118                 goto out_no_maps;
2119
2120         /* removes queue from unused CPUs */
2121         for_each_possible_cpu(cpu) {
2122                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2123                         continue;
2124
2125                 if (remove_xps_queue(dev_maps, cpu, index))
2126                         active = true;
2127         }
2128
2129         /* free map if not active */
2130         if (!active) {
2131                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2132                 kfree_rcu(dev_maps, rcu);
2133         }
2134
2135 out_no_maps:
2136         mutex_unlock(&xps_map_mutex);
2137
2138         return 0;
2139 error:
2140         /* remove any maps that we added */
2141         for_each_possible_cpu(cpu) {
2142                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2143                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2144                                  NULL;
2145                 if (new_map && new_map != map)
2146                         kfree(new_map);
2147         }
2148
2149         mutex_unlock(&xps_map_mutex);
2150
2151         kfree(new_dev_maps);
2152         return -ENOMEM;
2153 }
2154 EXPORT_SYMBOL(netif_set_xps_queue);
2155
2156 #endif
2157 /*
2158  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2159  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2160  */
2161 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2162 {
2163         int rc;
2164
2165         if (txq < 1 || txq > dev->num_tx_queues)
2166                 return -EINVAL;
2167
2168         if (dev->reg_state == NETREG_REGISTERED ||
2169             dev->reg_state == NETREG_UNREGISTERING) {
2170                 ASSERT_RTNL();
2171
2172                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2173                                                   txq);
2174                 if (rc)
2175                         return rc;
2176
2177                 if (dev->num_tc)
2178                         netif_setup_tc(dev, txq);
2179
2180                 if (txq < dev->real_num_tx_queues) {
2181                         qdisc_reset_all_tx_gt(dev, txq);
2182 #ifdef CONFIG_XPS
2183                         netif_reset_xps_queues_gt(dev, txq);
2184 #endif
2185                 }
2186         }
2187
2188         dev->real_num_tx_queues = txq;
2189         return 0;
2190 }
2191 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2192
2193 #ifdef CONFIG_SYSFS
2194 /**
2195  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2196  *      @dev: Network device
2197  *      @rxq: Actual number of RX queues
2198  *
2199  *      This must be called either with the rtnl_lock held or before
2200  *      registration of the net device.  Returns 0 on success, or a
2201  *      negative error code.  If called before registration, it always
2202  *      succeeds.
2203  */
2204 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2205 {
2206         int rc;
2207
2208         if (rxq < 1 || rxq > dev->num_rx_queues)
2209                 return -EINVAL;
2210
2211         if (dev->reg_state == NETREG_REGISTERED) {
2212                 ASSERT_RTNL();
2213
2214                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2215                                                   rxq);
2216                 if (rc)
2217                         return rc;
2218         }
2219
2220         dev->real_num_rx_queues = rxq;
2221         return 0;
2222 }
2223 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2224 #endif
2225
2226 /**
2227  * netif_get_num_default_rss_queues - default number of RSS queues
2228  *
2229  * This routine should set an upper limit on the number of RSS queues
2230  * used by default by multiqueue devices.
2231  */
2232 int netif_get_num_default_rss_queues(void)
2233 {
2234         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2235 }
2236 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2237
2238 static inline void __netif_reschedule(struct Qdisc *q)
2239 {
2240         struct softnet_data *sd;
2241         unsigned long flags;
2242
2243         local_irq_save(flags);
2244         sd = this_cpu_ptr(&softnet_data);
2245         q->next_sched = NULL;
2246         *sd->output_queue_tailp = q;
2247         sd->output_queue_tailp = &q->next_sched;
2248         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2249         local_irq_restore(flags);
2250         preempt_check_resched_rt();
2251 }
2252
2253 void __netif_schedule(struct Qdisc *q)
2254 {
2255         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2256                 __netif_reschedule(q);
2257 }
2258 EXPORT_SYMBOL(__netif_schedule);
2259
2260 struct dev_kfree_skb_cb {
2261         enum skb_free_reason reason;
2262 };
2263
2264 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2265 {
2266         return (struct dev_kfree_skb_cb *)skb->cb;
2267 }
2268
2269 void netif_schedule_queue(struct netdev_queue *txq)
2270 {
2271         rcu_read_lock();
2272         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2273                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2274
2275                 __netif_schedule(q);
2276         }
2277         rcu_read_unlock();
2278 }
2279 EXPORT_SYMBOL(netif_schedule_queue);
2280
2281 /**
2282  *      netif_wake_subqueue - allow sending packets on subqueue
2283  *      @dev: network device
2284  *      @queue_index: sub queue index
2285  *
2286  * Resume individual transmit queue of a device with multiple transmit queues.
2287  */
2288 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2289 {
2290         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2291
2292         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2293                 struct Qdisc *q;
2294
2295                 rcu_read_lock();
2296                 q = rcu_dereference(txq->qdisc);
2297                 __netif_schedule(q);
2298                 rcu_read_unlock();
2299         }
2300 }
2301 EXPORT_SYMBOL(netif_wake_subqueue);
2302
2303 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2304 {
2305         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2306                 struct Qdisc *q;
2307
2308                 rcu_read_lock();
2309                 q = rcu_dereference(dev_queue->qdisc);
2310                 __netif_schedule(q);
2311                 rcu_read_unlock();
2312         }
2313 }
2314 EXPORT_SYMBOL(netif_tx_wake_queue);
2315
2316 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2317 {
2318         unsigned long flags;
2319
2320         if (likely(atomic_read(&skb->users) == 1)) {
2321                 smp_rmb();
2322                 atomic_set(&skb->users, 0);
2323         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2324                 return;
2325         }
2326         get_kfree_skb_cb(skb)->reason = reason;
2327         local_irq_save(flags);
2328         skb->next = __this_cpu_read(softnet_data.completion_queue);
2329         __this_cpu_write(softnet_data.completion_queue, skb);
2330         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2331         local_irq_restore(flags);
2332         preempt_check_resched_rt();
2333 }
2334 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2335
2336 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2337 {
2338         if (in_irq() || irqs_disabled())
2339                 __dev_kfree_skb_irq(skb, reason);
2340         else
2341                 dev_kfree_skb(skb);
2342 }
2343 EXPORT_SYMBOL(__dev_kfree_skb_any);
2344
2345
2346 /**
2347  * netif_device_detach - mark device as removed
2348  * @dev: network device
2349  *
2350  * Mark device as removed from system and therefore no longer available.
2351  */
2352 void netif_device_detach(struct net_device *dev)
2353 {
2354         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2355             netif_running(dev)) {
2356                 netif_tx_stop_all_queues(dev);
2357         }
2358 }
2359 EXPORT_SYMBOL(netif_device_detach);
2360
2361 /**
2362  * netif_device_attach - mark device as attached
2363  * @dev: network device
2364  *
2365  * Mark device as attached from system and restart if needed.
2366  */
2367 void netif_device_attach(struct net_device *dev)
2368 {
2369         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2370             netif_running(dev)) {
2371                 netif_tx_wake_all_queues(dev);
2372                 __netdev_watchdog_up(dev);
2373         }
2374 }
2375 EXPORT_SYMBOL(netif_device_attach);
2376
2377 /*
2378  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2379  * to be used as a distribution range.
2380  */
2381 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2382                   unsigned int num_tx_queues)
2383 {
2384         u32 hash;
2385         u16 qoffset = 0;
2386         u16 qcount = num_tx_queues;
2387
2388         if (skb_rx_queue_recorded(skb)) {
2389                 hash = skb_get_rx_queue(skb);
2390                 while (unlikely(hash >= num_tx_queues))
2391                         hash -= num_tx_queues;
2392                 return hash;
2393         }
2394
2395         if (dev->num_tc) {
2396                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2397                 qoffset = dev->tc_to_txq[tc].offset;
2398                 qcount = dev->tc_to_txq[tc].count;
2399         }
2400
2401         return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2402 }
2403 EXPORT_SYMBOL(__skb_tx_hash);
2404
2405 static void skb_warn_bad_offload(const struct sk_buff *skb)
2406 {
2407         static const netdev_features_t null_features = 0;
2408         struct net_device *dev = skb->dev;
2409         const char *name = "";
2410
2411         if (!net_ratelimit())
2412                 return;
2413
2414         if (dev) {
2415                 if (dev->dev.parent)
2416                         name = dev_driver_string(dev->dev.parent);
2417                 else
2418                         name = netdev_name(dev);
2419         }
2420         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2421              "gso_type=%d ip_summed=%d\n",
2422              name, dev ? &dev->features : &null_features,
2423              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2424              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2425              skb_shinfo(skb)->gso_type, skb->ip_summed);
2426 }
2427
2428 /*
2429  * Invalidate hardware checksum when packet is to be mangled, and
2430  * complete checksum manually on outgoing path.
2431  */
2432 int skb_checksum_help(struct sk_buff *skb)
2433 {
2434         __wsum csum;
2435         int ret = 0, offset;
2436
2437         if (skb->ip_summed == CHECKSUM_COMPLETE)
2438                 goto out_set_summed;
2439
2440         if (unlikely(skb_shinfo(skb)->gso_size)) {
2441                 skb_warn_bad_offload(skb);
2442                 return -EINVAL;
2443         }
2444
2445         /* Before computing a checksum, we should make sure no frag could
2446          * be modified by an external entity : checksum could be wrong.
2447          */
2448         if (skb_has_shared_frag(skb)) {
2449                 ret = __skb_linearize(skb);
2450                 if (ret)
2451                         goto out;
2452         }
2453
2454         offset = skb_checksum_start_offset(skb);
2455         BUG_ON(offset >= skb_headlen(skb));
2456         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2457
2458         offset += skb->csum_offset;
2459         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2460
2461         if (skb_cloned(skb) &&
2462             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2463                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2464                 if (ret)
2465                         goto out;
2466         }
2467
2468         *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2469 out_set_summed:
2470         skb->ip_summed = CHECKSUM_NONE;
2471 out:
2472         return ret;
2473 }
2474 EXPORT_SYMBOL(skb_checksum_help);
2475
2476 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2477 {
2478         __be16 type = skb->protocol;
2479
2480         /* Tunnel gso handlers can set protocol to ethernet. */
2481         if (type == htons(ETH_P_TEB)) {
2482                 struct ethhdr *eth;
2483
2484                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2485                         return 0;
2486
2487                 eth = (struct ethhdr *)skb_mac_header(skb);
2488                 type = eth->h_proto;
2489         }
2490
2491         return __vlan_get_protocol(skb, type, depth);
2492 }
2493
2494 /**
2495  *      skb_mac_gso_segment - mac layer segmentation handler.
2496  *      @skb: buffer to segment
2497  *      @features: features for the output path (see dev->features)
2498  */
2499 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2500                                     netdev_features_t features)
2501 {
2502         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2503         struct packet_offload *ptype;
2504         int vlan_depth = skb->mac_len;
2505         __be16 type = skb_network_protocol(skb, &vlan_depth);
2506
2507         if (unlikely(!type))
2508                 return ERR_PTR(-EINVAL);
2509
2510         __skb_pull(skb, vlan_depth);
2511
2512         rcu_read_lock();
2513         list_for_each_entry_rcu(ptype, &offload_base, list) {
2514                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2515                         segs = ptype->callbacks.gso_segment(skb, features);
2516                         break;
2517                 }
2518         }
2519         rcu_read_unlock();
2520
2521         __skb_push(skb, skb->data - skb_mac_header(skb));
2522
2523         return segs;
2524 }
2525 EXPORT_SYMBOL(skb_mac_gso_segment);
2526
2527
2528 /* openvswitch calls this on rx path, so we need a different check.
2529  */
2530 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2531 {
2532         if (tx_path)
2533                 return skb->ip_summed != CHECKSUM_PARTIAL;
2534         else
2535                 return skb->ip_summed == CHECKSUM_NONE;
2536 }
2537
2538 /**
2539  *      __skb_gso_segment - Perform segmentation on skb.
2540  *      @skb: buffer to segment
2541  *      @features: features for the output path (see dev->features)
2542  *      @tx_path: whether it is called in TX path
2543  *
2544  *      This function segments the given skb and returns a list of segments.
2545  *
2546  *      It may return NULL if the skb requires no segmentation.  This is
2547  *      only possible when GSO is used for verifying header integrity.
2548  *
2549  *      Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2550  */
2551 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2552                                   netdev_features_t features, bool tx_path)
2553 {
2554         if (unlikely(skb_needs_check(skb, tx_path))) {
2555                 int err;
2556
2557                 skb_warn_bad_offload(skb);
2558
2559                 err = skb_cow_head(skb, 0);
2560                 if (err < 0)
2561                         return ERR_PTR(err);
2562         }
2563
2564         BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2565                      sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2566
2567         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2568         SKB_GSO_CB(skb)->encap_level = 0;
2569
2570         skb_reset_mac_header(skb);
2571         skb_reset_mac_len(skb);
2572
2573         return skb_mac_gso_segment(skb, features);
2574 }
2575 EXPORT_SYMBOL(__skb_gso_segment);
2576
2577 /* Take action when hardware reception checksum errors are detected. */
2578 #ifdef CONFIG_BUG
2579 void netdev_rx_csum_fault(struct net_device *dev)
2580 {
2581         if (net_ratelimit()) {
2582                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2583                 dump_stack();
2584         }
2585 }
2586 EXPORT_SYMBOL(netdev_rx_csum_fault);
2587 #endif
2588
2589 /* Actually, we should eliminate this check as soon as we know, that:
2590  * 1. IOMMU is present and allows to map all the memory.
2591  * 2. No high memory really exists on this machine.
2592  */
2593
2594 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2595 {
2596 #ifdef CONFIG_HIGHMEM
2597         int i;
2598         if (!(dev->features & NETIF_F_HIGHDMA)) {
2599                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2600                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2601                         if (PageHighMem(skb_frag_page(frag)))
2602                                 return 1;
2603                 }
2604         }
2605
2606         if (PCI_DMA_BUS_IS_PHYS) {
2607                 struct device *pdev = dev->dev.parent;
2608
2609                 if (!pdev)
2610                         return 0;
2611                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2612                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2613                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2614                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2615                                 return 1;
2616                 }
2617         }
2618 #endif
2619         return 0;
2620 }
2621
2622 /* If MPLS offload request, verify we are testing hardware MPLS features
2623  * instead of standard features for the netdev.
2624  */
2625 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2626 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2627                                            netdev_features_t features,
2628                                            __be16 type)
2629 {
2630         if (eth_p_mpls(type))
2631                 features &= skb->dev->mpls_features;
2632
2633         return features;
2634 }
2635 #else
2636 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2637                                            netdev_features_t features,
2638                                            __be16 type)
2639 {
2640         return features;
2641 }
2642 #endif
2643
2644 static netdev_features_t harmonize_features(struct sk_buff *skb,
2645         netdev_features_t features)
2646 {
2647         int tmp;
2648         __be16 type;
2649
2650         type = skb_network_protocol(skb, &tmp);
2651         features = net_mpls_features(skb, features, type);
2652
2653         if (skb->ip_summed != CHECKSUM_NONE &&
2654             !can_checksum_protocol(features, type)) {
2655                 features &= ~NETIF_F_ALL_CSUM;
2656         }
2657         if (illegal_highdma(skb->dev, skb))
2658                 features &= ~NETIF_F_SG;
2659
2660         return features;
2661 }
2662
2663 netdev_features_t passthru_features_check(struct sk_buff *skb,
2664                                           struct net_device *dev,
2665                                           netdev_features_t features)
2666 {
2667         return features;
2668 }
2669 EXPORT_SYMBOL(passthru_features_check);
2670
2671 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2672                                              struct net_device *dev,
2673                                              netdev_features_t features)
2674 {
2675         return vlan_features_check(skb, features);
2676 }
2677
2678 netdev_features_t netif_skb_features(struct sk_buff *skb)
2679 {
2680         struct net_device *dev = skb->dev;
2681         netdev_features_t features = dev->features;
2682         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2683
2684         if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2685                 features &= ~NETIF_F_GSO_MASK;
2686
2687         /* If encapsulation offload request, verify we are testing
2688          * hardware encapsulation features instead of standard
2689          * features for the netdev
2690          */
2691         if (skb->encapsulation)
2692                 features &= dev->hw_enc_features;
2693
2694         if (skb_vlan_tagged(skb))
2695                 features = netdev_intersect_features(features,
2696                                                      dev->vlan_features |
2697                                                      NETIF_F_HW_VLAN_CTAG_TX |
2698                                                      NETIF_F_HW_VLAN_STAG_TX);
2699
2700         if (dev->netdev_ops->ndo_features_check)
2701                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2702                                                                 features);
2703         else
2704                 features &= dflt_features_check(skb, dev, features);
2705
2706         return harmonize_features(skb, features);
2707 }
2708 EXPORT_SYMBOL(netif_skb_features);
2709
2710 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2711                     struct netdev_queue *txq, bool more)
2712 {
2713         unsigned int len;
2714         int rc;
2715
2716         if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2717                 dev_queue_xmit_nit(skb, dev);
2718
2719         len = skb->len;
2720         trace_net_dev_start_xmit(skb, dev);
2721         rc = netdev_start_xmit(skb, dev, txq, more);
2722         trace_net_dev_xmit(skb, rc, dev, len);
2723
2724         return rc;
2725 }
2726
2727 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2728                                     struct netdev_queue *txq, int *ret)
2729 {
2730         struct sk_buff *skb = first;
2731         int rc = NETDEV_TX_OK;
2732
2733         while (skb) {
2734                 struct sk_buff *next = skb->next;
2735
2736                 skb->next = NULL;
2737                 rc = xmit_one(skb, dev, txq, next != NULL);
2738                 if (unlikely(!dev_xmit_complete(rc))) {
2739                         skb->next = next;
2740                         goto out;
2741                 }
2742
2743                 skb = next;
2744                 if (netif_xmit_stopped(txq) && skb) {
2745                         rc = NETDEV_TX_BUSY;
2746                         break;
2747                 }
2748         }
2749
2750 out:
2751         *ret = rc;
2752         return skb;
2753 }
2754
2755 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2756                                           netdev_features_t features)
2757 {
2758         if (skb_vlan_tag_present(skb) &&
2759             !vlan_hw_offload_capable(features, skb->vlan_proto))
2760                 skb = __vlan_hwaccel_push_inside(skb);
2761         return skb;
2762 }
2763
2764 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2765 {
2766         netdev_features_t features;
2767
2768         if (skb->next)
2769                 return skb;
2770
2771         features = netif_skb_features(skb);
2772         skb = validate_xmit_vlan(skb, features);
2773         if (unlikely(!skb))
2774                 goto out_null;
2775
2776         if (netif_needs_gso(skb, features)) {
2777                 struct sk_buff *segs;
2778
2779                 segs = skb_gso_segment(skb, features);
2780                 if (IS_ERR(segs)) {
2781                         goto out_kfree_skb;
2782                 } else if (segs) {
2783                         consume_skb(skb);
2784                         skb = segs;
2785                 }
2786         } else {
2787                 if (skb_needs_linearize(skb, features) &&
2788                     __skb_linearize(skb))
2789                         goto out_kfree_skb;
2790
2791                 /* If packet is not checksummed and device does not
2792                  * support checksumming for this protocol, complete
2793                  * checksumming here.
2794                  */
2795                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2796                         if (skb->encapsulation)
2797                                 skb_set_inner_transport_header(skb,
2798                                                                skb_checksum_start_offset(skb));
2799                         else
2800                                 skb_set_transport_header(skb,
2801                                                          skb_checksum_start_offset(skb));
2802                         if (!(features & NETIF_F_ALL_CSUM) &&
2803                             skb_checksum_help(skb))
2804                                 goto out_kfree_skb;
2805                 }
2806         }
2807
2808         return skb;
2809
2810 out_kfree_skb:
2811         kfree_skb(skb);
2812 out_null:
2813         return NULL;
2814 }
2815
2816 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2817 {
2818         struct sk_buff *next, *head = NULL, *tail;
2819
2820         for (; skb != NULL; skb = next) {
2821                 next = skb->next;
2822                 skb->next = NULL;
2823
2824                 /* in case skb wont be segmented, point to itself */
2825                 skb->prev = skb;
2826
2827                 skb = validate_xmit_skb(skb, dev);
2828                 if (!skb)
2829                         continue;
2830
2831                 if (!head)
2832                         head = skb;
2833                 else
2834                         tail->next = skb;
2835                 /* If skb was segmented, skb->prev points to
2836                  * the last segment. If not, it still contains skb.
2837                  */
2838                 tail = skb->prev;
2839         }
2840         return head;
2841 }
2842 EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
2843
2844 static void qdisc_pkt_len_init(struct sk_buff *skb)
2845 {
2846         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2847
2848         qdisc_skb_cb(skb)->pkt_len = skb->len;
2849
2850         /* To get more precise estimation of bytes sent on wire,
2851          * we add to pkt_len the headers size of all segments
2852          */
2853         if (shinfo->gso_size)  {
2854                 unsigned int hdr_len;
2855                 u16 gso_segs = shinfo->gso_segs;
2856
2857                 /* mac layer + network layer */
2858                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2859
2860                 /* + transport layer */
2861                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2862                         hdr_len += tcp_hdrlen(skb);
2863                 else
2864                         hdr_len += sizeof(struct udphdr);
2865
2866                 if (shinfo->gso_type & SKB_GSO_DODGY)
2867                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2868                                                 shinfo->gso_size);
2869
2870                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2871         }
2872 }
2873
2874 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2875                                  struct net_device *dev,
2876                                  struct netdev_queue *txq)
2877 {
2878         spinlock_t *root_lock = qdisc_lock(q);
2879         bool contended;
2880         int rc;
2881
2882         qdisc_pkt_len_init(skb);
2883         qdisc_calculate_pkt_len(skb, q);
2884         /*
2885          * Heuristic to force contended enqueues to serialize on a
2886          * separate lock before trying to get qdisc main lock.
2887          * This permits __QDISC___STATE_RUNNING owner to get the lock more
2888          * often and dequeue packets faster.
2889          */
2890 #ifdef CONFIG_PREEMPT_RT_FULL
2891         contended = true;
2892 #else
2893         contended = qdisc_is_running(q);
2894 #endif
2895         if (unlikely(contended))
2896                 spin_lock(&q->busylock);
2897
2898         spin_lock(root_lock);
2899         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2900                 kfree_skb(skb);
2901                 rc = NET_XMIT_DROP;
2902         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2903                    qdisc_run_begin(q)) {
2904                 /*
2905                  * This is a work-conserving queue; there are no old skbs
2906                  * waiting to be sent out; and the qdisc is not running -
2907                  * xmit the skb directly.
2908                  */
2909
2910                 qdisc_bstats_update(q, skb);
2911
2912                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2913                         if (unlikely(contended)) {
2914                                 spin_unlock(&q->busylock);
2915                                 contended = false;
2916                         }
2917                         __qdisc_run(q);
2918                 } else
2919                         qdisc_run_end(q);
2920
2921                 rc = NET_XMIT_SUCCESS;
2922         } else {
2923                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2924                 if (qdisc_run_begin(q)) {
2925                         if (unlikely(contended)) {
2926                                 spin_unlock(&q->busylock);
2927                                 contended = false;
2928                         }
2929                         __qdisc_run(q);
2930                 }
2931         }
2932         spin_unlock(root_lock);
2933         if (unlikely(contended))
2934                 spin_unlock(&q->busylock);
2935         return rc;
2936 }
2937
2938 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2939 static void skb_update_prio(struct sk_buff *skb)
2940 {
2941         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2942
2943         if (!skb->priority && skb->sk && map) {
2944                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2945
2946                 if (prioidx < map->priomap_len)
2947                         skb->priority = map->priomap[prioidx];
2948         }
2949 }
2950 #else
2951 #define skb_update_prio(skb)
2952 #endif
2953
2954 #ifdef CONFIG_PREEMPT_RT_FULL
2955
2956 static inline int xmit_rec_read(void)
2957 {
2958        return current->xmit_recursion;
2959 }
2960
2961 static inline void xmit_rec_inc(void)
2962 {
2963        current->xmit_recursion++;
2964 }
2965
2966 static inline void xmit_rec_dec(void)
2967 {
2968        current->xmit_recursion--;
2969 }
2970
2971 #else
2972
2973 DEFINE_PER_CPU(int, xmit_recursion);
2974 EXPORT_SYMBOL(xmit_recursion);
2975
2976 static inline int xmit_rec_read(void)
2977 {
2978         return __this_cpu_read(xmit_recursion);
2979 }
2980
2981 static inline void xmit_rec_inc(void)
2982 {
2983         __this_cpu_inc(xmit_recursion);
2984 }
2985
2986 static inline void xmit_rec_dec(void)
2987 {
2988         __this_cpu_dec(xmit_recursion);
2989 }
2990 #endif
2991
2992 #define RECURSION_LIMIT 10
2993
2994 /**
2995  *      dev_loopback_xmit - loop back @skb
2996  *      @net: network namespace this loopback is happening in
2997  *      @sk:  sk needed to be a netfilter okfn
2998  *      @skb: buffer to transmit
2999  */
3000 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3001 {
3002         skb_reset_mac_header(skb);
3003         __skb_pull(skb, skb_network_offset(skb));
3004         skb->pkt_type = PACKET_LOOPBACK;
3005         skb->ip_summed = CHECKSUM_UNNECESSARY;
3006         WARN_ON(!skb_dst(skb));
3007         skb_dst_force(skb);
3008         netif_rx_ni(skb);
3009         return 0;
3010 }
3011 EXPORT_SYMBOL(dev_loopback_xmit);
3012
3013 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3014 {
3015 #ifdef CONFIG_XPS
3016         struct xps_dev_maps *dev_maps;
3017         struct xps_map *map;
3018         int queue_index = -1;
3019
3020         rcu_read_lock();
3021         dev_maps = rcu_dereference(dev->xps_maps);
3022         if (dev_maps) {
3023                 map = rcu_dereference(
3024                     dev_maps->cpu_map[skb->sender_cpu - 1]);
3025                 if (map) {
3026                         if (map->len == 1)
3027                                 queue_index = map->queues[0];
3028                         else
3029                                 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3030                                                                            map->len)];
3031                         if (unlikely(queue_index >= dev->real_num_tx_queues))
3032                                 queue_index = -1;
3033                 }
3034         }
3035         rcu_read_unlock();
3036
3037         return queue_index;
3038 #else
3039         return -1;
3040 #endif
3041 }
3042
3043 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3044 {
3045         struct sock *sk = skb->sk;
3046         int queue_index = sk_tx_queue_get(sk);
3047
3048         if (queue_index < 0 || skb->ooo_okay ||
3049             queue_index >= dev->real_num_tx_queues) {
3050                 int new_index = get_xps_queue(dev, skb);
3051                 if (new_index < 0)
3052                         new_index = skb_tx_hash(dev, skb);
3053
3054                 if (queue_index != new_index && sk &&
3055                     sk_fullsock(sk) &&
3056                     rcu_access_pointer(sk->sk_dst_cache))
3057                         sk_tx_queue_set(sk, new_index);
3058
3059                 queue_index = new_index;
3060         }
3061
3062         return queue_index;
3063 }
3064
3065 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3066                                     struct sk_buff *skb,
3067                                     void *accel_priv)
3068 {
3069         int queue_index = 0;
3070
3071 #ifdef CONFIG_XPS
3072         if (skb->sender_cpu == 0)
3073                 skb->sender_cpu = raw_smp_processor_id() + 1;
3074 #endif
3075
3076         if (dev->real_num_tx_queues != 1) {
3077                 const struct net_device_ops *ops = dev->netdev_ops;
3078                 if (ops->ndo_select_queue)
3079                         queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3080                                                             __netdev_pick_tx);
3081                 else
3082                         queue_index = __netdev_pick_tx(dev, skb);
3083
3084                 if (!accel_priv)
3085                         queue_index = netdev_cap_txqueue(dev, queue_index);
3086         }
3087
3088         skb_set_queue_mapping(skb, queue_index);
3089         return netdev_get_tx_queue(dev, queue_index);
3090 }
3091
3092 /**
3093  *      __dev_queue_xmit - transmit a buffer
3094  *      @skb: buffer to transmit
3095  *      @accel_priv: private data used for L2 forwarding offload
3096  *
3097  *      Queue a buffer for transmission to a network device. The caller must
3098  *      have set the device and priority and built the buffer before calling
3099  *      this function. The function can be called from an interrupt.
3100  *
3101  *      A negative errno code is returned on a failure. A success does not
3102  *      guarantee the frame will be transmitted as it may be dropped due
3103  *      to congestion or traffic shaping.
3104  *
3105  * -----------------------------------------------------------------------------------
3106  *      I notice this method can also return errors from the queue disciplines,
3107  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3108  *      be positive.
3109  *
3110  *      Regardless of the return value, the skb is consumed, so it is currently
3111  *      difficult to retry a send to this method.  (You can bump the ref count
3112  *      before sending to hold a reference for retry if you are careful.)
3113  *
3114  *      When calling this method, interrupts MUST be enabled.  This is because
3115  *      the BH enable code must have IRQs enabled so that it will not deadlock.
3116  *          --BLG
3117  */
3118 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3119 {
3120         struct net_device *dev = skb->dev;
3121         struct netdev_queue *txq;
3122         struct Qdisc *q;
3123         int rc = -ENOMEM;
3124
3125         skb_reset_mac_header(skb);
3126
3127         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3128                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3129
3130         /* Disable soft irqs for various locks below. Also
3131          * stops preemption for RCU.
3132          */
3133         rcu_read_lock_bh();
3134
3135         skb_update_prio(skb);
3136
3137         /* If device/qdisc don't need skb->dst, release it right now while
3138          * its hot in this cpu cache.
3139          */
3140         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3141                 skb_dst_drop(skb);
3142         else
3143                 skb_dst_force(skb);
3144
3145 #ifdef CONFIG_NET_SWITCHDEV
3146         /* Don't forward if offload device already forwarded */
3147         if (skb->offload_fwd_mark &&
3148             skb->offload_fwd_mark == dev->offload_fwd_mark) {
3149                 consume_skb(skb);
3150                 rc = NET_XMIT_SUCCESS;
3151                 goto out;
3152         }
3153 #endif
3154
3155         txq = netdev_pick_tx(dev, skb, accel_priv);
3156         q = rcu_dereference_bh(txq->qdisc);
3157
3158 #ifdef CONFIG_NET_CLS_ACT
3159         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3160 #endif
3161         trace_net_dev_queue(skb);
3162         if (q->enqueue) {
3163                 rc = __dev_xmit_skb(skb, q, dev, txq);
3164                 goto out;
3165         }
3166
3167         /* The device has no queue. Common case for software devices:
3168            loopback, all the sorts of tunnels...
3169
3170            Really, it is unlikely that netif_tx_lock protection is necessary
3171            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3172            counters.)
3173            However, it is possible, that they rely on protection
3174            made by us here.
3175
3176            Check this and shot the lock. It is not prone from deadlocks.
3177            Either shot noqueue qdisc, it is even simpler 8)
3178          */
3179         if (dev->flags & IFF_UP) {
3180                 int cpu = smp_processor_id(); /* ok because BHs are off */
3181
3182                 if (txq->xmit_lock_owner != cpu) {
3183
3184                         if (xmit_rec_read() > RECURSION_LIMIT)
3185                                 goto recursion_alert;
3186
3187                         skb = validate_xmit_skb(skb, dev);
3188                         if (!skb)
3189                                 goto drop;
3190
3191                         HARD_TX_LOCK(dev, txq, cpu);
3192
3193                         if (!netif_xmit_stopped(txq)) {
3194                                 xmit_rec_inc();
3195                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3196                                 xmit_rec_dec();
3197                                 if (dev_xmit_complete(rc)) {
3198                                         HARD_TX_UNLOCK(dev, txq);
3199                                         goto out;
3200                                 }
3201                         }
3202                         HARD_TX_UNLOCK(dev, txq);
3203                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3204                                              dev->name);
3205                 } else {
3206                         /* Recursion is detected! It is possible,
3207                          * unfortunately
3208                          */
3209 recursion_alert:
3210                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3211                                              dev->name);
3212                 }
3213         }
3214
3215         rc = -ENETDOWN;
3216 drop:
3217         rcu_read_unlock_bh();
3218
3219         atomic_long_inc(&dev->tx_dropped);
3220         kfree_skb_list(skb);
3221         return rc;
3222 out:
3223         rcu_read_unlock_bh();
3224         return rc;
3225 }
3226
3227 int dev_queue_xmit(struct sk_buff *skb)
3228 {
3229         return __dev_queue_xmit(skb, NULL);
3230 }
3231 EXPORT_SYMBOL(dev_queue_xmit);
3232
3233 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3234 {
3235         return __dev_queue_xmit(skb, accel_priv);
3236 }
3237 EXPORT_SYMBOL(dev_queue_xmit_accel);
3238
3239
3240 /*=======================================================================
3241                         Receiver routines
3242   =======================================================================*/
3243
3244 int netdev_max_backlog __read_mostly = 1000;
3245 EXPORT_SYMBOL(netdev_max_backlog);
3246
3247 int netdev_tstamp_prequeue __read_mostly = 1;
3248 int netdev_budget __read_mostly = 300;
3249 int weight_p __read_mostly = 64;            /* old backlog weight */
3250
3251 /* Called with irq disabled */
3252 static inline void ____napi_schedule(struct softnet_data *sd,
3253                                      struct napi_struct *napi)
3254 {
3255         list_add_tail(&napi->poll_list, &sd->poll_list);
3256         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3257 }
3258
3259 #ifdef CONFIG_RPS
3260
3261 /* One global table that all flow-based protocols share. */
3262 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3263 EXPORT_SYMBOL(rps_sock_flow_table);
3264 u32 rps_cpu_mask __read_mostly;
3265 EXPORT_SYMBOL(rps_cpu_mask);
3266
3267 struct static_key rps_needed __read_mostly;
3268
3269 static struct rps_dev_flow *
3270 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3271             struct rps_dev_flow *rflow, u16 next_cpu)
3272 {
3273         if (next_cpu < nr_cpu_ids) {
3274 #ifdef CONFIG_RFS_ACCEL
3275                 struct netdev_rx_queue *rxqueue;
3276                 struct rps_dev_flow_table *flow_table;
3277                 struct rps_dev_flow *old_rflow;
3278                 u32 flow_id;
3279                 u16 rxq_index;
3280                 int rc;
3281
3282                 /* Should we steer this flow to a different hardware queue? */
3283                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3284                     !(dev->features & NETIF_F_NTUPLE))
3285                         goto out;
3286                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3287                 if (rxq_index == skb_get_rx_queue(skb))
3288                         goto out;
3289
3290                 rxqueue = dev->_rx + rxq_index;
3291                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3292                 if (!flow_table)
3293                         goto out;
3294                 flow_id = skb_get_hash(skb) & flow_table->mask;
3295                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3296                                                         rxq_index, flow_id);
3297                 if (rc < 0)
3298                         goto out;
3299                 old_rflow = rflow;
3300                 rflow = &flow_table->flows[flow_id];
3301                 rflow->filter = rc;
3302                 if (old_rflow->filter == rflow->filter)
3303                         old_rflow->filter = RPS_NO_FILTER;
3304         out:
3305 #endif
3306                 rflow->last_qtail =
3307                         per_cpu(softnet_data, next_cpu).input_queue_head;
3308         }
3309
3310         rflow->cpu = next_cpu;
3311         return rflow;
3312 }
3313
3314 /*
3315  * get_rps_cpu is called from netif_receive_skb and returns the target
3316  * CPU from the RPS map of the receiving queue for a given skb.
3317  * rcu_read_lock must be held on entry.
3318  */
3319 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3320                        struct rps_dev_flow **rflowp)
3321 {
3322         const struct rps_sock_flow_table *sock_flow_table;
3323         struct netdev_rx_queue *rxqueue = dev->_rx;
3324         struct rps_dev_flow_table *flow_table;
3325         struct rps_map *map;
3326         int cpu = -1;
3327         u32 tcpu;
3328         u32 hash;
3329
3330         if (skb_rx_queue_recorded(skb)) {
3331                 u16 index = skb_get_rx_queue(skb);
3332
3333                 if (unlikely(index >= dev->real_num_rx_queues)) {
3334                         WARN_ONCE(dev->real_num_rx_queues > 1,
3335                                   "%s received packet on queue %u, but number "
3336                                   "of RX queues is %u\n",
3337                                   dev->name, index, dev->real_num_rx_queues);
3338                         goto done;
3339                 }
3340                 rxqueue += index;
3341         }
3342
3343         /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3344
3345         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3346         map = rcu_dereference(rxqueue->rps_map);
3347         if (!flow_table && !map)
3348                 goto done;
3349
3350         skb_reset_network_header(skb);
3351         hash = skb_get_hash(skb);
3352         if (!hash)
3353                 goto done;
3354
3355         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3356         if (flow_table && sock_flow_table) {
3357                 struct rps_dev_flow *rflow;
3358                 u32 next_cpu;
3359                 u32 ident;
3360
3361                 /* First check into global flow table if there is a match */
3362                 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3363                 if ((ident ^ hash) & ~rps_cpu_mask)
3364                         goto try_rps;
3365
3366                 next_cpu = ident & rps_cpu_mask;
3367
3368                 /* OK, now we know there is a match,
3369                  * we can look at the local (per receive queue) flow table
3370                  */
3371                 rflow = &flow_table->flows[hash & flow_table->mask];
3372                 tcpu = rflow->cpu;
3373
3374                 /*
3375                  * If the desired CPU (where last recvmsg was done) is
3376                  * different from current CPU (one in the rx-queue flow
3377                  * table entry), switch if one of the following holds:
3378                  *   - Current CPU is unset (>= nr_cpu_ids).
3379                  *   - Current CPU is offline.
3380                  *   - The current CPU's queue tail has advanced beyond the
3381                  *     last packet that was enqueued using this table entry.
3382                  *     This guarantees that all previous packets for the flow
3383                  *     have been dequeued, thus preserving in order delivery.
3384                  */
3385                 if (unlikely(tcpu != next_cpu) &&
3386                     (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3387                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3388                       rflow->last_qtail)) >= 0)) {
3389                         tcpu = next_cpu;
3390                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3391                 }
3392
3393                 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3394                         *rflowp = rflow;
3395                         cpu = tcpu;
3396                         goto done;
3397                 }
3398         }
3399
3400 try_rps:
3401
3402         if (map) {
3403                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3404                 if (cpu_online(tcpu)) {
3405                         cpu = tcpu;
3406                         goto done;
3407                 }
3408         }
3409
3410 done:
3411         return cpu;
3412 }
3413
3414 #ifdef CONFIG_RFS_ACCEL
3415
3416 /**
3417  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3418  * @dev: Device on which the filter was set
3419  * @rxq_index: RX queue index
3420  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3421  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3422  *
3423  * Drivers that implement ndo_rx_flow_steer() should periodically call
3424  * this function for each installed filter and remove the filters for
3425  * which it returns %true.
3426  */
3427 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3428                          u32 flow_id, u16 filter_id)
3429 {
3430         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3431         struct rps_dev_flow_table *flow_table;
3432         struct rps_dev_flow *rflow;
3433         bool expire = true;
3434         unsigned int cpu;
3435
3436         rcu_read_lock();
3437         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3438         if (flow_table && flow_id <= flow_table->mask) {
3439                 rflow = &flow_table->flows[flow_id];
3440                 cpu = ACCESS_ONCE(rflow->cpu);
3441                 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3442                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3443                            rflow->last_qtail) <
3444                      (int)(10 * flow_table->mask)))
3445                         expire = false;
3446         }
3447         rcu_read_unlock();
3448         return expire;
3449 }
3450 EXPORT_SYMBOL(rps_may_expire_flow);
3451
3452 #endif /* CONFIG_RFS_ACCEL */
3453
3454 /* Called from hardirq (IPI) context */
3455 static void rps_trigger_softirq(void *data)
3456 {
3457         struct softnet_data *sd = data;
3458
3459         ____napi_schedule(sd, &sd->backlog);
3460         sd->received_rps++;
3461 }
3462
3463 #endif /* CONFIG_RPS */
3464
3465 /*
3466  * Check if this softnet_data structure is another cpu one
3467  * If yes, queue it to our IPI list and return 1
3468  * If no, return 0
3469  */
3470 static int rps_ipi_queued(struct softnet_data *sd)
3471 {
3472 #ifdef CONFIG_RPS
3473         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3474
3475         if (sd != mysd) {
3476                 sd->rps_ipi_next = mysd->rps_ipi_list;
3477                 mysd->rps_ipi_list = sd;
3478
3479                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3480                 return 1;
3481         }
3482 #endif /* CONFIG_RPS */
3483         return 0;
3484 }
3485
3486 #ifdef CONFIG_NET_FLOW_LIMIT
3487 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3488 #endif
3489
3490 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3491 {
3492 #ifdef CONFIG_NET_FLOW_LIMIT
3493         struct sd_flow_limit *fl;
3494         struct softnet_data *sd;
3495         unsigned int old_flow, new_flow;
3496
3497         if (qlen < (netdev_max_backlog >> 1))
3498                 return false;
3499
3500         sd = this_cpu_ptr(&softnet_data);
3501
3502         rcu_read_lock();
3503         fl = rcu_dereference(sd->flow_limit);
3504         if (fl) {
3505                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3506                 old_flow = fl->history[fl->history_head];
3507                 fl->history[fl->history_head] = new_flow;
3508
3509                 fl->history_head++;
3510                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3511
3512                 if (likely(fl->buckets[old_flow]))
3513                         fl->buckets[old_flow]--;
3514
3515                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3516                         fl->count++;
3517                         rcu_read_unlock();
3518                         return true;
3519                 }
3520         }
3521         rcu_read_unlock();
3522 #endif
3523         return false;
3524 }
3525
3526 /*
3527  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3528  * queue (may be a remote CPU queue).
3529  */
3530 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3531                               unsigned int *qtail)
3532 {
3533         struct softnet_data *sd;
3534         unsigned long flags;
3535         unsigned int qlen;
3536
3537         sd = &per_cpu(softnet_data, cpu);
3538
3539         local_irq_save(flags);
3540
3541         rps_lock(sd);
3542         if (!netif_running(skb->dev))
3543                 goto drop;
3544         qlen = skb_queue_len(&sd->input_pkt_queue);
3545         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3546                 if (qlen) {
3547 enqueue:
3548                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3549                         input_queue_tail_incr_save(sd, qtail);
3550                         rps_unlock(sd);
3551                         local_irq_restore(flags);
3552                         return NET_RX_SUCCESS;
3553                 }
3554
3555                 /* Schedule NAPI for backlog device
3556                  * We can use non atomic operation since we own the queue lock
3557                  */
3558                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3559                         if (!rps_ipi_queued(sd))
3560                                 ____napi_schedule(sd, &sd->backlog);
3561                 }
3562                 goto enqueue;
3563         }
3564
3565 drop:
3566         sd->dropped++;
3567         rps_unlock(sd);
3568
3569         local_irq_restore(flags);
3570         preempt_check_resched_rt();
3571
3572         atomic_long_inc(&skb->dev->rx_dropped);
3573         kfree_skb(skb);
3574         return NET_RX_DROP;
3575 }
3576
3577 static int netif_rx_internal(struct sk_buff *skb)
3578 {
3579         int ret;
3580
3581         net_timestamp_check(netdev_tstamp_prequeue, skb);
3582
3583         trace_netif_rx(skb);
3584 #ifdef CONFIG_RPS
3585         if (static_key_false(&rps_needed)) {
3586                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3587                 int cpu;
3588
3589                 migrate_disable();
3590                 rcu_read_lock();
3591
3592                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3593                 if (cpu < 0)
3594                         cpu = smp_processor_id();
3595
3596                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3597
3598                 rcu_read_unlock();
3599                 migrate_enable();
3600         } else
3601 #endif
3602         {
3603                 unsigned int qtail;
3604                 ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
3605                 put_cpu_light();
3606         }
3607         return ret;
3608 }
3609
3610 /**
3611  *      netif_rx        -       post buffer to the network code
3612  *      @skb: buffer to post
3613  *
3614  *      This function receives a packet from a device driver and queues it for
3615  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3616  *      may be dropped during processing for congestion control or by the
3617  *      protocol layers.
3618  *
3619  *      return values:
3620  *      NET_RX_SUCCESS  (no congestion)
3621  *      NET_RX_DROP     (packet was dropped)
3622  *
3623  */
3624
3625 int netif_rx(struct sk_buff *skb)
3626 {
3627         trace_netif_rx_entry(skb);
3628
3629         return netif_rx_internal(skb);
3630 }
3631 EXPORT_SYMBOL(netif_rx);
3632
3633 int netif_rx_ni(struct sk_buff *skb)
3634 {
3635         int err;
3636
3637         trace_netif_rx_ni_entry(skb);
3638
3639         local_bh_disable();
3640         err = netif_rx_internal(skb);
3641         local_bh_enable();
3642
3643         return err;
3644 }
3645 EXPORT_SYMBOL(netif_rx_ni);
3646
3647 #ifdef CONFIG_PREEMPT_RT_FULL
3648 /*
3649  * RT runs ksoftirqd as a real time thread and the root_lock is a
3650  * "sleeping spinlock". If the trylock fails then we can go into an
3651  * infinite loop when ksoftirqd preempted the task which actually
3652  * holds the lock, because we requeue q and raise NET_TX softirq
3653  * causing ksoftirqd to loop forever.
3654  *
3655  * It's safe to use spin_lock on RT here as softirqs run in thread
3656  * context and cannot deadlock against the thread which is holding
3657  * root_lock.
3658  *
3659  * On !RT the trylock might fail, but there we bail out from the
3660  * softirq loop after 10 attempts which we can't do on RT. And the
3661  * task holding root_lock cannot be preempted, so the only downside of
3662  * that trylock is that we need 10 loops to decide that we should have
3663  * given up in the first one :)
3664  */
3665 static inline int take_root_lock(spinlock_t *lock)
3666 {
3667         spin_lock(lock);
3668         return 1;
3669 }
3670 #else
3671 static inline int take_root_lock(spinlock_t *lock)
3672 {
3673         return spin_trylock(lock);
3674 }
3675 #endif
3676
3677 static void net_tx_action(struct softirq_action *h)
3678 {
3679         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3680
3681         if (sd->completion_queue) {
3682                 struct sk_buff *clist;
3683
3684                 local_irq_disable();
3685                 clist = sd->completion_queue;
3686                 sd->completion_queue = NULL;
3687                 local_irq_enable();
3688
3689                 while (clist) {
3690                         struct sk_buff *skb = clist;
3691                         clist = clist->next;
3692
3693                         WARN_ON(atomic_read(&skb->users));
3694                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3695                                 trace_consume_skb(skb);
3696                         else
3697                                 trace_kfree_skb(skb, net_tx_action);
3698                         __kfree_skb(skb);
3699                 }
3700         }
3701
3702         if (sd->output_queue) {
3703                 struct Qdisc *head;
3704
3705                 local_irq_disable();
3706                 head = sd->output_queue;
3707                 sd->output_queue = NULL;
3708                 sd->output_queue_tailp = &sd->output_queue;
3709                 local_irq_enable();
3710
3711                 while (head) {
3712                         struct Qdisc *q = head;
3713                         spinlock_t *root_lock;
3714
3715                         head = head->next_sched;
3716
3717                         root_lock = qdisc_lock(q);
3718                         if (take_root_lock(root_lock)) {
3719                                 smp_mb__before_atomic();
3720                                 clear_bit(__QDISC_STATE_SCHED,
3721                                           &q->state);
3722                                 qdisc_run(q);
3723                                 spin_unlock(root_lock);
3724                         } else {
3725                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3726                                               &q->state)) {
3727                                         __netif_reschedule(q);
3728                                 } else {
3729                                         smp_mb__before_atomic();
3730                                         clear_bit(__QDISC_STATE_SCHED,
3731                                                   &q->state);
3732                                 }
3733                         }
3734                 }
3735         }
3736 }
3737
3738 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3739     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3740 /* This hook is defined here for ATM LANE */
3741 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3742                              unsigned char *addr) __read_mostly;
3743 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3744 #endif
3745
3746 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3747                                          struct packet_type **pt_prev,
3748                                          int *ret, struct net_device *orig_dev)
3749 {
3750 #ifdef CONFIG_NET_CLS_ACT
3751         struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3752         struct tcf_result cl_res;
3753
3754         /* If there's at least one ingress present somewhere (so
3755          * we get here via enabled static key), remaining devices
3756          * that are not configured with an ingress qdisc will bail
3757          * out here.
3758          */
3759         if (!cl)
3760                 return skb;
3761         if (*pt_prev) {
3762                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3763                 *pt_prev = NULL;
3764         }
3765
3766         qdisc_skb_cb(skb)->pkt_len = skb->len;
3767         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3768         qdisc_bstats_cpu_update(cl->q, skb);
3769
3770         switch (tc_classify(skb, cl, &cl_res, false)) {
3771         case TC_ACT_OK:
3772         case TC_ACT_RECLASSIFY:
3773                 skb->tc_index = TC_H_MIN(cl_res.classid);
3774                 break;
3775         case TC_ACT_SHOT:
3776                 qdisc_qstats_cpu_drop(cl->q);
3777         case TC_ACT_STOLEN:
3778         case TC_ACT_QUEUED:
3779                 kfree_skb(skb);
3780                 return NULL;
3781         case TC_ACT_REDIRECT:
3782                 /* skb_mac_header check was done by cls/act_bpf, so
3783                  * we can safely push the L2 header back before
3784                  * redirecting to another netdev
3785                  */
3786                 __skb_push(skb, skb->mac_len);
3787                 skb_do_redirect(skb);
3788                 return NULL;
3789         default:
3790                 break;
3791         }
3792 #endif /* CONFIG_NET_CLS_ACT */
3793         return skb;
3794 }
3795
3796 /**
3797  *      netdev_is_rx_handler_busy - check if receive handler is registered
3798  *      @dev: device to check
3799  *
3800  *      Check if a receive handler is already registered for a given device.
3801  *      Return true if there one.
3802  *
3803  *      The caller must hold the rtnl_mutex.
3804  */
3805 bool netdev_is_rx_handler_busy(struct net_device *dev)
3806 {
3807         ASSERT_RTNL();
3808         return dev && rtnl_dereference(dev->rx_handler);
3809 }
3810 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3811
3812 /**
3813  *      netdev_rx_handler_register - register receive handler
3814  *      @dev: device to register a handler for
3815  *      @rx_handler: receive handler to register
3816  *      @rx_handler_data: data pointer that is used by rx handler
3817  *
3818  *      Register a receive handler for a device. This handler will then be
3819  *      called from __netif_receive_skb. A negative errno code is returned
3820  *      on a failure.
3821  *
3822  *      The caller must hold the rtnl_mutex.
3823  *
3824  *      For a general description of rx_handler, see enum rx_handler_result.
3825  */
3826 int netdev_rx_handler_register(struct net_device *dev,
3827                                rx_handler_func_t *rx_handler,
3828                                void *rx_handler_data)
3829 {
3830         ASSERT_RTNL();
3831
3832         if (dev->rx_handler)
3833                 return -EBUSY;
3834
3835         /* Note: rx_handler_data must be set before rx_handler */
3836         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3837         rcu_assign_pointer(dev->rx_handler, rx_handler);
3838
3839         return 0;
3840 }
3841 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3842
3843 /**
3844  *      netdev_rx_handler_unregister - unregister receive handler
3845  *      @dev: device to unregister a handler from
3846  *
3847  *      Unregister a receive handler from a device.
3848  *
3849  *      The caller must hold the rtnl_mutex.
3850  */
3851 void netdev_rx_handler_unregister(struct net_device *dev)
3852 {
3853
3854         ASSERT_RTNL();
3855         RCU_INIT_POINTER(dev->rx_handler, NULL);
3856         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3857          * section has a guarantee to see a non NULL rx_handler_data
3858          * as well.
3859          */
3860         synchronize_net();
3861         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3862 }
3863 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3864
3865 /*
3866  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3867  * the special handling of PFMEMALLOC skbs.
3868  */
3869 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3870 {
3871         switch (skb->protocol) {
3872         case htons(ETH_P_ARP):
3873         case htons(ETH_P_IP):
3874         case htons(ETH_P_IPV6):
3875         case htons(ETH_P_8021Q):
3876         case htons(ETH_P_8021AD):
3877                 return true;
3878         default:
3879                 return false;
3880         }
3881 }
3882
3883 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
3884                              int *ret, struct net_device *orig_dev)
3885 {
3886 #ifdef CONFIG_NETFILTER_INGRESS
3887         if (nf_hook_ingress_active(skb)) {
3888                 if (*pt_prev) {
3889                         *ret = deliver_skb(skb, *pt_prev, orig_dev);
3890                         *pt_prev = NULL;
3891                 }
3892
3893                 return nf_hook_ingress(skb);
3894         }
3895 #endif /* CONFIG_NETFILTER_INGRESS */
3896         return 0;
3897 }
3898
3899 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3900 {
3901         struct packet_type *ptype, *pt_prev;
3902         rx_handler_func_t *rx_handler;
3903         struct net_device *orig_dev;
3904         bool deliver_exact = false;
3905         int ret = NET_RX_DROP;
3906         __be16 type;
3907
3908         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3909
3910         trace_netif_receive_skb(skb);
3911
3912         orig_dev = skb->dev;
3913
3914         skb_reset_network_header(skb);
3915         if (!skb_transport_header_was_set(skb))
3916                 skb_reset_transport_header(skb);
3917         skb_reset_mac_len(skb);
3918
3919         pt_prev = NULL;
3920
3921 another_round:
3922         skb->skb_iif = skb->dev->ifindex;
3923
3924         __this_cpu_inc(softnet_data.processed);
3925
3926         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3927             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3928                 skb = skb_vlan_untag(skb);
3929                 if (unlikely(!skb))
3930                         goto out;
3931         }
3932
3933 #ifdef CONFIG_NET_CLS_ACT
3934         if (skb->tc_verd & TC_NCLS) {
3935                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3936                 goto ncls;
3937         }
3938 #endif
3939
3940         if (pfmemalloc)
3941                 goto skip_taps;
3942
3943         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3944                 if (pt_prev)
3945                         ret = deliver_skb(skb, pt_prev, orig_dev);
3946                 pt_prev = ptype;
3947         }
3948
3949         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
3950                 if (pt_prev)
3951                         ret = deliver_skb(skb, pt_prev, orig_dev);
3952                 pt_prev = ptype;
3953         }
3954
3955 skip_taps:
3956 #ifdef CONFIG_NET_INGRESS
3957         if (static_key_false(&ingress_needed)) {
3958                 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3959                 if (!skb)
3960                         goto out;
3961
3962                 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
3963                         goto out;
3964         }
3965 #endif
3966 #ifdef CONFIG_NET_CLS_ACT
3967         skb->tc_verd = 0;
3968 ncls:
3969 #endif
3970         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3971                 goto drop;
3972
3973         if (skb_vlan_tag_present(skb)) {
3974                 if (pt_prev) {
3975                         ret = deliver_skb(skb, pt_prev, orig_dev);
3976                         pt_prev = NULL;
3977                 }
3978                 if (vlan_do_receive(&skb))
3979                         goto another_round;
3980                 else if (unlikely(!skb))
3981                         goto out;
3982         }
3983
3984         rx_handler = rcu_dereference(skb->dev->rx_handler);
3985         if (rx_handler) {
3986                 if (pt_prev) {
3987                         ret = deliver_skb(skb, pt_prev, orig_dev);
3988                         pt_prev = NULL;
3989                 }
3990                 switch (rx_handler(&skb)) {
3991                 case RX_HANDLER_CONSUMED:
3992                         ret = NET_RX_SUCCESS;
3993                         goto out;
3994                 case RX_HANDLER_ANOTHER:
3995                         goto another_round;
3996                 case RX_HANDLER_EXACT:
3997                         deliver_exact = true;
3998                 case RX_HANDLER_PASS:
3999                         break;
4000                 default:
4001                         BUG();
4002                 }
4003         }
4004
4005         if (unlikely(skb_vlan_tag_present(skb))) {
4006                 if (skb_vlan_tag_get_id(skb))
4007                         skb->pkt_type = PACKET_OTHERHOST;
4008                 /* Note: we might in the future use prio bits
4009                  * and set skb->priority like in vlan_do_receive()
4010                  * For the time being, just ignore Priority Code Point
4011                  */
4012                 skb->vlan_tci = 0;
4013         }
4014
4015         type = skb->protocol;
4016
4017         /* deliver only exact match when indicated */
4018         if (likely(!deliver_exact)) {
4019                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4020                                        &ptype_base[ntohs(type) &
4021                                                    PTYPE_HASH_MASK]);
4022         }
4023
4024         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4025                                &orig_dev->ptype_specific);
4026
4027         if (unlikely(skb->dev != orig_dev)) {
4028                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4029                                        &skb->dev->ptype_specific);
4030         }
4031
4032         if (pt_prev) {
4033                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4034                         goto drop;
4035                 else
4036                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4037         } else {
4038 drop:
4039                 atomic_long_inc(&skb->dev->rx_dropped);
4040                 kfree_skb(skb);
4041                 /* Jamal, now you will not able to escape explaining
4042                  * me how you were going to use this. :-)
4043                  */
4044                 ret = NET_RX_DROP;
4045         }
4046
4047 out:
4048         return ret;
4049 }
4050
4051 static int __netif_receive_skb(struct sk_buff *skb)
4052 {
4053         int ret;
4054
4055         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4056                 unsigned long pflags = current->flags;
4057
4058                 /*
4059                  * PFMEMALLOC skbs are special, they should
4060                  * - be delivered to SOCK_MEMALLOC sockets only
4061                  * - stay away from userspace
4062                  * - have bounded memory usage
4063                  *
4064                  * Use PF_MEMALLOC as this saves us from propagating the allocation
4065                  * context down to all allocation sites.
4066                  */
4067                 current->flags |= PF_MEMALLOC;
4068                 ret = __netif_receive_skb_core(skb, true);
4069                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
4070         } else
4071                 ret = __netif_receive_skb_core(skb, false);
4072
4073         return ret;
4074 }
4075
4076 static int netif_receive_skb_internal(struct sk_buff *skb)
4077 {
4078         int ret;
4079
4080         net_timestamp_check(netdev_tstamp_prequeue, skb);
4081
4082         if (skb_defer_rx_timestamp(skb))
4083                 return NET_RX_SUCCESS;
4084
4085         rcu_read_lock();
4086
4087 #ifdef CONFIG_RPS
4088         if (static_key_false(&rps_needed)) {
4089                 struct rps_dev_flow voidflow, *rflow = &voidflow;
4090                 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4091
4092                 if (cpu >= 0) {
4093                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4094                         rcu_read_unlock();
4095                         return ret;
4096                 }
4097         }
4098 #endif
4099         ret = __netif_receive_skb(skb);
4100         rcu_read_unlock();
4101         return ret;
4102 }
4103
4104 /**
4105  *      netif_receive_skb - process receive buffer from network
4106  *      @skb: buffer to process
4107  *
4108  *      netif_receive_skb() is the main receive data processing function.
4109  *      It always succeeds. The buffer may be dropped during processing
4110  *      for congestion control or by the protocol layers.
4111  *
4112  *      This function may only be called from softirq context and interrupts
4113  *      should be enabled.
4114  *
4115  *      Return values (usually ignored):
4116  *      NET_RX_SUCCESS: no congestion
4117  *      NET_RX_DROP: packet was dropped
4118  */
4119 int netif_receive_skb(struct sk_buff *skb)
4120 {
4121         trace_netif_receive_skb_entry(skb);
4122
4123         return netif_receive_skb_internal(skb);
4124 }
4125 EXPORT_SYMBOL(netif_receive_skb);
4126
4127 /* Network device is going away, flush any packets still pending
4128  * Called with irqs disabled.
4129  */
4130 static void flush_backlog(void *arg)
4131 {
4132         struct net_device *dev = arg;
4133         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4134         struct sk_buff *skb, *tmp;
4135
4136         rps_lock(sd);
4137         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4138                 if (skb->dev == dev) {
4139                         __skb_unlink(skb, &sd->input_pkt_queue);
4140                         __skb_queue_tail(&sd->tofree_queue, skb);
4141                         input_queue_head_incr(sd);
4142                 }
4143         }
4144         rps_unlock(sd);
4145
4146         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4147                 if (skb->dev == dev) {
4148                         __skb_unlink(skb, &sd->process_queue);
4149                         __skb_queue_tail(&sd->tofree_queue, skb);
4150                         input_queue_head_incr(sd);
4151                 }
4152         }
4153
4154         if (!skb_queue_empty(&sd->tofree_queue))
4155                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
4156 }
4157
4158 static int napi_gro_complete(struct sk_buff *skb)
4159 {
4160         struct packet_offload *ptype;
4161         __be16 type = skb->protocol;
4162         struct list_head *head = &offload_base;
4163         int err = -ENOENT;
4164
4165         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4166
4167         if (NAPI_GRO_CB(skb)->count == 1) {
4168                 skb_shinfo(skb)->gso_size = 0;
4169                 goto out;
4170         }
4171
4172         rcu_read_lock();
4173         list_for_each_entry_rcu(ptype, head, list) {
4174                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4175                         continue;
4176
4177                 err = ptype->callbacks.gro_complete(skb, 0);
4178                 break;
4179         }
4180         rcu_read_unlock();
4181
4182         if (err) {
4183                 WARN_ON(&ptype->list == head);
4184                 kfree_skb(skb);
4185                 return NET_RX_SUCCESS;
4186         }
4187
4188 out:
4189         return netif_receive_skb_internal(skb);
4190 }
4191
4192 /* napi->gro_list contains packets ordered by age.
4193  * youngest packets at the head of it.
4194  * Complete skbs in reverse order to reduce latencies.
4195  */
4196 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4197 {
4198         struct sk_buff *skb, *prev = NULL;
4199
4200         /* scan list and build reverse chain */
4201         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4202                 skb->prev = prev;
4203                 prev = skb;
4204         }
4205
4206         for (skb = prev; skb; skb = prev) {
4207                 skb->next = NULL;
4208
4209                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4210                         return;
4211
4212                 prev = skb->prev;
4213                 napi_gro_complete(skb);
4214                 napi->gro_count--;
4215         }
4216
4217         napi->gro_list = NULL;
4218 }
4219 EXPORT_SYMBOL(napi_gro_flush);
4220
4221 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4222 {
4223         struct sk_buff *p;
4224         unsigned int maclen = skb->dev->hard_header_len;
4225         u32 hash = skb_get_hash_raw(skb);
4226
4227         for (p = napi->gro_list; p; p = p->next) {
4228                 unsigned long diffs;
4229
4230                 NAPI_GRO_CB(p)->flush = 0;
4231
4232                 if (hash != skb_get_hash_raw(p)) {
4233                         NAPI_GRO_CB(p)->same_flow = 0;
4234                         continue;
4235                 }
4236
4237                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4238                 diffs |= p->vlan_tci ^ skb->vlan_tci;
4239                 diffs |= skb_metadata_dst_cmp(p, skb);
4240                 if (maclen == ETH_HLEN)
4241                         diffs |= compare_ether_header(skb_mac_header(p),
4242                                                       skb_mac_header(skb));
4243                 else if (!diffs)
4244                         diffs = memcmp(skb_mac_header(p),
4245                                        skb_mac_header(skb),
4246                                        maclen);
4247                 NAPI_GRO_CB(p)->same_flow = !diffs;
4248         }
4249 }
4250
4251 static void skb_gro_reset_offset(struct sk_buff *skb)
4252 {
4253         const struct skb_shared_info *pinfo = skb_shinfo(skb);
4254         const skb_frag_t *frag0 = &pinfo->frags[0];
4255
4256         NAPI_GRO_CB(skb)->data_offset = 0;
4257         NAPI_GRO_CB(skb)->frag0 = NULL;
4258         NAPI_GRO_CB(skb)->frag0_len = 0;
4259
4260         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4261             pinfo->nr_frags &&
4262             !PageHighMem(skb_frag_page(frag0))) {
4263                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4264                 NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
4265                                                     skb_frag_size(frag0),
4266                                                     skb->end - skb->tail);
4267         }
4268 }
4269
4270 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4271 {
4272         struct skb_shared_info *pinfo = skb_shinfo(skb);
4273
4274         BUG_ON(skb->end - skb->tail < grow);
4275
4276         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4277
4278         skb->data_len -= grow;
4279         skb->tail += grow;
4280
4281         pinfo->frags[0].page_offset += grow;
4282         skb_frag_size_sub(&pinfo->frags[0], grow);
4283
4284         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4285                 skb_frag_unref(skb, 0);
4286                 memmove(pinfo->frags, pinfo->frags + 1,
4287                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4288         }
4289 }
4290
4291 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4292 {
4293         struct sk_buff **pp = NULL;
4294         struct packet_offload *ptype;
4295         __be16 type = skb->protocol;
4296         struct list_head *head = &offload_base;
4297         int same_flow;
4298         enum gro_result ret;
4299         int grow;
4300
4301         if (!(skb->dev->features & NETIF_F_GRO))
4302                 goto normal;
4303
4304         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4305                 goto normal;
4306
4307         gro_list_prepare(napi, skb);
4308
4309         rcu_read_lock();
4310         list_for_each_entry_rcu(ptype, head, list) {
4311                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4312                         continue;
4313
4314                 skb_set_network_header(skb, skb_gro_offset(skb));
4315                 skb_reset_mac_len(skb);
4316                 NAPI_GRO_CB(skb)->same_flow = 0;
4317                 NAPI_GRO_CB(skb)->flush = 0;
4318                 NAPI_GRO_CB(skb)->free = 0;
4319                 NAPI_GRO_CB(skb)->encap_mark = 0;
4320                 NAPI_GRO_CB(skb)->recursion_counter = 0;
4321                 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4322
4323                 /* Setup for GRO checksum validation */
4324                 switch (skb->ip_summed) {
4325                 case CHECKSUM_COMPLETE:
4326                         NAPI_GRO_CB(skb)->csum = skb->csum;
4327                         NAPI_GRO_CB(skb)->csum_valid = 1;
4328                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4329                         break;
4330                 case CHECKSUM_UNNECESSARY:
4331                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4332                         NAPI_GRO_CB(skb)->csum_valid = 0;
4333                         break;
4334                 default:
4335                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4336                         NAPI_GRO_CB(skb)->csum_valid = 0;
4337                 }
4338
4339                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4340                 break;
4341         }
4342         rcu_read_unlock();
4343
4344         if (&ptype->list == head)
4345                 goto normal;
4346
4347         same_flow = NAPI_GRO_CB(skb)->same_flow;
4348         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4349
4350         if (pp) {
4351                 struct sk_buff *nskb = *pp;
4352
4353                 *pp = nskb->next;
4354                 nskb->next = NULL;
4355                 napi_gro_complete(nskb);
4356                 napi->gro_count--;
4357         }
4358
4359         if (same_flow)
4360                 goto ok;
4361
4362         if (NAPI_GRO_CB(skb)->flush)
4363                 goto normal;
4364
4365         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4366                 struct sk_buff *nskb = napi->gro_list;
4367
4368                 /* locate the end of the list to select the 'oldest' flow */
4369                 while (nskb->next) {
4370                         pp = &nskb->next;
4371                         nskb = *pp;
4372                 }
4373                 *pp = NULL;
4374                 nskb->next = NULL;
4375                 napi_gro_complete(nskb);
4376         } else {
4377                 napi->gro_count++;
4378         }
4379         NAPI_GRO_CB(skb)->count = 1;
4380         NAPI_GRO_CB(skb)->age = jiffies;
4381         NAPI_GRO_CB(skb)->last = skb;
4382         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4383         skb->next = napi->gro_list;
4384         napi->gro_list = skb;
4385         ret = GRO_HELD;
4386
4387 pull:
4388         grow = skb_gro_offset(skb) - skb_headlen(skb);
4389         if (grow > 0)
4390                 gro_pull_from_frag0(skb, grow);
4391 ok:
4392         return ret;
4393
4394 normal:
4395         ret = GRO_NORMAL;
4396         goto pull;
4397 }
4398
4399 struct packet_offload *gro_find_receive_by_type(__be16 type)
4400 {
4401         struct list_head *offload_head = &offload_base;
4402         struct packet_offload *ptype;
4403
4404         list_for_each_entry_rcu(ptype, offload_head, list) {
4405                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4406                         continue;
4407                 return ptype;
4408         }
4409         return NULL;
4410 }
4411 EXPORT_SYMBOL(gro_find_receive_by_type);
4412
4413 struct packet_offload *gro_find_complete_by_type(__be16 type)
4414 {
4415         struct list_head *offload_head = &offload_base;
4416         struct packet_offload *ptype;
4417
4418         list_for_each_entry_rcu(ptype, offload_head, list) {
4419                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4420                         continue;
4421                 return ptype;
4422         }
4423         return NULL;
4424 }
4425 EXPORT_SYMBOL(gro_find_complete_by_type);
4426
4427 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4428 {
4429         switch (ret) {
4430         case GRO_NORMAL:
4431                 if (netif_receive_skb_internal(skb))
4432                         ret = GRO_DROP;
4433                 break;
4434
4435         case GRO_DROP:
4436                 kfree_skb(skb);
4437                 break;
4438
4439         case GRO_MERGED_FREE:
4440                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4441                         skb_dst_drop(skb);
4442                         kmem_cache_free(skbuff_head_cache, skb);
4443                 } else {
4444                         __kfree_skb(skb);
4445                 }
4446                 break;
4447
4448         case GRO_HELD:
4449         case GRO_MERGED:
4450                 break;
4451         }
4452
4453         return ret;
4454 }
4455
4456 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4457 {
4458         trace_napi_gro_receive_entry(skb);
4459
4460         skb_gro_reset_offset(skb);
4461
4462         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4463 }
4464 EXPORT_SYMBOL(napi_gro_receive);
4465
4466 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4467 {
4468         if (unlikely(skb->pfmemalloc)) {
4469                 consume_skb(skb);
4470                 return;
4471         }
4472         __skb_pull(skb, skb_headlen(skb));
4473         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4474         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4475         skb->vlan_tci = 0;
4476         skb->dev = napi->dev;
4477         skb->skb_iif = 0;
4478         skb->encapsulation = 0;
4479         skb_shinfo(skb)->gso_type = 0;
4480         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4481
4482         napi->skb = skb;
4483 }
4484
4485 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4486 {
4487         struct sk_buff *skb = napi->skb;
4488
4489         if (!skb) {
4490                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4491                 napi->skb = skb;
4492         }
4493         return skb;
4494 }
4495 EXPORT_SYMBOL(napi_get_frags);
4496
4497 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4498                                       struct sk_buff *skb,
4499                                       gro_result_t ret)
4500 {
4501         switch (ret) {
4502         case GRO_NORMAL:
4503         case GRO_HELD:
4504                 __skb_push(skb, ETH_HLEN);
4505                 skb->protocol = eth_type_trans(skb, skb->dev);
4506                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4507                         ret = GRO_DROP;
4508                 break;
4509
4510         case GRO_DROP:
4511         case GRO_MERGED_FREE:
4512                 napi_reuse_skb(napi, skb);
4513                 break;
4514
4515         case GRO_MERGED:
4516                 break;
4517         }
4518
4519         return ret;
4520 }
4521
4522 /* Upper GRO stack assumes network header starts at gro_offset=0
4523  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4524  * We copy ethernet header into skb->data to have a common layout.
4525  */
4526 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4527 {
4528         struct sk_buff *skb = napi->skb;
4529         const struct ethhdr *eth;
4530         unsigned int hlen = sizeof(*eth);
4531
4532         napi->skb = NULL;
4533
4534         skb_reset_mac_header(skb);
4535         skb_gro_reset_offset(skb);
4536
4537         eth = skb_gro_header_fast(skb, 0);
4538         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4539                 eth = skb_gro_header_slow(skb, hlen, 0);
4540                 if (unlikely(!eth)) {
4541                         napi_reuse_skb(napi, skb);
4542                         return NULL;
4543                 }
4544         } else {
4545                 gro_pull_from_frag0(skb, hlen);
4546                 NAPI_GRO_CB(skb)->frag0 += hlen;
4547                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4548         }
4549         __skb_pull(skb, hlen);
4550
4551         /*
4552          * This works because the only protocols we care about don't require
4553          * special handling.
4554          * We'll fix it up properly in napi_frags_finish()
4555          */
4556         skb->protocol = eth->h_proto;
4557
4558         return skb;
4559 }
4560
4561 gro_result_t napi_gro_frags(struct napi_struct *napi)
4562 {
4563         struct sk_buff *skb = napi_frags_skb(napi);
4564
4565         if (!skb)
4566                 return GRO_DROP;
4567
4568         trace_napi_gro_frags_entry(skb);
4569
4570         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4571 }
4572 EXPORT_SYMBOL(napi_gro_frags);
4573
4574 /* Compute the checksum from gro_offset and return the folded value
4575  * after adding in any pseudo checksum.
4576  */
4577 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4578 {
4579         __wsum wsum;
4580         __sum16 sum;
4581
4582         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4583
4584         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4585         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4586         if (likely(!sum)) {
4587                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4588                     !skb->csum_complete_sw)
4589                         netdev_rx_csum_fault(skb->dev);
4590         }
4591
4592         NAPI_GRO_CB(skb)->csum = wsum;
4593         NAPI_GRO_CB(skb)->csum_valid = 1;
4594
4595         return sum;
4596 }
4597 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4598
4599 /*
4600  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4601  * Note: called with local irq disabled, but exits with local irq enabled.
4602  */
4603 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4604 {
4605 #ifdef CONFIG_RPS
4606         struct softnet_data *remsd = sd->rps_ipi_list;
4607
4608         if (remsd) {
4609                 sd->rps_ipi_list = NULL;
4610
4611                 local_irq_enable();
4612                 preempt_check_resched_rt();
4613
4614                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4615                 while (remsd) {
4616                         struct softnet_data *next = remsd->rps_ipi_next;
4617
4618                         if (cpu_online(remsd->cpu))
4619                                 smp_call_function_single_async(remsd->cpu,
4620                                                            &remsd->csd);
4621                         remsd = next;
4622                 }
4623         } else
4624 #endif
4625                 local_irq_enable();
4626         preempt_check_resched_rt();
4627 }
4628
4629 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4630 {
4631 #ifdef CONFIG_RPS
4632         return sd->rps_ipi_list != NULL;
4633 #else
4634         return false;
4635 #endif
4636 }
4637
4638 static int process_backlog(struct napi_struct *napi, int quota)
4639 {
4640         int work = 0;
4641         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4642
4643         /* Check if we have pending ipi, its better to send them now,
4644          * not waiting net_rx_action() end.
4645          */
4646         if (sd_has_rps_ipi_waiting(sd)) {
4647                 local_irq_disable();
4648                 net_rps_action_and_irq_enable(sd);
4649         }
4650
4651         napi->weight = weight_p;
4652         local_irq_disable();
4653         while (1) {
4654                 struct sk_buff *skb;
4655
4656                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4657                         rcu_read_lock();
4658                         local_irq_enable();
4659                         __netif_receive_skb(skb);
4660                         rcu_read_unlock();
4661                         local_irq_disable();
4662                         input_queue_head_incr(sd);
4663                         if (++work >= quota) {
4664                                 local_irq_enable();
4665                                 return work;
4666                         }
4667                 }
4668
4669                 rps_lock(sd);
4670                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4671                         /*
4672                          * Inline a custom version of __napi_complete().
4673                          * only current cpu owns and manipulates this napi,
4674                          * and NAPI_STATE_SCHED is the only possible flag set
4675                          * on backlog.
4676                          * We can use a plain write instead of clear_bit(),
4677                          * and we dont need an smp_mb() memory barrier.
4678                          */
4679                         napi->state = 0;
4680                         rps_unlock(sd);
4681
4682                         break;
4683                 }
4684
4685                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4686                                            &sd->process_queue);
4687                 rps_unlock(sd);
4688         }
4689         local_irq_enable();
4690
4691         return work;
4692 }
4693
4694 /**
4695  * __napi_schedule - schedule for receive
4696  * @n: entry to schedule
4697  *
4698  * The entry's receive function will be scheduled to run.
4699  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4700  */
4701 void __napi_schedule(struct napi_struct *n)
4702 {
4703         unsigned long flags;
4704
4705         local_irq_save(flags);
4706         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4707         local_irq_restore(flags);
4708         preempt_check_resched_rt();
4709 }
4710 EXPORT_SYMBOL(__napi_schedule);
4711
4712 #ifndef CONFIG_PREEMPT_RT_FULL
4713 /**
4714  * __napi_schedule_irqoff - schedule for receive
4715  * @n: entry to schedule
4716  *
4717  * Variant of __napi_schedule() assuming hard irqs are masked
4718  */
4719 void __napi_schedule_irqoff(struct napi_struct *n)
4720 {
4721         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4722 }
4723 EXPORT_SYMBOL(__napi_schedule_irqoff);
4724 #endif
4725
4726 void __napi_complete(struct napi_struct *n)
4727 {
4728         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4729
4730         list_del_init(&n->poll_list);
4731         smp_mb__before_atomic();
4732         clear_bit(NAPI_STATE_SCHED, &n->state);
4733 }
4734 EXPORT_SYMBOL(__napi_complete);
4735
4736 void napi_complete_done(struct napi_struct *n, int work_done)
4737 {
4738         unsigned long flags;
4739
4740         /*
4741          * don't let napi dequeue from the cpu poll list
4742          * just in case its running on a different cpu
4743          */
4744         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4745                 return;
4746
4747         if (n->gro_list) {
4748                 unsigned long timeout = 0;
4749
4750                 if (work_done)
4751                         timeout = n->dev->gro_flush_timeout;
4752
4753                 if (timeout)
4754                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
4755                                       HRTIMER_MODE_REL_PINNED);
4756                 else
4757                         napi_gro_flush(n, false);
4758         }
4759         if (likely(list_empty(&n->poll_list))) {
4760                 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4761         } else {
4762                 /* If n->poll_list is not empty, we need to mask irqs */
4763                 local_irq_save(flags);
4764                 __napi_complete(n);
4765                 local_irq_restore(flags);
4766         }
4767 }
4768 EXPORT_SYMBOL(napi_complete_done);
4769
4770 /* must be called under rcu_read_lock(), as we dont take a reference */
4771 struct napi_struct *napi_by_id(unsigned int napi_id)
4772 {
4773         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4774         struct napi_struct *napi;
4775
4776         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4777                 if (napi->napi_id == napi_id)
4778                         return napi;
4779
4780         return NULL;
4781 }
4782 EXPORT_SYMBOL_GPL(napi_by_id);
4783
4784 void napi_hash_add(struct napi_struct *napi)
4785 {
4786         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4787
4788                 spin_lock(&napi_hash_lock);
4789
4790                 /* 0 is not a valid id, we also skip an id that is taken
4791                  * we expect both events to be extremely rare
4792                  */
4793                 napi->napi_id = 0;
4794                 while (!napi->napi_id) {
4795                         napi->napi_id = ++napi_gen_id;
4796                         if (napi_by_id(napi->napi_id))
4797                                 napi->napi_id = 0;
4798                 }
4799
4800                 hlist_add_head_rcu(&napi->napi_hash_node,
4801                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4802
4803                 spin_unlock(&napi_hash_lock);
4804         }
4805 }
4806 EXPORT_SYMBOL_GPL(napi_hash_add);
4807
4808 /* Warning : caller is responsible to make sure rcu grace period
4809  * is respected before freeing memory containing @napi
4810  */
4811 void napi_hash_del(struct napi_struct *napi)
4812 {
4813         spin_lock(&napi_hash_lock);
4814
4815         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4816                 hlist_del_rcu(&napi->napi_hash_node);
4817
4818         spin_unlock(&napi_hash_lock);
4819 }
4820 EXPORT_SYMBOL_GPL(napi_hash_del);
4821
4822 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4823 {
4824         struct napi_struct *napi;
4825
4826         napi = container_of(timer, struct napi_struct, timer);
4827         if (napi->gro_list)
4828                 napi_schedule(napi);
4829
4830         return HRTIMER_NORESTART;
4831 }
4832
4833 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4834                     int (*poll)(struct napi_struct *, int), int weight)
4835 {
4836         INIT_LIST_HEAD(&napi->poll_list);
4837         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4838         napi->timer.function = napi_watchdog;
4839         napi->gro_count = 0;
4840         napi->gro_list = NULL;
4841         napi->skb = NULL;
4842         napi->poll = poll;
4843         if (weight > NAPI_POLL_WEIGHT)
4844                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4845                             weight, dev->name);
4846         napi->weight = weight;
4847         list_add(&napi->dev_list, &dev->napi_list);
4848         napi->dev = dev;
4849 #ifdef CONFIG_NETPOLL
4850         spin_lock_init(&napi->poll_lock);
4851         napi->poll_owner = -1;
4852 #endif
4853         set_bit(NAPI_STATE_SCHED, &napi->state);
4854 }
4855 EXPORT_SYMBOL(netif_napi_add);
4856
4857 void napi_disable(struct napi_struct *n)
4858 {
4859         might_sleep();
4860         set_bit(NAPI_STATE_DISABLE, &n->state);
4861
4862         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4863                 msleep(1);
4864         while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
4865                 msleep(1);
4866
4867         hrtimer_cancel(&n->timer);
4868
4869         clear_bit(NAPI_STATE_DISABLE, &n->state);
4870 }
4871 EXPORT_SYMBOL(napi_disable);
4872
4873 void netif_napi_del(struct napi_struct *napi)
4874 {
4875         list_del_init(&napi->dev_list);
4876         napi_free_frags(napi);
4877
4878         kfree_skb_list(napi->gro_list);
4879         napi->gro_list = NULL;
4880         napi->gro_count = 0;
4881 }
4882 EXPORT_SYMBOL(netif_napi_del);
4883
4884 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4885 {
4886         void *have;
4887         int work, weight;
4888
4889         list_del_init(&n->poll_list);
4890
4891         have = netpoll_poll_lock(n);
4892
4893         weight = n->weight;
4894
4895         /* This NAPI_STATE_SCHED test is for avoiding a race
4896          * with netpoll's poll_napi().  Only the entity which
4897          * obtains the lock and sees NAPI_STATE_SCHED set will
4898          * actually make the ->poll() call.  Therefore we avoid
4899          * accidentally calling ->poll() when NAPI is not scheduled.
4900          */
4901         work = 0;
4902         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4903                 work = n->poll(n, weight);
4904                 trace_napi_poll(n);
4905         }
4906
4907         WARN_ON_ONCE(work > weight);
4908
4909         if (likely(work < weight))
4910                 goto out_unlock;
4911
4912         /* Drivers must not modify the NAPI state if they
4913          * consume the entire weight.  In such cases this code
4914          * still "owns" the NAPI instance and therefore can
4915          * move the instance around on the list at-will.
4916          */
4917         if (unlikely(napi_disable_pending(n))) {
4918                 napi_complete(n);
4919                 goto out_unlock;
4920         }
4921
4922         if (n->gro_list) {
4923                 /* flush too old packets
4924                  * If HZ < 1000, flush all packets.
4925                  */
4926                 napi_gro_flush(n, HZ >= 1000);
4927         }
4928
4929         /* Some drivers may have called napi_schedule
4930          * prior to exhausting their budget.
4931          */
4932         if (unlikely(!list_empty(&n->poll_list))) {
4933                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4934                              n->dev ? n->dev->name : "backlog");
4935                 goto out_unlock;
4936         }
4937
4938         list_add_tail(&n->poll_list, repoll);
4939
4940 out_unlock:
4941         netpoll_poll_unlock(have);
4942
4943         return work;
4944 }
4945
4946 static void net_rx_action(struct softirq_action *h)
4947 {
4948         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4949         unsigned long time_limit = jiffies + 2;
4950         int budget = netdev_budget;
4951         struct sk_buff_head tofree_q;
4952         struct sk_buff *skb;
4953         LIST_HEAD(list);
4954         LIST_HEAD(repoll);
4955
4956         __skb_queue_head_init(&tofree_q);
4957
4958         local_irq_disable();
4959         skb_queue_splice_init(&sd->tofree_queue, &tofree_q);
4960         list_splice_init(&sd->poll_list, &list);
4961         local_irq_enable();
4962
4963         while ((skb = __skb_dequeue(&tofree_q)))
4964                 kfree_skb(skb);
4965
4966         for (;;) {
4967                 struct napi_struct *n;
4968
4969                 if (list_empty(&list)) {
4970                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4971                                 return;
4972                         break;
4973                 }
4974
4975                 n = list_first_entry(&list, struct napi_struct, poll_list);
4976                 budget -= napi_poll(n, &repoll);
4977
4978                 /* If softirq window is exhausted then punt.
4979                  * Allow this to run for 2 jiffies since which will allow
4980                  * an average latency of 1.5/HZ.
4981                  */
4982                 if (unlikely(budget <= 0 ||
4983                              time_after_eq(jiffies, time_limit))) {
4984                         sd->time_squeeze++;
4985                         break;
4986                 }
4987         }
4988
4989         local_irq_disable();
4990
4991         list_splice_tail_init(&sd->poll_list, &list);
4992         list_splice_tail(&repoll, &list);
4993         list_splice(&list, &sd->poll_list);
4994         if (!list_empty(&sd->poll_list))
4995                 __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ);
4996
4997         net_rps_action_and_irq_enable(sd);
4998 }
4999
5000 struct netdev_adjacent {
5001         struct net_device *dev;
5002
5003         /* upper master flag, there can only be one master device per list */
5004         bool master;
5005
5006         /* counter for the number of times this device was added to us */
5007         u16 ref_nr;
5008
5009         /* private field for the users */
5010         void *private;
5011
5012         struct list_head list;
5013         struct rcu_head rcu;
5014 };
5015
5016 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5017                                                  struct list_head *adj_list)
5018 {
5019         struct netdev_adjacent *adj;
5020
5021         list_for_each_entry(adj, adj_list, list) {
5022                 if (adj->dev == adj_dev)
5023                         return adj;
5024         }
5025         return NULL;
5026 }
5027
5028 /**
5029  * netdev_has_upper_dev - Check if device is linked to an upper device
5030  * @dev: device
5031  * @upper_dev: upper device to check
5032  *
5033  * Find out if a device is linked to specified upper device and return true
5034  * in case it is. Note that this checks only immediate upper device,
5035  * not through a complete stack of devices. The caller must hold the RTNL lock.
5036  */
5037 bool netdev_has_upper_dev(struct net_device *dev,
5038                           struct net_device *upper_dev)
5039 {
5040         ASSERT_RTNL();
5041
5042         return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
5043 }
5044 EXPORT_SYMBOL(netdev_has_upper_dev);
5045
5046 /**
5047  * netdev_has_any_upper_dev - Check if device is linked to some device
5048  * @dev: device
5049  *
5050  * Find out if a device is linked to an upper device and return true in case
5051  * it is. The caller must hold the RTNL lock.
5052  */
5053 static bool netdev_has_any_upper_dev(struct net_device *dev)
5054 {
5055         ASSERT_RTNL();
5056
5057         return !list_empty(&dev->all_adj_list.upper);
5058 }
5059
5060 /**
5061  * netdev_master_upper_dev_get - Get master upper device
5062  * @dev: device
5063  *
5064  * Find a master upper device and return pointer to it or NULL in case
5065  * it's not there. The caller must hold the RTNL lock.
5066  */
5067 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5068 {
5069         struct netdev_adjacent *upper;
5070
5071         ASSERT_RTNL();
5072
5073         if (list_empty(&dev->adj_list.upper))
5074                 return NULL;
5075
5076         upper = list_first_entry(&dev->adj_list.upper,
5077                                  struct netdev_adjacent, list);
5078         if (likely(upper->master))
5079                 return upper->dev;
5080         return NULL;
5081 }
5082 EXPORT_SYMBOL(netdev_master_upper_dev_get);
5083
5084 void *netdev_adjacent_get_private(struct list_head *adj_list)
5085 {
5086         struct netdev_adjacent *adj;
5087
5088         adj = list_entry(adj_list, struct netdev_adjacent, list);
5089
5090         return adj->private;
5091 }
5092 EXPORT_SYMBOL(netdev_adjacent_get_private);
5093
5094 /**
5095  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5096  * @dev: device
5097  * @iter: list_head ** of the current position
5098  *
5099  * Gets the next device from the dev's upper list, starting from iter
5100  * position. The caller must hold RCU read lock.
5101  */
5102 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5103                                                  struct list_head **iter)
5104 {
5105         struct netdev_adjacent *upper;
5106
5107         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5108
5109         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5110
5111         if (&upper->list == &dev->adj_list.upper)
5112                 return NULL;
5113
5114         *iter = &upper->list;
5115
5116         return upper->dev;
5117 }
5118 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5119
5120 /**
5121  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
5122  * @dev: device
5123  * @iter: list_head ** of the current position
5124  *
5125  * Gets the next device from the dev's upper list, starting from iter
5126  * position. The caller must hold RCU read lock.
5127  */
5128 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5129                                                      struct list_head **iter)
5130 {
5131         struct netdev_adjacent *upper;
5132
5133         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5134
5135         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5136
5137         if (&upper->list == &dev->all_adj_list.upper)
5138                 return NULL;
5139
5140         *iter = &upper->list;
5141
5142         return upper->dev;
5143 }
5144 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
5145
5146 /**
5147  * netdev_lower_get_next_private - Get the next ->private from the
5148  *                                 lower neighbour list
5149  * @dev: device
5150  * @iter: list_head ** of the current position
5151  *
5152  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5153  * list, starting from iter position. The caller must hold either hold the
5154  * RTNL lock or its own locking that guarantees that the neighbour lower
5155  * list will remain unchanged.
5156  */
5157 void *netdev_lower_get_next_private(struct net_device *dev,
5158                                     struct list_head **iter)
5159 {
5160         struct netdev_adjacent *lower;
5161
5162         lower = list_entry(*iter, struct netdev_adjacent, list);
5163
5164         if (&lower->list == &dev->adj_list.lower)
5165                 return NULL;
5166
5167         *iter = lower->list.next;
5168
5169         return lower->private;
5170 }
5171 EXPORT_SYMBOL(netdev_lower_get_next_private);
5172
5173 /**
5174  * netdev_lower_get_next_private_rcu - Get the next ->private from the
5175  *                                     lower neighbour list, RCU
5176  *                                     variant
5177  * @dev: device
5178  * @iter: list_head ** of the current position
5179  *
5180  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5181  * list, starting from iter position. The caller must hold RCU read lock.
5182  */
5183 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5184                                         struct list_head **iter)
5185 {
5186         struct netdev_adjacent *lower;
5187
5188         WARN_ON_ONCE(!rcu_read_lock_held());
5189
5190         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5191
5192         if (&lower->list == &dev->adj_list.lower)
5193                 return NULL;
5194
5195         *iter = &lower->list;
5196
5197         return lower->private;
5198 }
5199 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5200
5201 /**
5202  * netdev_lower_get_next - Get the next device from the lower neighbour
5203  *                         list
5204  * @dev: device
5205  * @iter: list_head ** of the current position
5206  *
5207  * Gets the next netdev_adjacent from the dev's lower neighbour
5208  * list, starting from iter position. The caller must hold RTNL lock or
5209  * its own locking that guarantees that the neighbour lower
5210  * list will remain unchanged.
5211  */
5212 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5213 {
5214         struct netdev_adjacent *lower;
5215
5216         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5217
5218         if (&lower->list == &dev->adj_list.lower)
5219                 return NULL;
5220
5221         *iter = &lower->list;
5222
5223         return lower->dev;
5224 }
5225 EXPORT_SYMBOL(netdev_lower_get_next);
5226
5227 /**
5228  * netdev_lower_get_first_private_rcu - Get the first ->private from the
5229  *                                     lower neighbour list, RCU
5230  *                                     variant
5231  * @dev: device
5232  *
5233  * Gets the first netdev_adjacent->private from the dev's lower neighbour
5234  * list. The caller must hold RCU read lock.
5235  */
5236 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5237 {
5238         struct netdev_adjacent *lower;
5239
5240         lower = list_first_or_null_rcu(&dev->adj_list.lower,
5241                         struct netdev_adjacent, list);
5242         if (lower)
5243                 return lower->private;
5244         return NULL;
5245 }
5246 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5247
5248 /**
5249  * netdev_master_upper_dev_get_rcu - Get master upper device
5250  * @dev: device
5251  *
5252  * Find a master upper device and return pointer to it or NULL in case
5253  * it's not there. The caller must hold the RCU read lock.
5254  */
5255 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5256 {
5257         struct netdev_adjacent *upper;
5258
5259         upper = list_first_or_null_rcu(&dev->adj_list.upper,
5260                                        struct netdev_adjacent, list);
5261         if (upper && likely(upper->master))
5262                 return upper->dev;
5263         return NULL;
5264 }
5265 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5266
5267 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5268                               struct net_device *adj_dev,
5269                               struct list_head *dev_list)
5270 {
5271         char linkname[IFNAMSIZ+7];
5272         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5273                 "upper_%s" : "lower_%s", adj_dev->name);
5274         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5275                                  linkname);
5276 }
5277 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5278                                char *name,
5279                                struct list_head *dev_list)
5280 {
5281         char linkname[IFNAMSIZ+7];
5282         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5283                 "upper_%s" : "lower_%s", name);
5284         sysfs_remove_link(&(dev->dev.kobj), linkname);
5285 }
5286
5287 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5288                                                  struct net_device *adj_dev,
5289                                                  struct list_head *dev_list)
5290 {
5291         return (dev_list == &dev->adj_list.upper ||
5292                 dev_list == &dev->adj_list.lower) &&
5293                 net_eq(dev_net(dev), dev_net(adj_dev));
5294 }
5295
5296 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5297                                         struct net_device *adj_dev,
5298                                         u16 ref_nr,
5299                                         struct list_head *dev_list,
5300                                         void *private, bool master)
5301 {
5302         struct netdev_adjacent *adj;
5303         int ret;
5304
5305         adj = __netdev_find_adj(adj_dev, dev_list);
5306
5307         if (adj) {
5308                 adj->ref_nr += ref_nr;
5309                 return 0;
5310         }
5311
5312         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5313         if (!adj)
5314                 return -ENOMEM;
5315
5316         adj->dev = adj_dev;
5317         adj->master = master;
5318         adj->ref_nr = ref_nr;
5319         adj->private = private;
5320         dev_hold(adj_dev);
5321
5322         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5323                  adj_dev->name, dev->name, adj_dev->name);
5324
5325         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5326                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5327                 if (ret)
5328                         goto free_adj;
5329         }
5330
5331         /* Ensure that master link is always the first item in list. */
5332         if (master) {
5333                 ret = sysfs_create_link(&(dev->dev.kobj),
5334                                         &(adj_dev->dev.kobj), "master");
5335                 if (ret)
5336                         goto remove_symlinks;
5337
5338                 list_add_rcu(&adj->list, dev_list);
5339         } else {
5340                 list_add_tail_rcu(&adj->list, dev_list);
5341         }
5342
5343         return 0;
5344
5345 remove_symlinks:
5346         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5347                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5348 free_adj:
5349         kfree(adj);
5350         dev_put(adj_dev);
5351
5352         return ret;
5353 }
5354
5355 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5356                                          struct net_device *adj_dev,
5357                                          u16 ref_nr,
5358                                          struct list_head *dev_list)
5359 {
5360         struct netdev_adjacent *adj;
5361
5362         adj = __netdev_find_adj(adj_dev, dev_list);
5363
5364         if (!adj) {
5365                 pr_err("tried to remove device %s from %s\n",
5366                        dev->name, adj_dev->name);
5367                 BUG();
5368         }
5369
5370         if (adj->ref_nr > ref_nr) {
5371                 pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name,
5372                          ref_nr, adj->ref_nr-ref_nr);
5373                 adj->ref_nr -= ref_nr;
5374                 return;
5375         }
5376
5377         if (adj->master)
5378                 sysfs_remove_link(&(dev->dev.kobj), "master");
5379
5380         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5381                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5382
5383         list_del_rcu(&adj->list);
5384         pr_debug("dev_put for %s, because link removed from %s to %s\n",
5385                  adj_dev->name, dev->name, adj_dev->name);
5386         dev_put(adj_dev);
5387         kfree_rcu(adj, rcu);
5388 }
5389
5390 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5391                                             struct net_device *upper_dev,
5392                                             u16 ref_nr,
5393                                             struct list_head *up_list,
5394                                             struct list_head *down_list,
5395                                             void *private, bool master)
5396 {
5397         int ret;
5398
5399         ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list,
5400                                            private, master);
5401         if (ret)
5402                 return ret;
5403
5404         ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list,
5405                                            private, false);
5406         if (ret) {
5407                 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5408                 return ret;
5409         }
5410
5411         return 0;
5412 }
5413
5414 static int __netdev_adjacent_dev_link(struct net_device *dev,
5415                                       struct net_device *upper_dev,
5416                                       u16 ref_nr)
5417 {
5418         return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr,
5419                                                 &dev->all_adj_list.upper,
5420                                                 &upper_dev->all_adj_list.lower,
5421                                                 NULL, false);
5422 }
5423
5424 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5425                                                struct net_device *upper_dev,
5426                                                u16 ref_nr,
5427                                                struct list_head *up_list,
5428                                                struct list_head *down_list)
5429 {
5430         __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5431         __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5432 }
5433
5434 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5435                                          struct net_device *upper_dev,
5436                                          u16 ref_nr)
5437 {
5438         __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr,
5439                                            &dev->all_adj_list.upper,
5440                                            &upper_dev->all_adj_list.lower);
5441 }
5442
5443 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5444                                                 struct net_device *upper_dev,
5445                                                 void *private, bool master)
5446 {
5447         int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1);
5448
5449         if (ret)
5450                 return ret;
5451
5452         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1,
5453                                                &dev->adj_list.upper,
5454                                                &upper_dev->adj_list.lower,
5455                                                private, master);
5456         if (ret) {
5457                 __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5458                 return ret;
5459         }
5460
5461         return 0;
5462 }
5463
5464 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5465                                                    struct net_device *upper_dev)
5466 {
5467         __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5468         __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5469                                            &dev->adj_list.upper,
5470                                            &upper_dev->adj_list.lower);
5471 }
5472
5473 static int __netdev_upper_dev_link(struct net_device *dev,
5474                                    struct net_device *upper_dev, bool master,
5475                                    void *private)
5476 {
5477         struct netdev_notifier_changeupper_info changeupper_info;
5478         struct netdev_adjacent *i, *j, *to_i, *to_j;
5479         int ret = 0;
5480
5481         ASSERT_RTNL();
5482
5483         if (dev == upper_dev)
5484                 return -EBUSY;
5485
5486         /* To prevent loops, check if dev is not upper device to upper_dev. */
5487         if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
5488                 return -EBUSY;
5489
5490         if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
5491                 return -EEXIST;
5492
5493         if (master && netdev_master_upper_dev_get(dev))
5494                 return -EBUSY;
5495
5496         changeupper_info.upper_dev = upper_dev;
5497         changeupper_info.master = master;
5498         changeupper_info.linking = true;
5499
5500         ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5501                                             &changeupper_info.info);
5502         ret = notifier_to_errno(ret);
5503         if (ret)
5504                 return ret;
5505
5506         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5507                                                    master);
5508         if (ret)
5509                 return ret;
5510
5511         /* Now that we linked these devs, make all the upper_dev's
5512          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5513          * versa, and don't forget the devices itself. All of these
5514          * links are non-neighbours.
5515          */
5516         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5517                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5518                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5519                                  i->dev->name, j->dev->name);
5520                         ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr);
5521                         if (ret)
5522                                 goto rollback_mesh;
5523                 }
5524         }
5525
5526         /* add dev to every upper_dev's upper device */
5527         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5528                 pr_debug("linking %s's upper device %s with %s\n",
5529                          upper_dev->name, i->dev->name, dev->name);
5530                 ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr);
5531                 if (ret)
5532                         goto rollback_upper_mesh;
5533         }
5534
5535         /* add upper_dev to every dev's lower device */
5536         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5537                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5538                          i->dev->name, upper_dev->name);
5539                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr);
5540                 if (ret)
5541                         goto rollback_lower_mesh;
5542         }
5543
5544         call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5545                                       &changeupper_info.info);
5546         return 0;
5547
5548 rollback_lower_mesh:
5549         to_i = i;
5550         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5551                 if (i == to_i)
5552                         break;
5553                 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5554         }
5555
5556         i = NULL;
5557
5558 rollback_upper_mesh:
5559         to_i = i;
5560         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5561                 if (i == to_i)
5562                         break;
5563                 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5564         }
5565
5566         i = j = NULL;
5567
5568 rollback_mesh:
5569         to_i = i;
5570         to_j = j;
5571         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5572                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5573                         if (i == to_i && j == to_j)
5574                                 break;
5575                         __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5576                 }
5577                 if (i == to_i)
5578                         break;
5579         }
5580
5581         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5582
5583         return ret;
5584 }
5585
5586 /**
5587  * netdev_upper_dev_link - Add a link to the upper device
5588  * @dev: device
5589  * @upper_dev: new upper device
5590  *
5591  * Adds a link to device which is upper to this one. The caller must hold
5592  * the RTNL lock. On a failure a negative errno code is returned.
5593  * On success the reference counts are adjusted and the function
5594  * returns zero.
5595  */
5596 int netdev_upper_dev_link(struct net_device *dev,
5597                           struct net_device *upper_dev)
5598 {
5599         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5600 }
5601 EXPORT_SYMBOL(netdev_upper_dev_link);
5602
5603 /**
5604  * netdev_master_upper_dev_link - Add a master link to the upper device
5605  * @dev: device
5606  * @upper_dev: new upper device
5607  *
5608  * Adds a link to device which is upper to this one. In this case, only
5609  * one master upper device can be linked, although other non-master devices
5610  * might be linked as well. The caller must hold the RTNL lock.
5611  * On a failure a negative errno code is returned. On success the reference
5612  * counts are adjusted and the function returns zero.
5613  */
5614 int netdev_master_upper_dev_link(struct net_device *dev,
5615                                  struct net_device *upper_dev)
5616 {
5617         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5618 }
5619 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5620
5621 int netdev_master_upper_dev_link_private(struct net_device *dev,
5622                                          struct net_device *upper_dev,
5623                                          void *private)
5624 {
5625         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5626 }
5627 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5628
5629 /**
5630  * netdev_upper_dev_unlink - Removes a link to upper device
5631  * @dev: device
5632  * @upper_dev: new upper device
5633  *
5634  * Removes a link to device which is upper to this one. The caller must hold
5635  * the RTNL lock.
5636  */
5637 void netdev_upper_dev_unlink(struct net_device *dev,
5638                              struct net_device *upper_dev)
5639 {
5640         struct netdev_notifier_changeupper_info changeupper_info;
5641         struct netdev_adjacent *i, *j;
5642         ASSERT_RTNL();
5643
5644         changeupper_info.upper_dev = upper_dev;
5645         changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5646         changeupper_info.linking = false;
5647
5648         call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5649                                       &changeupper_info.info);
5650
5651         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5652
5653         /* Here is the tricky part. We must remove all dev's lower
5654          * devices from all upper_dev's upper devices and vice
5655          * versa, to maintain the graph relationship.
5656          */
5657         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5658                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5659                         __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5660
5661         /* remove also the devices itself from lower/upper device
5662          * list
5663          */
5664         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5665                 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5666
5667         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5668                 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5669
5670         call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5671                                       &changeupper_info.info);
5672 }
5673 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5674
5675 /**
5676  * netdev_bonding_info_change - Dispatch event about slave change
5677  * @dev: device
5678  * @bonding_info: info to dispatch
5679  *
5680  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5681  * The caller must hold the RTNL lock.
5682  */
5683 void netdev_bonding_info_change(struct net_device *dev,
5684                                 struct netdev_bonding_info *bonding_info)
5685 {
5686         struct netdev_notifier_bonding_info     info;
5687
5688         memcpy(&info.bonding_info, bonding_info,
5689                sizeof(struct netdev_bonding_info));
5690         call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5691                                       &info.info);
5692 }
5693 EXPORT_SYMBOL(netdev_bonding_info_change);
5694
5695 static void netdev_adjacent_add_links(struct net_device *dev)
5696 {
5697         struct netdev_adjacent *iter;
5698
5699         struct net *net = dev_net(dev);
5700
5701         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5702                 if (!net_eq(net,dev_net(iter->dev)))
5703                         continue;
5704                 netdev_adjacent_sysfs_add(iter->dev, dev,
5705                                           &iter->dev->adj_list.lower);
5706                 netdev_adjacent_sysfs_add(dev, iter->dev,
5707                                           &dev->adj_list.upper);
5708         }
5709
5710         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5711                 if (!net_eq(net,dev_net(iter->dev)))
5712                         continue;
5713                 netdev_adjacent_sysfs_add(iter->dev, dev,
5714                                           &iter->dev->adj_list.upper);
5715                 netdev_adjacent_sysfs_add(dev, iter->dev,
5716                                           &dev->adj_list.lower);
5717         }
5718 }
5719
5720 static void netdev_adjacent_del_links(struct net_device *dev)
5721 {
5722         struct netdev_adjacent *iter;
5723
5724         struct net *net = dev_net(dev);
5725
5726         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5727                 if (!net_eq(net,dev_net(iter->dev)))
5728                         continue;
5729                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5730                                           &iter->dev->adj_list.lower);
5731                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5732                                           &dev->adj_list.upper);
5733         }
5734
5735         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5736                 if (!net_eq(net,dev_net(iter->dev)))
5737                         continue;
5738                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5739                                           &iter->dev->adj_list.upper);
5740                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5741                                           &dev->adj_list.lower);
5742         }
5743 }
5744
5745 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5746 {
5747         struct netdev_adjacent *iter;
5748
5749         struct net *net = dev_net(dev);
5750
5751         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5752                 if (!net_eq(net,dev_net(iter->dev)))
5753                         continue;
5754                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5755                                           &iter->dev->adj_list.lower);
5756                 netdev_adjacent_sysfs_add(iter->dev, dev,
5757                                           &iter->dev->adj_list.lower);
5758         }
5759
5760         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5761                 if (!net_eq(net,dev_net(iter->dev)))
5762                         continue;
5763                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5764                                           &iter->dev->adj_list.upper);
5765                 netdev_adjacent_sysfs_add(iter->dev, dev,
5766                                           &iter->dev->adj_list.upper);
5767         }
5768 }
5769
5770 void *netdev_lower_dev_get_private(struct net_device *dev,
5771                                    struct net_device *lower_dev)
5772 {
5773         struct netdev_adjacent *lower;
5774
5775         if (!lower_dev)
5776                 return NULL;
5777         lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
5778         if (!lower)
5779                 return NULL;
5780
5781         return lower->private;
5782 }
5783 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5784
5785
5786 int dev_get_nest_level(struct net_device *dev,
5787                        bool (*type_check)(struct net_device *dev))
5788 {
5789         struct net_device *lower = NULL;
5790         struct list_head *iter;
5791         int max_nest = -1;
5792         int nest;
5793
5794         ASSERT_RTNL();
5795
5796         netdev_for_each_lower_dev(dev, lower, iter) {
5797                 nest = dev_get_nest_level(lower, type_check);
5798                 if (max_nest < nest)
5799                         max_nest = nest;
5800         }
5801
5802         if (type_check(dev))
5803                 max_nest++;
5804
5805         return max_nest;
5806 }
5807 EXPORT_SYMBOL(dev_get_nest_level);
5808
5809 static void dev_change_rx_flags(struct net_device *dev, int flags)
5810 {
5811         const struct net_device_ops *ops = dev->netdev_ops;
5812
5813         if (ops->ndo_change_rx_flags)
5814                 ops->ndo_change_rx_flags(dev, flags);
5815 }
5816
5817 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5818 {
5819         unsigned int old_flags = dev->flags;
5820         kuid_t uid;
5821         kgid_t gid;
5822
5823         ASSERT_RTNL();
5824
5825         dev->flags |= IFF_PROMISC;
5826         dev->promiscuity += inc;
5827         if (dev->promiscuity == 0) {
5828                 /*
5829                  * Avoid overflow.
5830                  * If inc causes overflow, untouch promisc and return error.
5831                  */
5832                 if (inc < 0)
5833                         dev->flags &= ~IFF_PROMISC;
5834                 else {
5835                         dev->promiscuity -= inc;
5836                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5837                                 dev->name);
5838                         return -EOVERFLOW;
5839                 }
5840         }
5841         if (dev->flags != old_flags) {
5842                 pr_info("device %s %s promiscuous mode\n",
5843                         dev->name,
5844                         dev->flags & IFF_PROMISC ? "entered" : "left");
5845                 if (audit_enabled) {
5846                         current_uid_gid(&uid, &gid);
5847                         audit_log(current->audit_context, GFP_ATOMIC,
5848                                 AUDIT_ANOM_PROMISCUOUS,
5849                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5850                                 dev->name, (dev->flags & IFF_PROMISC),
5851                                 (old_flags & IFF_PROMISC),
5852                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5853                                 from_kuid(&init_user_ns, uid),
5854                                 from_kgid(&init_user_ns, gid),
5855                                 audit_get_sessionid(current));
5856                 }
5857
5858                 dev_change_rx_flags(dev, IFF_PROMISC);
5859         }
5860         if (notify)
5861                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5862         return 0;
5863 }
5864
5865 /**
5866  *      dev_set_promiscuity     - update promiscuity count on a device
5867  *      @dev: device
5868  *      @inc: modifier
5869  *
5870  *      Add or remove promiscuity from a device. While the count in the device
5871  *      remains above zero the interface remains promiscuous. Once it hits zero
5872  *      the device reverts back to normal filtering operation. A negative inc
5873  *      value is used to drop promiscuity on the device.
5874  *      Return 0 if successful or a negative errno code on error.
5875  */
5876 int dev_set_promiscuity(struct net_device *dev, int inc)
5877 {
5878         unsigned int old_flags = dev->flags;
5879         int err;
5880
5881         err = __dev_set_promiscuity(dev, inc, true);
5882         if (err < 0)
5883                 return err;
5884         if (dev->flags != old_flags)
5885                 dev_set_rx_mode(dev);
5886         return err;
5887 }
5888 EXPORT_SYMBOL(dev_set_promiscuity);
5889
5890 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5891 {
5892         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5893
5894         ASSERT_RTNL();
5895
5896         dev->flags |= IFF_ALLMULTI;
5897         dev->allmulti += inc;
5898         if (dev->allmulti == 0) {
5899                 /*
5900                  * Avoid overflow.
5901                  * If inc causes overflow, untouch allmulti and return error.
5902                  */
5903                 if (inc < 0)
5904                         dev->flags &= ~IFF_ALLMULTI;
5905                 else {
5906                         dev->allmulti -= inc;
5907                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5908                                 dev->name);
5909                         return -EOVERFLOW;
5910                 }
5911         }
5912         if (dev->flags ^ old_flags) {
5913                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5914                 dev_set_rx_mode(dev);
5915                 if (notify)
5916                         __dev_notify_flags(dev, old_flags,
5917                                            dev->gflags ^ old_gflags);
5918         }
5919         return 0;
5920 }
5921
5922 /**
5923  *      dev_set_allmulti        - update allmulti count on a device
5924  *      @dev: device
5925  *      @inc: modifier
5926  *
5927  *      Add or remove reception of all multicast frames to a device. While the
5928  *      count in the device remains above zero the interface remains listening
5929  *      to all interfaces. Once it hits zero the device reverts back to normal
5930  *      filtering operation. A negative @inc value is used to drop the counter
5931  *      when releasing a resource needing all multicasts.
5932  *      Return 0 if successful or a negative errno code on error.
5933  */
5934
5935 int dev_set_allmulti(struct net_device *dev, int inc)
5936 {
5937         return __dev_set_allmulti(dev, inc, true);
5938 }
5939 EXPORT_SYMBOL(dev_set_allmulti);
5940
5941 /*
5942  *      Upload unicast and multicast address lists to device and
5943  *      configure RX filtering. When the device doesn't support unicast
5944  *      filtering it is put in promiscuous mode while unicast addresses
5945  *      are present.
5946  */
5947 void __dev_set_rx_mode(struct net_device *dev)
5948 {
5949         const struct net_device_ops *ops = dev->netdev_ops;
5950
5951         /* dev_open will call this function so the list will stay sane. */
5952         if (!(dev->flags&IFF_UP))
5953                 return;
5954
5955         if (!netif_device_present(dev))
5956                 return;
5957
5958         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5959                 /* Unicast addresses changes may only happen under the rtnl,
5960                  * therefore calling __dev_set_promiscuity here is safe.
5961                  */
5962                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5963                         __dev_set_promiscuity(dev, 1, false);
5964                         dev->uc_promisc = true;
5965                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5966                         __dev_set_promiscuity(dev, -1, false);
5967                         dev->uc_promisc = false;
5968                 }
5969         }
5970
5971         if (ops->ndo_set_rx_mode)
5972                 ops->ndo_set_rx_mode(dev);
5973 }
5974
5975 void dev_set_rx_mode(struct net_device *dev)
5976 {
5977         netif_addr_lock_bh(dev);
5978         __dev_set_rx_mode(dev);
5979         netif_addr_unlock_bh(dev);
5980 }
5981
5982 /**
5983  *      dev_get_flags - get flags reported to userspace
5984  *      @dev: device
5985  *
5986  *      Get the combination of flag bits exported through APIs to userspace.
5987  */
5988 unsigned int dev_get_flags(const struct net_device *dev)
5989 {
5990         unsigned int flags;
5991
5992         flags = (dev->flags & ~(IFF_PROMISC |
5993                                 IFF_ALLMULTI |
5994                                 IFF_RUNNING |
5995                                 IFF_LOWER_UP |
5996                                 IFF_DORMANT)) |
5997                 (dev->gflags & (IFF_PROMISC |
5998                                 IFF_ALLMULTI));
5999
6000         if (netif_running(dev)) {
6001                 if (netif_oper_up(dev))
6002                         flags |= IFF_RUNNING;
6003                 if (netif_carrier_ok(dev))
6004                         flags |= IFF_LOWER_UP;
6005                 if (netif_dormant(dev))
6006                         flags |= IFF_DORMANT;
6007         }
6008
6009         return flags;
6010 }
6011 EXPORT_SYMBOL(dev_get_flags);
6012
6013 int __dev_change_flags(struct net_device *dev, unsigned int flags)
6014 {
6015         unsigned int old_flags = dev->flags;
6016         int ret;
6017
6018         ASSERT_RTNL();
6019
6020         /*
6021          *      Set the flags on our device.
6022          */
6023
6024         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6025                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6026                                IFF_AUTOMEDIA)) |
6027                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6028                                     IFF_ALLMULTI));
6029
6030         /*
6031          *      Load in the correct multicast list now the flags have changed.
6032          */
6033
6034         if ((old_flags ^ flags) & IFF_MULTICAST)
6035                 dev_change_rx_flags(dev, IFF_MULTICAST);
6036
6037         dev_set_rx_mode(dev);
6038
6039         /*
6040          *      Have we downed the interface. We handle IFF_UP ourselves
6041          *      according to user attempts to set it, rather than blindly
6042          *      setting it.
6043          */
6044
6045         ret = 0;
6046         if ((old_flags ^ flags) & IFF_UP)
6047                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6048
6049         if ((flags ^ dev->gflags) & IFF_PROMISC) {
6050                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
6051                 unsigned int old_flags = dev->flags;
6052
6053                 dev->gflags ^= IFF_PROMISC;
6054
6055                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
6056                         if (dev->flags != old_flags)
6057                                 dev_set_rx_mode(dev);
6058         }
6059
6060         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6061            is important. Some (broken) drivers set IFF_PROMISC, when
6062            IFF_ALLMULTI is requested not asking us and not reporting.
6063          */
6064         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6065                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6066
6067                 dev->gflags ^= IFF_ALLMULTI;
6068                 __dev_set_allmulti(dev, inc, false);
6069         }
6070
6071         return ret;
6072 }
6073
6074 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6075                         unsigned int gchanges)
6076 {
6077         unsigned int changes = dev->flags ^ old_flags;
6078
6079         if (gchanges)
6080                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6081
6082         if (changes & IFF_UP) {
6083                 if (dev->flags & IFF_UP)
6084                         call_netdevice_notifiers(NETDEV_UP, dev);
6085                 else
6086                         call_netdevice_notifiers(NETDEV_DOWN, dev);
6087         }
6088
6089         if (dev->flags & IFF_UP &&
6090             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6091                 struct netdev_notifier_change_info change_info;
6092
6093                 change_info.flags_changed = changes;
6094                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6095                                               &change_info.info);
6096         }
6097 }
6098
6099 /**
6100  *      dev_change_flags - change device settings
6101  *      @dev: device
6102  *      @flags: device state flags
6103  *
6104  *      Change settings on device based state flags. The flags are
6105  *      in the userspace exported format.
6106  */
6107 int dev_change_flags(struct net_device *dev, unsigned int flags)
6108 {
6109         int ret;
6110         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6111
6112         ret = __dev_change_flags(dev, flags);
6113         if (ret < 0)
6114                 return ret;
6115
6116         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6117         __dev_notify_flags(dev, old_flags, changes);
6118         return ret;
6119 }
6120 EXPORT_SYMBOL(dev_change_flags);
6121
6122 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6123 {
6124         const struct net_device_ops *ops = dev->netdev_ops;
6125
6126         if (ops->ndo_change_mtu)
6127                 return ops->ndo_change_mtu(dev, new_mtu);
6128
6129         dev->mtu = new_mtu;
6130         return 0;
6131 }
6132
6133 /**
6134  *      dev_set_mtu - Change maximum transfer unit
6135  *      @dev: device
6136  *      @new_mtu: new transfer unit
6137  *
6138  *      Change the maximum transfer size of the network device.
6139  */
6140 int dev_set_mtu(struct net_device *dev, int new_mtu)
6141 {
6142         int err, orig_mtu;
6143
6144         if (new_mtu == dev->mtu)
6145                 return 0;
6146
6147         /*      MTU must be positive.    */
6148         if (new_mtu < 0)
6149                 return -EINVAL;
6150
6151         if (!netif_device_present(dev))
6152                 return -ENODEV;
6153
6154         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6155         err = notifier_to_errno(err);
6156         if (err)
6157                 return err;
6158
6159         orig_mtu = dev->mtu;
6160         err = __dev_set_mtu(dev, new_mtu);
6161
6162         if (!err) {
6163                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6164                 err = notifier_to_errno(err);
6165                 if (err) {
6166                         /* setting mtu back and notifying everyone again,
6167                          * so that they have a chance to revert changes.
6168                          */
6169                         __dev_set_mtu(dev, orig_mtu);
6170                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6171                 }
6172         }
6173         return err;
6174 }
6175 EXPORT_SYMBOL(dev_set_mtu);
6176
6177 /**
6178  *      dev_set_group - Change group this device belongs to
6179  *      @dev: device
6180  *      @new_group: group this device should belong to
6181  */
6182 void dev_set_group(struct net_device *dev, int new_group)
6183 {
6184         dev->group = new_group;
6185 }
6186 EXPORT_SYMBOL(dev_set_group);
6187
6188 /**
6189  *      dev_set_mac_address - Change Media Access Control Address
6190  *      @dev: device
6191  *      @sa: new address
6192  *
6193  *      Change the hardware (MAC) address of the device
6194  */
6195 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6196 {
6197         const struct net_device_ops *ops = dev->netdev_ops;
6198         int err;
6199
6200         if (!ops->ndo_set_mac_address)
6201                 return -EOPNOTSUPP;
6202         if (sa->sa_family != dev->type)
6203                 return -EINVAL;
6204         if (!netif_device_present(dev))
6205                 return -ENODEV;
6206         err = ops->ndo_set_mac_address(dev, sa);
6207         if (err)
6208                 return err;
6209         dev->addr_assign_type = NET_ADDR_SET;
6210         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6211         add_device_randomness(dev->dev_addr, dev->addr_len);
6212         return 0;
6213 }
6214 EXPORT_SYMBOL(dev_set_mac_address);
6215
6216 /**
6217  *      dev_change_carrier - Change device carrier
6218  *      @dev: device
6219  *      @new_carrier: new value
6220  *
6221  *      Change device carrier
6222  */
6223 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6224 {
6225         const struct net_device_ops *ops = dev->netdev_ops;
6226
6227         if (!ops->ndo_change_carrier)
6228                 return -EOPNOTSUPP;
6229         if (!netif_device_present(dev))
6230                 return -ENODEV;
6231         return ops->ndo_change_carrier(dev, new_carrier);
6232 }
6233 EXPORT_SYMBOL(dev_change_carrier);
6234
6235 /**
6236  *      dev_get_phys_port_id - Get device physical port ID
6237  *      @dev: device
6238  *      @ppid: port ID
6239  *
6240  *      Get device physical port ID
6241  */
6242 int dev_get_phys_port_id(struct net_device *dev,
6243                          struct netdev_phys_item_id *ppid)
6244 {
6245         const struct net_device_ops *ops = dev->netdev_ops;
6246
6247         if (!ops->ndo_get_phys_port_id)
6248                 return -EOPNOTSUPP;
6249         return ops->ndo_get_phys_port_id(dev, ppid);
6250 }
6251 EXPORT_SYMBOL(dev_get_phys_port_id);
6252
6253 /**
6254  *      dev_get_phys_port_name - Get device physical port name
6255  *      @dev: device
6256  *      @name: port name
6257  *
6258  *      Get device physical port name
6259  */
6260 int dev_get_phys_port_name(struct net_device *dev,
6261                            char *name, size_t len)
6262 {
6263         const struct net_device_ops *ops = dev->netdev_ops;
6264
6265         if (!ops->ndo_get_phys_port_name)
6266                 return -EOPNOTSUPP;
6267         return ops->ndo_get_phys_port_name(dev, name, len);
6268 }
6269 EXPORT_SYMBOL(dev_get_phys_port_name);
6270
6271 /**
6272  *      dev_change_proto_down - update protocol port state information
6273  *      @dev: device
6274  *      @proto_down: new value
6275  *
6276  *      This info can be used by switch drivers to set the phys state of the
6277  *      port.
6278  */
6279 int dev_change_proto_down(struct net_device *dev, bool proto_down)
6280 {
6281         const struct net_device_ops *ops = dev->netdev_ops;
6282
6283         if (!ops->ndo_change_proto_down)
6284                 return -EOPNOTSUPP;
6285         if (!netif_device_present(dev))
6286                 return -ENODEV;
6287         return ops->ndo_change_proto_down(dev, proto_down);
6288 }
6289 EXPORT_SYMBOL(dev_change_proto_down);
6290
6291 /**
6292  *      dev_new_index   -       allocate an ifindex
6293  *      @net: the applicable net namespace
6294  *
6295  *      Returns a suitable unique value for a new device interface
6296  *      number.  The caller must hold the rtnl semaphore or the
6297  *      dev_base_lock to be sure it remains unique.
6298  */
6299 static int dev_new_index(struct net *net)
6300 {
6301         int ifindex = net->ifindex;
6302         for (;;) {
6303                 if (++ifindex <= 0)
6304                         ifindex = 1;
6305                 if (!__dev_get_by_index(net, ifindex))
6306                         return net->ifindex = ifindex;
6307         }
6308 }
6309
6310 /* Delayed registration/unregisteration */
6311 static LIST_HEAD(net_todo_list);
6312 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6313
6314 static void net_set_todo(struct net_device *dev)
6315 {
6316         list_add_tail(&dev->todo_list, &net_todo_list);
6317         dev_net(dev)->dev_unreg_count++;
6318 }
6319
6320 static void rollback_registered_many(struct list_head *head)
6321 {
6322         struct net_device *dev, *tmp;
6323         LIST_HEAD(close_head);
6324
6325         BUG_ON(dev_boot_phase);
6326         ASSERT_RTNL();
6327
6328         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6329                 /* Some devices call without registering
6330                  * for initialization unwind. Remove those
6331                  * devices and proceed with the remaining.
6332                  */
6333                 if (dev->reg_state == NETREG_UNINITIALIZED) {
6334                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6335                                  dev->name, dev);
6336
6337                         WARN_ON(1);
6338                         list_del(&dev->unreg_list);
6339                         continue;
6340                 }
6341                 dev->dismantle = true;
6342                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6343         }
6344
6345         /* If device is running, close it first. */
6346         list_for_each_entry(dev, head, unreg_list)
6347                 list_add_tail(&dev->close_list, &close_head);
6348         dev_close_many(&close_head, true);
6349
6350         list_for_each_entry(dev, head, unreg_list) {
6351                 /* And unlink it from device chain. */
6352                 unlist_netdevice(dev);
6353
6354                 dev->reg_state = NETREG_UNREGISTERING;
6355                 on_each_cpu(flush_backlog, dev, 1);
6356         }
6357
6358         synchronize_net();
6359
6360         list_for_each_entry(dev, head, unreg_list) {
6361                 struct sk_buff *skb = NULL;
6362
6363                 /* Shutdown queueing discipline. */
6364                 dev_shutdown(dev);
6365
6366
6367                 /* Notify protocols, that we are about to destroy
6368                    this device. They should clean all the things.
6369                 */
6370                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6371
6372                 if (!dev->rtnl_link_ops ||
6373                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6374                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6375                                                      GFP_KERNEL);
6376
6377                 /*
6378                  *      Flush the unicast and multicast chains
6379                  */
6380                 dev_uc_flush(dev);
6381                 dev_mc_flush(dev);
6382
6383                 if (dev->netdev_ops->ndo_uninit)
6384                         dev->netdev_ops->ndo_uninit(dev);
6385
6386                 if (skb)
6387                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6388
6389                 /* Notifier chain MUST detach us all upper devices. */
6390                 WARN_ON(netdev_has_any_upper_dev(dev));
6391
6392                 /* Remove entries from kobject tree */
6393                 netdev_unregister_kobject(dev);
6394 #ifdef CONFIG_XPS
6395                 /* Remove XPS queueing entries */
6396                 netif_reset_xps_queues_gt(dev, 0);
6397 #endif
6398         }
6399
6400         synchronize_net();
6401
6402         list_for_each_entry(dev, head, unreg_list)
6403                 dev_put(dev);
6404 }
6405
6406 static void rollback_registered(struct net_device *dev)
6407 {
6408         LIST_HEAD(single);
6409
6410         list_add(&dev->unreg_list, &single);
6411         rollback_registered_many(&single);
6412         list_del(&single);
6413 }
6414
6415 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6416         struct net_device *upper, netdev_features_t features)
6417 {
6418         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6419         netdev_features_t feature;
6420         int feature_bit;
6421
6422         for_each_netdev_feature(&upper_disables, feature_bit) {
6423                 feature = __NETIF_F_BIT(feature_bit);
6424                 if (!(upper->wanted_features & feature)
6425                     && (features & feature)) {
6426                         netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6427                                    &feature, upper->name);
6428                         features &= ~feature;
6429                 }
6430         }
6431
6432         return features;
6433 }
6434
6435 static void netdev_sync_lower_features(struct net_device *upper,
6436         struct net_device *lower, netdev_features_t features)
6437 {
6438         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6439         netdev_features_t feature;
6440         int feature_bit;
6441
6442         for_each_netdev_feature(&upper_disables, feature_bit) {
6443                 feature = __NETIF_F_BIT(feature_bit);
6444                 if (!(features & feature) && (lower->features & feature)) {
6445                         netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6446                                    &feature, lower->name);
6447                         lower->wanted_features &= ~feature;
6448                         netdev_update_features(lower);
6449
6450                         if (unlikely(lower->features & feature))
6451                                 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6452                                             &feature, lower->name);
6453                 }
6454         }
6455 }
6456
6457 static netdev_features_t netdev_fix_features(struct net_device *dev,
6458         netdev_features_t features)
6459 {
6460         /* Fix illegal checksum combinations */
6461         if ((features & NETIF_F_HW_CSUM) &&
6462             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6463                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6464                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6465         }
6466
6467         /* TSO requires that SG is present as well. */
6468         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6469                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6470                 features &= ~NETIF_F_ALL_TSO;
6471         }
6472
6473         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6474                                         !(features & NETIF_F_IP_CSUM)) {
6475                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6476                 features &= ~NETIF_F_TSO;
6477                 features &= ~NETIF_F_TSO_ECN;
6478         }
6479
6480         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6481                                          !(features & NETIF_F_IPV6_CSUM)) {
6482                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6483                 features &= ~NETIF_F_TSO6;
6484         }
6485
6486         /* TSO ECN requires that TSO is present as well. */
6487         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6488                 features &= ~NETIF_F_TSO_ECN;
6489
6490         /* Software GSO depends on SG. */
6491         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6492                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6493                 features &= ~NETIF_F_GSO;
6494         }
6495
6496         /* UFO needs SG and checksumming */
6497         if (features & NETIF_F_UFO) {
6498                 /* maybe split UFO into V4 and V6? */
6499                 if (!((features & NETIF_F_GEN_CSUM) ||
6500                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6501                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6502                         netdev_dbg(dev,
6503                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6504                         features &= ~NETIF_F_UFO;
6505                 }
6506
6507                 if (!(features & NETIF_F_SG)) {
6508                         netdev_dbg(dev,
6509                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6510                         features &= ~NETIF_F_UFO;
6511                 }
6512         }
6513
6514 #ifdef CONFIG_NET_RX_BUSY_POLL
6515         if (dev->netdev_ops->ndo_busy_poll)
6516                 features |= NETIF_F_BUSY_POLL;
6517         else
6518 #endif
6519                 features &= ~NETIF_F_BUSY_POLL;
6520
6521         return features;
6522 }
6523
6524 int __netdev_update_features(struct net_device *dev)
6525 {
6526         struct net_device *upper, *lower;
6527         netdev_features_t features;
6528         struct list_head *iter;
6529         int err = -1;
6530
6531         ASSERT_RTNL();
6532
6533         features = netdev_get_wanted_features(dev);
6534
6535         if (dev->netdev_ops->ndo_fix_features)
6536                 features = dev->netdev_ops->ndo_fix_features(dev, features);
6537
6538         /* driver might be less strict about feature dependencies */
6539         features = netdev_fix_features(dev, features);
6540
6541         /* some features can't be enabled if they're off an an upper device */
6542         netdev_for_each_upper_dev_rcu(dev, upper, iter)
6543                 features = netdev_sync_upper_features(dev, upper, features);
6544
6545         if (dev->features == features)
6546                 goto sync_lower;
6547
6548         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6549                 &dev->features, &features);
6550
6551         if (dev->netdev_ops->ndo_set_features)
6552                 err = dev->netdev_ops->ndo_set_features(dev, features);
6553         else
6554                 err = 0;
6555
6556         if (unlikely(err < 0)) {
6557                 netdev_err(dev,
6558                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
6559                         err, &features, &dev->features);
6560                 /* return non-0 since some features might have changed and
6561                  * it's better to fire a spurious notification than miss it
6562                  */
6563                 return -1;
6564         }
6565
6566 sync_lower:
6567         /* some features must be disabled on lower devices when disabled
6568          * on an upper device (think: bonding master or bridge)
6569          */
6570         netdev_for_each_lower_dev(dev, lower, iter)
6571                 netdev_sync_lower_features(dev, lower, features);
6572
6573         if (!err)
6574                 dev->features = features;
6575
6576         return err < 0 ? 0 : 1;
6577 }
6578
6579 /**
6580  *      netdev_update_features - recalculate device features
6581  *      @dev: the device to check
6582  *
6583  *      Recalculate dev->features set and send notifications if it
6584  *      has changed. Should be called after driver or hardware dependent
6585  *      conditions might have changed that influence the features.
6586  */
6587 void netdev_update_features(struct net_device *dev)
6588 {
6589         if (__netdev_update_features(dev))
6590                 netdev_features_change(dev);
6591 }
6592 EXPORT_SYMBOL(netdev_update_features);
6593
6594 /**
6595  *      netdev_change_features - recalculate device features
6596  *      @dev: the device to check
6597  *
6598  *      Recalculate dev->features set and send notifications even
6599  *      if they have not changed. Should be called instead of
6600  *      netdev_update_features() if also dev->vlan_features might
6601  *      have changed to allow the changes to be propagated to stacked
6602  *      VLAN devices.
6603  */
6604 void netdev_change_features(struct net_device *dev)
6605 {
6606         __netdev_update_features(dev);
6607         netdev_features_change(dev);
6608 }
6609 EXPORT_SYMBOL(netdev_change_features);
6610
6611 /**
6612  *      netif_stacked_transfer_operstate -      transfer operstate
6613  *      @rootdev: the root or lower level device to transfer state from
6614  *      @dev: the device to transfer operstate to
6615  *
6616  *      Transfer operational state from root to device. This is normally
6617  *      called when a stacking relationship exists between the root
6618  *      device and the device(a leaf device).
6619  */
6620 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6621                                         struct net_device *dev)
6622 {
6623         if (rootdev->operstate == IF_OPER_DORMANT)
6624                 netif_dormant_on(dev);
6625         else
6626                 netif_dormant_off(dev);
6627
6628         if (netif_carrier_ok(rootdev)) {
6629                 if (!netif_carrier_ok(dev))
6630                         netif_carrier_on(dev);
6631         } else {
6632                 if (netif_carrier_ok(dev))
6633                         netif_carrier_off(dev);
6634         }
6635 }
6636 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6637
6638 #ifdef CONFIG_SYSFS
6639 static int netif_alloc_rx_queues(struct net_device *dev)
6640 {
6641         unsigned int i, count = dev->num_rx_queues;
6642         struct netdev_rx_queue *rx;
6643         size_t sz = count * sizeof(*rx);
6644
6645         BUG_ON(count < 1);
6646
6647         rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6648         if (!rx) {
6649                 rx = vzalloc(sz);
6650                 if (!rx)
6651                         return -ENOMEM;
6652         }
6653         dev->_rx = rx;
6654
6655         for (i = 0; i < count; i++)
6656                 rx[i].dev = dev;
6657         return 0;
6658 }
6659 #endif
6660
6661 static void netdev_init_one_queue(struct net_device *dev,
6662                                   struct netdev_queue *queue, void *_unused)
6663 {
6664         /* Initialize queue lock */
6665         spin_lock_init(&queue->_xmit_lock);
6666         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6667         queue->xmit_lock_owner = -1;
6668         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6669         queue->dev = dev;
6670 #ifdef CONFIG_BQL
6671         dql_init(&queue->dql, HZ);
6672 #endif
6673 }
6674
6675 static void netif_free_tx_queues(struct net_device *dev)
6676 {
6677         kvfree(dev->_tx);
6678 }
6679
6680 static int netif_alloc_netdev_queues(struct net_device *dev)
6681 {
6682         unsigned int count = dev->num_tx_queues;
6683         struct netdev_queue *tx;
6684         size_t sz = count * sizeof(*tx);
6685
6686         if (count < 1 || count > 0xffff)
6687                 return -EINVAL;
6688
6689         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6690         if (!tx) {
6691                 tx = vzalloc(sz);
6692                 if (!tx)
6693                         return -ENOMEM;
6694         }
6695         dev->_tx = tx;
6696
6697         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6698         spin_lock_init(&dev->tx_global_lock);
6699
6700         return 0;
6701 }
6702
6703 void netif_tx_stop_all_queues(struct net_device *dev)
6704 {
6705         unsigned int i;
6706
6707         for (i = 0; i < dev->num_tx_queues; i++) {
6708                 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
6709                 netif_tx_stop_queue(txq);
6710         }
6711 }
6712 EXPORT_SYMBOL(netif_tx_stop_all_queues);
6713
6714 /**
6715  *      register_netdevice      - register a network device
6716  *      @dev: device to register
6717  *
6718  *      Take a completed network device structure and add it to the kernel
6719  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6720  *      chain. 0 is returned on success. A negative errno code is returned
6721  *      on a failure to set up the device, or if the name is a duplicate.
6722  *
6723  *      Callers must hold the rtnl semaphore. You may want
6724  *      register_netdev() instead of this.
6725  *
6726  *      BUGS:
6727  *      The locking appears insufficient to guarantee two parallel registers
6728  *      will not get the same name.
6729  */
6730
6731 int register_netdevice(struct net_device *dev)
6732 {
6733         int ret;
6734         struct net *net = dev_net(dev);
6735
6736         BUG_ON(dev_boot_phase);
6737         ASSERT_RTNL();
6738
6739         might_sleep();
6740
6741         /* When net_device's are persistent, this will be fatal. */
6742         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6743         BUG_ON(!net);
6744
6745         spin_lock_init(&dev->addr_list_lock);
6746         netdev_set_addr_lockdep_class(dev);
6747
6748         ret = dev_get_valid_name(net, dev, dev->name);
6749         if (ret < 0)
6750                 goto out;
6751
6752         /* Init, if this function is available */
6753         if (dev->netdev_ops->ndo_init) {
6754                 ret = dev->netdev_ops->ndo_init(dev);
6755                 if (ret) {
6756                         if (ret > 0)
6757                                 ret = -EIO;
6758                         goto out;
6759                 }
6760         }
6761
6762         if (((dev->hw_features | dev->features) &
6763              NETIF_F_HW_VLAN_CTAG_FILTER) &&
6764             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6765              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6766                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6767                 ret = -EINVAL;
6768                 goto err_uninit;
6769         }
6770
6771         ret = -EBUSY;
6772         if (!dev->ifindex)
6773                 dev->ifindex = dev_new_index(net);
6774         else if (__dev_get_by_index(net, dev->ifindex))
6775                 goto err_uninit;
6776
6777         /* Transfer changeable features to wanted_features and enable
6778          * software offloads (GSO and GRO).
6779          */
6780         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6781         dev->features |= NETIF_F_SOFT_FEATURES;
6782         dev->wanted_features = dev->features & dev->hw_features;
6783
6784         if (!(dev->flags & IFF_LOOPBACK)) {
6785                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6786         }
6787
6788         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6789          */
6790         dev->vlan_features |= NETIF_F_HIGHDMA;
6791
6792         /* Make NETIF_F_SG inheritable to tunnel devices.
6793          */
6794         dev->hw_enc_features |= NETIF_F_SG;
6795
6796         /* Make NETIF_F_SG inheritable to MPLS.
6797          */
6798         dev->mpls_features |= NETIF_F_SG;
6799
6800         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6801         ret = notifier_to_errno(ret);
6802         if (ret)
6803                 goto err_uninit;
6804
6805         ret = netdev_register_kobject(dev);
6806         if (ret)
6807                 goto err_uninit;
6808         dev->reg_state = NETREG_REGISTERED;
6809
6810         __netdev_update_features(dev);
6811
6812         /*
6813          *      Default initial state at registry is that the
6814          *      device is present.
6815          */
6816
6817         set_bit(__LINK_STATE_PRESENT, &dev->state);
6818
6819         linkwatch_init_dev(dev);
6820
6821         dev_init_scheduler(dev);
6822         dev_hold(dev);
6823         list_netdevice(dev);
6824         add_device_randomness(dev->dev_addr, dev->addr_len);
6825
6826         /* If the device has permanent device address, driver should
6827          * set dev_addr and also addr_assign_type should be set to
6828          * NET_ADDR_PERM (default value).
6829          */
6830         if (dev->addr_assign_type == NET_ADDR_PERM)
6831                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6832
6833         /* Notify protocols, that a new device appeared. */
6834         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6835         ret = notifier_to_errno(ret);
6836         if (ret) {
6837                 rollback_registered(dev);
6838                 dev->reg_state = NETREG_UNREGISTERED;
6839         }
6840         /*
6841          *      Prevent userspace races by waiting until the network
6842          *      device is fully setup before sending notifications.
6843          */
6844         if (!dev->rtnl_link_ops ||
6845             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6846                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6847
6848 out:
6849         return ret;
6850
6851 err_uninit:
6852         if (dev->netdev_ops->ndo_uninit)
6853                 dev->netdev_ops->ndo_uninit(dev);
6854         goto out;
6855 }
6856 EXPORT_SYMBOL(register_netdevice);
6857
6858 /**
6859  *      init_dummy_netdev       - init a dummy network device for NAPI
6860  *      @dev: device to init
6861  *
6862  *      This takes a network device structure and initialize the minimum
6863  *      amount of fields so it can be used to schedule NAPI polls without
6864  *      registering a full blown interface. This is to be used by drivers
6865  *      that need to tie several hardware interfaces to a single NAPI
6866  *      poll scheduler due to HW limitations.
6867  */
6868 int init_dummy_netdev(struct net_device *dev)
6869 {
6870         /* Clear everything. Note we don't initialize spinlocks
6871          * are they aren't supposed to be taken by any of the
6872          * NAPI code and this dummy netdev is supposed to be
6873          * only ever used for NAPI polls
6874          */
6875         memset(dev, 0, sizeof(struct net_device));
6876
6877         /* make sure we BUG if trying to hit standard
6878          * register/unregister code path
6879          */
6880         dev->reg_state = NETREG_DUMMY;
6881
6882         /* NAPI wants this */
6883         INIT_LIST_HEAD(&dev->napi_list);
6884
6885         /* a dummy interface is started by default */
6886         set_bit(__LINK_STATE_PRESENT, &dev->state);
6887         set_bit(__LINK_STATE_START, &dev->state);
6888
6889         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6890          * because users of this 'device' dont need to change
6891          * its refcount.
6892          */
6893
6894         return 0;
6895 }
6896 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6897
6898
6899 /**
6900  *      register_netdev - register a network device
6901  *      @dev: device to register
6902  *
6903  *      Take a completed network device structure and add it to the kernel
6904  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6905  *      chain. 0 is returned on success. A negative errno code is returned
6906  *      on a failure to set up the device, or if the name is a duplicate.
6907  *
6908  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6909  *      and expands the device name if you passed a format string to
6910  *      alloc_netdev.
6911  */
6912 int register_netdev(struct net_device *dev)
6913 {
6914         int err;
6915
6916         rtnl_lock();
6917         err = register_netdevice(dev);
6918         rtnl_unlock();
6919         return err;
6920 }
6921 EXPORT_SYMBOL(register_netdev);
6922
6923 int netdev_refcnt_read(const struct net_device *dev)
6924 {
6925         int i, refcnt = 0;
6926
6927         for_each_possible_cpu(i)
6928                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6929         return refcnt;
6930 }
6931 EXPORT_SYMBOL(netdev_refcnt_read);
6932
6933 /**
6934  * netdev_wait_allrefs - wait until all references are gone.
6935  * @dev: target net_device
6936  *
6937  * This is called when unregistering network devices.
6938  *
6939  * Any protocol or device that holds a reference should register
6940  * for netdevice notification, and cleanup and put back the
6941  * reference if they receive an UNREGISTER event.
6942  * We can get stuck here if buggy protocols don't correctly
6943  * call dev_put.
6944  */
6945 static void netdev_wait_allrefs(struct net_device *dev)
6946 {
6947         unsigned long rebroadcast_time, warning_time;
6948         int refcnt;
6949
6950         linkwatch_forget_dev(dev);
6951
6952         rebroadcast_time = warning_time = jiffies;
6953         refcnt = netdev_refcnt_read(dev);
6954
6955         while (refcnt != 0) {
6956                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6957                         rtnl_lock();
6958
6959                         /* Rebroadcast unregister notification */
6960                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6961
6962                         __rtnl_unlock();
6963                         rcu_barrier();
6964                         rtnl_lock();
6965
6966                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6967                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6968                                      &dev->state)) {
6969                                 /* We must not have linkwatch events
6970                                  * pending on unregister. If this
6971                                  * happens, we simply run the queue
6972                                  * unscheduled, resulting in a noop
6973                                  * for this device.
6974                                  */
6975                                 linkwatch_run_queue();
6976                         }
6977
6978                         __rtnl_unlock();
6979
6980                         rebroadcast_time = jiffies;
6981                 }
6982
6983                 msleep(250);
6984
6985                 refcnt = netdev_refcnt_read(dev);
6986
6987                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6988                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6989                                  dev->name, refcnt);
6990                         warning_time = jiffies;
6991                 }
6992         }
6993 }
6994
6995 /* The sequence is:
6996  *
6997  *      rtnl_lock();
6998  *      ...
6999  *      register_netdevice(x1);
7000  *      register_netdevice(x2);
7001  *      ...
7002  *      unregister_netdevice(y1);
7003  *      unregister_netdevice(y2);
7004  *      ...
7005  *      rtnl_unlock();
7006  *      free_netdev(y1);
7007  *      free_netdev(y2);
7008  *
7009  * We are invoked by rtnl_unlock().
7010  * This allows us to deal with problems:
7011  * 1) We can delete sysfs objects which invoke hotplug
7012  *    without deadlocking with linkwatch via keventd.
7013  * 2) Since we run with the RTNL semaphore not held, we can sleep
7014  *    safely in order to wait for the netdev refcnt to drop to zero.
7015  *
7016  * We must not return until all unregister events added during
7017  * the interval the lock was held have been completed.
7018  */
7019 void netdev_run_todo(void)
7020 {
7021         struct list_head list;
7022
7023         /* Snapshot list, allow later requests */
7024         list_replace_init(&net_todo_list, &list);
7025
7026         __rtnl_unlock();
7027
7028
7029         /* Wait for rcu callbacks to finish before next phase */
7030         if (!list_empty(&list))
7031                 rcu_barrier();
7032
7033         while (!list_empty(&list)) {
7034                 struct net_device *dev
7035                         = list_first_entry(&list, struct net_device, todo_list);
7036                 list_del(&dev->todo_list);
7037
7038                 rtnl_lock();
7039                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7040                 __rtnl_unlock();
7041
7042                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7043                         pr_err("network todo '%s' but state %d\n",
7044                                dev->name, dev->reg_state);
7045                         dump_stack();
7046                         continue;
7047                 }
7048
7049                 dev->reg_state = NETREG_UNREGISTERED;
7050
7051                 netdev_wait_allrefs(dev);
7052
7053                 /* paranoia */
7054                 BUG_ON(netdev_refcnt_read(dev));
7055                 BUG_ON(!list_empty(&dev->ptype_all));
7056                 BUG_ON(!list_empty(&dev->ptype_specific));
7057                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
7058                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7059                 WARN_ON(dev->dn_ptr);
7060
7061                 if (dev->destructor)
7062                         dev->destructor(dev);
7063
7064                 /* Report a network device has been unregistered */
7065                 rtnl_lock();
7066                 dev_net(dev)->dev_unreg_count--;
7067                 __rtnl_unlock();
7068                 wake_up(&netdev_unregistering_wq);
7069
7070                 /* Free network device */
7071                 kobject_put(&dev->dev.kobj);
7072         }
7073 }
7074
7075 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
7076  * fields in the same order, with only the type differing.
7077  */
7078 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7079                              const struct net_device_stats *netdev_stats)
7080 {
7081 #if BITS_PER_LONG == 64
7082         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
7083         memcpy(stats64, netdev_stats, sizeof(*stats64));
7084 #else
7085         size_t i, n = sizeof(*stats64) / sizeof(u64);
7086         const unsigned long *src = (const unsigned long *)netdev_stats;
7087         u64 *dst = (u64 *)stats64;
7088
7089         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
7090                      sizeof(*stats64) / sizeof(u64));
7091         for (i = 0; i < n; i++)
7092                 dst[i] = src[i];
7093 #endif
7094 }
7095 EXPORT_SYMBOL(netdev_stats_to_stats64);
7096
7097 /**
7098  *      dev_get_stats   - get network device statistics
7099  *      @dev: device to get statistics from
7100  *      @storage: place to store stats
7101  *
7102  *      Get network statistics from device. Return @storage.
7103  *      The device driver may provide its own method by setting
7104  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7105  *      otherwise the internal statistics structure is used.
7106  */
7107 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7108                                         struct rtnl_link_stats64 *storage)
7109 {
7110         const struct net_device_ops *ops = dev->netdev_ops;
7111
7112         if (ops->ndo_get_stats64) {
7113                 memset(storage, 0, sizeof(*storage));
7114                 ops->ndo_get_stats64(dev, storage);
7115         } else if (ops->ndo_get_stats) {
7116                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7117         } else {
7118                 netdev_stats_to_stats64(storage, &dev->stats);
7119         }
7120         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7121         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7122         return storage;
7123 }
7124 EXPORT_SYMBOL(dev_get_stats);
7125
7126 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7127 {
7128         struct netdev_queue *queue = dev_ingress_queue(dev);
7129
7130 #ifdef CONFIG_NET_CLS_ACT
7131         if (queue)
7132                 return queue;
7133         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7134         if (!queue)
7135                 return NULL;
7136         netdev_init_one_queue(dev, queue, NULL);
7137         RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7138         queue->qdisc_sleeping = &noop_qdisc;
7139         rcu_assign_pointer(dev->ingress_queue, queue);
7140 #endif
7141         return queue;
7142 }
7143
7144 static const struct ethtool_ops default_ethtool_ops;
7145
7146 void netdev_set_default_ethtool_ops(struct net_device *dev,
7147                                     const struct ethtool_ops *ops)
7148 {
7149         if (dev->ethtool_ops == &default_ethtool_ops)
7150                 dev->ethtool_ops = ops;
7151 }
7152 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7153
7154 void netdev_freemem(struct net_device *dev)
7155 {
7156         char *addr = (char *)dev - dev->padded;
7157
7158         kvfree(addr);
7159 }
7160
7161 /**
7162  *      alloc_netdev_mqs - allocate network device
7163  *      @sizeof_priv:           size of private data to allocate space for
7164  *      @name:                  device name format string
7165  *      @name_assign_type:      origin of device name
7166  *      @setup:                 callback to initialize device
7167  *      @txqs:                  the number of TX subqueues to allocate
7168  *      @rxqs:                  the number of RX subqueues to allocate
7169  *
7170  *      Allocates a struct net_device with private data area for driver use
7171  *      and performs basic initialization.  Also allocates subqueue structs
7172  *      for each queue on the device.
7173  */
7174 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7175                 unsigned char name_assign_type,
7176                 void (*setup)(struct net_device *),
7177                 unsigned int txqs, unsigned int rxqs)
7178 {
7179         struct net_device *dev;
7180         size_t alloc_size;
7181         struct net_device *p;
7182
7183         BUG_ON(strlen(name) >= sizeof(dev->name));
7184
7185         if (txqs < 1) {
7186                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7187                 return NULL;
7188         }
7189
7190 #ifdef CONFIG_SYSFS
7191         if (rxqs < 1) {
7192                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7193                 return NULL;
7194         }
7195 #endif
7196
7197         alloc_size = sizeof(struct net_device);
7198         if (sizeof_priv) {
7199                 /* ensure 32-byte alignment of private area */
7200                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7201                 alloc_size += sizeof_priv;
7202         }
7203         /* ensure 32-byte alignment of whole construct */
7204         alloc_size += NETDEV_ALIGN - 1;
7205
7206         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7207         if (!p)
7208                 p = vzalloc(alloc_size);
7209         if (!p)
7210                 return NULL;
7211
7212         dev = PTR_ALIGN(p, NETDEV_ALIGN);
7213         dev->padded = (char *)dev - (char *)p;
7214
7215         dev->pcpu_refcnt = alloc_percpu(int);
7216         if (!dev->pcpu_refcnt)
7217                 goto free_dev;
7218
7219         if (dev_addr_init(dev))
7220                 goto free_pcpu;
7221
7222         dev_mc_init(dev);
7223         dev_uc_init(dev);
7224
7225         dev_net_set(dev, &init_net);
7226
7227         dev->gso_max_size = GSO_MAX_SIZE;
7228         dev->gso_max_segs = GSO_MAX_SEGS;
7229         dev->gso_min_segs = 0;
7230
7231         INIT_LIST_HEAD(&dev->napi_list);
7232         INIT_LIST_HEAD(&dev->unreg_list);
7233         INIT_LIST_HEAD(&dev->close_list);
7234         INIT_LIST_HEAD(&dev->link_watch_list);
7235         INIT_LIST_HEAD(&dev->adj_list.upper);
7236         INIT_LIST_HEAD(&dev->adj_list.lower);
7237         INIT_LIST_HEAD(&dev->all_adj_list.upper);
7238         INIT_LIST_HEAD(&dev->all_adj_list.lower);
7239         INIT_LIST_HEAD(&dev->ptype_all);
7240         INIT_LIST_HEAD(&dev->ptype_specific);
7241         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7242         setup(dev);
7243
7244         if (!dev->tx_queue_len) {
7245                 dev->priv_flags |= IFF_NO_QUEUE;
7246                 dev->tx_queue_len = 1;
7247         }
7248
7249         dev->num_tx_queues = txqs;
7250         dev->real_num_tx_queues = txqs;
7251         if (netif_alloc_netdev_queues(dev))
7252                 goto free_all;
7253
7254 #ifdef CONFIG_SYSFS
7255         dev->num_rx_queues = rxqs;
7256         dev->real_num_rx_queues = rxqs;
7257         if (netif_alloc_rx_queues(dev))
7258                 goto free_all;
7259 #endif
7260
7261         strcpy(dev->name, name);
7262         dev->name_assign_type = name_assign_type;
7263         dev->group = INIT_NETDEV_GROUP;
7264         if (!dev->ethtool_ops)
7265                 dev->ethtool_ops = &default_ethtool_ops;
7266
7267         nf_hook_ingress_init(dev);
7268
7269         return dev;
7270
7271 free_all:
7272         free_netdev(dev);
7273         return NULL;
7274
7275 free_pcpu:
7276         free_percpu(dev->pcpu_refcnt);
7277 free_dev:
7278         netdev_freemem(dev);
7279         return NULL;
7280 }
7281 EXPORT_SYMBOL(alloc_netdev_mqs);
7282
7283 /**
7284  *      free_netdev - free network device
7285  *      @dev: device
7286  *
7287  *      This function does the last stage of destroying an allocated device
7288  *      interface. The reference to the device object is released.
7289  *      If this is the last reference then it will be freed.
7290  */
7291 void free_netdev(struct net_device *dev)
7292 {
7293         struct napi_struct *p, *n;
7294
7295         netif_free_tx_queues(dev);
7296 #ifdef CONFIG_SYSFS
7297         kvfree(dev->_rx);
7298 #endif
7299
7300         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7301
7302         /* Flush device addresses */
7303         dev_addr_flush(dev);
7304
7305         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7306                 netif_napi_del(p);
7307
7308         free_percpu(dev->pcpu_refcnt);
7309         dev->pcpu_refcnt = NULL;
7310
7311         /*  Compatibility with error handling in drivers */
7312         if (dev->reg_state == NETREG_UNINITIALIZED) {
7313                 netdev_freemem(dev);
7314                 return;
7315         }
7316
7317         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7318         dev->reg_state = NETREG_RELEASED;
7319
7320         /* will free via device release */
7321         put_device(&dev->dev);
7322 }
7323 EXPORT_SYMBOL(free_netdev);
7324
7325 /**
7326  *      synchronize_net -  Synchronize with packet receive processing
7327  *
7328  *      Wait for packets currently being received to be done.
7329  *      Does not block later packets from starting.
7330  */
7331 void synchronize_net(void)
7332 {
7333         might_sleep();
7334         if (rtnl_is_locked() && !IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
7335                 synchronize_rcu_expedited();
7336         else
7337                 synchronize_rcu();
7338 }
7339 EXPORT_SYMBOL(synchronize_net);
7340
7341 /**
7342  *      unregister_netdevice_queue - remove device from the kernel
7343  *      @dev: device
7344  *      @head: list
7345  *
7346  *      This function shuts down a device interface and removes it
7347  *      from the kernel tables.
7348  *      If head not NULL, device is queued to be unregistered later.
7349  *
7350  *      Callers must hold the rtnl semaphore.  You may want
7351  *      unregister_netdev() instead of this.
7352  */
7353
7354 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7355 {
7356         ASSERT_RTNL();
7357
7358         if (head) {
7359                 list_move_tail(&dev->unreg_list, head);
7360         } else {
7361                 rollback_registered(dev);
7362                 /* Finish processing unregister after unlock */
7363                 net_set_todo(dev);
7364         }
7365 }
7366 EXPORT_SYMBOL(unregister_netdevice_queue);
7367
7368 /**
7369  *      unregister_netdevice_many - unregister many devices
7370  *      @head: list of devices
7371  *
7372  *  Note: As most callers use a stack allocated list_head,
7373  *  we force a list_del() to make sure stack wont be corrupted later.
7374  */
7375 void unregister_netdevice_many(struct list_head *head)
7376 {
7377         struct net_device *dev;
7378
7379         if (!list_empty(head)) {
7380                 rollback_registered_many(head);
7381                 list_for_each_entry(dev, head, unreg_list)
7382                         net_set_todo(dev);
7383                 list_del(head);
7384         }
7385 }
7386 EXPORT_SYMBOL(unregister_netdevice_many);
7387
7388 /**
7389  *      unregister_netdev - remove device from the kernel
7390  *      @dev: device
7391  *
7392  *      This function shuts down a device interface and removes it
7393  *      from the kernel tables.
7394  *
7395  *      This is just a wrapper for unregister_netdevice that takes
7396  *      the rtnl semaphore.  In general you want to use this and not
7397  *      unregister_netdevice.
7398  */
7399 void unregister_netdev(struct net_device *dev)
7400 {
7401         rtnl_lock();
7402         unregister_netdevice(dev);
7403         rtnl_unlock();
7404 }
7405 EXPORT_SYMBOL(unregister_netdev);
7406
7407 /**
7408  *      dev_change_net_namespace - move device to different nethost namespace
7409  *      @dev: device
7410  *      @net: network namespace
7411  *      @pat: If not NULL name pattern to try if the current device name
7412  *            is already taken in the destination network namespace.
7413  *
7414  *      This function shuts down a device interface and moves it
7415  *      to a new network namespace. On success 0 is returned, on
7416  *      a failure a netagive errno code is returned.
7417  *
7418  *      Callers must hold the rtnl semaphore.
7419  */
7420
7421 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7422 {
7423         int err;
7424
7425         ASSERT_RTNL();
7426
7427         /* Don't allow namespace local devices to be moved. */
7428         err = -EINVAL;
7429         if (dev->features & NETIF_F_NETNS_LOCAL)
7430                 goto out;
7431
7432         /* Ensure the device has been registrered */
7433         if (dev->reg_state != NETREG_REGISTERED)
7434                 goto out;
7435
7436         /* Get out if there is nothing todo */
7437         err = 0;
7438         if (net_eq(dev_net(dev), net))
7439                 goto out;
7440
7441         /* Pick the destination device name, and ensure
7442          * we can use it in the destination network namespace.
7443          */
7444         err = -EEXIST;
7445         if (__dev_get_by_name(net, dev->name)) {
7446                 /* We get here if we can't use the current device name */
7447                 if (!pat)
7448                         goto out;
7449                 if (dev_get_valid_name(net, dev, pat) < 0)
7450                         goto out;
7451         }
7452
7453         /*
7454          * And now a mini version of register_netdevice unregister_netdevice.
7455          */
7456
7457         /* If device is running close it first. */
7458         dev_close(dev);
7459
7460         /* And unlink it from device chain */
7461         err = -ENODEV;
7462         unlist_netdevice(dev);
7463
7464         synchronize_net();
7465
7466         /* Shutdown queueing discipline. */
7467         dev_shutdown(dev);
7468
7469         /* Notify protocols, that we are about to destroy
7470            this device. They should clean all the things.
7471
7472            Note that dev->reg_state stays at NETREG_REGISTERED.
7473            This is wanted because this way 8021q and macvlan know
7474            the device is just moving and can keep their slaves up.
7475         */
7476         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7477         rcu_barrier();
7478         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7479         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7480
7481         /*
7482          *      Flush the unicast and multicast chains
7483          */
7484         dev_uc_flush(dev);
7485         dev_mc_flush(dev);
7486
7487         /* Send a netdev-removed uevent to the old namespace */
7488         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7489         netdev_adjacent_del_links(dev);
7490
7491         /* Actually switch the network namespace */
7492         dev_net_set(dev, net);
7493
7494         /* If there is an ifindex conflict assign a new one */
7495         if (__dev_get_by_index(net, dev->ifindex))
7496                 dev->ifindex = dev_new_index(net);
7497
7498         /* Send a netdev-add uevent to the new namespace */
7499         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7500         netdev_adjacent_add_links(dev);
7501
7502         /* Fixup kobjects */
7503         err = device_rename(&dev->dev, dev->name);
7504         WARN_ON(err);
7505
7506         /* Add the device back in the hashes */
7507         list_netdevice(dev);
7508
7509         /* Notify protocols, that a new device appeared. */
7510         call_netdevice_notifiers(NETDEV_REGISTER, dev);
7511
7512         /*
7513          *      Prevent userspace races by waiting until the network
7514          *      device is fully setup before sending notifications.
7515          */
7516         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7517
7518         synchronize_net();
7519         err = 0;
7520 out:
7521         return err;
7522 }
7523 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7524
7525 static int dev_cpu_callback(struct notifier_block *nfb,
7526                             unsigned long action,
7527                             void *ocpu)
7528 {
7529         struct sk_buff **list_skb;
7530         struct sk_buff *skb;
7531         unsigned int cpu, oldcpu = (unsigned long)ocpu;
7532         struct softnet_data *sd, *oldsd;
7533
7534         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7535                 return NOTIFY_OK;
7536
7537         local_irq_disable();
7538         cpu = smp_processor_id();
7539         sd = &per_cpu(softnet_data, cpu);
7540         oldsd = &per_cpu(softnet_data, oldcpu);
7541
7542         /* Find end of our completion_queue. */
7543         list_skb = &sd->completion_queue;
7544         while (*list_skb)
7545                 list_skb = &(*list_skb)->next;
7546         /* Append completion queue from offline CPU. */
7547         *list_skb = oldsd->completion_queue;
7548         oldsd->completion_queue = NULL;
7549
7550         /* Append output queue from offline CPU. */
7551         if (oldsd->output_queue) {
7552                 *sd->output_queue_tailp = oldsd->output_queue;
7553                 sd->output_queue_tailp = oldsd->output_queue_tailp;
7554                 oldsd->output_queue = NULL;
7555                 oldsd->output_queue_tailp = &oldsd->output_queue;
7556         }
7557         /* Append NAPI poll list from offline CPU, with one exception :
7558          * process_backlog() must be called by cpu owning percpu backlog.
7559          * We properly handle process_queue & input_pkt_queue later.
7560          */
7561         while (!list_empty(&oldsd->poll_list)) {
7562                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7563                                                             struct napi_struct,
7564                                                             poll_list);
7565
7566                 list_del_init(&napi->poll_list);
7567                 if (napi->poll == process_backlog)
7568                         napi->state = 0;
7569                 else
7570                         ____napi_schedule(sd, napi);
7571         }
7572
7573         raise_softirq_irqoff(NET_TX_SOFTIRQ);
7574         local_irq_enable();
7575         preempt_check_resched_rt();
7576
7577         /* Process offline CPU's input_pkt_queue */
7578         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7579                 netif_rx_ni(skb);
7580                 input_queue_head_incr(oldsd);
7581         }
7582         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
7583                 netif_rx_ni(skb);
7584                 input_queue_head_incr(oldsd);
7585         }
7586         while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
7587                 kfree_skb(skb);
7588         }
7589
7590         return NOTIFY_OK;
7591 }
7592
7593
7594 /**
7595  *      netdev_increment_features - increment feature set by one
7596  *      @all: current feature set
7597  *      @one: new feature set
7598  *      @mask: mask feature set
7599  *
7600  *      Computes a new feature set after adding a device with feature set
7601  *      @one to the master device with current feature set @all.  Will not
7602  *      enable anything that is off in @mask. Returns the new feature set.
7603  */
7604 netdev_features_t netdev_increment_features(netdev_features_t all,
7605         netdev_features_t one, netdev_features_t mask)
7606 {
7607         if (mask & NETIF_F_GEN_CSUM)
7608                 mask |= NETIF_F_ALL_CSUM;
7609         mask |= NETIF_F_VLAN_CHALLENGED;
7610
7611         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7612         all &= one | ~NETIF_F_ALL_FOR_ALL;
7613
7614         /* If one device supports hw checksumming, set for all. */
7615         if (all & NETIF_F_GEN_CSUM)
7616                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7617
7618         return all;
7619 }
7620 EXPORT_SYMBOL(netdev_increment_features);
7621
7622 static struct hlist_head * __net_init netdev_create_hash(void)
7623 {
7624         int i;
7625         struct hlist_head *hash;
7626
7627         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7628         if (hash != NULL)
7629                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7630                         INIT_HLIST_HEAD(&hash[i]);
7631
7632         return hash;
7633 }
7634
7635 /* Initialize per network namespace state */
7636 static int __net_init netdev_init(struct net *net)
7637 {
7638         if (net != &init_net)
7639                 INIT_LIST_HEAD(&net->dev_base_head);
7640
7641         net->dev_name_head = netdev_create_hash();
7642         if (net->dev_name_head == NULL)
7643                 goto err_name;
7644
7645         net->dev_index_head = netdev_create_hash();
7646         if (net->dev_index_head == NULL)
7647                 goto err_idx;
7648
7649         return 0;
7650
7651 err_idx:
7652         kfree(net->dev_name_head);
7653 err_name:
7654         return -ENOMEM;
7655 }
7656
7657 /**
7658  *      netdev_drivername - network driver for the device
7659  *      @dev: network device
7660  *
7661  *      Determine network driver for device.
7662  */
7663 const char *netdev_drivername(const struct net_device *dev)
7664 {
7665         const struct device_driver *driver;
7666         const struct device *parent;
7667         const char *empty = "";
7668
7669         parent = dev->dev.parent;
7670         if (!parent)
7671                 return empty;
7672
7673         driver = parent->driver;
7674         if (driver && driver->name)
7675                 return driver->name;
7676         return empty;
7677 }
7678
7679 static void __netdev_printk(const char *level, const struct net_device *dev,
7680                             struct va_format *vaf)
7681 {
7682         if (dev && dev->dev.parent) {
7683                 dev_printk_emit(level[1] - '0',
7684                                 dev->dev.parent,
7685                                 "%s %s %s%s: %pV",
7686                                 dev_driver_string(dev->dev.parent),
7687                                 dev_name(dev->dev.parent),
7688                                 netdev_name(dev), netdev_reg_state(dev),
7689                                 vaf);
7690         } else if (dev) {
7691                 printk("%s%s%s: %pV",
7692                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
7693         } else {
7694                 printk("%s(NULL net_device): %pV", level, vaf);
7695         }
7696 }
7697
7698 void netdev_printk(const char *level, const struct net_device *dev,
7699                    const char *format, ...)
7700 {
7701         struct va_format vaf;
7702         va_list args;
7703
7704         va_start(args, format);
7705
7706         vaf.fmt = format;
7707         vaf.va = &args;
7708
7709         __netdev_printk(level, dev, &vaf);
7710
7711         va_end(args);
7712 }
7713 EXPORT_SYMBOL(netdev_printk);
7714
7715 #define define_netdev_printk_level(func, level)                 \
7716 void func(const struct net_device *dev, const char *fmt, ...)   \
7717 {                                                               \
7718         struct va_format vaf;                                   \
7719         va_list args;                                           \
7720                                                                 \
7721         va_start(args, fmt);                                    \
7722                                                                 \
7723         vaf.fmt = fmt;                                          \
7724         vaf.va = &args;                                         \
7725                                                                 \
7726         __netdev_printk(level, dev, &vaf);                      \
7727                                                                 \
7728         va_end(args);                                           \
7729 }                                                               \
7730 EXPORT_SYMBOL(func);
7731
7732 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7733 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7734 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7735 define_netdev_printk_level(netdev_err, KERN_ERR);
7736 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7737 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7738 define_netdev_printk_level(netdev_info, KERN_INFO);
7739
7740 static void __net_exit netdev_exit(struct net *net)
7741 {
7742         kfree(net->dev_name_head);
7743         kfree(net->dev_index_head);
7744 }
7745
7746 static struct pernet_operations __net_initdata netdev_net_ops = {
7747         .init = netdev_init,
7748         .exit = netdev_exit,
7749 };
7750
7751 static void __net_exit default_device_exit(struct net *net)
7752 {
7753         struct net_device *dev, *aux;
7754         /*
7755          * Push all migratable network devices back to the
7756          * initial network namespace
7757          */
7758         rtnl_lock();
7759         for_each_netdev_safe(net, dev, aux) {
7760                 int err;
7761                 char fb_name[IFNAMSIZ];
7762
7763                 /* Ignore unmoveable devices (i.e. loopback) */
7764                 if (dev->features & NETIF_F_NETNS_LOCAL)
7765                         continue;
7766
7767                 /* Leave virtual devices for the generic cleanup */
7768                 if (dev->rtnl_link_ops)
7769                         continue;
7770
7771                 /* Push remaining network devices to init_net */
7772                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7773                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7774                 if (err) {
7775                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7776                                  __func__, dev->name, err);
7777                         BUG();
7778                 }
7779         }
7780         rtnl_unlock();
7781 }
7782
7783 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7784 {
7785         /* Return with the rtnl_lock held when there are no network
7786          * devices unregistering in any network namespace in net_list.
7787          */
7788         struct net *net;
7789         bool unregistering;
7790         DEFINE_WAIT_FUNC(wait, woken_wake_function);
7791
7792         add_wait_queue(&netdev_unregistering_wq, &wait);
7793         for (;;) {
7794                 unregistering = false;
7795                 rtnl_lock();
7796                 list_for_each_entry(net, net_list, exit_list) {
7797                         if (net->dev_unreg_count > 0) {
7798                                 unregistering = true;
7799                                 break;
7800                         }
7801                 }
7802                 if (!unregistering)
7803                         break;
7804                 __rtnl_unlock();
7805
7806                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7807         }
7808         remove_wait_queue(&netdev_unregistering_wq, &wait);
7809 }
7810
7811 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7812 {
7813         /* At exit all network devices most be removed from a network
7814          * namespace.  Do this in the reverse order of registration.
7815          * Do this across as many network namespaces as possible to
7816          * improve batching efficiency.
7817          */
7818         struct net_device *dev;
7819         struct net *net;
7820         LIST_HEAD(dev_kill_list);
7821
7822         /* To prevent network device cleanup code from dereferencing
7823          * loopback devices or network devices that have been freed
7824          * wait here for all pending unregistrations to complete,
7825          * before unregistring the loopback device and allowing the
7826          * network namespace be freed.
7827          *
7828          * The netdev todo list containing all network devices
7829          * unregistrations that happen in default_device_exit_batch
7830          * will run in the rtnl_unlock() at the end of
7831          * default_device_exit_batch.
7832          */
7833         rtnl_lock_unregistering(net_list);
7834         list_for_each_entry(net, net_list, exit_list) {
7835                 for_each_netdev_reverse(net, dev) {
7836                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7837                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7838                         else
7839                                 unregister_netdevice_queue(dev, &dev_kill_list);
7840                 }
7841         }
7842         unregister_netdevice_many(&dev_kill_list);
7843         rtnl_unlock();
7844 }
7845
7846 static struct pernet_operations __net_initdata default_device_ops = {
7847         .exit = default_device_exit,
7848         .exit_batch = default_device_exit_batch,
7849 };
7850
7851 /*
7852  *      Initialize the DEV module. At boot time this walks the device list and
7853  *      unhooks any devices that fail to initialise (normally hardware not
7854  *      present) and leaves us with a valid list of present and active devices.
7855  *
7856  */
7857
7858 /*
7859  *       This is called single threaded during boot, so no need
7860  *       to take the rtnl semaphore.
7861  */
7862 static int __init net_dev_init(void)
7863 {
7864         int i, rc = -ENOMEM;
7865
7866         BUG_ON(!dev_boot_phase);
7867
7868         if (dev_proc_init())
7869                 goto out;
7870
7871         if (netdev_kobject_init())
7872                 goto out;
7873
7874         INIT_LIST_HEAD(&ptype_all);
7875         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7876                 INIT_LIST_HEAD(&ptype_base[i]);
7877
7878         INIT_LIST_HEAD(&offload_base);
7879
7880         if (register_pernet_subsys(&netdev_net_ops))
7881                 goto out;
7882
7883         /*
7884          *      Initialise the packet receive queues.
7885          */
7886
7887         for_each_possible_cpu(i) {
7888                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7889
7890                 skb_queue_head_init_raw(&sd->input_pkt_queue);
7891                 skb_queue_head_init_raw(&sd->process_queue);
7892                 skb_queue_head_init_raw(&sd->tofree_queue);
7893                 INIT_LIST_HEAD(&sd->poll_list);
7894                 sd->output_queue_tailp = &sd->output_queue;
7895 #ifdef CONFIG_RPS
7896                 sd->csd.func = rps_trigger_softirq;
7897                 sd->csd.info = sd;
7898                 sd->cpu = i;
7899 #endif
7900
7901                 sd->backlog.poll = process_backlog;
7902                 sd->backlog.weight = weight_p;
7903         }
7904
7905         dev_boot_phase = 0;
7906
7907         /* The loopback device is special if any other network devices
7908          * is present in a network namespace the loopback device must
7909          * be present. Since we now dynamically allocate and free the
7910          * loopback device ensure this invariant is maintained by
7911          * keeping the loopback device as the first device on the
7912          * list of network devices.  Ensuring the loopback devices
7913          * is the first device that appears and the last network device
7914          * that disappears.
7915          */
7916         if (register_pernet_device(&loopback_net_ops))
7917                 goto out;
7918
7919         if (register_pernet_device(&default_device_ops))
7920                 goto out;
7921
7922         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7923         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7924
7925         hotcpu_notifier(dev_cpu_callback, 0);
7926         dst_subsys_init();
7927         rc = 0;
7928 out:
7929         return rc;
7930 }
7931
7932 subsys_initcall(net_dev_init);