These changes are the raw update to linux-4.4.6-rt14. Kernel sources

[kvmfornfv.git] / kernel / net / core / dev.c
diff --git a/kernel/net/core/dev.c b/kernel/net/core/dev.c

index f8c23de..0e17592 100644 (file)
--- a/kernel/net/core/dev.c
+++ b/kernel/net/core/dev.c
@@ -99,6 +99,7 @@
  #include <linux/rtnetlink.h>
  #include <linux/stat.h>
  #include <net/dst.h>
+#include <net/dst_metadata.h>
  #include <net/pkt_sched.h>
  #include <net/checksum.h>
  #include <net/xfrm.h>
@@ -135,6 +136,7 @@
  #include <linux/if_macvlan.h>
  #include <linux/errqueue.h>
  #include <linux/hrtimer.h>
+#include <linux/netfilter_ingress.h>
  
  #include "net-sysfs.h"
  
@@ -469,10 +471,14 @@ EXPORT_SYMBOL(dev_remove_pack);
   */
  void dev_add_offload(struct packet_offload *po)
  {
-       struct list_head *head = &offload_base;
+       struct packet_offload *elem;
  
         spin_lock(&offload_lock);
-       list_add_rcu(&po->list, head);
+       list_for_each_entry(elem, &offload_base, list) {
+               if (po->priority < elem->priority)
+                       break;
+       }
+       list_add_rcu(&po->list, elem->list.prev);
         spin_unlock(&offload_lock);
  }
  EXPORT_SYMBOL(dev_add_offload);
@@ -677,6 +683,32 @@ int dev_get_iflink(const struct net_device *dev)
  }
  EXPORT_SYMBOL(dev_get_iflink);
  
+/**
+ *     dev_fill_metadata_dst - Retrieve tunnel egress information.
+ *     @dev: targeted interface
+ *     @skb: The packet.
+ *
+ *     For better visibility of tunnel traffic OVS needs to retrieve
+ *     egress tunnel information for a packet. Following API allows
+ *     user to get this info.
+ */
+int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
+{
+       struct ip_tunnel_info *info;
+
+       if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
+               return -EINVAL;
+
+       info = skb_tunnel_info_unclone(skb);
+       if (!info)
+               return -ENOMEM;
+       if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
+               return -EINVAL;
+
+       return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
+}
+EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
+
  /**
   *     __dev_get_by_name       - find a device by its name
   *     @net: the applicable net namespace
@@ -1632,7 +1664,7 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
  }
  EXPORT_SYMBOL(call_netdevice_notifiers);
  
-#ifdef CONFIG_NET_CLS_ACT
+#ifdef CONFIG_NET_INGRESS
  static struct static_key ingress_needed __read_mostly;
  
  void net_inc_ingress_queue(void)
@@ -2347,21 +2379,52 @@ void netif_device_attach(struct net_device *dev)
  }
  EXPORT_SYMBOL(netif_device_attach);
  
+/*
+ * Returns a Tx hash based on the given packet descriptor a Tx queues' number
+ * to be used as a distribution range.
+ */
+u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
+                 unsigned int num_tx_queues)
+{
+       u32 hash;
+       u16 qoffset = 0;
+       u16 qcount = num_tx_queues;
+
+       if (skb_rx_queue_recorded(skb)) {
+               hash = skb_get_rx_queue(skb);
+               while (unlikely(hash >= num_tx_queues))
+                       hash -= num_tx_queues;
+               return hash;
+       }
+
+       if (dev->num_tc) {
+               u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+               qoffset = dev->tc_to_txq[tc].offset;
+               qcount = dev->tc_to_txq[tc].count;
+       }
+
+       return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
+}
+EXPORT_SYMBOL(__skb_tx_hash);
+
  static void skb_warn_bad_offload(const struct sk_buff *skb)
  {
         static const netdev_features_t null_features = 0;
         struct net_device *dev = skb->dev;
-       const char *driver = "";
+       const char *name = "";
  
         if (!net_ratelimit())
                 return;
  
-       if (dev && dev->dev.parent)
-               driver = dev_driver_string(dev->dev.parent);
-
+       if (dev) {
+               if (dev->dev.parent)
+                       name = dev_driver_string(dev->dev.parent);
+               else
+                       name = netdev_name(dev);
+       }
         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
              "gso_type=%d ip_summed=%d\n",
-            driver, dev ? &dev->features : &null_features,
+            name, dev ? &dev->features : &null_features,
              skb->sk ? &skb->sk->sk_route_caps : &null_features,
              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
              skb_shinfo(skb)->gso_type, skb->ip_summed);
@@ -2487,6 +2550,8 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
   *
   *     It may return NULL if the skb requires no segmentation.  This is
   *     only possible when GSO is used for verifying header integrity.
+ *
+ *     Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
   */
  struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
                                   netdev_features_t features, bool tx_path)
@@ -2501,6 +2566,9 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
                         return ERR_PTR(err);
         }
  
+       BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
+                    sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
+
         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
         SKB_GSO_CB(skb)->encap_level = 0;
  
@@ -2823,7 +2891,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
          * This permits __QDISC___STATE_RUNNING owner to get the lock more
          * often and dequeue packets faster.
          */
+#ifdef CONFIG_PREEMPT_RT_FULL
+       contended = true;
+#else
         contended = qdisc_is_running(q);
+#endif
         if (unlikely(contended))
                 spin_lock(&q->busylock);
  
@@ -2883,16 +2955,53 @@ static void skb_update_prio(struct sk_buff *skb)
  #define skb_update_prio(skb)
  #endif
  
+#ifdef CONFIG_PREEMPT_RT_FULL
+
+static inline int xmit_rec_read(void)
+{
+       return current->xmit_recursion;
+}
+
+static inline void xmit_rec_inc(void)
+{
+       current->xmit_recursion++;
+}
+
+static inline void xmit_rec_dec(void)
+{
+       current->xmit_recursion--;
+}
+
+#else
+
  DEFINE_PER_CPU(int, xmit_recursion);
  EXPORT_SYMBOL(xmit_recursion);
  
+static inline int xmit_rec_read(void)
+{
+       return __this_cpu_read(xmit_recursion);
+}
+
+static inline void xmit_rec_inc(void)
+{
+       __this_cpu_inc(xmit_recursion);
+}
+
+static inline void xmit_rec_dec(void)
+{
+       __this_cpu_dec(xmit_recursion);
+}
+#endif
+
  #define RECURSION_LIMIT 10
  
  /**
   *     dev_loopback_xmit - loop back @skb
+ *     @net: network namespace this loopback is happening in
+ *     @sk:  sk needed to be a netfilter okfn
   *     @skb: buffer to transmit
   */
-int dev_loopback_xmit(struct sock *sk, struct sk_buff *skb)
+int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
  {
         skb_reset_mac_header(skb);
         __skb_pull(skb, skb_network_offset(skb));
@@ -2905,6 +3014,85 @@ int dev_loopback_xmit(struct sock *sk, struct sk_buff *skb)
  }
  EXPORT_SYMBOL(dev_loopback_xmit);
  
+static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+{
+#ifdef CONFIG_XPS
+       struct xps_dev_maps *dev_maps;
+       struct xps_map *map;
+       int queue_index = -1;
+
+       rcu_read_lock();
+       dev_maps = rcu_dereference(dev->xps_maps);
+       if (dev_maps) {
+               map = rcu_dereference(
+                   dev_maps->cpu_map[skb->sender_cpu - 1]);
+               if (map) {
+                       if (map->len == 1)
+                               queue_index = map->queues[0];
+                       else
+                               queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
+                                                                          map->len)];
+                       if (unlikely(queue_index >= dev->real_num_tx_queues))
+                               queue_index = -1;
+               }
+       }
+       rcu_read_unlock();
+
+       return queue_index;
+#else
+       return -1;
+#endif
+}
+
+static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
+{
+       struct sock *sk = skb->sk;
+       int queue_index = sk_tx_queue_get(sk);
+
+       if (queue_index < 0 || skb->ooo_okay ||
+           queue_index >= dev->real_num_tx_queues) {
+               int new_index = get_xps_queue(dev, skb);
+               if (new_index < 0)
+                       new_index = skb_tx_hash(dev, skb);
+
+               if (queue_index != new_index && sk &&
+                   sk_fullsock(sk) &&
+                   rcu_access_pointer(sk->sk_dst_cache))
+                       sk_tx_queue_set(sk, new_index);
+
+               queue_index = new_index;
+       }
+
+       return queue_index;
+}
+
+struct netdev_queue *netdev_pick_tx(struct net_device *dev,
+                                   struct sk_buff *skb,
+                                   void *accel_priv)
+{
+       int queue_index = 0;
+
+#ifdef CONFIG_XPS
+       if (skb->sender_cpu == 0)
+               skb->sender_cpu = raw_smp_processor_id() + 1;
+#endif
+
+       if (dev->real_num_tx_queues != 1) {
+               const struct net_device_ops *ops = dev->netdev_ops;
+               if (ops->ndo_select_queue)
+                       queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
+                                                           __netdev_pick_tx);
+               else
+                       queue_index = __netdev_pick_tx(dev, skb);
+
+               if (!accel_priv)
+                       queue_index = netdev_cap_txqueue(dev, queue_index);
+       }
+
+       skb_set_queue_mapping(skb, queue_index);
+       return netdev_get_tx_queue(dev, queue_index);
+}
+
  /**
   *     __dev_queue_xmit - transmit a buffer
   *     @skb: buffer to transmit
@@ -2958,6 +3146,16 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
         else
                 skb_dst_force(skb);
  
+#ifdef CONFIG_NET_SWITCHDEV
+       /* Don't forward if offload device already forwarded */
+       if (skb->offload_fwd_mark &&
+           skb->offload_fwd_mark == dev->offload_fwd_mark) {
+               consume_skb(skb);
+               rc = NET_XMIT_SUCCESS;
+               goto out;
+       }
+#endif
+
         txq = netdev_pick_tx(dev, skb, accel_priv);
         q = rcu_dereference_bh(txq->qdisc);
  
@@ -2987,7 +3185,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
  
                 if (txq->xmit_lock_owner != cpu) {
  
-                       if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
+                       if (xmit_rec_read() > RECURSION_LIMIT)
                                 goto recursion_alert;
  
                         skb = validate_xmit_skb(skb, dev);
@@ -2997,9 +3195,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
                         HARD_TX_LOCK(dev, txq, cpu);
  
                         if (!netif_xmit_stopped(txq)) {
-                               __this_cpu_inc(xmit_recursion);
+                               xmit_rec_inc();
                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
-                               __this_cpu_dec(xmit_recursion);
+                               xmit_rec_dec();
                                 if (dev_xmit_complete(rc)) {
                                         HARD_TX_UNLOCK(dev, txq);
                                         goto out;
@@ -3030,11 +3228,11 @@ out:
         return rc;
  }
  
-int dev_queue_xmit_sk(struct sock *sk, struct sk_buff *skb)
+int dev_queue_xmit(struct sk_buff *skb)
  {
         return __dev_queue_xmit(skb, NULL);
  }
-EXPORT_SYMBOL(dev_queue_xmit_sk);
+EXPORT_SYMBOL(dev_queue_xmit);
  
  int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
  {
@@ -3549,66 +3747,55 @@ int (*br_fdb_test_addr_hook)(struct net_device *dev,
  EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
  #endif
  
-#ifdef CONFIG_NET_CLS_ACT
-/* TODO: Maybe we should just force sch_ingress to be compiled in
- * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
- * a compare and 2 stores extra right now if we dont have it on
- * but have CONFIG_NET_CLS_ACT
- * NOTE: This doesn't stop any functionality; if you dont have
- * the ingress scheduler, you just can't add policies on ingress.
- *
- */
-static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
-{
-       struct net_device *dev = skb->dev;
-       u32 ttl = G_TC_RTTL(skb->tc_verd);
-       int result = TC_ACT_OK;
-       struct Qdisc *q;
-
-       if (unlikely(MAX_RED_LOOP < ttl++)) {
-               net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
-                                    skb->skb_iif, dev->ifindex);
-               return TC_ACT_SHOT;
-       }
-
-       skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
-       skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
-
-       q = rcu_dereference(rxq->qdisc);
-       if (q != &noop_qdisc) {
-               spin_lock(qdisc_lock(q));
-               if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
-                       result = qdisc_enqueue_root(skb, q);
-               spin_unlock(qdisc_lock(q));
-       }
-
-       return result;
-}
-
  static inline struct sk_buff *handle_ing(struct sk_buff *skb,
                                          struct packet_type **pt_prev,
                                          int *ret, struct net_device *orig_dev)
  {
-       struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
+#ifdef CONFIG_NET_CLS_ACT
+       struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
+       struct tcf_result cl_res;
  
-       if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
+       /* If there's at least one ingress present somewhere (so
+        * we get here via enabled static key), remaining devices
+        * that are not configured with an ingress qdisc will bail
+        * out here.
+        */
+       if (!cl)
                 return skb;
-
         if (*pt_prev) {
                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
                 *pt_prev = NULL;
         }
  
-       switch (ing_filter(skb, rxq)) {
+       qdisc_skb_cb(skb)->pkt_len = skb->len;
+       skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
+       qdisc_bstats_cpu_update(cl->q, skb);
+
+       switch (tc_classify(skb, cl, &cl_res, false)) {
+       case TC_ACT_OK:
+       case TC_ACT_RECLASSIFY:
+               skb->tc_index = TC_H_MIN(cl_res.classid);
+               break;
         case TC_ACT_SHOT:
+               qdisc_qstats_cpu_drop(cl->q);
         case TC_ACT_STOLEN:
+       case TC_ACT_QUEUED:
                 kfree_skb(skb);
                 return NULL;
+       case TC_ACT_REDIRECT:
+               /* skb_mac_header check was done by cls/act_bpf, so
+                * we can safely push the L2 header back before
+                * redirecting to another netdev
+                */
+               __skb_push(skb, skb->mac_len);
+               skb_do_redirect(skb);
+               return NULL;
+       default:
+               break;
         }
-
+#endif /* CONFIG_NET_CLS_ACT */
         return skb;
  }
-#endif
  
  /**
   *     netdev_rx_handler_register - register receive handler
@@ -3681,6 +3868,22 @@ static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
         }
  }
  
+static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
+                            int *ret, struct net_device *orig_dev)
+{
+#ifdef CONFIG_NETFILTER_INGRESS
+       if (nf_hook_ingress_active(skb)) {
+               if (*pt_prev) {
+                       *ret = deliver_skb(skb, *pt_prev, orig_dev);
+                       *pt_prev = NULL;
+               }
+
+               return nf_hook_ingress(skb);
+       }
+#endif /* CONFIG_NETFILTER_INGRESS */
+       return 0;
+}
+
  static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
  {
         struct packet_type *ptype, *pt_prev;
@@ -3738,13 +3941,17 @@ another_round:
         }
  
  skip_taps:
-#ifdef CONFIG_NET_CLS_ACT
+#ifdef CONFIG_NET_INGRESS
         if (static_key_false(&ingress_needed)) {
                 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
                 if (!skb)
                         goto out;
-       }
  
+               if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
+                       goto out;
+       }
+#endif
+#ifdef CONFIG_NET_CLS_ACT
         skb->tc_verd = 0;
  ncls:
  #endif
@@ -3897,13 +4104,13 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
   *     NET_RX_SUCCESS: no congestion
   *     NET_RX_DROP: packet was dropped
   */
-int netif_receive_skb_sk(struct sock *sk, struct sk_buff *skb)
+int netif_receive_skb(struct sk_buff *skb)
  {
         trace_netif_receive_skb_entry(skb);
  
         return netif_receive_skb_internal(skb);
  }
-EXPORT_SYMBOL(netif_receive_skb_sk);
+EXPORT_SYMBOL(netif_receive_skb);
  
  /* Network device is going away, flush any packets still pending
   * Called with irqs disabled.
@@ -4017,6 +4224,7 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
  
                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
                 diffs |= p->vlan_tci ^ skb->vlan_tci;
+               diffs |= skb_metadata_dst_cmp(p, skb);
                 if (maclen == ETH_HLEN)
                         diffs |= compare_ether_header(skb_mac_header(p),
                                                       skb_mac_header(skb));
@@ -4214,10 +4422,12 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
                 break;
  
         case GRO_MERGED_FREE:
-               if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
+               if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
+                       skb_dst_drop(skb);
                         kmem_cache_free(skbuff_head_cache, skb);
-               else
+               } else {
                         __kfree_skb(skb);
+               }
                 break;
  
         case GRO_HELD:
@@ -4634,6 +4844,8 @@ void napi_disable(struct napi_struct *n)
  
         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
                 msleep(1);
+       while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
+               msleep(1);
  
         hrtimer_cancel(&n->timer);
  
@@ -4755,7 +4967,7 @@ static void net_rx_action(struct softirq_action *h)
         list_splice_tail(&repoll, &list);
         list_splice(&list, &sd->poll_list);
         if (!list_empty(&sd->poll_list))
-               __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+               __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ);
  
         net_rps_action_and_irq_enable(sd);
  }
@@ -4776,8 +4988,7 @@ struct netdev_adjacent {
         struct rcu_head rcu;
  };
  
-static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
-                                                struct net_device *adj_dev,
+static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
                                                  struct list_head *adj_list)
  {
         struct netdev_adjacent *adj;
@@ -4803,7 +5014,7 @@ bool netdev_has_upper_dev(struct net_device *dev,
  {
         ASSERT_RTNL();
  
-       return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
+       return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
  }
  EXPORT_SYMBOL(netdev_has_upper_dev);
  
@@ -4916,7 +5127,7 @@ EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
   * Gets the next netdev_adjacent->private from the dev's lower neighbour
   * list, starting from iter position. The caller must hold either hold the
   * RTNL lock or its own locking that guarantees that the neighbour lower
- * list will remain unchainged.
+ * list will remain unchanged.
   */
  void *netdev_lower_get_next_private(struct net_device *dev,
                                     struct list_head **iter)
@@ -4971,7 +5182,7 @@ EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
   * Gets the next netdev_adjacent from the dev's lower neighbour
   * list, starting from iter position. The caller must hold RTNL lock or
   * its own locking that guarantees that the neighbour lower
- * list will remain unchainged.
+ * list will remain unchanged.
   */
  void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
  {
@@ -5065,7 +5276,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
         struct netdev_adjacent *adj;
         int ret;
  
-       adj = __netdev_find_adj(dev, adj_dev, dev_list);
+       adj = __netdev_find_adj(adj_dev, dev_list);
  
         if (adj) {
                 adj->ref_nr++;
@@ -5121,7 +5332,7 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev,
  {
         struct netdev_adjacent *adj;
  
-       adj = __netdev_find_adj(dev, adj_dev, dev_list);
+       adj = __netdev_find_adj(adj_dev, dev_list);
  
         if (!adj) {
                 pr_err("tried to remove device %s from %s\n",
@@ -5232,6 +5443,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
                                    struct net_device *upper_dev, bool master,
                                    void *private)
  {
+       struct netdev_notifier_changeupper_info changeupper_info;
         struct netdev_adjacent *i, *j, *to_i, *to_j;
         int ret = 0;
  
@@ -5241,15 +5453,25 @@ static int __netdev_upper_dev_link(struct net_device *dev,
                 return -EBUSY;
  
         /* To prevent loops, check if dev is not upper device to upper_dev. */
-       if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
+       if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
                 return -EBUSY;
  
-       if (__netdev_find_adj(dev, upper_dev, &dev->adj_list.upper))
+       if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
                 return -EEXIST;
  
         if (master && netdev_master_upper_dev_get(dev))
                 return -EBUSY;
  
+       changeupper_info.upper_dev = upper_dev;
+       changeupper_info.master = master;
+       changeupper_info.linking = true;
+
+       ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
+                                           &changeupper_info.info);
+       ret = notifier_to_errno(ret);
+       if (ret)
+               return ret;
+
         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
                                                    master);
         if (ret)
@@ -5288,7 +5510,8 @@ static int __netdev_upper_dev_link(struct net_device *dev,
                         goto rollback_lower_mesh;
         }
  
-       call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
+       call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
+                                     &changeupper_info.info);
         return 0;
  
  rollback_lower_mesh:
@@ -5383,9 +5606,17 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
  void netdev_upper_dev_unlink(struct net_device *dev,
                              struct net_device *upper_dev)
  {
+       struct netdev_notifier_changeupper_info changeupper_info;
         struct netdev_adjacent *i, *j;
         ASSERT_RTNL();
  
+       changeupper_info.upper_dev = upper_dev;
+       changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
+       changeupper_info.linking = false;
+
+       call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
+                                     &changeupper_info.info);
+
         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
  
         /* Here is the tricky part. We must remove all dev's lower
@@ -5405,7 +5636,8 @@ void netdev_upper_dev_unlink(struct net_device *dev,
         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
                 __netdev_adjacent_dev_unlink(dev, i->dev);
  
-       call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
+       call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
+                                     &changeupper_info.info);
  }
  EXPORT_SYMBOL(netdev_upper_dev_unlink);
  
@@ -5511,7 +5743,7 @@ void *netdev_lower_dev_get_private(struct net_device *dev,
  
         if (!lower_dev)
                 return NULL;
-       lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
+       lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
         if (!lower)
                 return NULL;
  
@@ -6005,6 +6237,26 @@ int dev_get_phys_port_name(struct net_device *dev,
  }
  EXPORT_SYMBOL(dev_get_phys_port_name);
  
+/**
+ *     dev_change_proto_down - update protocol port state information
+ *     @dev: device
+ *     @proto_down: new value
+ *
+ *     This info can be used by switch drivers to set the phys state of the
+ *     port.
+ */
+int dev_change_proto_down(struct net_device *dev, bool proto_down)
+{
+       const struct net_device_ops *ops = dev->netdev_ops;
+
+       if (!ops->ndo_change_proto_down)
+               return -EOPNOTSUPP;
+       if (!netif_device_present(dev))
+               return -ENODEV;
+       return ops->ndo_change_proto_down(dev, proto_down);
+}
+EXPORT_SYMBOL(dev_change_proto_down);
+
  /**
   *     dev_new_index   -       allocate an ifindex
   *     @net: the applicable net namespace
@@ -6129,6 +6381,48 @@ static void rollback_registered(struct net_device *dev)
         list_del(&single);
  }
  
+static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
+       struct net_device *upper, netdev_features_t features)
+{
+       netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
+       netdev_features_t feature;
+       int feature_bit;
+
+       for_each_netdev_feature(&upper_disables, feature_bit) {
+               feature = __NETIF_F_BIT(feature_bit);
+               if (!(upper->wanted_features & feature)
+                   && (features & feature)) {
+                       netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
+                                  &feature, upper->name);
+                       features &= ~feature;
+               }
+       }
+
+       return features;
+}
+
+static void netdev_sync_lower_features(struct net_device *upper,
+       struct net_device *lower, netdev_features_t features)
+{
+       netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
+       netdev_features_t feature;
+       int feature_bit;
+
+       for_each_netdev_feature(&upper_disables, feature_bit) {
+               feature = __NETIF_F_BIT(feature_bit);
+               if (!(features & feature) && (lower->features & feature)) {
+                       netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
+                                  &feature, lower->name);
+                       lower->wanted_features &= ~feature;
+                       netdev_update_features(lower);
+
+                       if (unlikely(lower->features & feature))
+                               netdev_WARN(upper, "failed to disable %pNF on %s!\n",
+                                           &feature, lower->name);
+               }
+       }
+}
+
  static netdev_features_t netdev_fix_features(struct net_device *dev,
         netdev_features_t features)
  {
@@ -6198,8 +6492,10 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
  
  int __netdev_update_features(struct net_device *dev)
  {
+       struct net_device *upper, *lower;
         netdev_features_t features;
-       int err = 0;
+       struct list_head *iter;
+       int err = -1;
  
         ASSERT_RTNL();
  
@@ -6211,26 +6507,42 @@ int __netdev_update_features(struct net_device *dev)
         /* driver might be less strict about feature dependencies */
         features = netdev_fix_features(dev, features);
  
+       /* some features can't be enabled if they're off an an upper device */
+       netdev_for_each_upper_dev_rcu(dev, upper, iter)
+               features = netdev_sync_upper_features(dev, upper, features);
+
         if (dev->features == features)
-               return 0;
+               goto sync_lower;
  
         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
                 &dev->features, &features);
  
         if (dev->netdev_ops->ndo_set_features)
                 err = dev->netdev_ops->ndo_set_features(dev, features);
+       else
+               err = 0;
  
         if (unlikely(err < 0)) {
                 netdev_err(dev,
                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
                         err, &features, &dev->features);
+               /* return non-0 since some features might have changed and
+                * it's better to fire a spurious notification than miss it
+                */
                 return -1;
         }
  
+sync_lower:
+       /* some features must be disabled on lower devices when disabled
+        * on an upper device (think: bonding master or bridge)
+        */
+       netdev_for_each_lower_dev(dev, lower, iter)
+               netdev_sync_lower_features(dev, lower, features);
+
         if (!err)
                 dev->features = features;
  
-       return 1;
+       return err < 0 ? 0 : 1;
  }
  
  /**
@@ -6357,6 +6669,17 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
         return 0;
  }
  
+void netif_tx_stop_all_queues(struct net_device *dev)
+{
+       unsigned int i;
+
+       for (i = 0; i < dev->num_tx_queues; i++) {
+               struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
+               netif_tx_stop_queue(txq);
+       }
+}
+EXPORT_SYMBOL(netif_tx_stop_all_queues);
+
  /**
   *     register_netdevice      - register a network device
   *     @dev: device to register
@@ -6887,6 +7210,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
         setup(dev);
  
+       if (!dev->tx_queue_len) {
+               dev->priv_flags |= IFF_NO_QUEUE;
+               dev->tx_queue_len = 1;
+       }
+
         dev->num_tx_queues = txqs;
         dev->real_num_tx_queues = txqs;
         if (netif_alloc_netdev_queues(dev))
@@ -6904,6 +7232,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
         dev->group = INIT_NETDEV_GROUP;
         if (!dev->ethtool_ops)
                 dev->ethtool_ops = &default_ethtool_ops;
+
+       nf_hook_ingress_init(dev);
+
         return dev;
  
  free_all:
@@ -6969,7 +7300,7 @@ EXPORT_SYMBOL(free_netdev);
  void synchronize_net(void)
  {
         might_sleep();
-       if (rtnl_is_locked())
+       if (rtnl_is_locked() && !IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
                 synchronize_rcu_expedited();
         else
                 synchronize_rcu();
@@ -7561,7 +7892,7 @@ static int __init net_dev_init(void)
         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
  
         hotcpu_notifier(dev_cpu_callback, 0);
-       dst_init();
+       dst_subsys_init();
         rc = 0;
  out:
         return rc;