These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / net / core / dev.c
index f8c23de..0e17592 100644 (file)
@@ -99,6 +99,7 @@
 #include <linux/rtnetlink.h>
 #include <linux/stat.h>
 #include <net/dst.h>
+#include <net/dst_metadata.h>
 #include <net/pkt_sched.h>
 #include <net/checksum.h>
 #include <net/xfrm.h>
 #include <linux/if_macvlan.h>
 #include <linux/errqueue.h>
 #include <linux/hrtimer.h>
+#include <linux/netfilter_ingress.h>
 
 #include "net-sysfs.h"
 
@@ -469,10 +471,14 @@ EXPORT_SYMBOL(dev_remove_pack);
  */
 void dev_add_offload(struct packet_offload *po)
 {
-       struct list_head *head = &offload_base;
+       struct packet_offload *elem;
 
        spin_lock(&offload_lock);
-       list_add_rcu(&po->list, head);
+       list_for_each_entry(elem, &offload_base, list) {
+               if (po->priority < elem->priority)
+                       break;
+       }
+       list_add_rcu(&po->list, elem->list.prev);
        spin_unlock(&offload_lock);
 }
 EXPORT_SYMBOL(dev_add_offload);
@@ -677,6 +683,32 @@ int dev_get_iflink(const struct net_device *dev)
 }
 EXPORT_SYMBOL(dev_get_iflink);
 
+/**
+ *     dev_fill_metadata_dst - Retrieve tunnel egress information.
+ *     @dev: targeted interface
+ *     @skb: The packet.
+ *
+ *     For better visibility of tunnel traffic OVS needs to retrieve
+ *     egress tunnel information for a packet. Following API allows
+ *     user to get this info.
+ */
+int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
+{
+       struct ip_tunnel_info *info;
+
+       if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
+               return -EINVAL;
+
+       info = skb_tunnel_info_unclone(skb);
+       if (!info)
+               return -ENOMEM;
+       if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
+               return -EINVAL;
+
+       return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
+}
+EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
+
 /**
  *     __dev_get_by_name       - find a device by its name
  *     @net: the applicable net namespace
@@ -1632,7 +1664,7 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
 }
 EXPORT_SYMBOL(call_netdevice_notifiers);
 
-#ifdef CONFIG_NET_CLS_ACT
+#ifdef CONFIG_NET_INGRESS
 static struct static_key ingress_needed __read_mostly;
 
 void net_inc_ingress_queue(void)
@@ -2347,21 +2379,52 @@ void netif_device_attach(struct net_device *dev)
 }
 EXPORT_SYMBOL(netif_device_attach);
 
+/*
+ * Returns a Tx hash based on the given packet descriptor a Tx queues' number
+ * to be used as a distribution range.
+ */
+u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
+                 unsigned int num_tx_queues)
+{
+       u32 hash;
+       u16 qoffset = 0;
+       u16 qcount = num_tx_queues;
+
+       if (skb_rx_queue_recorded(skb)) {
+               hash = skb_get_rx_queue(skb);
+               while (unlikely(hash >= num_tx_queues))
+                       hash -= num_tx_queues;
+               return hash;
+       }
+
+       if (dev->num_tc) {
+               u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+               qoffset = dev->tc_to_txq[tc].offset;
+               qcount = dev->tc_to_txq[tc].count;
+       }
+
+       return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
+}
+EXPORT_SYMBOL(__skb_tx_hash);
+
 static void skb_warn_bad_offload(const struct sk_buff *skb)
 {
        static const netdev_features_t null_features = 0;
        struct net_device *dev = skb->dev;
-       const char *driver = "";
+       const char *name = "";
 
        if (!net_ratelimit())
                return;
 
-       if (dev && dev->dev.parent)
-               driver = dev_driver_string(dev->dev.parent);
-
+       if (dev) {
+               if (dev->dev.parent)
+                       name = dev_driver_string(dev->dev.parent);
+               else
+                       name = netdev_name(dev);
+       }
        WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
             "gso_type=%d ip_summed=%d\n",
-            driver, dev ? &dev->features : &null_features,
+            name, dev ? &dev->features : &null_features,
             skb->sk ? &skb->sk->sk_route_caps : &null_features,
             skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
             skb_shinfo(skb)->gso_type, skb->ip_summed);
@@ -2487,6 +2550,8 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
  *
  *     It may return NULL if the skb requires no segmentation.  This is
  *     only possible when GSO is used for verifying header integrity.
+ *
+ *     Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
  */
 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
                                  netdev_features_t features, bool tx_path)
@@ -2501,6 +2566,9 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
                        return ERR_PTR(err);
        }
 
+       BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
+                    sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
+
        SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
        SKB_GSO_CB(skb)->encap_level = 0;
 
@@ -2823,7 +2891,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
         * This permits __QDISC___STATE_RUNNING owner to get the lock more
         * often and dequeue packets faster.
         */
+#ifdef CONFIG_PREEMPT_RT_FULL
+       contended = true;
+#else
        contended = qdisc_is_running(q);
+#endif
        if (unlikely(contended))
                spin_lock(&q->busylock);
 
@@ -2883,16 +2955,53 @@ static void skb_update_prio(struct sk_buff *skb)
 #define skb_update_prio(skb)
 #endif
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+
+static inline int xmit_rec_read(void)
+{
+       return current->xmit_recursion;
+}
+
+static inline void xmit_rec_inc(void)
+{
+       current->xmit_recursion++;
+}
+
+static inline void xmit_rec_dec(void)
+{
+       current->xmit_recursion--;
+}
+
+#else
+
 DEFINE_PER_CPU(int, xmit_recursion);
 EXPORT_SYMBOL(xmit_recursion);
 
+static inline int xmit_rec_read(void)
+{
+       return __this_cpu_read(xmit_recursion);
+}
+
+static inline void xmit_rec_inc(void)
+{
+       __this_cpu_inc(xmit_recursion);
+}
+
+static inline void xmit_rec_dec(void)
+{
+       __this_cpu_dec(xmit_recursion);
+}
+#endif
+
 #define RECURSION_LIMIT 10
 
 /**
  *     dev_loopback_xmit - loop back @skb
+ *     @net: network namespace this loopback is happening in
+ *     @sk:  sk needed to be a netfilter okfn
  *     @skb: buffer to transmit
  */
-int dev_loopback_xmit(struct sock *sk, struct sk_buff *skb)
+int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
        skb_reset_mac_header(skb);
        __skb_pull(skb, skb_network_offset(skb));
@@ -2905,6 +3014,85 @@ int dev_loopback_xmit(struct sock *sk, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(dev_loopback_xmit);
 
+static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+{
+#ifdef CONFIG_XPS
+       struct xps_dev_maps *dev_maps;
+       struct xps_map *map;
+       int queue_index = -1;
+
+       rcu_read_lock();
+       dev_maps = rcu_dereference(dev->xps_maps);
+       if (dev_maps) {
+               map = rcu_dereference(
+                   dev_maps->cpu_map[skb->sender_cpu - 1]);
+               if (map) {
+                       if (map->len == 1)
+                               queue_index = map->queues[0];
+                       else
+                               queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
+                                                                          map->len)];
+                       if (unlikely(queue_index >= dev->real_num_tx_queues))
+                               queue_index = -1;
+               }
+       }
+       rcu_read_unlock();
+
+       return queue_index;
+#else
+       return -1;
+#endif
+}
+
+static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
+{
+       struct sock *sk = skb->sk;
+       int queue_index = sk_tx_queue_get(sk);
+
+       if (queue_index < 0 || skb->ooo_okay ||
+           queue_index >= dev->real_num_tx_queues) {
+               int new_index = get_xps_queue(dev, skb);
+               if (new_index < 0)
+                       new_index = skb_tx_hash(dev, skb);
+
+               if (queue_index != new_index && sk &&
+                   sk_fullsock(sk) &&
+                   rcu_access_pointer(sk->sk_dst_cache))
+                       sk_tx_queue_set(sk, new_index);
+
+               queue_index = new_index;
+       }
+
+       return queue_index;
+}
+
+struct netdev_queue *netdev_pick_tx(struct net_device *dev,
+                                   struct sk_buff *skb,
+                                   void *accel_priv)
+{
+       int queue_index = 0;
+
+#ifdef CONFIG_XPS
+       if (skb->sender_cpu == 0)
+               skb->sender_cpu = raw_smp_processor_id() + 1;
+#endif
+
+       if (dev->real_num_tx_queues != 1) {
+               const struct net_device_ops *ops = dev->netdev_ops;
+               if (ops->ndo_select_queue)
+                       queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
+                                                           __netdev_pick_tx);
+               else
+                       queue_index = __netdev_pick_tx(dev, skb);
+
+               if (!accel_priv)
+                       queue_index = netdev_cap_txqueue(dev, queue_index);
+       }
+
+       skb_set_queue_mapping(skb, queue_index);
+       return netdev_get_tx_queue(dev, queue_index);
+}
+
 /**
  *     __dev_queue_xmit - transmit a buffer
  *     @skb: buffer to transmit
@@ -2958,6 +3146,16 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
        else
                skb_dst_force(skb);
 
+#ifdef CONFIG_NET_SWITCHDEV
+       /* Don't forward if offload device already forwarded */
+       if (skb->offload_fwd_mark &&
+           skb->offload_fwd_mark == dev->offload_fwd_mark) {
+               consume_skb(skb);
+               rc = NET_XMIT_SUCCESS;
+               goto out;
+       }
+#endif
+
        txq = netdev_pick_tx(dev, skb, accel_priv);
        q = rcu_dereference_bh(txq->qdisc);
 
@@ -2987,7 +3185,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
 
                if (txq->xmit_lock_owner != cpu) {
 
-                       if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
+                       if (xmit_rec_read() > RECURSION_LIMIT)
                                goto recursion_alert;
 
                        skb = validate_xmit_skb(skb, dev);
@@ -2997,9 +3195,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
                        HARD_TX_LOCK(dev, txq, cpu);
 
                        if (!netif_xmit_stopped(txq)) {
-                               __this_cpu_inc(xmit_recursion);
+                               xmit_rec_inc();
                                skb = dev_hard_start_xmit(skb, dev, txq, &rc);
-                               __this_cpu_dec(xmit_recursion);
+                               xmit_rec_dec();
                                if (dev_xmit_complete(rc)) {
                                        HARD_TX_UNLOCK(dev, txq);
                                        goto out;
@@ -3030,11 +3228,11 @@ out:
        return rc;
 }
 
-int dev_queue_xmit_sk(struct sock *sk, struct sk_buff *skb)
+int dev_queue_xmit(struct sk_buff *skb)
 {
        return __dev_queue_xmit(skb, NULL);
 }
-EXPORT_SYMBOL(dev_queue_xmit_sk);
+EXPORT_SYMBOL(dev_queue_xmit);
 
 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
 {
@@ -3549,66 +3747,55 @@ int (*br_fdb_test_addr_hook)(struct net_device *dev,
 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
 #endif
 
-#ifdef CONFIG_NET_CLS_ACT
-/* TODO: Maybe we should just force sch_ingress to be compiled in
- * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
- * a compare and 2 stores extra right now if we dont have it on
- * but have CONFIG_NET_CLS_ACT
- * NOTE: This doesn't stop any functionality; if you dont have
- * the ingress scheduler, you just can't add policies on ingress.
- *
- */
-static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
-{
-       struct net_device *dev = skb->dev;
-       u32 ttl = G_TC_RTTL(skb->tc_verd);
-       int result = TC_ACT_OK;
-       struct Qdisc *q;
-
-       if (unlikely(MAX_RED_LOOP < ttl++)) {
-               net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
-                                    skb->skb_iif, dev->ifindex);
-               return TC_ACT_SHOT;
-       }
-
-       skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
-       skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
-
-       q = rcu_dereference(rxq->qdisc);
-       if (q != &noop_qdisc) {
-               spin_lock(qdisc_lock(q));
-               if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
-                       result = qdisc_enqueue_root(skb, q);
-               spin_unlock(qdisc_lock(q));
-       }
-
-       return result;
-}
-
 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
                                         struct packet_type **pt_prev,
                                         int *ret, struct net_device *orig_dev)
 {
-       struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
+#ifdef CONFIG_NET_CLS_ACT
+       struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
+       struct tcf_result cl_res;
 
-       if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
+       /* If there's at least one ingress present somewhere (so
+        * we get here via enabled static key), remaining devices
+        * that are not configured with an ingress qdisc will bail
+        * out here.
+        */
+       if (!cl)
                return skb;
-
        if (*pt_prev) {
                *ret = deliver_skb(skb, *pt_prev, orig_dev);
                *pt_prev = NULL;
        }
 
-       switch (ing_filter(skb, rxq)) {
+       qdisc_skb_cb(skb)->pkt_len = skb->len;
+       skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
+       qdisc_bstats_cpu_update(cl->q, skb);
+
+       switch (tc_classify(skb, cl, &cl_res, false)) {
+       case TC_ACT_OK:
+       case TC_ACT_RECLASSIFY:
+               skb->tc_index = TC_H_MIN(cl_res.classid);
+               break;
        case TC_ACT_SHOT:
+               qdisc_qstats_cpu_drop(cl->q);
        case TC_ACT_STOLEN:
+       case TC_ACT_QUEUED:
                kfree_skb(skb);
                return NULL;
+       case TC_ACT_REDIRECT:
+               /* skb_mac_header check was done by cls/act_bpf, so
+                * we can safely push the L2 header back before
+                * redirecting to another netdev
+                */
+               __skb_push(skb, skb->mac_len);
+               skb_do_redirect(skb);
+               return NULL;
+       default:
+               break;
        }
-
+#endif /* CONFIG_NET_CLS_ACT */
        return skb;
 }
-#endif
 
 /**
  *     netdev_rx_handler_register - register receive handler
@@ -3681,6 +3868,22 @@ static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
        }
 }
 
+static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
+                            int *ret, struct net_device *orig_dev)
+{
+#ifdef CONFIG_NETFILTER_INGRESS
+       if (nf_hook_ingress_active(skb)) {
+               if (*pt_prev) {
+                       *ret = deliver_skb(skb, *pt_prev, orig_dev);
+                       *pt_prev = NULL;
+               }
+
+               return nf_hook_ingress(skb);
+       }
+#endif /* CONFIG_NETFILTER_INGRESS */
+       return 0;
+}
+
 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
 {
        struct packet_type *ptype, *pt_prev;
@@ -3738,13 +3941,17 @@ another_round:
        }
 
 skip_taps:
-#ifdef CONFIG_NET_CLS_ACT
+#ifdef CONFIG_NET_INGRESS
        if (static_key_false(&ingress_needed)) {
                skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
                if (!skb)
                        goto out;
-       }
 
+               if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
+                       goto out;
+       }
+#endif
+#ifdef CONFIG_NET_CLS_ACT
        skb->tc_verd = 0;
 ncls:
 #endif
@@ -3897,13 +4104,13 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
  *     NET_RX_SUCCESS: no congestion
  *     NET_RX_DROP: packet was dropped
  */
-int netif_receive_skb_sk(struct sock *sk, struct sk_buff *skb)
+int netif_receive_skb(struct sk_buff *skb)
 {
        trace_netif_receive_skb_entry(skb);
 
        return netif_receive_skb_internal(skb);
 }
-EXPORT_SYMBOL(netif_receive_skb_sk);
+EXPORT_SYMBOL(netif_receive_skb);
 
 /* Network device is going away, flush any packets still pending
  * Called with irqs disabled.
@@ -4017,6 +4224,7 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
 
                diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
                diffs |= p->vlan_tci ^ skb->vlan_tci;
+               diffs |= skb_metadata_dst_cmp(p, skb);
                if (maclen == ETH_HLEN)
                        diffs |= compare_ether_header(skb_mac_header(p),
                                                      skb_mac_header(skb));
@@ -4214,10 +4422,12 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
                break;
 
        case GRO_MERGED_FREE:
-               if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
+               if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
+                       skb_dst_drop(skb);
                        kmem_cache_free(skbuff_head_cache, skb);
-               else
+               } else {
                        __kfree_skb(skb);
+               }
                break;
 
        case GRO_HELD:
@@ -4634,6 +4844,8 @@ void napi_disable(struct napi_struct *n)
 
        while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
                msleep(1);
+       while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
+               msleep(1);
 
        hrtimer_cancel(&n->timer);
 
@@ -4755,7 +4967,7 @@ static void net_rx_action(struct softirq_action *h)
        list_splice_tail(&repoll, &list);
        list_splice(&list, &sd->poll_list);
        if (!list_empty(&sd->poll_list))
-               __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+               __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ);
 
        net_rps_action_and_irq_enable(sd);
 }
@@ -4776,8 +4988,7 @@ struct netdev_adjacent {
        struct rcu_head rcu;
 };
 
-static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
-                                                struct net_device *adj_dev,
+static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
                                                 struct list_head *adj_list)
 {
        struct netdev_adjacent *adj;
@@ -4803,7 +5014,7 @@ bool netdev_has_upper_dev(struct net_device *dev,
 {
        ASSERT_RTNL();
 
-       return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
+       return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
 }
 EXPORT_SYMBOL(netdev_has_upper_dev);
 
@@ -4916,7 +5127,7 @@ EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
  * Gets the next netdev_adjacent->private from the dev's lower neighbour
  * list, starting from iter position. The caller must hold either hold the
  * RTNL lock or its own locking that guarantees that the neighbour lower
- * list will remain unchainged.
+ * list will remain unchanged.
  */
 void *netdev_lower_get_next_private(struct net_device *dev,
                                    struct list_head **iter)
@@ -4971,7 +5182,7 @@ EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
  * Gets the next netdev_adjacent from the dev's lower neighbour
  * list, starting from iter position. The caller must hold RTNL lock or
  * its own locking that guarantees that the neighbour lower
- * list will remain unchainged.
+ * list will remain unchanged.
  */
 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
 {
@@ -5065,7 +5276,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
        struct netdev_adjacent *adj;
        int ret;
 
-       adj = __netdev_find_adj(dev, adj_dev, dev_list);
+       adj = __netdev_find_adj(adj_dev, dev_list);
 
        if (adj) {
                adj->ref_nr++;
@@ -5121,7 +5332,7 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev,
 {
        struct netdev_adjacent *adj;
 
-       adj = __netdev_find_adj(dev, adj_dev, dev_list);
+       adj = __netdev_find_adj(adj_dev, dev_list);
 
        if (!adj) {
                pr_err("tried to remove device %s from %s\n",
@@ -5232,6 +5443,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
                                   struct net_device *upper_dev, bool master,
                                   void *private)
 {
+       struct netdev_notifier_changeupper_info changeupper_info;
        struct netdev_adjacent *i, *j, *to_i, *to_j;
        int ret = 0;
 
@@ -5241,15 +5453,25 @@ static int __netdev_upper_dev_link(struct net_device *dev,
                return -EBUSY;
 
        /* To prevent loops, check if dev is not upper device to upper_dev. */
-       if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
+       if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
                return -EBUSY;
 
-       if (__netdev_find_adj(dev, upper_dev, &dev->adj_list.upper))
+       if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
                return -EEXIST;
 
        if (master && netdev_master_upper_dev_get(dev))
                return -EBUSY;
 
+       changeupper_info.upper_dev = upper_dev;
+       changeupper_info.master = master;
+       changeupper_info.linking = true;
+
+       ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
+                                           &changeupper_info.info);
+       ret = notifier_to_errno(ret);
+       if (ret)
+               return ret;
+
        ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
                                                   master);
        if (ret)
@@ -5288,7 +5510,8 @@ static int __netdev_upper_dev_link(struct net_device *dev,
                        goto rollback_lower_mesh;
        }
 
-       call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
+       call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
+                                     &changeupper_info.info);
        return 0;
 
 rollback_lower_mesh:
@@ -5383,9 +5606,17 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
 void netdev_upper_dev_unlink(struct net_device *dev,
                             struct net_device *upper_dev)
 {
+       struct netdev_notifier_changeupper_info changeupper_info;
        struct netdev_adjacent *i, *j;
        ASSERT_RTNL();
 
+       changeupper_info.upper_dev = upper_dev;
+       changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
+       changeupper_info.linking = false;
+
+       call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
+                                     &changeupper_info.info);
+
        __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 
        /* Here is the tricky part. We must remove all dev's lower
@@ -5405,7 +5636,8 @@ void netdev_upper_dev_unlink(struct net_device *dev,
        list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
                __netdev_adjacent_dev_unlink(dev, i->dev);
 
-       call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
+       call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
+                                     &changeupper_info.info);
 }
 EXPORT_SYMBOL(netdev_upper_dev_unlink);
 
@@ -5511,7 +5743,7 @@ void *netdev_lower_dev_get_private(struct net_device *dev,
 
        if (!lower_dev)
                return NULL;
-       lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
+       lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
        if (!lower)
                return NULL;
 
@@ -6005,6 +6237,26 @@ int dev_get_phys_port_name(struct net_device *dev,
 }
 EXPORT_SYMBOL(dev_get_phys_port_name);
 
+/**
+ *     dev_change_proto_down - update protocol port state information
+ *     @dev: device
+ *     @proto_down: new value
+ *
+ *     This info can be used by switch drivers to set the phys state of the
+ *     port.
+ */
+int dev_change_proto_down(struct net_device *dev, bool proto_down)
+{
+       const struct net_device_ops *ops = dev->netdev_ops;
+
+       if (!ops->ndo_change_proto_down)
+               return -EOPNOTSUPP;
+       if (!netif_device_present(dev))
+               return -ENODEV;
+       return ops->ndo_change_proto_down(dev, proto_down);
+}
+EXPORT_SYMBOL(dev_change_proto_down);
+
 /**
  *     dev_new_index   -       allocate an ifindex
  *     @net: the applicable net namespace
@@ -6129,6 +6381,48 @@ static void rollback_registered(struct net_device *dev)
        list_del(&single);
 }
 
+static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
+       struct net_device *upper, netdev_features_t features)
+{
+       netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
+       netdev_features_t feature;
+       int feature_bit;
+
+       for_each_netdev_feature(&upper_disables, feature_bit) {
+               feature = __NETIF_F_BIT(feature_bit);
+               if (!(upper->wanted_features & feature)
+                   && (features & feature)) {
+                       netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
+                                  &feature, upper->name);
+                       features &= ~feature;
+               }
+       }
+
+       return features;
+}
+
+static void netdev_sync_lower_features(struct net_device *upper,
+       struct net_device *lower, netdev_features_t features)
+{
+       netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
+       netdev_features_t feature;
+       int feature_bit;
+
+       for_each_netdev_feature(&upper_disables, feature_bit) {
+               feature = __NETIF_F_BIT(feature_bit);
+               if (!(features & feature) && (lower->features & feature)) {
+                       netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
+                                  &feature, lower->name);
+                       lower->wanted_features &= ~feature;
+                       netdev_update_features(lower);
+
+                       if (unlikely(lower->features & feature))
+                               netdev_WARN(upper, "failed to disable %pNF on %s!\n",
+                                           &feature, lower->name);
+               }
+       }
+}
+
 static netdev_features_t netdev_fix_features(struct net_device *dev,
        netdev_features_t features)
 {
@@ -6198,8 +6492,10 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
 
 int __netdev_update_features(struct net_device *dev)
 {
+       struct net_device *upper, *lower;
        netdev_features_t features;
-       int err = 0;
+       struct list_head *iter;
+       int err = -1;
 
        ASSERT_RTNL();
 
@@ -6211,26 +6507,42 @@ int __netdev_update_features(struct net_device *dev)
        /* driver might be less strict about feature dependencies */
        features = netdev_fix_features(dev, features);
 
+       /* some features can't be enabled if they're off an an upper device */
+       netdev_for_each_upper_dev_rcu(dev, upper, iter)
+               features = netdev_sync_upper_features(dev, upper, features);
+
        if (dev->features == features)
-               return 0;
+               goto sync_lower;
 
        netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
                &dev->features, &features);
 
        if (dev->netdev_ops->ndo_set_features)
                err = dev->netdev_ops->ndo_set_features(dev, features);
+       else
+               err = 0;
 
        if (unlikely(err < 0)) {
                netdev_err(dev,
                        "set_features() failed (%d); wanted %pNF, left %pNF\n",
                        err, &features, &dev->features);
+               /* return non-0 since some features might have changed and
+                * it's better to fire a spurious notification than miss it
+                */
                return -1;
        }
 
+sync_lower:
+       /* some features must be disabled on lower devices when disabled
+        * on an upper device (think: bonding master or bridge)
+        */
+       netdev_for_each_lower_dev(dev, lower, iter)
+               netdev_sync_lower_features(dev, lower, features);
+
        if (!err)
                dev->features = features;
 
-       return 1;
+       return err < 0 ? 0 : 1;
 }
 
 /**
@@ -6357,6 +6669,17 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
        return 0;
 }
 
+void netif_tx_stop_all_queues(struct net_device *dev)
+{
+       unsigned int i;
+
+       for (i = 0; i < dev->num_tx_queues; i++) {
+               struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
+               netif_tx_stop_queue(txq);
+       }
+}
+EXPORT_SYMBOL(netif_tx_stop_all_queues);
+
 /**
  *     register_netdevice      - register a network device
  *     @dev: device to register
@@ -6887,6 +7210,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
        dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
        setup(dev);
 
+       if (!dev->tx_queue_len) {
+               dev->priv_flags |= IFF_NO_QUEUE;
+               dev->tx_queue_len = 1;
+       }
+
        dev->num_tx_queues = txqs;
        dev->real_num_tx_queues = txqs;
        if (netif_alloc_netdev_queues(dev))
@@ -6904,6 +7232,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
        dev->group = INIT_NETDEV_GROUP;
        if (!dev->ethtool_ops)
                dev->ethtool_ops = &default_ethtool_ops;
+
+       nf_hook_ingress_init(dev);
+
        return dev;
 
 free_all:
@@ -6969,7 +7300,7 @@ EXPORT_SYMBOL(free_netdev);
 void synchronize_net(void)
 {
        might_sleep();
-       if (rtnl_is_locked())
+       if (rtnl_is_locked() && !IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
                synchronize_rcu_expedited();
        else
                synchronize_rcu();
@@ -7561,7 +7892,7 @@ static int __init net_dev_init(void)
        open_softirq(NET_RX_SOFTIRQ, net_rx_action);
 
        hotcpu_notifier(dev_cpu_callback, 0);
-       dst_init();
+       dst_subsys_init();
        rc = 0;
 out:
        return rc;