These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / net / ipv4 / route.c
index f45f2a1..02c6229 100644 (file)
@@ -91,6 +91,7 @@
 #include <linux/slab.h>
 #include <linux/jhash.h>
 #include <net/dst.h>
+#include <net/dst_metadata.h>
 #include <net/net_namespace.h>
 #include <net/protocol.h>
 #include <net/ip.h>
 #include <net/tcp.h>
 #include <net/icmp.h>
 #include <net/xfrm.h>
+#include <net/lwtunnel.h>
 #include <net/netevent.h>
 #include <net/rtnetlink.h>
 #ifdef CONFIG_SYSCTL
 #include <linux/kmemleak.h>
 #endif
 #include <net/secure_seq.h>
+#include <net/ip_tunnels.h>
+#include <net/l3mdev.h>
 
 #define RT_FL_TOS(oldflp4) \
        ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
@@ -125,6 +129,7 @@ static int ip_rt_mtu_expires __read_mostly  = 10 * 60 * HZ;
 static int ip_rt_min_pmtu __read_mostly                = 512 + 20 + 20;
 static int ip_rt_min_advmss __read_mostly      = 256;
 
+static int ip_rt_gc_timeout __read_mostly      = RT_GC_TIMEOUT;
 /*
  *     Interface to generic destination cache.
  */
@@ -457,12 +462,9 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 }
 
 #define IP_IDENTS_SZ 2048u
-struct ip_ident_bucket {
-       atomic_t        id;
-       u32             stamp32;
-};
 
-static struct ip_ident_bucket *ip_idents __read_mostly;
+static atomic_t *ip_idents __read_mostly;
+static u32 *ip_tstamps __read_mostly;
 
 /* In order to protect privacy, we add a perturbation to identifiers
  * if one generator is seldom used. This makes hard for an attacker
@@ -470,15 +472,16 @@ static struct ip_ident_bucket *ip_idents __read_mostly;
  */
 u32 ip_idents_reserve(u32 hash, int segs)
 {
-       struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
-       u32 old = ACCESS_ONCE(bucket->stamp32);
+       u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
+       atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
+       u32 old = ACCESS_ONCE(*p_tstamp);
        u32 now = (u32)jiffies;
        u32 delta = 0;
 
-       if (old != now && cmpxchg(&bucket->stamp32, old, now) == old)
+       if (old != now && cmpxchg(p_tstamp, old, now) == old)
                delta = prandom_u32_max(now - old);
 
-       return atomic_add_return(segs + delta, &bucket->id) - segs;
+       return atomic_add_return(segs + delta, p_id) - segs;
 }
 EXPORT_SYMBOL(ip_idents_reserve);
 
@@ -749,11 +752,11 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
                if (!(n->nud_state & NUD_VALID)) {
                        neigh_event_send(n, NULL);
                } else {
-                       if (fib_lookup(net, fl4, &res) == 0) {
+                       if (fib_lookup(net, fl4, &res, 0) == 0) {
                                struct fib_nh *nh = &FIB_RES_NH(res);
 
                                update_or_create_fnhe(nh, fl4->daddr, new_gw,
-                                                     0, 0);
+                                               0, jiffies + ip_rt_gc_timeout);
                        }
                        if (kill_route)
                                rt->dst.obsolete = DST_OBSOLETE_KILL;
@@ -836,6 +839,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
        struct inet_peer *peer;
        struct net *net;
        int log_martians;
+       int vif;
 
        rcu_read_lock();
        in_dev = __in_dev_get_rcu(rt->dst.dev);
@@ -844,10 +848,11 @@ void ip_rt_send_redirect(struct sk_buff *skb)
                return;
        }
        log_martians = IN_DEV_LOG_MARTIANS(in_dev);
+       vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
        rcu_read_unlock();
 
        net = dev_net(rt->dst.dev);
-       peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
+       peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
        if (!peer) {
                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
                          rt_nexthop(rt, ip_hdr(skb)->daddr));
@@ -936,7 +941,8 @@ static int ip_error(struct sk_buff *skb)
                break;
        }
 
-       peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
+       peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
+                              l3mdev_master_ifindex(skb->dev), 1);
 
        send = true;
        if (peer) {
@@ -977,7 +983,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
                return;
 
        rcu_read_lock();
-       if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
+       if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
                struct fib_nh *nh = &FIB_RES_NH(res);
 
                update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
@@ -1147,7 +1153,7 @@ static void ipv4_link_failure(struct sk_buff *skb)
                dst_set_expires(&rt->dst, 0);
 }
 
-static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)
+static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
        pr_debug("%s: %pI4 -> %pI4, %s\n",
                 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
@@ -1188,7 +1194,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
                fl4.flowi4_mark = skb->mark;
 
                rcu_read_lock();
-               if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
+               if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
                        src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
                else
                        src = inet_select_addr(rt->dst.dev,
@@ -1405,6 +1411,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 #ifdef CONFIG_IP_ROUTE_CLASSID
                rt->dst.tclassid = nh->nh_tclassid;
 #endif
+               rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
                if (unlikely(fnhe))
                        cached = rt_bind_exception(rt, fnhe, daddr);
                else if (!(rt->dst.flags & DST_NOCACHE))
@@ -1432,12 +1439,34 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 }
 
 static struct rtable *rt_dst_alloc(struct net_device *dev,
+                                  unsigned int flags, u16 type,
                                   bool nopolicy, bool noxfrm, bool will_cache)
 {
-       return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
-                        (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
-                        (nopolicy ? DST_NOPOLICY : 0) |
-                        (noxfrm ? DST_NOXFRM : 0));
+       struct rtable *rt;
+
+       rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
+                      (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
+                      (nopolicy ? DST_NOPOLICY : 0) |
+                      (noxfrm ? DST_NOXFRM : 0));
+
+       if (rt) {
+               rt->rt_genid = rt_genid_ipv4(dev_net(dev));
+               rt->rt_flags = flags;
+               rt->rt_type = type;
+               rt->rt_is_input = 0;
+               rt->rt_iif = 0;
+               rt->rt_pmtu = 0;
+               rt->rt_gateway = 0;
+               rt->rt_uses_gateway = 0;
+               rt->rt_table_id = 0;
+               INIT_LIST_HEAD(&rt->rt_uncached);
+
+               rt->dst.output = ip_output;
+               if (flags & RTCF_LOCAL)
+                       rt->dst.input = ip_local_deliver;
+       }
+
+       return rt;
 }
 
 /* called in rcu_read_lock() section */
@@ -1446,6 +1475,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 {
        struct rtable *rth;
        struct in_device *in_dev = __in_dev_get_rcu(dev);
+       unsigned int flags = RTCF_MULTICAST;
        u32 itag = 0;
        int err;
 
@@ -1458,9 +1488,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
            skb->protocol != htons(ETH_P_IP))
                goto e_inval;
 
-       if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
-               if (ipv4_is_loopback(saddr))
-                       goto e_inval;
+       if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
+               goto e_inval;
 
        if (ipv4_is_zeronet(saddr)) {
                if (!ipv4_is_local_multicast(daddr))
@@ -1471,7 +1500,10 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                if (err < 0)
                        goto e_err;
        }
-       rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
+       if (our)
+               flags |= RTCF_LOCAL;
+
+       rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
        if (!rth)
                goto e_nobufs;
@@ -1480,20 +1512,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
        rth->dst.tclassid = itag;
 #endif
        rth->dst.output = ip_rt_bug;
-
-       rth->rt_genid   = rt_genid_ipv4(dev_net(dev));
-       rth->rt_flags   = RTCF_MULTICAST;
-       rth->rt_type    = RTN_MULTICAST;
        rth->rt_is_input= 1;
-       rth->rt_iif     = 0;
-       rth->rt_pmtu    = 0;
-       rth->rt_gateway = 0;
-       rth->rt_uses_gateway = 0;
-       INIT_LIST_HEAD(&rth->rt_uncached);
-       if (our) {
-               rth->dst.input= ip_local_deliver;
-               rth->rt_flags |= RTCF_LOCAL;
-       }
 
 #ifdef CONFIG_IP_MROUTE
        if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
@@ -1538,6 +1557,36 @@ static void ip_handle_martian_source(struct net_device *dev,
 #endif
 }
 
+static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
+{
+       struct fnhe_hash_bucket *hash;
+       struct fib_nh_exception *fnhe, __rcu **fnhe_p;
+       u32 hval = fnhe_hashfun(daddr);
+
+       spin_lock_bh(&fnhe_lock);
+
+       hash = rcu_dereference_protected(nh->nh_exceptions,
+                                        lockdep_is_held(&fnhe_lock));
+       hash += hval;
+
+       fnhe_p = &hash->chain;
+       fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
+       while (fnhe) {
+               if (fnhe->fnhe_daddr == daddr) {
+                       rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
+                               fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
+                       fnhe_flush_routes(fnhe);
+                       kfree_rcu(fnhe, rcu);
+                       break;
+               }
+               fnhe_p = &fnhe->fnhe_next;
+               fnhe = rcu_dereference_protected(fnhe->fnhe_next,
+                                                lockdep_is_held(&fnhe_lock));
+       }
+
+       spin_unlock_bh(&fnhe_lock);
+}
+
 /* called in rcu_read_lock() section */
 static int __mkroute_input(struct sk_buff *skb,
                           const struct fib_result *res,
@@ -1548,7 +1597,6 @@ static int __mkroute_input(struct sk_buff *skb,
        struct rtable *rth;
        int err;
        struct in_device *out_dev;
-       unsigned int flags = 0;
        bool do_cache;
        u32 itag = 0;
 
@@ -1592,18 +1640,27 @@ static int __mkroute_input(struct sk_buff *skb,
 
        fnhe = find_exception(&FIB_RES_NH(*res), daddr);
        if (do_cache) {
-               if (fnhe)
+               if (fnhe) {
                        rth = rcu_dereference(fnhe->fnhe_rth_input);
-               else
-                       rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
+                       if (rth && rth->dst.expires &&
+                           time_after(jiffies, rth->dst.expires)) {
+                               ip_del_fnhe(&FIB_RES_NH(*res), daddr);
+                               fnhe = NULL;
+                       } else {
+                               goto rt_cache;
+                       }
+               }
+
+               rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
 
+rt_cache:
                if (rt_cache_valid(rth)) {
                        skb_dst_set_noref(skb, &rth->dst);
                        goto out;
                }
        }
 
-       rth = rt_dst_alloc(out_dev->dev,
+       rth = rt_dst_alloc(out_dev->dev, 0, res->type,
                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
                           IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
        if (!rth) {
@@ -1611,21 +1668,22 @@ static int __mkroute_input(struct sk_buff *skb,
                goto cleanup;
        }
 
-       rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
-       rth->rt_flags = flags;
-       rth->rt_type = res->type;
        rth->rt_is_input = 1;
-       rth->rt_iif     = 0;
-       rth->rt_pmtu    = 0;
-       rth->rt_gateway = 0;
-       rth->rt_uses_gateway = 0;
-       INIT_LIST_HEAD(&rth->rt_uncached);
+       if (res->table)
+               rth->rt_table_id = res->table->tb_id;
        RT_CACHE_STAT_INC(in_slow_tot);
 
        rth->dst.input = ip_forward;
-       rth->dst.output = ip_output;
 
        rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
+       if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
+               rth->dst.lwtstate->orig_output = rth->dst.output;
+               rth->dst.output = lwtunnel_output;
+       }
+       if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
+               rth->dst.lwtstate->orig_input = rth->dst.input;
+               rth->dst.input = lwtunnel_input;
+       }
        skb_dst_set(skb, &rth->dst);
 out:
        err = 0;
@@ -1633,6 +1691,48 @@ out:
        return err;
 }
 
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+/* To make ICMP packets follow the right flow, the multipath hash is
+ * calculated from the inner IP addresses in reverse order.
+ */
+static int ip_multipath_icmp_hash(struct sk_buff *skb)
+{
+       const struct iphdr *outer_iph = ip_hdr(skb);
+       struct icmphdr _icmph;
+       const struct icmphdr *icmph;
+       struct iphdr _inner_iph;
+       const struct iphdr *inner_iph;
+
+       if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
+               goto standard_hash;
+
+       icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
+                                  &_icmph);
+       if (!icmph)
+               goto standard_hash;
+
+       if (icmph->type != ICMP_DEST_UNREACH &&
+           icmph->type != ICMP_REDIRECT &&
+           icmph->type != ICMP_TIME_EXCEEDED &&
+           icmph->type != ICMP_PARAMETERPROB) {
+               goto standard_hash;
+       }
+
+       inner_iph = skb_header_pointer(skb,
+                                      outer_iph->ihl * 4 + sizeof(_icmph),
+                                      sizeof(_inner_iph), &_inner_iph);
+       if (!inner_iph)
+               goto standard_hash;
+
+       return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
+
+standard_hash:
+       return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
+}
+
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+
 static int ip_mkroute_input(struct sk_buff *skb,
                            struct fib_result *res,
                            const struct flowi4 *fl4,
@@ -1640,8 +1740,15 @@ static int ip_mkroute_input(struct sk_buff *skb,
                            __be32 daddr, __be32 saddr, u32 tos)
 {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-       if (res->fi && res->fi->fib_nhs > 1)
-               fib_select_multipath(res);
+       if (res->fi && res->fi->fib_nhs > 1) {
+               int h;
+
+               if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
+                       h = ip_multipath_icmp_hash(skb);
+               else
+                       h = fib_multipath_hash(saddr, daddr);
+               fib_select_multipath(res, h);
+       }
 #endif
 
        /* create a routing cache entry */
@@ -1664,6 +1771,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 {
        struct fib_result res;
        struct in_device *in_dev = __in_dev_get_rcu(dev);
+       struct ip_tunnel_info *tun_info;
        struct flowi4   fl4;
        unsigned int    flags = 0;
        u32             itag = 0;
@@ -1681,10 +1789,18 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
           by fib_lookup.
         */
 
+       tun_info = skb_tunnel_info(skb);
+       if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
+               fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
+       else
+               fl4.flowi4_tun_key.tun_id = 0;
+       skb_dst_drop(skb);
+
        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
                goto martian_source;
 
        res.fi = NULL;
+       res.table = NULL;
        if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
                goto brd_input;
 
@@ -1712,13 +1828,14 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
         *      Now we are ready to route packet.
         */
        fl4.flowi4_oif = 0;
-       fl4.flowi4_iif = dev->ifindex;
+       fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev);
        fl4.flowi4_mark = skb->mark;
        fl4.flowi4_tos = tos;
        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+       fl4.flowi4_flags = 0;
        fl4.daddr = daddr;
        fl4.saddr = saddr;
-       err = fib_lookup(net, &fl4, &res);
+       err = fib_lookup(net, &fl4, &res, 0);
        if (err != 0) {
                if (!IN_DEV_FORWARD(in_dev))
                        err = -EHOSTUNREACH;
@@ -1732,7 +1849,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                err = fib_validate_source(skb, saddr, daddr, tos,
                                          0, dev, in_dev, &itag);
                if (err < 0)
-                       goto martian_source_keep_err;
+                       goto martian_source;
                goto local_input;
        }
 
@@ -1754,7 +1871,7 @@ brd_input:
                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
                                          in_dev, &itag);
                if (err < 0)
-                       goto martian_source_keep_err;
+                       goto martian_source;
        }
        flags |= RTCF_BROADCAST;
        res.type = RTN_BROADCAST;
@@ -1774,26 +1891,19 @@ local_input:
                }
        }
 
-       rth = rt_dst_alloc(net->loopback_dev,
+       rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type,
                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
        if (!rth)
                goto e_nobufs;
 
-       rth->dst.input= ip_local_deliver;
        rth->dst.output= ip_rt_bug;
 #ifdef CONFIG_IP_ROUTE_CLASSID
        rth->dst.tclassid = itag;
 #endif
-
-       rth->rt_genid = rt_genid_ipv4(net);
-       rth->rt_flags   = flags|RTCF_LOCAL;
-       rth->rt_type    = res.type;
        rth->rt_is_input = 1;
-       rth->rt_iif     = 0;
-       rth->rt_pmtu    = 0;
-       rth->rt_gateway = 0;
-       rth->rt_uses_gateway = 0;
-       INIT_LIST_HEAD(&rth->rt_uncached);
+       if (res.table)
+               rth->rt_table_id = res.table->tb_id;
+
        RT_CACHE_STAT_INC(in_slow_tot);
        if (res.type == RTN_UNREACHABLE) {
                rth->dst.input= ip_error;
@@ -1814,6 +1924,7 @@ no_route:
        RT_CACHE_STAT_INC(in_no_route);
        res.type = RTN_UNREACHABLE;
        res.fi = NULL;
+       res.table = NULL;
        goto local_input;
 
        /*
@@ -1836,8 +1947,6 @@ e_nobufs:
        goto out;
 
 martian_source:
-       err = -EINVAL;
-martian_source_keep_err:
        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
        goto out;
 }
@@ -1945,19 +2054,29 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
                struct fib_nh *nh = &FIB_RES_NH(*res);
 
                fnhe = find_exception(nh, fl4->daddr);
-               if (fnhe)
+               if (fnhe) {
                        prth = &fnhe->fnhe_rth_output;
-               else {
-                       if (unlikely(fl4->flowi4_flags &
-                                    FLOWI_FLAG_KNOWN_NH &&
-                                    !(nh->nh_gw &&
-                                      nh->nh_scope == RT_SCOPE_LINK))) {
-                               do_cache = false;
-                               goto add;
+                       rth = rcu_dereference(*prth);
+                       if (rth && rth->dst.expires &&
+                           time_after(jiffies, rth->dst.expires)) {
+                               ip_del_fnhe(nh, fl4->daddr);
+                               fnhe = NULL;
+                       } else {
+                               goto rt_cache;
                        }
-                       prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
                }
+
+               if (unlikely(fl4->flowi4_flags &
+                            FLOWI_FLAG_KNOWN_NH &&
+                            !(nh->nh_gw &&
+                              nh->nh_scope == RT_SCOPE_LINK))) {
+                       do_cache = false;
+                       goto add;
+               }
+               prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
                rth = rcu_dereference(*prth);
+
+rt_cache:
                if (rt_cache_valid(rth)) {
                        dst_hold(&rth->dst);
                        return rth;
@@ -1965,29 +2084,19 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
        }
 
 add:
-       rth = rt_dst_alloc(dev_out,
+       rth = rt_dst_alloc(dev_out, flags, type,
                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
                           IN_DEV_CONF_GET(in_dev, NOXFRM),
                           do_cache);
        if (!rth)
                return ERR_PTR(-ENOBUFS);
 
-       rth->dst.output = ip_output;
-
-       rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
-       rth->rt_flags   = flags;
-       rth->rt_type    = type;
-       rth->rt_is_input = 0;
        rth->rt_iif     = orig_oif ? : 0;
-       rth->rt_pmtu    = 0;
-       rth->rt_gateway = 0;
-       rth->rt_uses_gateway = 0;
-       INIT_LIST_HEAD(&rth->rt_uncached);
+       if (res->table)
+               rth->rt_table_id = res->table->tb_id;
 
        RT_CACHE_STAT_INC(out_slow_tot);
 
-       if (flags & RTCF_LOCAL)
-               rth->dst.input = ip_local_deliver;
        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
                if (flags & RTCF_LOCAL &&
                    !(dev_out->flags & IFF_LOOPBACK)) {
@@ -2006,6 +2115,8 @@ add:
        }
 
        rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
+       if (lwtunnel_output_redirect(rth->dst.lwtstate))
+               rth->dst.output = lwtunnel_output;
 
        return rth;
 }
@@ -2014,7 +2125,8 @@ add:
  * Major route resolver routine.
  */
 
-struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
+struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
+                                         int mp_hash)
 {
        struct net_device *dev_out = NULL;
        __u8 tos = RT_FL_TOS(fl4);
@@ -2022,6 +2134,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
        struct fib_result res;
        struct rtable *rth;
        int orig_oif;
+       int err = -ENETUNREACH;
 
        res.tclassid    = 0;
        res.fi          = NULL;
@@ -2097,7 +2210,8 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
                        goto out;
                }
                if (ipv4_is_local_multicast(fl4->daddr) ||
-                   ipv4_is_lbcast(fl4->daddr)) {
+                   ipv4_is_lbcast(fl4->daddr) ||
+                   fl4->flowi4_proto == IPPROTO_IGMP) {
                        if (!fl4->saddr)
                                fl4->saddr = inet_select_addr(dev_out, 0,
                                                              RT_SCOPE_LINK);
@@ -2111,6 +2225,10 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
                                fl4->saddr = inet_select_addr(dev_out, 0,
                                                              RT_SCOPE_HOST);
                }
+
+               rth = l3mdev_get_rtable(dev_out, fl4);
+               if (rth)
+                       goto out;
        }
 
        if (!fl4->daddr) {
@@ -2124,10 +2242,12 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
                goto make_route;
        }
 
-       if (fib_lookup(net, fl4, &res)) {
+       err = fib_lookup(net, fl4, &res, 0);
+       if (err) {
                res.fi = NULL;
                res.table = NULL;
-               if (fl4->flowi4_oif) {
+               if (fl4->flowi4_oif &&
+                   !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
                        /* Apparently, routing tables are wrong. Assume,
                           that the destination is on link.
 
@@ -2152,7 +2272,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
                        res.type = RTN_UNICAST;
                        goto make_route;
                }
-               rth = ERR_PTR(-ENETUNREACH);
+               rth = ERR_PTR(err);
                goto out;
        }
 
@@ -2169,18 +2289,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
                goto make_route;
        }
 
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-       if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
-               fib_select_multipath(&res);
-       else
-#endif
-       if (!res.prefixlen &&
-           res.table->tb_num_default > 1 &&
-           res.type == RTN_UNICAST && !fl4->flowi4_oif)
-               fib_select_default(&res);
-
-       if (!fl4->saddr)
-               fl4->saddr = FIB_RES_PREFSRC(net, res);
+       fib_select_path(net, &res, fl4, mp_hash);
 
        dev_out = FIB_RES_DEV(res);
        fl4->flowi4_oif = dev_out->ifindex;
@@ -2193,7 +2302,7 @@ out:
        rcu_read_unlock();
        return rth;
 }
-EXPORT_SYMBOL_GPL(__ip_route_output_key);
+EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
 
 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
 {
@@ -2245,7 +2354,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 
                new->__use = 1;
                new->input = dst_discard;
-               new->output = dst_discard_sk;
+               new->output = dst_discard_out;
 
                new->dev = ort->dst.dev;
                if (new->dev)
@@ -2262,7 +2371,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
                rt->rt_uses_gateway = ort->rt_uses_gateway;
 
                INIT_LIST_HEAD(&rt->rt_uncached);
-
                dst_free(new);
        }
 
@@ -2272,7 +2380,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 }
 
 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
-                                   struct sock *sk)
+                                   const struct sock *sk)
 {
        struct rtable *rt = __ip_route_output_key(net, flp4);
 
@@ -2288,7 +2396,7 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
 }
 EXPORT_SYMBOL_GPL(ip_route_output_flow);
 
-static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
+static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
                        struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
                        u32 seq, int event, int nowait, unsigned int flags)
 {
@@ -2308,8 +2416,8 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
        r->rtm_dst_len  = 32;
        r->rtm_src_len  = 0;
        r->rtm_tos      = fl4->flowi4_tos;
-       r->rtm_table    = RT_TABLE_MAIN;
-       if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
+       r->rtm_table    = table_id;
+       if (nla_put_u32(skb, RTA_TABLE, table_id))
                goto nla_put_failure;
        r->rtm_type     = rt->rt_type;
        r->rtm_scope    = RT_SCOPE_UNIVERSE;
@@ -2414,6 +2522,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
        int err;
        int mark;
        struct sk_buff *skb;
+       u32 table_id = RT_TABLE_MAIN;
 
        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
        if (err < 0)
@@ -2449,6 +2558,9 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
        fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
        fl4.flowi4_mark = mark;
 
+       if (netif_index_is_l3_master(net, fl4.flowi4_oif))
+               fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF;
+
        if (iif) {
                struct net_device *dev;
 
@@ -2483,7 +2595,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
        if (rtm->rtm_flags & RTM_F_NOTIFY)
                rt->rt_flags |= RTCF_NOTIFY;
 
-       err = rt_fill_info(net, dst, src, &fl4, skb,
+       if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
+               table_id = rt->rt_table_id;
+
+       err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
                           NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
                           RTM_NEWROUTE, 0, 0);
        if (err < 0)
@@ -2504,7 +2619,6 @@ void ip_rt_multicast_event(struct in_device *in_dev)
 }
 
 #ifdef CONFIG_SYSCTL
-static int ip_rt_gc_timeout __read_mostly      = RT_GC_TIMEOUT;
 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
 static int ip_rt_gc_elasticity __read_mostly   = 8;
@@ -2742,6 +2856,10 @@ int __init ip_rt_init(void)
 
        prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
 
+       ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
+       if (!ip_tstamps)
+               panic("IP: failed to allocate ip_tstamps\n");
+
        for_each_possible_cpu(cpu) {
                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);