These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / net / ipv6 / route.c
index f371fef..3f164d3 100644 (file)
 #include <net/tcp.h>
 #include <linux/rtnetlink.h>
 #include <net/dst.h>
+#include <net/dst_metadata.h>
 #include <net/xfrm.h>
 #include <net/netevent.h>
 #include <net/netlink.h>
 #include <net/nexthop.h>
+#include <net/lwtunnel.h>
+#include <net/ip_tunnels.h>
+#include <net/l3mdev.h>
 
 #include <asm/uaccess.h>
 
@@ -72,8 +76,7 @@ enum rt6_nud_state {
        RT6_NUD_SUCCEED = 1
 };
 
-static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
-                                   const struct in6_addr *dest);
+static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
 static struct dst_entry        *ip6_dst_check(struct dst_entry *dst, u32 cookie);
 static unsigned int     ip6_default_advmss(const struct dst_entry *dst);
 static unsigned int     ip6_mtu(const struct dst_entry *dst);
@@ -84,14 +87,15 @@ static void         ip6_dst_ifdown(struct dst_entry *,
 static int              ip6_dst_gc(struct dst_ops *ops);
 
 static int             ip6_pkt_discard(struct sk_buff *skb);
-static int             ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb);
+static int             ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
 static int             ip6_pkt_prohibit(struct sk_buff *skb);
-static int             ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb);
+static int             ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
 static void            ip6_link_failure(struct sk_buff *skb);
 static void            ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
                                           struct sk_buff *skb, u32 mtu);
 static void            rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
                                        struct sk_buff *skb);
+static void            rt6_dst_from_metrics_check(struct rt6_info *rt);
 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
 
 #ifdef CONFIG_IPV6_ROUTE_INFO
@@ -104,65 +108,83 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
                                           const struct in6_addr *gwaddr, int ifindex);
 #endif
 
-static void rt6_bind_peer(struct rt6_info *rt, int create)
+struct uncached_list {
+       spinlock_t              lock;
+       struct list_head        head;
+};
+
+static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
+
+static void rt6_uncached_list_add(struct rt6_info *rt)
 {
-       struct inet_peer_base *base;
-       struct inet_peer *peer;
+       struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
 
-       base = inetpeer_base_ptr(rt->_rt6i_peer);
-       if (!base)
-               return;
+       rt->dst.flags |= DST_NOCACHE;
+       rt->rt6i_uncached_list = ul;
 
-       peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create);
-       if (peer) {
-               if (!rt6_set_peer(rt, peer))
-                       inet_putpeer(peer);
-       }
+       spin_lock_bh(&ul->lock);
+       list_add_tail(&rt->rt6i_uncached, &ul->head);
+       spin_unlock_bh(&ul->lock);
 }
 
-static struct inet_peer *__rt6_get_peer(struct rt6_info *rt, int create)
+static void rt6_uncached_list_del(struct rt6_info *rt)
 {
-       if (rt6_has_peer(rt))
-               return rt6_peer_ptr(rt);
+       if (!list_empty(&rt->rt6i_uncached)) {
+               struct uncached_list *ul = rt->rt6i_uncached_list;
 
-       rt6_bind_peer(rt, create);
-       return (rt6_has_peer(rt) ? rt6_peer_ptr(rt) : NULL);
+               spin_lock_bh(&ul->lock);
+               list_del(&rt->rt6i_uncached);
+               spin_unlock_bh(&ul->lock);
+       }
 }
 
-static struct inet_peer *rt6_get_peer_create(struct rt6_info *rt)
+static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
 {
-       return __rt6_get_peer(rt, 1);
-}
+       struct net_device *loopback_dev = net->loopback_dev;
+       int cpu;
 
-static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
-{
-       struct rt6_info *rt = (struct rt6_info *) dst;
-       struct inet_peer *peer;
-       u32 *p = NULL;
-
-       if (!(rt->dst.flags & DST_HOST))
-               return dst_cow_metrics_generic(dst, old);
+       if (dev == loopback_dev)
+               return;
 
-       peer = rt6_get_peer_create(rt);
-       if (peer) {
-               u32 *old_p = __DST_METRICS_PTR(old);
-               unsigned long prev, new;
+       for_each_possible_cpu(cpu) {
+               struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
+               struct rt6_info *rt;
 
-               p = peer->metrics;
-               if (inet_metrics_new(peer) ||
-                   (old & DST_METRICS_FORCE_OVERWRITE))
-                       memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
+               spin_lock_bh(&ul->lock);
+               list_for_each_entry(rt, &ul->head, rt6i_uncached) {
+                       struct inet6_dev *rt_idev = rt->rt6i_idev;
+                       struct net_device *rt_dev = rt->dst.dev;
 
-               new = (unsigned long) p;
-               prev = cmpxchg(&dst->_metrics, old, new);
+                       if (rt_idev->dev == dev) {
+                               rt->rt6i_idev = in6_dev_get(loopback_dev);
+                               in6_dev_put(rt_idev);
+                       }
 
-               if (prev != old) {
-                       p = __DST_METRICS_PTR(prev);
-                       if (prev & DST_METRICS_READ_ONLY)
-                               p = NULL;
+                       if (rt_dev == dev) {
+                               rt->dst.dev = loopback_dev;
+                               dev_hold(rt->dst.dev);
+                               dev_put(rt_dev);
+                       }
                }
+               spin_unlock_bh(&ul->lock);
        }
-       return p;
+}
+
+static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
+{
+       return dst_metrics_write_ptr(rt->dst.from);
+}
+
+static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
+{
+       struct rt6_info *rt = (struct rt6_info *)dst;
+
+       if (rt->rt6i_flags & RTF_PCPU)
+               return rt6_pcpu_cow_metrics(rt);
+       else if (rt->rt6i_flags & RTF_CACHE)
+               return NULL;
+       else
+               return dst_cow_metrics_generic(dst, old);
 }
 
 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
@@ -227,12 +249,6 @@ static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
 {
 }
 
-static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
-                                        unsigned long old)
-{
-       return NULL;
-}
-
 static struct dst_ops ip6_dst_blackhole_ops = {
        .family                 =       AF_INET6,
        .destroy                =       ip6_dst_destroy,
@@ -241,7 +257,7 @@ static struct dst_ops ip6_dst_blackhole_ops = {
        .default_advmss         =       ip6_default_advmss,
        .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
        .redirect               =       ip6_rt_blackhole_redirect,
-       .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
+       .cow_metrics            =       dst_cow_metrics_generic,
        .neigh_lookup           =       ip6_neigh_lookup,
 };
 
@@ -288,7 +304,7 @@ static const struct rt6_info ip6_blk_hole_entry_template = {
                .obsolete       = DST_OBSOLETE_FORCE_CHK,
                .error          = -EINVAL,
                .input          = dst_discard,
-               .output         = dst_discard_sk,
+               .output         = dst_discard_out,
        },
        .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
        .rt6i_protocol  = RTPROT_KERNEL,
@@ -298,34 +314,67 @@ static const struct rt6_info ip6_blk_hole_entry_template = {
 
 #endif
 
+static void rt6_info_init(struct rt6_info *rt)
+{
+       struct dst_entry *dst = &rt->dst;
+
+       memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
+       INIT_LIST_HEAD(&rt->rt6i_siblings);
+       INIT_LIST_HEAD(&rt->rt6i_uncached);
+}
+
 /* allocate dst with ip6_dst_ops */
-static inline struct rt6_info *ip6_dst_alloc(struct net *net,
-                                            struct net_device *dev,
-                                            int flags,
-                                            struct fib6_table *table)
+static struct rt6_info *__ip6_dst_alloc(struct net *net,
+                                       struct net_device *dev,
+                                       int flags)
 {
        struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
                                        0, DST_OBSOLETE_FORCE_CHK, flags);
 
+       if (rt)
+               rt6_info_init(rt);
+
+       return rt;
+}
+
+static struct rt6_info *ip6_dst_alloc(struct net *net,
+                                     struct net_device *dev,
+                                     int flags)
+{
+       struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
+
        if (rt) {
-               struct dst_entry *dst = &rt->dst;
+               rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
+               if (rt->rt6i_pcpu) {
+                       int cpu;
+
+                       for_each_possible_cpu(cpu) {
+                               struct rt6_info **p;
 
-               memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
-               rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
-               INIT_LIST_HEAD(&rt->rt6i_siblings);
+                               p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
+                               /* no one shares rt */
+                               *p =  NULL;
+                       }
+               } else {
+                       dst_destroy((struct dst_entry *)rt);
+                       return NULL;
+               }
        }
+
        return rt;
 }
 
 static void ip6_dst_destroy(struct dst_entry *dst)
 {
        struct rt6_info *rt = (struct rt6_info *)dst;
-       struct inet6_dev *idev = rt->rt6i_idev;
        struct dst_entry *from = dst->from;
+       struct inet6_dev *idev;
 
-       if (!(rt->dst.flags & DST_HOST))
-               dst_destroy_metrics_generic(dst);
+       dst_destroy_metrics_generic(dst);
+       free_percpu(rt->rt6i_pcpu);
+       rt6_uncached_list_del(rt);
 
+       idev = rt->rt6i_idev;
        if (idev) {
                rt->rt6i_idev = NULL;
                in6_dev_put(idev);
@@ -333,11 +382,6 @@ static void ip6_dst_destroy(struct dst_entry *dst)
 
        dst->from = NULL;
        dst_release(from);
-
-       if (rt6_has_peer(rt)) {
-               struct inet_peer *peer = rt6_peer_ptr(rt);
-               inet_putpeer(peer);
-       }
 }
 
 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
@@ -360,6 +404,14 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
        }
 }
 
+static bool __rt6_check_expired(const struct rt6_info *rt)
+{
+       if (rt->rt6i_flags & RTF_EXPIRES)
+               return time_after(jiffies, rt->dst.expires);
+       else
+               return false;
+}
+
 static bool rt6_check_expired(const struct rt6_info *rt)
 {
        if (rt->rt6i_flags & RTF_EXPIRES) {
@@ -378,31 +430,7 @@ static bool rt6_check_expired(const struct rt6_info *rt)
 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
                               const struct flowi6 *fl6)
 {
-       unsigned int val = fl6->flowi6_proto;
-
-       val ^= ipv6_addr_hash(&fl6->daddr);
-       val ^= ipv6_addr_hash(&fl6->saddr);
-
-       /* Work only if this not encapsulated */
-       switch (fl6->flowi6_proto) {
-       case IPPROTO_UDP:
-       case IPPROTO_TCP:
-       case IPPROTO_SCTP:
-               val ^= (__force u16)fl6->fl6_sport;
-               val ^= (__force u16)fl6->fl6_dport;
-               break;
-
-       case IPPROTO_ICMPV6:
-               val ^= (__force u16)fl6->fl6_icmp_type;
-               val ^= (__force u16)fl6->fl6_icmp_code;
-               break;
-       }
-       /* RFC6438 recommands to use flowlabel */
-       val ^= (__force u32)fl6->flowlabel;
-
-       /* Perhaps, we need to tune, this function? */
-       val = val ^ (val >> 7) ^ (val >> 12);
-       return val % candidate_count;
+       return get_hash_from_flowi6(fl6) % candidate_count;
 }
 
 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
@@ -455,10 +483,10 @@ static inline struct rt6_info *rt6_device_match(struct net *net,
                        if (dev->flags & IFF_LOOPBACK) {
                                if (!sprt->rt6i_idev ||
                                    sprt->rt6i_idev->dev->ifindex != oif) {
-                                       if (flags & RT6_LOOKUP_F_IFACE && oif)
+                                       if (flags & RT6_LOOKUP_F_IFACE)
                                                continue;
-                                       if (local && (!oif ||
-                                                     local->rt6i_idev->dev->ifindex == oif))
+                                       if (local &&
+                                           local->rt6i_idev->dev->ifindex == oif)
                                                continue;
                                }
                                local = sprt;
@@ -495,13 +523,14 @@ static void rt6_probe_deferred(struct work_struct *w)
                container_of(w, struct __rt6_probe_work, work);
 
        addrconf_addr_solict_mult(&work->target, &mcaddr);
-       ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL);
+       ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL);
        dev_put(work->dev);
        kfree(work);
 }
 
 static void rt6_probe(struct rt6_info *rt)
 {
+       struct __rt6_probe_work *work;
        struct neighbour *neigh;
        /*
         * Okay, this does not seem to be appropriate
@@ -516,34 +545,33 @@ static void rt6_probe(struct rt6_info *rt)
        rcu_read_lock_bh();
        neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
        if (neigh) {
-               write_lock(&neigh->lock);
                if (neigh->nud_state & NUD_VALID)
                        goto out;
-       }
-
-       if (!neigh ||
-           time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
-               struct __rt6_probe_work *work;
 
+               work = NULL;
+               write_lock(&neigh->lock);
+               if (!(neigh->nud_state & NUD_VALID) &&
+                   time_after(jiffies,
+                              neigh->updated +
+                              rt->rt6i_idev->cnf.rtr_probe_interval)) {
+                       work = kmalloc(sizeof(*work), GFP_ATOMIC);
+                       if (work)
+                               __neigh_set_probe_once(neigh);
+               }
+               write_unlock(&neigh->lock);
+       } else {
                work = kmalloc(sizeof(*work), GFP_ATOMIC);
+       }
 
-               if (neigh && work)
-                       __neigh_set_probe_once(neigh);
-
-               if (neigh)
-                       write_unlock(&neigh->lock);
+       if (work) {
+               INIT_WORK(&work->work, rt6_probe_deferred);
+               work->target = rt->rt6i_gateway;
+               dev_hold(rt->dst.dev);
+               work->dev = rt->dst.dev;
+               schedule_work(&work->work);
+       }
 
-               if (work) {
-                       INIT_WORK(&work->work, rt6_probe_deferred);
-                       work->target = rt->rt6i_gateway;
-                       dev_hold(rt->dst.dev);
-                       work->dev = rt->dst.dev;
-                       schedule_work(&work->work);
-               }
-       } else {
 out:
-               write_unlock(&neigh->lock);
-       }
        rcu_read_unlock_bh();
 }
 #else
@@ -622,6 +650,12 @@ static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
 {
        int m;
        bool match_do_rr = false;
+       struct inet6_dev *idev = rt->rt6i_idev;
+       struct net_device *dev = rt->dst.dev;
+
+       if (dev && !netif_carrier_ok(dev) &&
+           idev->cnf.ignore_routes_with_linkdown)
+               goto out;
 
        if (rt6_check_expired(rt))
                goto out;
@@ -652,15 +686,33 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
                                     u32 metric, int oif, int strict,
                                     bool *do_rr)
 {
-       struct rt6_info *rt, *match;
+       struct rt6_info *rt, *match, *cont;
        int mpri = -1;
 
        match = NULL;
-       for (rt = rr_head; rt && rt->rt6i_metric == metric;
-            rt = rt->dst.rt6_next)
+       cont = NULL;
+       for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
+               if (rt->rt6i_metric != metric) {
+                       cont = rt;
+                       break;
+               }
+
                match = find_match(rt, oif, strict, &mpri, match, do_rr);
-       for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
-            rt = rt->dst.rt6_next)
+       }
+
+       for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
+               if (rt->rt6i_metric != metric) {
+                       cont = rt;
+                       break;
+               }
+
+               match = find_match(rt, oif, strict, &mpri, match, do_rr);
+       }
+
+       if (match || !cont)
+               return match;
+
+       for (rt = cont; rt; rt = rt->dst.rt6_next)
                match = find_match(rt, oif, strict, &mpri, match, do_rr);
 
        return match;
@@ -694,6 +746,11 @@ static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
        return match ? match : net->ipv6.ip6_null_entry;
 }
 
+static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
+{
+       return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
+}
+
 #ifdef CONFIG_IPV6_ROUTE_INFO
 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
                  const struct in6_addr *gwaddr)
@@ -872,9 +929,9 @@ int ip6_ins_rt(struct rt6_info *rt)
        return __ip6_ins_rt(rt, &info, &mxc);
 }
 
-static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
-                                     const struct in6_addr *daddr,
-                                     const struct in6_addr *saddr)
+static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
+                                          const struct in6_addr *daddr,
+                                          const struct in6_addr *saddr)
 {
        struct rt6_info *rt;
 
@@ -882,15 +939,25 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
         *      Clone the route.
         */
 
-       rt = ip6_rt_copy(ort, daddr);
+       if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
+               ort = (struct rt6_info *)ort->dst.from;
 
-       if (rt) {
+       rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
+
+       if (!rt)
+               return NULL;
+
+       ip6_rt_copy_init(rt, ort);
+       rt->rt6i_flags |= RTF_CACHE;
+       rt->rt6i_metric = 0;
+       rt->dst.flags |= DST_HOST;
+       rt->rt6i_dst.addr = *daddr;
+       rt->rt6i_dst.plen = 128;
+
+       if (!rt6_is_gw_or_nonexthop(ort)) {
                if (ort->rt6i_dst.plen != 128 &&
                    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
                        rt->rt6i_flags |= RTF_ANYCAST;
-
-               rt->rt6i_flags |= RTF_CACHE;
-
 #ifdef CONFIG_IPV6_SUBTREES
                if (rt->rt6i_src.plen && saddr) {
                        rt->rt6i_src.addr = *saddr;
@@ -902,35 +969,93 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
        return rt;
 }
 
-static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
-                                       const struct in6_addr *daddr)
+static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
 {
-       struct rt6_info *rt = ip6_rt_copy(ort, daddr);
+       struct rt6_info *pcpu_rt;
 
-       if (rt)
-               rt->rt6i_flags |= RTF_CACHE;
-       return rt;
+       pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
+                                 rt->dst.dev, rt->dst.flags);
+
+       if (!pcpu_rt)
+               return NULL;
+       ip6_rt_copy_init(pcpu_rt, rt);
+       pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
+       pcpu_rt->rt6i_flags |= RTF_PCPU;
+       return pcpu_rt;
+}
+
+/* It should be called with read_lock_bh(&tb6_lock) acquired */
+static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
+{
+       struct rt6_info *pcpu_rt, **p;
+
+       p = this_cpu_ptr(rt->rt6i_pcpu);
+       pcpu_rt = *p;
+
+       if (pcpu_rt) {
+               dst_hold(&pcpu_rt->dst);
+               rt6_dst_from_metrics_check(pcpu_rt);
+       }
+       return pcpu_rt;
+}
+
+static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
+{
+       struct fib6_table *table = rt->rt6i_table;
+       struct rt6_info *pcpu_rt, *prev, **p;
+
+       pcpu_rt = ip6_rt_pcpu_alloc(rt);
+       if (!pcpu_rt) {
+               struct net *net = dev_net(rt->dst.dev);
+
+               dst_hold(&net->ipv6.ip6_null_entry->dst);
+               return net->ipv6.ip6_null_entry;
+       }
+
+       read_lock_bh(&table->tb6_lock);
+       if (rt->rt6i_pcpu) {
+               p = this_cpu_ptr(rt->rt6i_pcpu);
+               prev = cmpxchg(p, NULL, pcpu_rt);
+               if (prev) {
+                       /* If someone did it before us, return prev instead */
+                       dst_destroy(&pcpu_rt->dst);
+                       pcpu_rt = prev;
+               }
+       } else {
+               /* rt has been removed from the fib6 tree
+                * before we have a chance to acquire the read_lock.
+                * In this case, don't brother to create a pcpu rt
+                * since rt is going away anyway.  The next
+                * dst_check() will trigger a re-lookup.
+                */
+               dst_destroy(&pcpu_rt->dst);
+               pcpu_rt = rt;
+       }
+       dst_hold(&pcpu_rt->dst);
+       rt6_dst_from_metrics_check(pcpu_rt);
+       read_unlock_bh(&table->tb6_lock);
+       return pcpu_rt;
 }
 
 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
                                      struct flowi6 *fl6, int flags)
 {
        struct fib6_node *fn, *saved_fn;
-       struct rt6_info *rt, *nrt;
+       struct rt6_info *rt;
        int strict = 0;
-       int attempts = 3;
-       int err;
 
        strict |= flags & RT6_LOOKUP_F_IFACE;
        if (net->ipv6.devconf_all->forwarding == 0)
                strict |= RT6_LOOKUP_F_REACHABLE;
 
-redo_fib6_lookup_lock:
        read_lock_bh(&table->tb6_lock);
 
        fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
        saved_fn = fn;
 
+       if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
+               oif = 0;
+
 redo_rt6_select:
        rt = rt6_select(fn, oif, strict);
        if (rt->rt6i_nsiblings)
@@ -944,51 +1069,65 @@ redo_rt6_select:
                        strict &= ~RT6_LOOKUP_F_REACHABLE;
                        fn = saved_fn;
                        goto redo_rt6_select;
-               } else {
-                       dst_hold(&rt->dst);
-                       read_unlock_bh(&table->tb6_lock);
-                       goto out2;
                }
        }
 
-       dst_hold(&rt->dst);
-       read_unlock_bh(&table->tb6_lock);
 
-       if (rt->rt6i_flags & RTF_CACHE)
-               goto out2;
+       if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
+               dst_use(&rt->dst, jiffies);
+               read_unlock_bh(&table->tb6_lock);
 
-       if (!(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)))
-               nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
-       else if (!(rt->dst.flags & DST_HOST))
-               nrt = rt6_alloc_clone(rt, &fl6->daddr);
-       else
-               goto out2;
+               rt6_dst_from_metrics_check(rt);
+               return rt;
+       } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
+                           !(rt->rt6i_flags & RTF_GATEWAY))) {
+               /* Create a RTF_CACHE clone which will not be
+                * owned by the fib6 tree.  It is for the special case where
+                * the daddr in the skb during the neighbor look-up is different
+                * from the fl6->daddr used to look-up route here.
+                */
 
-       ip6_rt_put(rt);
-       rt = nrt ? : net->ipv6.ip6_null_entry;
+               struct rt6_info *uncached_rt;
 
-       dst_hold(&rt->dst);
-       if (nrt) {
-               err = ip6_ins_rt(nrt);
-               if (!err)
-                       goto out2;
-       }
+               dst_use(&rt->dst, jiffies);
+               read_unlock_bh(&table->tb6_lock);
 
-       if (--attempts <= 0)
-               goto out2;
+               uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
+               dst_release(&rt->dst);
 
-       /*
-        * Race condition! In the gap, when table->tb6_lock was
-        * released someone could insert this route.  Relookup.
-        */
-       ip6_rt_put(rt);
-       goto redo_fib6_lookup_lock;
+               if (uncached_rt)
+                       rt6_uncached_list_add(uncached_rt);
+               else
+                       uncached_rt = net->ipv6.ip6_null_entry;
 
-out2:
-       rt->dst.lastuse = jiffies;
-       rt->dst.__use++;
+               dst_hold(&uncached_rt->dst);
+               return uncached_rt;
 
-       return rt;
+       } else {
+               /* Get a percpu copy */
+
+               struct rt6_info *pcpu_rt;
+
+               rt->dst.lastuse = jiffies;
+               rt->dst.__use++;
+               pcpu_rt = rt6_get_pcpu_route(rt);
+
+               if (pcpu_rt) {
+                       read_unlock_bh(&table->tb6_lock);
+               } else {
+                       /* We have to do the read_unlock first
+                        * because rt6_make_pcpu_route() may trigger
+                        * ip6_dst_gc() which will take the write_lock.
+                        */
+                       dst_hold(&rt->dst);
+                       read_unlock_bh(&table->tb6_lock);
+                       pcpu_rt = rt6_make_pcpu_route(rt);
+                       dst_release(&rt->dst);
+               }
+
+               return pcpu_rt;
+
+       }
 }
 
 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
@@ -1012,8 +1151,9 @@ void ip6_route_input(struct sk_buff *skb)
        const struct ipv6hdr *iph = ipv6_hdr(skb);
        struct net *net = dev_net(skb->dev);
        int flags = RT6_LOOKUP_F_HAS_SADDR;
+       struct ip_tunnel_info *tun_info;
        struct flowi6 fl6 = {
-               .flowi6_iif = skb->dev->ifindex,
+               .flowi6_iif = l3mdev_fib_oif(skb->dev),
                .daddr = iph->daddr,
                .saddr = iph->saddr,
                .flowlabel = ip6_flowinfo(iph),
@@ -1021,6 +1161,10 @@ void ip6_route_input(struct sk_buff *skb)
                .flowi6_proto = iph->nexthdr,
        };
 
+       tun_info = skb_tunnel_info(skb);
+       if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
+               fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
+       skb_dst_drop(skb);
        skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
 }
 
@@ -1030,24 +1174,31 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table
        return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
 }
 
-struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
-                                   struct flowi6 *fl6)
+struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
+                                        struct flowi6 *fl6, int flags)
 {
-       int flags = 0;
+       struct dst_entry *dst;
+       bool any_src;
+
+       dst = l3mdev_rt6_dst_by_oif(net, fl6);
+       if (dst)
+               return dst;
 
        fl6->flowi6_iif = LOOPBACK_IFINDEX;
 
-       if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
+       any_src = ipv6_addr_any(&fl6->saddr);
+       if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
+           (fl6->flowi6_oif && any_src))
                flags |= RT6_LOOKUP_F_IFACE;
 
-       if (!ipv6_addr_any(&fl6->saddr))
+       if (!any_src)
                flags |= RT6_LOOKUP_F_HAS_SADDR;
        else if (sk)
                flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
 
        return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
 }
-EXPORT_SYMBOL(ip6_route_output);
+EXPORT_SYMBOL_GPL(ip6_route_output_flags);
 
 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
 {
@@ -1056,25 +1207,20 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
 
        rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
        if (rt) {
-               new = &rt->dst;
-
-               memset(new + 1, 0, sizeof(*rt) - sizeof(*new));
-               rt6_init_peer(rt, net->ipv6.peers);
+               rt6_info_init(rt);
 
+               new = &rt->dst;
                new->__use = 1;
                new->input = dst_discard;
-               new->output = dst_discard_sk;
+               new->output = dst_discard_out;
 
-               if (dst_metrics_read_only(&ort->dst))
-                       new->_metrics = ort->dst._metrics;
-               else
-                       dst_copy_metrics(new, &ort->dst);
+               dst_copy_metrics(new, &ort->dst);
                rt->rt6i_idev = ort->rt6i_idev;
                if (rt->rt6i_idev)
                        in6_dev_hold(rt->rt6i_idev);
 
                rt->rt6i_gateway = ort->rt6i_gateway;
-               rt->rt6i_flags = ort->rt6i_flags;
+               rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
                rt->rt6i_metric = 0;
 
                memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
@@ -1093,6 +1239,34 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
  *     Destination cache support functions
  */
 
+static void rt6_dst_from_metrics_check(struct rt6_info *rt)
+{
+       if (rt->dst.from &&
+           dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
+               dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
+}
+
+static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
+{
+       if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
+               return NULL;
+
+       if (rt6_check_expired(rt))
+               return NULL;
+
+       return &rt->dst;
+}
+
+static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
+{
+       if (!__rt6_check_expired(rt) &&
+           rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
+           rt6_check((struct rt6_info *)(rt->dst.from), cookie))
+               return &rt->dst;
+       else
+               return NULL;
+}
+
 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 {
        struct rt6_info *rt;
@@ -1103,13 +1277,14 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
         * into this function always.
         */
-       if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
-               return NULL;
 
-       if (rt6_check_expired(rt))
-               return NULL;
+       rt6_dst_from_metrics_check(rt);
 
-       return dst;
+       if (rt->rt6i_flags & RTF_PCPU ||
+           (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from))
+               return rt6_dst_from_check(rt, cookie);
+       else
+               return rt6_check(rt, cookie);
 }
 
 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
@@ -1140,32 +1315,76 @@ static void ip6_link_failure(struct sk_buff *skb)
        if (rt) {
                if (rt->rt6i_flags & RTF_CACHE) {
                        dst_hold(&rt->dst);
-                       if (ip6_del_rt(rt))
-                               dst_free(&rt->dst);
+                       ip6_del_rt(rt);
                } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
                        rt->rt6i_node->fn_sernum = -1;
                }
        }
 }
 
-static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
-                              struct sk_buff *skb, u32 mtu)
+static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
+{
+       struct net *net = dev_net(rt->dst.dev);
+
+       rt->rt6i_flags |= RTF_MODIFIED;
+       rt->rt6i_pmtu = mtu;
+       rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
+}
+
+static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
+{
+       return !(rt->rt6i_flags & RTF_CACHE) &&
+               (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
+}
+
+static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
+                                const struct ipv6hdr *iph, u32 mtu)
 {
        struct rt6_info *rt6 = (struct rt6_info *)dst;
 
-       dst_confirm(dst);
-       if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
-               struct net *net = dev_net(dst->dev);
+       if (rt6->rt6i_flags & RTF_LOCAL)
+               return;
 
-               rt6->rt6i_flags |= RTF_MODIFIED;
-               if (mtu < IPV6_MIN_MTU)
-                       mtu = IPV6_MIN_MTU;
+       dst_confirm(dst);
+       mtu = max_t(u32, mtu, IPV6_MIN_MTU);
+       if (mtu >= dst_mtu(dst))
+               return;
 
-               dst_metric_set(dst, RTAX_MTU, mtu);
-               rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
+       if (!rt6_cache_allowed_for_pmtu(rt6)) {
+               rt6_do_update_pmtu(rt6, mtu);
+       } else {
+               const struct in6_addr *daddr, *saddr;
+               struct rt6_info *nrt6;
+
+               if (iph) {
+                       daddr = &iph->daddr;
+                       saddr = &iph->saddr;
+               } else if (sk) {
+                       daddr = &sk->sk_v6_daddr;
+                       saddr = &inet6_sk(sk)->saddr;
+               } else {
+                       return;
+               }
+               nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
+               if (nrt6) {
+                       rt6_do_update_pmtu(nrt6, mtu);
+
+                       /* ip6_ins_rt(nrt6) will bump the
+                        * rt6->rt6i_node->fn_sernum
+                        * which will fail the next rt6_check() and
+                        * invalidate the sk->sk_dst_cache.
+                        */
+                       ip6_ins_rt(nrt6);
+               }
        }
 }
 
+static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+                              struct sk_buff *skb, u32 mtu)
+{
+       __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
+}
+
 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
                     int oif, u32 mark)
 {
@@ -1182,7 +1401,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
 
        dst = ip6_route_output(net, NULL, &fl6);
        if (!dst->error)
-               ip6_rt_update_pmtu(dst, NULL, skb, ntohl(mtu));
+               __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
        dst_release(dst);
 }
 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
@@ -1341,9 +1560,14 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst)
 
 static unsigned int ip6_mtu(const struct dst_entry *dst)
 {
+       const struct rt6_info *rt = (const struct rt6_info *)dst;
+       unsigned int mtu = rt->rt6i_pmtu;
        struct inet6_dev *idev;
-       unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
 
+       if (mtu)
+               goto out;
+
+       mtu = dst_metric_raw(dst, RTAX_MTU);
        if (mtu)
                goto out;
 
@@ -1373,7 +1597,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
        if (unlikely(!idev))
                return ERR_PTR(-ENODEV);
 
-       rt = ip6_dst_alloc(net, dev, 0, NULL);
+       rt = ip6_dst_alloc(net, dev, 0);
        if (unlikely(!rt)) {
                in6_dev_put(idev);
                dst = ERR_PTR(-ENOMEM);
@@ -1472,6 +1696,7 @@ out:
 static int ip6_convert_metrics(struct mx6_config *mxc,
                               const struct fib6_config *cfg)
 {
+       bool ecn_ca = false;
        struct nlattr *nla;
        int remaining;
        u32 *mp;
@@ -1485,51 +1710,57 @@ static int ip6_convert_metrics(struct mx6_config *mxc,
 
        nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
                int type = nla_type(nla);
+               u32 val;
+
+               if (!type)
+                       continue;
+               if (unlikely(type > RTAX_MAX))
+                       goto err;
 
-               if (type) {
-                       u32 val;
+               if (type == RTAX_CC_ALGO) {
+                       char tmp[TCP_CA_NAME_MAX];
 
-                       if (unlikely(type > RTAX_MAX))
+                       nla_strlcpy(tmp, nla, sizeof(tmp));
+                       val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
+                       if (val == TCP_CA_UNSPEC)
                                goto err;
-                       if (type == RTAX_CC_ALGO) {
-                               char tmp[TCP_CA_NAME_MAX];
+               } else {
+                       val = nla_get_u32(nla);
+               }
+               if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
+                       goto err;
 
-                               nla_strlcpy(tmp, nla, sizeof(tmp));
-                               val = tcp_ca_get_key_by_name(tmp);
-                               if (val == TCP_CA_UNSPEC)
-                                       goto err;
-                       } else {
-                               val = nla_get_u32(nla);
-                       }
+               mp[type - 1] = val;
+               __set_bit(type - 1, mxc->mx_valid);
+       }
 
-                       mp[type - 1] = val;
-                       __set_bit(type - 1, mxc->mx_valid);
-               }
+       if (ecn_ca) {
+               __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
+               mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
        }
 
        mxc->mx = mp;
-
        return 0;
  err:
        kfree(mp);
        return -EINVAL;
 }
 
-int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret)
+static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
 {
-       int err;
        struct net *net = cfg->fc_nlinfo.nl_net;
        struct rt6_info *rt = NULL;
        struct net_device *dev = NULL;
        struct inet6_dev *idev = NULL;
        struct fib6_table *table;
        int addr_type;
+       int err = -EINVAL;
 
        if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
-               return -EINVAL;
+               goto out;
 #ifndef CONFIG_IPV6_SUBTREES
        if (cfg->fc_src_len)
-               return -EINVAL;
+               goto out;
 #endif
        if (cfg->fc_ifindex) {
                err = -ENODEV;
@@ -1559,7 +1790,8 @@ int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret)
        if (!table)
                goto out;
 
-       rt = ip6_dst_alloc(net, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT, table);
+       rt = ip6_dst_alloc(net, NULL,
+                          (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
 
        if (!rt) {
                err = -ENOMEM;
@@ -1587,12 +1819,29 @@ int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret)
 
        rt->dst.output = ip6_output;
 
+       if (cfg->fc_encap) {
+               struct lwtunnel_state *lwtstate;
+
+               err = lwtunnel_build_state(dev, cfg->fc_encap_type,
+                                          cfg->fc_encap, AF_INET6, cfg,
+                                          &lwtstate);
+               if (err)
+                       goto out;
+               rt->dst.lwtstate = lwtstate_get(lwtstate);
+               if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
+                       rt->dst.lwtstate->orig_output = rt->dst.output;
+                       rt->dst.output = lwtunnel_output;
+               }
+               if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
+                       rt->dst.lwtstate->orig_input = rt->dst.input;
+                       rt->dst.input = lwtunnel_input;
+               }
+       }
+
        ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
        rt->rt6i_dst.plen = cfg->fc_dst_len;
-       if (rt->rt6i_dst.plen == 128) {
+       if (rt->rt6i_dst.plen == 128)
                rt->dst.flags |= DST_HOST;
-               dst_metrics_set_force_overwrite(&rt->dst);
-       }
 
 #ifdef CONFIG_IPV6_SUBTREES
        ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
@@ -1626,7 +1875,7 @@ int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret)
                switch (cfg->fc_type) {
                case RTN_BLACKHOLE:
                        rt->dst.error = -EINVAL;
-                       rt->dst.output = dst_discard_sk;
+                       rt->dst.output = dst_discard_out;
                        rt->dst.input = dst_discard;
                        break;
                case RTN_PROHIBIT:
@@ -1635,9 +1884,11 @@ int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret)
                        rt->dst.input = ip6_pkt_prohibit;
                        break;
                case RTN_THROW:
+               case RTN_UNREACHABLE:
                default:
                        rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
-                                       : -ENETUNREACH;
+                                       : (cfg->fc_type == RTN_UNREACHABLE)
+                                       ? -EHOSTUNREACH : -ENETUNREACH;
                        rt->dst.output = ip6_pkt_discard_out;
                        rt->dst.input = ip6_pkt_discard;
                        break;
@@ -1650,9 +1901,21 @@ int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret)
                int gwa_type;
 
                gw_addr = &cfg->fc_gateway;
-               rt->rt6i_gateway = *gw_addr;
                gwa_type = ipv6_addr_type(gw_addr);
 
+               /* if gw_addr is local we will fail to detect this in case
+                * address is still TENTATIVE (DAD in progress). rt6_lookup()
+                * will return already-added prefix route via interface that
+                * prefix route was assigned to, which might be non-loopback.
+                */
+               err = -EINVAL;
+               if (ipv6_chk_addr_and_flags(net, gw_addr,
+                                           gwa_type & IPV6_ADDR_LINKLOCAL ?
+                                           dev : NULL, 0, 0))
+                       goto out;
+
+               rt->rt6i_gateway = *gw_addr;
+
                if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
                        struct rt6_info *grt;
 
@@ -1663,7 +1926,6 @@ int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret)
                           (SIT, PtP, NBMA NOARP links) it is handy to allow
                           some exceptions. --ANK
                         */
-                       err = -EINVAL;
                        if (!(gwa_type & IPV6_ADDR_UNICAST))
                                goto out;
 
@@ -1718,9 +1980,7 @@ install_route:
 
        cfg->fc_nlinfo.nl_net = dev_net(dev);
 
-       *rt_ret = rt;
-
-       return 0;
+       return rt;
 out:
        if (dev)
                dev_put(dev);
@@ -1729,20 +1989,21 @@ out:
        if (rt)
                dst_free(&rt->dst);
 
-       *rt_ret = NULL;
-
-       return err;
+       return ERR_PTR(err);
 }
 
 int ip6_route_add(struct fib6_config *cfg)
 {
        struct mx6_config mxc = { .mx = NULL, };
-       struct rt6_info *rt = NULL;
+       struct rt6_info *rt;
        int err;
 
-       err = ip6_route_info_create(cfg, &rt);
-       if (err)
+       rt = ip6_route_info_create(cfg);
+       if (IS_ERR(rt)) {
+               err = PTR_ERR(rt);
+               rt = NULL;
                goto out;
+       }
 
        err = ip6_convert_metrics(&mxc, cfg);
        if (err)
@@ -1766,7 +2027,8 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
        struct fib6_table *table;
        struct net *net = dev_net(rt->dst.dev);
 
-       if (rt == net->ipv6.ip6_null_entry) {
+       if (rt == net->ipv6.ip6_null_entry ||
+           rt->dst.flags & DST_NOCACHE) {
                err = -ENOENT;
                goto out;
        }
@@ -1808,6 +2070,9 @@ static int ip6_route_del(struct fib6_config *cfg)
 
        if (fn) {
                for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
+                       if ((rt->rt6i_flags & RTF_CACHE) &&
+                           !(cfg->fc_flags & RTF_CACHE))
+                               continue;
                        if (cfg->fc_ifindex &&
                            (!rt->dst.dev ||
                             rt->dst.dev->ifindex != cfg->fc_ifindex))
@@ -1830,7 +2095,6 @@ static int ip6_route_del(struct fib6_config *cfg)
 
 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 {
-       struct net *net = dev_net(skb->dev);
        struct netevent_redirect netevent;
        struct rt6_info *rt, *nrt = NULL;
        struct ndisc_options ndopts;
@@ -1891,7 +2155,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
        }
 
        rt = (struct rt6_info *) dst;
-       if (rt == net->ipv6.ip6_null_entry) {
+       if (rt->rt6i_flags & RTF_REJECT) {
                net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
                return;
        }
@@ -1917,7 +2181,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
                                     NEIGH_UPDATE_F_ISROUTER))
                     );
 
-       nrt = ip6_rt_copy(rt, &msg->dest);
+       nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
        if (!nrt)
                goto out;
 
@@ -1949,42 +2213,36 @@ out:
  *     Misc support functions
  */
 
-static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
-                                   const struct in6_addr *dest)
+static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
 {
-       struct net *net = dev_net(ort->dst.dev);
-       struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0,
-                                           ort->rt6i_table);
+       BUG_ON(from->dst.from);
 
-       if (rt) {
-               rt->dst.input = ort->dst.input;
-               rt->dst.output = ort->dst.output;
-               rt->dst.flags |= DST_HOST;
-
-               rt->rt6i_dst.addr = *dest;
-               rt->rt6i_dst.plen = 128;
-               dst_copy_metrics(&rt->dst, &ort->dst);
-               rt->dst.error = ort->dst.error;
-               rt->rt6i_idev = ort->rt6i_idev;
-               if (rt->rt6i_idev)
-                       in6_dev_hold(rt->rt6i_idev);
-               rt->dst.lastuse = jiffies;
-
-               if (ort->rt6i_flags & RTF_GATEWAY)
-                       rt->rt6i_gateway = ort->rt6i_gateway;
-               else
-                       rt->rt6i_gateway = *dest;
-               rt->rt6i_flags = ort->rt6i_flags;
-               rt6_set_from(rt, ort);
-               rt->rt6i_metric = 0;
+       rt->rt6i_flags &= ~RTF_EXPIRES;
+       dst_hold(&from->dst);
+       rt->dst.from = &from->dst;
+       dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
+}
 
+static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
+{
+       rt->dst.input = ort->dst.input;
+       rt->dst.output = ort->dst.output;
+       rt->rt6i_dst = ort->rt6i_dst;
+       rt->dst.error = ort->dst.error;
+       rt->rt6i_idev = ort->rt6i_idev;
+       if (rt->rt6i_idev)
+               in6_dev_hold(rt->rt6i_idev);
+       rt->dst.lastuse = jiffies;
+       rt->rt6i_gateway = ort->rt6i_gateway;
+       rt->rt6i_flags = ort->rt6i_flags;
+       rt6_set_from(rt, ort);
+       rt->rt6i_metric = ort->rt6i_metric;
 #ifdef CONFIG_IPV6_SUBTREES
-               memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
+       rt->rt6i_src = ort->rt6i_src;
 #endif
-               memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
-               rt->rt6i_table = ort->rt6i_table;
-       }
-       return rt;
+       rt->rt6i_prefsrc = ort->rt6i_prefsrc;
+       rt->rt6i_table = ort->rt6i_table;
+       rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
 }
 
 #ifdef CONFIG_IPV6_ROUTE_INFO
@@ -2026,7 +2284,6 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
                                           unsigned int pref)
 {
        struct fib6_config cfg = {
-               .fc_table       = RT6_TABLE_INFO,
                .fc_metric      = IP6_RT_PRIO_USER,
                .fc_ifindex     = ifindex,
                .fc_dst_len     = prefixlen,
@@ -2037,6 +2294,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
                .fc_nlinfo.nl_net = net,
        };
 
+       cfg.fc_table = l3mdev_fib_table_by_index(net, ifindex) ? : RT6_TABLE_INFO;
        cfg.fc_dst = *prefix;
        cfg.fc_gateway = *gwaddr;
 
@@ -2077,7 +2335,7 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
                                     unsigned int pref)
 {
        struct fib6_config cfg = {
-               .fc_table       = RT6_TABLE_DFLT,
+               .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
                .fc_metric      = IP6_RT_PRIO_USER,
                .fc_ifindex     = dev->ifindex,
                .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
@@ -2124,7 +2382,8 @@ static void rtmsg_to_fib6_config(struct net *net,
 {
        memset(cfg, 0, sizeof(*cfg));
 
-       cfg->fc_table = RT6_TABLE_MAIN;
+       cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
+                        : RT6_TABLE_MAIN;
        cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
        cfg->fc_metric = rtmsg->rtmsg_metric;
        cfg->fc_expires = rtmsg->rtmsg_info;
@@ -2208,7 +2467,7 @@ static int ip6_pkt_discard(struct sk_buff *skb)
        return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
 }
 
-static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb)
+static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
        skb->dev = skb_dst(skb)->dev;
        return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
@@ -2219,7 +2478,7 @@ static int ip6_pkt_prohibit(struct sk_buff *skb)
        return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
 }
 
-static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb)
+static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
        skb->dev = skb_dst(skb)->dev;
        return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
@@ -2233,9 +2492,10 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
                                    const struct in6_addr *addr,
                                    bool anycast)
 {
+       u32 tb_id;
        struct net *net = dev_net(idev->dev);
        struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
-                                           DST_NOCOUNT, NULL);
+                                           DST_NOCOUNT);
        if (!rt)
                return ERR_PTR(-ENOMEM);
 
@@ -2255,7 +2515,9 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
        rt->rt6i_gateway  = *addr;
        rt->rt6i_dst.addr = *addr;
        rt->rt6i_dst.plen = 128;
-       rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
+       tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
+       rt->rt6i_table = fib6_get_table(net, tb_id);
+       rt->dst.flags |= DST_NOCACHE;
 
        atomic_set(&rt->dst.__refcnt, 1);
 
@@ -2359,6 +2621,8 @@ void rt6_ifdown(struct net *net, struct net_device *dev)
 
        fib6_clean_all(net, fib6_ifdown, &adn);
        icmp6_clean_all(fib6_ifdown, &adn);
+       if (dev)
+               rt6_uncached_list_flush_dev(net, dev);
 }
 
 struct rt6_mtu_change_arg {
@@ -2396,11 +2660,20 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
           PMTU discouvery.
         */
        if (rt->dst.dev == arg->dev &&
-           !dst_metric_locked(&rt->dst, RTAX_MTU) &&
-           (dst_mtu(&rt->dst) >= arg->mtu ||
-            (dst_mtu(&rt->dst) < arg->mtu &&
-             dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
-               dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
+           !dst_metric_locked(&rt->dst, RTAX_MTU)) {
+               if (rt->rt6i_flags & RTF_CACHE) {
+                       /* For RTF_CACHE with rt6i_pmtu == 0
+                        * (i.e. a redirected route),
+                        * the metrics of its rt->dst.from has already
+                        * been updated.
+                        */
+                       if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
+                               rt->rt6i_pmtu = arg->mtu;
+               } else if (dst_mtu(&rt->dst) >= arg->mtu ||
+                          (dst_mtu(&rt->dst) < arg->mtu &&
+                           dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
+                       dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
+               }
        }
        return 0;
 }
@@ -2423,6 +2696,8 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
        [RTA_METRICS]           = { .type = NLA_NESTED },
        [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
        [RTA_PREF]              = { .type = NLA_U8 },
+       [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
+       [RTA_ENCAP]             = { .type = NLA_NESTED },
 };
 
 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -2457,6 +2732,9 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
        if (rtm->rtm_type == RTN_LOCAL)
                cfg->fc_flags |= RTF_LOCAL;
 
+       if (rtm->rtm_flags & RTM_F_CLONED)
+               cfg->fc_flags |= RTF_CACHE;
+
        cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
        cfg->fc_nlinfo.nlh = nlh;
        cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
@@ -2514,6 +2792,12 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
                cfg->fc_flags |= RTF_PREF(pref);
        }
 
+       if (tb[RTA_ENCAP])
+               cfg->fc_encap = tb[RTA_ENCAP];
+
+       if (tb[RTA_ENCAP_TYPE])
+               cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
+
        err = 0;
 errout:
        return err;
@@ -2605,11 +2889,18 @@ static int ip6_route_multipath_add(struct fib6_config *cfg)
                                r_cfg.fc_gateway = nla_get_in6_addr(nla);
                                r_cfg.fc_flags |= RTF_GATEWAY;
                        }
+                       r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
+                       nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
+                       if (nla)
+                               r_cfg.fc_encap_type = nla_get_u16(nla);
                }
 
-               err = ip6_route_info_create(&r_cfg, &rt);
-               if (err)
+               rt = ip6_route_info_create(&r_cfg);
+               if (IS_ERR(rt)) {
+                       err = PTR_ERR(rt);
+                       rt = NULL;
                        goto cleanup;
+               }
 
                err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
                if (err) {
@@ -2658,8 +2949,7 @@ cleanup:
        list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
                if (nh->rt6_info)
                        dst_free(&nh->rt6_info->dst);
-               if (nh->mxc.mx)
-                       kfree(nh->mxc.mx);
+               kfree(nh->mxc.mx);
                list_del(&nh->next);
                kfree(nh);
        }
@@ -2734,7 +3024,7 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
                return ip6_route_add(&cfg);
 }
 
-static inline size_t rt6_nlmsg_size(void)
+static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
 {
        return NLMSG_ALIGN(sizeof(struct rtmsg))
               + nla_total_size(16) /* RTA_SRC */
@@ -2748,7 +3038,8 @@ static inline size_t rt6_nlmsg_size(void)
               + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
               + nla_total_size(sizeof(struct rta_cacheinfo))
               + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
-              + nla_total_size(1); /* RTA_PREF */
+              + nla_total_size(1) /* RTA_PREF */
+              + lwtunnel_get_encap_size(rt->dst.lwtstate);
 }
 
 static int rt6_fill_node(struct net *net,
@@ -2757,6 +3048,7 @@ static int rt6_fill_node(struct net *net,
                         int iif, int type, u32 portid, u32 seq,
                         int prefix, int nowait, unsigned int flags)
 {
+       u32 metrics[RTAX_MAX];
        struct rtmsg *rtm;
        struct nlmsghdr *nlh;
        long expires;
@@ -2808,6 +3100,11 @@ static int rt6_fill_node(struct net *net,
        else
                rtm->rtm_type = RTN_UNICAST;
        rtm->rtm_flags = 0;
+       if (!netif_carrier_ok(rt->dst.dev)) {
+               rtm->rtm_flags |= RTNH_F_LINKDOWN;
+               if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
+                       rtm->rtm_flags |= RTNH_F_DEAD;
+       }
        rtm->rtm_scope = RT_SCOPE_UNIVERSE;
        rtm->rtm_protocol = rt->rt6i_protocol;
        if (rt->rt6i_flags & RTF_DYNAMIC)
@@ -2870,7 +3167,10 @@ static int rt6_fill_node(struct net *net,
                        goto nla_put_failure;
        }
 
-       if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
+       memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
+       if (rt->rt6i_pmtu)
+               metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
+       if (rtnetlink_put_metrics(skb, metrics) < 0)
                goto nla_put_failure;
 
        if (rt->rt6i_flags & RTF_GATEWAY) {
@@ -2892,6 +3192,8 @@ static int rt6_fill_node(struct net *net,
        if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
                goto nla_put_failure;
 
+       lwtunnel_fill_encap(skb, rt->dst.lwtstate);
+
        nlmsg_end(skb, nlh);
        return 0;
 
@@ -2977,6 +3279,11 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
        } else {
                fl6.flowi6_oif = oif;
 
+               if (netif_index_is_l3_master(net, oif)) {
+                       fl6.flowi6_flags = FLOWI_FLAG_L3MDEV_SRC |
+                                          FLOWI_FLAG_SKIP_NH_OIF;
+               }
+
                rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
        }
 
@@ -3008,7 +3315,8 @@ errout:
        return err;
 }
 
-void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
+void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
+                    unsigned int nlm_flags)
 {
        struct sk_buff *skb;
        struct net *net = info->nl_net;
@@ -3018,12 +3326,12 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
        err = -ENOBUFS;
        seq = info->nlh ? info->nlh->nlmsg_seq : 0;
 
-       skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
+       skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
        if (!skb)
                goto errout;
 
        err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
-                               event, info->portid, seq, 0, 0, 0);
+                               event, info->portid, seq, 0, 0, nlm_flags);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
                WARN_ON(err == -EMSGSIZE);
@@ -3365,6 +3673,7 @@ static struct notifier_block ip6_route_dev_notifier = {
 int __init ip6_route_init(void)
 {
        int ret;
+       int cpu;
 
        ret = -ENOMEM;
        ip6_dst_ops_template.kmem_cachep =
@@ -3424,6 +3733,13 @@ int __init ip6_route_init(void)
        if (ret)
                goto out_register_late_subsys;
 
+       for_each_possible_cpu(cpu) {
+               struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
+
+               INIT_LIST_HEAD(&ul->head);
+               spin_lock_init(&ul->lock);
+       }
+
 out:
        return ret;