These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / net / ipv4 / fib_semantics.c
index 8d695b6..d97268e 100644 (file)
@@ -42,6 +42,7 @@
 #include <net/ip_fib.h>
 #include <net/netlink.h>
 #include <net/nexthop.h>
+#include <net/lwtunnel.h>
 
 #include "fib_lookup.h"
 
@@ -56,8 +57,7 @@ static unsigned int fib_info_cnt;
 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-
-static DEFINE_SPINLOCK(fib_multipath_lock);
+u32 fib_multipath_secret __read_mostly;
 
 #define for_nexthops(fi) {                                             \
        int nhsel; const struct fib_nh *nh;                             \
@@ -208,6 +208,7 @@ static void free_fib_info_rcu(struct rcu_head *head)
        change_nexthops(fi) {
                if (nexthop_nh->nh_dev)
                        dev_put(nexthop_nh->nh_dev);
+               lwtstate_put(nexthop_nh->nh_lwtstate);
                free_nh_exceptions(nexthop_nh);
                rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
                rt_fibinfo_free(&nexthop_nh->nh_rth_input);
@@ -266,7 +267,8 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
 #ifdef CONFIG_IP_ROUTE_CLASSID
                    nh->nh_tclassid != onh->nh_tclassid ||
 #endif
-                   ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
+                   lwtunnel_cmp_encap(nh->nh_lwtstate, onh->nh_lwtstate) ||
+                   ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_COMPARE_MASK))
                        return -1;
                onh++;
        } endfor_nexthops(fi);
@@ -318,7 +320,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
                    nfi->fib_type == fi->fib_type &&
                    memcmp(nfi->fib_metrics, fi->fib_metrics,
                           sizeof(u32) * RTAX_MAX) == 0 &&
-                   ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
+                   !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) &&
                    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
                        return fi;
        }
@@ -366,6 +368,7 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
        payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
 
        if (fi->fib_nhs) {
+               size_t nh_encapsize = 0;
                /* Also handles the special case fib_nhs == 1 */
 
                /* each nexthop is packed in an attribute */
@@ -374,8 +377,21 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
                /* may contain flow and gateway attribute */
                nhsize += 2 * nla_total_size(4);
 
+               /* grab encap info */
+               for_nexthops(fi) {
+                       if (nh->nh_lwtstate) {
+                               /* RTA_ENCAP_TYPE */
+                               nh_encapsize += lwtunnel_get_encap_size(
+                                               nh->nh_lwtstate);
+                               /* RTA_ENCAP */
+                               nh_encapsize +=  nla_total_size(2);
+                       }
+               } endfor_nexthops(fi);
+
                /* all nexthops are packed in a nested attribute */
-               payload += nla_total_size(fi->fib_nhs * nhsize);
+               payload += nla_total_size((fi->fib_nhs * nhsize) +
+                                         nh_encapsize);
+
        }
 
        return payload;
@@ -421,13 +437,15 @@ static int fib_detect_death(struct fib_info *fi, int order,
        if (n) {
                state = n->nud_state;
                neigh_release(n);
+       } else {
+               return 0;
        }
        if (state == NUD_REACHABLE)
                return 0;
        if ((state & NUD_VALID) && order != dflt)
                return 0;
        if ((state & NUD_VALID) ||
-           (*last_idx < 0 && order > dflt)) {
+           (*last_idx < 0 && order > dflt && state != NUD_INCOMPLETE)) {
                *last_resort = fi;
                *last_idx = order;
        }
@@ -452,6 +470,9 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
                       int remaining, struct fib_config *cfg)
 {
+       struct net *net = cfg->fc_nlinfo.nl_net;
+       int ret;
+
        change_nexthops(fi) {
                int attrlen;
 
@@ -475,18 +496,130 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
                        if (nexthop_nh->nh_tclassid)
                                fi->fib_net->ipv4.fib_num_tclassid_users++;
 #endif
+                       nla = nla_find(attrs, attrlen, RTA_ENCAP);
+                       if (nla) {
+                               struct lwtunnel_state *lwtstate;
+                               struct net_device *dev = NULL;
+                               struct nlattr *nla_entype;
+
+                               nla_entype = nla_find(attrs, attrlen,
+                                                     RTA_ENCAP_TYPE);
+                               if (!nla_entype)
+                                       goto err_inval;
+                               if (cfg->fc_oif)
+                                       dev = __dev_get_by_index(net, cfg->fc_oif);
+                               ret = lwtunnel_build_state(dev, nla_get_u16(
+                                                          nla_entype),
+                                                          nla,  AF_INET, cfg,
+                                                          &lwtstate);
+                               if (ret)
+                                       goto errout;
+                               nexthop_nh->nh_lwtstate =
+                                       lwtstate_get(lwtstate);
+                       }
                }
 
                rtnh = rtnh_next(rtnh, &remaining);
        } endfor_nexthops(fi);
 
        return 0;
+
+err_inval:
+       ret = -EINVAL;
+
+errout:
+       return ret;
 }
 
-#endif
+static void fib_rebalance(struct fib_info *fi)
+{
+       int total;
+       int w;
+       struct in_device *in_dev;
+
+       if (fi->fib_nhs < 2)
+               return;
+
+       total = 0;
+       for_nexthops(fi) {
+               if (nh->nh_flags & RTNH_F_DEAD)
+                       continue;
+
+               in_dev = __in_dev_get_rtnl(nh->nh_dev);
+
+               if (in_dev &&
+                   IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+                   nh->nh_flags & RTNH_F_LINKDOWN)
+                       continue;
+
+               total += nh->nh_weight;
+       } endfor_nexthops(fi);
+
+       w = 0;
+       change_nexthops(fi) {
+               int upper_bound;
+
+               in_dev = __in_dev_get_rtnl(nexthop_nh->nh_dev);
+
+               if (nexthop_nh->nh_flags & RTNH_F_DEAD) {
+                       upper_bound = -1;
+               } else if (in_dev &&
+                          IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+                          nexthop_nh->nh_flags & RTNH_F_LINKDOWN) {
+                       upper_bound = -1;
+               } else {
+                       w += nexthop_nh->nh_weight;
+                       upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31,
+                                                           total) - 1;
+               }
+
+               atomic_set(&nexthop_nh->nh_upper_bound, upper_bound);
+       } endfor_nexthops(fi);
+
+       net_get_random_once(&fib_multipath_secret,
+                           sizeof(fib_multipath_secret));
+}
+
+static inline void fib_add_weight(struct fib_info *fi,
+                                 const struct fib_nh *nh)
+{
+       fi->fib_weight += nh->nh_weight;
+}
+
+#else /* CONFIG_IP_ROUTE_MULTIPATH */
+
+#define fib_rebalance(fi) do { } while (0)
+#define fib_add_weight(fi, nh) do { } while (0)
+
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+
+static int fib_encap_match(struct net *net, u16 encap_type,
+                          struct nlattr *encap,
+                          int oif, const struct fib_nh *nh,
+                          const struct fib_config *cfg)
+{
+       struct lwtunnel_state *lwtstate;
+       struct net_device *dev = NULL;
+       int ret, result = 0;
+
+       if (encap_type == LWTUNNEL_ENCAP_NONE)
+               return 0;
+
+       if (oif)
+               dev = __dev_get_by_index(net, oif);
+       ret = lwtunnel_build_state(dev, encap_type, encap,
+                                  AF_INET, cfg, &lwtstate);
+       if (!ret) {
+               result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate);
+               lwtstate_free(lwtstate);
+       }
+
+       return result;
+}
 
 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
 {
+       struct net *net = cfg->fc_nlinfo.nl_net;
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
        struct rtnexthop *rtnh;
        int remaining;
@@ -496,6 +629,12 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
                return 1;
 
        if (cfg->fc_oif || cfg->fc_gw) {
+               if (cfg->fc_encap) {
+                       if (fib_encap_match(net, cfg->fc_encap_type,
+                                           cfg->fc_encap, cfg->fc_oif,
+                                           fi->fib_nh, cfg))
+                           return 1;
+               }
                if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
                    (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
                        return 0;
@@ -585,7 +724,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
                        struct fib_nh *nh)
 {
-       int err;
+       int err = 0;
        struct net *net;
        struct net_device *dev;
 
@@ -594,16 +733,20 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
                struct fib_result res;
 
                if (nh->nh_flags & RTNH_F_ONLINK) {
+                       unsigned int addr_type;
 
                        if (cfg->fc_scope >= RT_SCOPE_LINK)
                                return -EINVAL;
-                       if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
-                               return -EINVAL;
                        dev = __dev_get_by_index(net, nh->nh_oif);
                        if (!dev)
                                return -ENODEV;
                        if (!(dev->flags & IFF_UP))
                                return -ENETDOWN;
+                       addr_type = inet_addr_type_dev_table(net, dev, nh->nh_gw);
+                       if (addr_type != RTN_UNICAST)
+                               return -EINVAL;
+                       if (!netif_carrier_ok(dev))
+                               nh->nh_flags |= RTNH_F_LINKDOWN;
                        nh->nh_dev = dev;
                        dev_hold(dev);
                        nh->nh_scope = RT_SCOPE_LINK;
@@ -611,6 +754,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
                }
                rcu_read_lock();
                {
+                       struct fib_table *tbl = NULL;
                        struct flowi4 fl4 = {
                                .daddr = nh->nh_gw,
                                .flowi4_scope = cfg->fc_scope + 1,
@@ -621,7 +765,24 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
                        /* It is not necessary, but requires a bit of thinking */
                        if (fl4.flowi4_scope < RT_SCOPE_LINK)
                                fl4.flowi4_scope = RT_SCOPE_LINK;
-                       err = fib_lookup(net, &fl4, &res);
+
+                       if (cfg->fc_table)
+                               tbl = fib_get_table(net, cfg->fc_table);
+
+                       if (tbl)
+                               err = fib_table_lookup(tbl, &fl4, &res,
+                                                      FIB_LOOKUP_IGNORE_LINKSTATE |
+                                                      FIB_LOOKUP_NOREF);
+
+                       /* on error or if no table given do full lookup. This
+                        * is needed for example when nexthops are in the local
+                        * table rather than the given table
+                        */
+                       if (!tbl || err) {
+                               err = fib_lookup(net, &fl4, &res,
+                                                FIB_LOOKUP_IGNORE_LINKSTATE);
+                       }
+
                        if (err) {
                                rcu_read_unlock();
                                return err;
@@ -636,6 +797,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
                if (!dev)
                        goto out;
                dev_hold(dev);
+               if (!netif_carrier_ok(dev))
+                       nh->nh_flags |= RTNH_F_LINKDOWN;
                err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
        } else {
                struct in_device *in_dev;
@@ -654,6 +817,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
                nh->nh_dev = in_dev->dev;
                dev_hold(nh->nh_dev);
                nh->nh_scope = RT_SCOPE_HOST;
+               if (!netif_carrier_ok(nh->nh_dev))
+                       nh->nh_flags |= RTNH_F_LINKDOWN;
                err = 0;
        }
 out:
@@ -713,8 +878,6 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash,
                        struct hlist_head *dest;
                        unsigned int new_hash;
 
-                       hlist_del(&fi->fib_hash);
-
                        new_hash = fib_info_hashfn(fi);
                        dest = &new_info_hash[new_hash];
                        hlist_add_head(&fi->fib_hash, dest);
@@ -731,8 +894,6 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash,
                        struct hlist_head *ldest;
                        unsigned int new_hash;
 
-                       hlist_del(&fi->fib_lhash);
-
                        new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
                        ldest = &new_laddrhash[new_hash];
                        hlist_add_head(&fi->fib_lhash, ldest);
@@ -757,6 +918,74 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
        return nh->nh_saddr;
 }
 
+static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
+{
+       if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
+           fib_prefsrc != cfg->fc_dst) {
+               u32 tb_id = cfg->fc_table;
+               int rc;
+
+               if (tb_id == RT_TABLE_MAIN)
+                       tb_id = RT_TABLE_LOCAL;
+
+               rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net,
+                                         fib_prefsrc, tb_id);
+
+               if (rc != RTN_LOCAL && tb_id != RT_TABLE_LOCAL) {
+                       rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net,
+                                                 fib_prefsrc, RT_TABLE_LOCAL);
+               }
+
+               if (rc != RTN_LOCAL)
+                       return false;
+       }
+       return true;
+}
+
+static int
+fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
+{
+       bool ecn_ca = false;
+       struct nlattr *nla;
+       int remaining;
+
+       if (!cfg->fc_mx)
+               return 0;
+
+       nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
+               int type = nla_type(nla);
+               u32 val;
+
+               if (!type)
+                       continue;
+               if (type > RTAX_MAX)
+                       return -EINVAL;
+
+               if (type == RTAX_CC_ALGO) {
+                       char tmp[TCP_CA_NAME_MAX];
+
+                       nla_strlcpy(tmp, nla, sizeof(tmp));
+                       val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
+                       if (val == TCP_CA_UNSPEC)
+                               return -EINVAL;
+               } else {
+                       val = nla_get_u32(nla);
+               }
+               if (type == RTAX_ADVMSS && val > 65535 - 40)
+                       val = 65535 - 40;
+               if (type == RTAX_MTU && val > 65535 - 15)
+                       val = 65535 - 15;
+               if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
+                       return -EINVAL;
+               fi->fib_metrics[type - 1] = val;
+       }
+
+       if (ecn_ca)
+               fi->fib_metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
+
+       return 0;
+}
+
 struct fib_info *fib_create_info(struct fib_config *cfg)
 {
        int err;
@@ -829,36 +1058,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
                        goto failure;
        } endfor_nexthops(fi)
 
-       if (cfg->fc_mx) {
-               struct nlattr *nla;
-               int remaining;
-
-               nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
-                       int type = nla_type(nla);
-
-                       if (type) {
-                               u32 val;
-
-                               if (type > RTAX_MAX)
-                                       goto err_inval;
-                               if (type == RTAX_CC_ALGO) {
-                                       char tmp[TCP_CA_NAME_MAX];
-
-                                       nla_strlcpy(tmp, nla, sizeof(tmp));
-                                       val = tcp_ca_get_key_by_name(tmp);
-                                       if (val == TCP_CA_UNSPEC)
-                                               goto err_inval;
-                               } else {
-                                       val = nla_get_u32(nla);
-                               }
-                               if (type == RTAX_ADVMSS && val > 65535 - 40)
-                                       val = 65535 - 40;
-                               if (type == RTAX_MTU && val > 65535 - 15)
-                                       val = 65535 - 15;
-                               fi->fib_metrics[type - 1] = val;
-                       }
-               }
-       }
+       err = fib_convert_metrics(fi, cfg);
+       if (err)
+               goto failure;
 
        if (cfg->fc_mp) {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -879,6 +1081,22 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
        } else {
                struct fib_nh *nh = fi->fib_nh;
 
+               if (cfg->fc_encap) {
+                       struct lwtunnel_state *lwtstate;
+                       struct net_device *dev = NULL;
+
+                       if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE)
+                               goto err_inval;
+                       if (cfg->fc_oif)
+                               dev = __dev_get_by_index(net, cfg->fc_oif);
+                       err = lwtunnel_build_state(dev, cfg->fc_encap_type,
+                                                  cfg->fc_encap, AF_INET, cfg,
+                                                  &lwtstate);
+                       if (err)
+                               goto failure;
+
+                       nh->nh_lwtstate = lwtstate_get(lwtstate);
+               }
                nh->nh_oif = cfg->fc_oif;
                nh->nh_gw = cfg->fc_gw;
                nh->nh_flags = cfg->fc_flags;
@@ -924,24 +1142,29 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
                if (!nh->nh_dev)
                        goto failure;
        } else {
+               int linkdown = 0;
+
                change_nexthops(fi) {
                        err = fib_check_nh(cfg, fi, nexthop_nh);
                        if (err != 0)
                                goto failure;
+                       if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN)
+                               linkdown++;
                } endfor_nexthops(fi)
+               if (linkdown == fi->fib_nhs)
+                       fi->fib_flags |= RTNH_F_LINKDOWN;
        }
 
-       if (fi->fib_prefsrc) {
-               if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
-                   fi->fib_prefsrc != cfg->fc_dst)
-                       if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
-                               goto err_inval;
-       }
+       if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc))
+               goto err_inval;
 
        change_nexthops(fi) {
                fib_info_update_nh_saddr(net, nexthop_nh);
+               fib_add_weight(fi, nexthop_nh);
        } endfor_nexthops(fi)
 
+       fib_rebalance(fi);
+
 link_it:
        ofi = fib_find_info(fi);
        if (ofi) {
@@ -1027,17 +1250,27 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
            nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
                goto nla_put_failure;
        if (fi->fib_nhs == 1) {
+               struct in_device *in_dev;
+
                if (fi->fib_nh->nh_gw &&
                    nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw))
                        goto nla_put_failure;
                if (fi->fib_nh->nh_oif &&
                    nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif))
                        goto nla_put_failure;
+               if (fi->fib_nh->nh_flags & RTNH_F_LINKDOWN) {
+                       in_dev = __in_dev_get_rtnl(fi->fib_nh->nh_dev);
+                       if (in_dev &&
+                           IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
+                               rtm->rtm_flags |= RTNH_F_DEAD;
+               }
 #ifdef CONFIG_IP_ROUTE_CLASSID
                if (fi->fib_nh[0].nh_tclassid &&
                    nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
                        goto nla_put_failure;
 #endif
+               if (fi->fib_nh->nh_lwtstate)
+                       lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate);
        }
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
        if (fi->fib_nhs > 1) {
@@ -1049,11 +1282,19 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
                        goto nla_put_failure;
 
                for_nexthops(fi) {
+                       struct in_device *in_dev;
+
                        rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
                        if (!rtnh)
                                goto nla_put_failure;
 
                        rtnh->rtnh_flags = nh->nh_flags & 0xFF;
+                       if (nh->nh_flags & RTNH_F_LINKDOWN) {
+                               in_dev = __in_dev_get_rtnl(nh->nh_dev);
+                               if (in_dev &&
+                                   IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
+                                       rtnh->rtnh_flags |= RTNH_F_DEAD;
+                       }
                        rtnh->rtnh_hops = nh->nh_weight - 1;
                        rtnh->rtnh_ifindex = nh->nh_oif;
 
@@ -1065,6 +1306,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
                            nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
                                goto nla_put_failure;
 #endif
+                       if (nh->nh_lwtstate)
+                               lwtunnel_fill_encap(skb, nh->nh_lwtstate);
                        /* length of rtnetlink header + attributes */
                        rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
                } endfor_nexthops(fi);
@@ -1107,7 +1350,13 @@ int fib_sync_down_addr(struct net *net, __be32 local)
        return ret;
 }
 
-int fib_sync_down_dev(struct net_device *dev, int force)
+/* Event              force Flags           Description
+ * NETDEV_CHANGE      0     LINKDOWN        Carrier OFF, not for scope host
+ * NETDEV_DOWN        0     LINKDOWN|DEAD   Link down, not for scope host
+ * NETDEV_DOWN        1     LINKDOWN|DEAD   Last address removed
+ * NETDEV_UNREGISTER  1     LINKDOWN|DEAD   Device removed
+ */
+int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
 {
        int ret = 0;
        int scope = RT_SCOPE_NOWHERE;
@@ -1133,49 +1382,79 @@ int fib_sync_down_dev(struct net_device *dev, int force)
                                dead++;
                        else if (nexthop_nh->nh_dev == dev &&
                                 nexthop_nh->nh_scope != scope) {
-                               nexthop_nh->nh_flags |= RTNH_F_DEAD;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-                               spin_lock_bh(&fib_multipath_lock);
-                               fi->fib_power -= nexthop_nh->nh_power;
-                               nexthop_nh->nh_power = 0;
-                               spin_unlock_bh(&fib_multipath_lock);
-#endif
+                               switch (event) {
+                               case NETDEV_DOWN:
+                               case NETDEV_UNREGISTER:
+                                       nexthop_nh->nh_flags |= RTNH_F_DEAD;
+                                       /* fall through */
+                               case NETDEV_CHANGE:
+                                       nexthop_nh->nh_flags |= RTNH_F_LINKDOWN;
+                                       break;
+                               }
                                dead++;
                        }
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-                       if (force > 1 && nexthop_nh->nh_dev == dev) {
+                       if (event == NETDEV_UNREGISTER &&
+                           nexthop_nh->nh_dev == dev) {
                                dead = fi->fib_nhs;
                                break;
                        }
 #endif
                } endfor_nexthops(fi)
                if (dead == fi->fib_nhs) {
-                       fi->fib_flags |= RTNH_F_DEAD;
+                       switch (event) {
+                       case NETDEV_DOWN:
+                       case NETDEV_UNREGISTER:
+                               fi->fib_flags |= RTNH_F_DEAD;
+                               /* fall through */
+                       case NETDEV_CHANGE:
+                               fi->fib_flags |= RTNH_F_LINKDOWN;
+                               break;
+                       }
                        ret++;
                }
+
+               fib_rebalance(fi);
        }
 
        return ret;
 }
 
 /* Must be invoked inside of an RCU protected region.  */
-void fib_select_default(struct fib_result *res)
+void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
 {
        struct fib_info *fi = NULL, *last_resort = NULL;
        struct hlist_head *fa_head = res->fa_head;
        struct fib_table *tb = res->table;
+       u8 slen = 32 - res->prefixlen;
        int order = -1, last_idx = -1;
-       struct fib_alias *fa;
+       struct fib_alias *fa, *fa1 = NULL;
+       u32 last_prio = res->fi->fib_priority;
+       u8 last_tos = 0;
 
        hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
                struct fib_info *next_fi = fa->fa_info;
 
+               if (fa->fa_slen != slen)
+                       continue;
+               if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+                       continue;
+               if (fa->tb_id != tb->tb_id)
+                       continue;
+               if (next_fi->fib_priority > last_prio &&
+                   fa->fa_tos == last_tos) {
+                       if (last_tos)
+                               continue;
+                       break;
+               }
+               if (next_fi->fib_flags & RTNH_F_DEAD)
+                       continue;
+               last_tos = fa->fa_tos;
+               last_prio = next_fi->fib_priority;
+
                if (next_fi->fib_scope != res->scope ||
                    fa->fa_type != RTN_UNICAST)
                        continue;
-
-               if (next_fi->fib_priority > res->fi->fib_priority)
-                       break;
                if (!next_fi->fib_nh[0].nh_gw ||
                    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
                        continue;
@@ -1185,10 +1464,11 @@ void fib_select_default(struct fib_result *res)
                if (!fi) {
                        if (next_fi != res->fi)
                                break;
+                       fa1 = fa;
                } else if (!fib_detect_death(fi, order, &last_resort,
-                                            &last_idx, tb->tb_default)) {
+                                            &last_idx, fa1->fa_default)) {
                        fib_result_assign(res, fi);
-                       tb->tb_default = order;
+                       fa1->fa_default = order;
                        goto out;
                }
                fi = next_fi;
@@ -1196,31 +1476,30 @@ void fib_select_default(struct fib_result *res)
        }
 
        if (order <= 0 || !fi) {
-               tb->tb_default = -1;
+               if (fa1)
+                       fa1->fa_default = -1;
                goto out;
        }
 
        if (!fib_detect_death(fi, order, &last_resort, &last_idx,
-                               tb->tb_default)) {
+                             fa1->fa_default)) {
                fib_result_assign(res, fi);
-               tb->tb_default = order;
+               fa1->fa_default = order;
                goto out;
        }
 
        if (last_idx >= 0)
                fib_result_assign(res, last_resort);
-       tb->tb_default = last_idx;
+       fa1->fa_default = last_idx;
 out:
        return;
 }
 
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-
 /*
  * Dead device goes up. We wake up dead nexthops.
  * It takes sense only on multipath routes.
  */
-int fib_sync_up(struct net_device *dev)
+int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
 {
        struct fib_info *prev_fi;
        unsigned int hash;
@@ -1231,6 +1510,13 @@ int fib_sync_up(struct net_device *dev)
        if (!(dev->flags & IFF_UP))
                return 0;
 
+       if (nh_flags & RTNH_F_DEAD) {
+               unsigned int flags = dev_get_flags(dev);
+
+               if (flags & (IFF_RUNNING | IFF_LOWER_UP))
+                       nh_flags |= RTNH_F_LINKDOWN;
+       }
+
        prev_fi = NULL;
        hash = fib_devindex_hashfn(dev->ifindex);
        head = &fib_info_devhash[hash];
@@ -1247,7 +1533,7 @@ int fib_sync_up(struct net_device *dev)
                prev_fi = fi;
                alive = 0;
                change_nexthops(fi) {
-                       if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
+                       if (!(nexthop_nh->nh_flags & nh_flags)) {
                                alive++;
                                continue;
                        }
@@ -1258,71 +1544,57 @@ int fib_sync_up(struct net_device *dev)
                            !__in_dev_get_rtnl(dev))
                                continue;
                        alive++;
-                       spin_lock_bh(&fib_multipath_lock);
-                       nexthop_nh->nh_power = 0;
-                       nexthop_nh->nh_flags &= ~RTNH_F_DEAD;
-                       spin_unlock_bh(&fib_multipath_lock);
+                       nexthop_nh->nh_flags &= ~nh_flags;
                } endfor_nexthops(fi)
 
                if (alive > 0) {
-                       fi->fib_flags &= ~RTNH_F_DEAD;
+                       fi->fib_flags &= ~nh_flags;
                        ret++;
                }
+
+               fib_rebalance(fi);
        }
 
        return ret;
 }
 
-/*
- * The algorithm is suboptimal, but it provides really
- * fair weighted route distribution.
- */
-void fib_select_multipath(struct fib_result *res)
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+void fib_select_multipath(struct fib_result *res, int hash)
 {
        struct fib_info *fi = res->fi;
-       int w;
 
-       spin_lock_bh(&fib_multipath_lock);
-       if (fi->fib_power <= 0) {
-               int power = 0;
-               change_nexthops(fi) {
-                       if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
-                               power += nexthop_nh->nh_weight;
-                               nexthop_nh->nh_power = nexthop_nh->nh_weight;
-                       }
-               } endfor_nexthops(fi);
-               fi->fib_power = power;
-               if (power <= 0) {
-                       spin_unlock_bh(&fib_multipath_lock);
-                       /* Race condition: route has just become dead. */
-                       res->nh_sel = 0;
-                       return;
-               }
-       }
-
-
-       /* w should be random number [0..fi->fib_power-1],
-        * it is pretty bad approximation.
-        */
-
-       w = jiffies % fi->fib_power;
+       for_nexthops(fi) {
+               if (hash > atomic_read(&nh->nh_upper_bound))
+                       continue;
 
-       change_nexthops(fi) {
-               if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
-                   nexthop_nh->nh_power) {
-                       w -= nexthop_nh->nh_power;
-                       if (w <= 0) {
-                               nexthop_nh->nh_power--;
-                               fi->fib_power--;
-                               res->nh_sel = nhsel;
-                               spin_unlock_bh(&fib_multipath_lock);
-                               return;
-                       }
-               }
+               res->nh_sel = nhsel;
+               return;
        } endfor_nexthops(fi);
 
        /* Race condition: route has just become dead. */
        res->nh_sel = 0;
-       spin_unlock_bh(&fib_multipath_lock);
 }
 #endif
+
+void fib_select_path(struct net *net, struct fib_result *res,
+                    struct flowi4 *fl4, int mp_hash)
+{
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+       if (res->fi->fib_nhs > 1 && fl4->flowi4_oif == 0) {
+               if (mp_hash < 0)
+                       mp_hash = get_hash_from_flowi4(fl4) >> 1;
+
+               fib_select_multipath(res, mp_hash);
+       }
+       else
+#endif
+       if (!res->prefixlen &&
+           res->table->tb_num_default > 1 &&
+           res->type == RTN_UNICAST && !fl4->flowi4_oif)
+               fib_select_default(fl4, res);
+
+       if (!fl4->saddr)
+               fl4->saddr = FIB_RES_PREFSRC(net, *res);
+}
+EXPORT_SYMBOL_GPL(fib_select_path);