From: David Ahern <dsah...@gmail.com>

Allow the creation of nexthop groups which reference other nexthop
objects to create multipath routes.

TO-DO: Add mpath support to IPv6

Signed-off-by: David Ahern <dsah...@gmail.com>
---
 include/net/nexthop.h    |  77 +++++--
 net/ipv4/fib_semantics.c |   5 +-
 net/ipv4/nexthop.c       | 511 ++++++++++++++++++++++++++++++++++++++++++-----
 net/ipv4/route.c         |  16 +-
 4 files changed, 540 insertions(+), 69 deletions(-)

diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index 759bb39e4ea7..654b67192337 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -28,6 +28,23 @@
 
 struct nexthop;
 
+struct nh_grp_entry {
+       struct nexthop   *nh;
+       u32              weight;
+       atomic_t         upper_bound;
+
+       struct list_head nh_list;
+       struct nexthop   *nh_parent;  /* nexthop of group with this entry */
+};
+
+struct nh_group {
+       u16                     num_nh_set;
+       u16                     num_nh;
+       u8                      mpath:1,
+                               unused:7;
+       struct nh_grp_entry     nh_entries[0];
+};
+
 struct nh_info {
        struct hlist_node       dev_hash;
        struct net              *net;
@@ -47,6 +64,7 @@ struct nh_info {
 
 struct nexthop {
        struct rb_node          rb_node;
+       struct list_head        grp_list;  /* nh group entries using this nh */
        struct list_head        fi_list;    /* v4 entries using nh */
        struct list_head        f6i_list;   /* v6 entries using nh */
 
@@ -54,12 +72,15 @@ struct nexthop {
 
        u8                      protocol;
        u8                      nh_flags;
+       u8                      is_group:1,
+                               unused:7;
 
        refcount_t              refcnt;
        struct rcu_head         rcu;
 
        union {
                struct nh_info  __rcu *nh_info;
+               struct nh_group __rcu *nh_grp;
        };
 };
 
@@ -81,6 +102,9 @@ struct nh_config {
                struct in6_addr ipv6;
        } gw;
 
+       struct nlattr   *nh_grp;
+       u16             nh_grp_type;
+
        u32             nlflags;
        struct nl_info  nlinfo;
 };
@@ -88,42 +112,61 @@ struct nh_config {
 void nexthop_get(struct nexthop *nh);
 void nexthop_put(struct nexthop *nh);
 
+static inline bool nexthop_cmp(struct nexthop *nh1, struct nexthop *nh2)
+{
+       return nh1 == nh2;
+}
+
 /* caller is holding rtnl; no reference taken to nexthop */
 struct nexthop *nexthop_find_by_id(struct net *net, u32 id);
 
-static inline bool nexthop_cmp(struct nexthop *nh1, struct nexthop *nh2)
+/* called with rcu lock */
+static inline bool nexthop_is_multipath(const struct nexthop *nh)
 {
-       return nh1 == nh2;
+       if (nh->is_group) {
+               struct nh_group *nh_grp;
+
+               nh_grp = rcu_dereference(nh->nh_grp);
+               return !!nh_grp->mpath;
+       }
+       return false;
 }
 
+struct nexthop *nexthop_mpath_select(struct nexthop *nh, int nhsel);
+
+/* called with rcu lock */
 static inline int nexthop_num_path(struct nexthop *nh)
 {
+       if (nexthop_is_multipath(nh)) {
+               struct nh_group *nh_grp;
+
+               nh_grp = rcu_dereference(nh->nh_grp);
+               return nh_grp->num_nh_set;
+       }
+
        return 1;
 }
 
-/* called with rcu lock */
+void nexthop_select_path(struct net *net, struct fib_result *res, int hash);
+
 static inline bool nexthop_has_gw(struct nexthop *nh)
 {
-       struct nh_info *nhi;
-
-       nhi = rcu_dereference(nh->nh_info);
-       return !!nhi->has_gw;
+       return !!nh->nh_info->has_gw;
 }
 
-/* called with rcu lock */
 static inline bool nexthop_is_blackhole(struct nexthop *nh)
 {
-       struct nh_info *nhi;
-
-       nhi = rcu_dereference(nh->nh_info);
-       return !!nhi->reject_nh;
+       return !nexthop_is_multipath(nh) && !!nh->nh_info->reject_nh;
 }
 
 static inline struct fib_nh *nexthop_fib_nh(struct nexthop *nh, int nhsel)
 {
        struct nh_info *nhi;
 
-       nhi = rcu_dereference(nh->nh_info);
+       if (nexthop_is_multipath(nh))
+               nh = nexthop_mpath_select(nh, nhsel);
+
+       nhi = nh->nh_info;
        if (nhi->family == AF_INET ||
            nhi->family == AF_UNSPEC)  /* dev only re-uses IPv4 struct */
                return &nhi->fib_nh;
@@ -164,11 +207,11 @@ static inline __be32 fib_info_nh_gw(struct fib_info *fi)
  */
 static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh)
 {
-       struct nh_info *nhi;
+       if (nexthop_is_multipath(nh))
+               nh = nexthop_mpath_select(nh, 0);
 
-       nhi = rcu_dereference(nh->nh_info);
-       if (nhi->family == AF_INET6)
-               return &nhi->fib6_nh;
+       if (nh->nh_info->family == AF_INET6)
+               return &nh->nh_info->fib6_nh;
 
        return NULL;
 }
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c91cdafd40ec..0ddf14512bb3 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1821,7 +1821,10 @@ void fib_select_path(struct net *net, struct fib_result 
*res,
                goto check_saddr;
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-       if (res->fi->fib_nhs > 1) {
+       if (res->fi->nh && nexthop_is_multipath(res->fi->nh)) {
+               h = fib_multipath_hash(net, fl4, skb, NULL);
+               nexthop_select_path(net, res, h);
+       } else if (res->fi->fib_nhs > 1) {
                h = fib_multipath_hash(net, fl4, skb, NULL);
                fib_select_multipath(res, h);
        }
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 1e77fa94e562..f0b4151c661a 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -35,6 +35,8 @@ static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = {
        [NHA_TABLE_ID]          = { .type = NLA_U32 },
        [NHA_BLACKHOLE]         = { .type = NLA_FLAG },
        [NHA_MASTER]            = { .type = NLA_U32 },
+       [NHA_GROUP_TYPE]        = { .type = NLA_U16 },
+       [NHA_GROUPS]            = { .type = NLA_FLAG },
 };
 
 static unsigned int nh_dev_hashfn(unsigned int val)
@@ -67,19 +69,35 @@ static void nexthop_devhash_add(struct net *net, struct 
nh_info *nhi)
 static void nexthop_free_rcu(struct rcu_head *head)
 {
        struct nexthop *nh = container_of(head, struct nexthop, rcu);
-       struct nh_info *nhi;
 
-       nhi = rcu_dereference_raw(nh->nh_info);
-       switch (nhi->family) {
-       case AF_INET:
-       case AF_UNSPEC:
-               fib_nh_release(nhi->net, &nhi->fib_nh);
-               break;
-       case AF_INET6:
-               fib6_nh_release(&nhi->fib6_nh);
-               break;
+       if (nh->is_group) {
+               struct nh_group *nh_grp;
+               int i;
+
+               nh_grp = rcu_dereference_raw(nh->nh_grp);
+               for (i = 0; i < nh_grp->num_nh; ++i) {
+                       if (!nh_grp->nh_entries[i].nh)
+                               continue;
+
+                       list_del(&nh_grp->nh_entries[i].nh_list);
+                       nexthop_put(nh_grp->nh_entries[i].nh);
+               }
+               kfree(nh_grp);
+       } else {
+               struct nh_info *nhi;
+
+               nhi = rcu_dereference_raw(nh->nh_info);
+               switch (nhi->family) {
+               case AF_INET:
+               case AF_UNSPEC:
+                       fib_nh_release(nhi->net, &nhi->fib_nh);
+                       break;
+               case AF_INET6:
+                       fib6_nh_release(&nhi->fib6_nh);
+                       break;
+               }
+               kfree(nhi);
        }
-       kfree(nhi);
 
        kfree(nh);
 }
@@ -89,6 +107,33 @@ static struct nexthop *nexthop_alloc(void)
        return kzalloc(sizeof(struct nexthop), GFP_KERNEL);
 }
 
+/* nexthop for group has variable size and may not use the kmem_cache */
+static struct nexthop *nexthop_grp_alloc(u16 num_nh)
+{
+       size_t sz = offsetof(struct nexthop, nh_grp)
+                   + sizeof(struct nh_group)
+                   + sizeof(struct nh_grp_entry) * num_nh;
+       struct nh_group *nh_grp;
+       struct nexthop *nh;
+
+       nh = nexthop_alloc();
+       if (!nh)
+               return ERR_PTR(-ENOMEM);
+
+       nh_grp = kzalloc(sz, GFP_KERNEL);
+       if (!nh_grp) {
+               kfree(nh);
+               return ERR_PTR(-ENOMEM);
+       }
+
+       nh->is_group = 1;
+       nh_grp->num_nh = num_nh;
+       nh_grp->num_nh_set = num_nh;
+       rcu_assign_pointer(nh->nh_grp, nh_grp);
+
+       return nh;
+}
+
 static void nh_base_seq_inc(struct net *net)
 {
        while (++net->nexthop.seq == 0)
@@ -173,23 +218,166 @@ static size_t nh_nlmsg_size_ipv4(struct nh_info *nhi)
 
 static size_t nh_nlmsg_size(struct nexthop *nh)
 {
-       struct nh_info *nhi = rtnl_dereference(nh->nh_info);
        size_t sz = nla_total_size(4);    /* NHA_ID */
 
-       /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
-        * are mutually exclusive
-        */
-       sz += nla_total_size(4);  /* NHA_OIF */
+       if (nh->is_group) {
+               struct nh_group *nh_grp = rtnl_dereference(nh->nh_grp);
+               size_t sz2 = sizeof(struct nh_group) * nh_grp->num_nh_set;
 
-       if (nhi->family == AF_INET)
-               sz += nh_nlmsg_size_ipv4(nhi);
+               sz += nla_total_size(sz2)
+                     + nla_total_size(2);  /* NHA_GROUP_TYPE */
+       } else {
+               struct nh_info *nhi = rtnl_dereference(nh->nh_info);
 
-       else if (nhi->family == AF_INET6)
-               sz += nh_nlmsg_size_ipv6(nhi);
+               /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
+                * are mutually exclusive
+                */
+               sz += nla_total_size(4);  /* NHA_OIF */
+
+               if (nhi->family == AF_INET)
+                       sz += nh_nlmsg_size_ipv4(nhi);
+               else if (nhi->family == AF_INET6)
+                       sz += nh_nlmsg_size_ipv6(nhi);
+       }
 
        return sz;
 }
 
+static bool valid_group_nh(struct nexthop *nh, struct netlink_ext_ack *extack)
+{
+       if (nh->is_group) {
+               struct nh_group *nh_grp = rtnl_dereference(nh->nh_grp);
+
+               /* nested multipath (group within a group) is not
+                * supported
+                */
+               if (nh_grp->mpath) {
+                       NL_SET_ERR_MSG(extack,
+                                      "Multipath group can not be a nexthop 
within a group");
+                       return false;
+               }
+       } else {
+               struct nh_info *nhi = rtnl_dereference(nh->nh_info);
+
+               if (nhi->reject_nh) {
+                       NL_SET_ERR_MSG(extack,
+                                      "Blackhole nexthop can not be used in a 
group");
+                       return false;
+               }
+       }
+
+       return true;
+}
+
+static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
+                              struct netlink_ext_ack *extack)
+{
+       unsigned int len = nla_len(tb[NHA_GROUP]);
+       struct nexthop_grp *nhg;
+       int i;
+
+       if (len & (sizeof(struct nh_group) - 1)) {
+               NL_SET_ERR_MSG(extack,
+                              "Invalid length for nexthop group attribute");
+               return -EINVAL;
+       }
+
+       /* convert len to number of nexthop ids */
+       len /= sizeof(*nhg);
+
+       nhg = nla_data(tb[NHA_GROUP]);
+       for (i = 0; i < len; ++i) {
+               struct nexthop *nh;
+
+               nh = nexthop_find_by_id(net, nhg->id);
+               if (!nh) {
+                       NL_SET_ERR_MSG(extack, "Invalid nexthop id");
+                       return -EINVAL;
+               }
+               if (!valid_group_nh(nh, extack))
+                       return -EINVAL;
+
+               nhg += 1;
+       }
+
+       for (i = NHA_GROUP + 1; i < __NHA_MAX; ++i) {
+               if (!tb[i])
+                       continue;
+
+               NL_SET_ERR_MSG(extack,
+                              "No other attributes can be set in nexthop 
groups");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nh_grp)
+{
+       size_t len = nh_grp->num_nh_set * sizeof(struct nh_group);
+       struct nexthop_grp *p;
+       struct nlattr *nla;
+       u16 group_type = 0;
+       int i;
+
+       if (nh_grp->mpath)
+               group_type = NEXTHOP_GRP_TYPE_MPATH;
+
+       if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type))
+               goto nla_put_failure;
+
+       nla = nla_reserve(skb, NHA_GROUP, len);
+       if (!nla)
+               goto nla_put_failure;
+
+       p = nla_data(nla);
+       for (i = 0; i < nh_grp->num_nh; ++i) {
+               if (!nh_grp->nh_entries[i].nh)
+                       continue;
+
+               p->id = nh_grp->nh_entries[i].nh->id;
+               p->weight = nh_grp->nh_entries[i].weight;
+               p += 1;
+       }
+
+       return 0;
+
+nla_put_failure:
+       return -EMSGSIZE;
+}
+
+static void nh_group_rebalance(struct nh_group *nhg)
+{
+       struct nh_grp_entry *nhge;
+       int total = 0;
+       int w = 0;
+       int i;
+
+       for (i = 0; i < nhg->num_nh; ++i) {
+               nhge = &nhg->nh_entries[i];
+
+               if (!nhge->nh)
+                       continue;
+
+               total += nhge->weight;
+       }
+
+       for (i = 0; i < nhg->num_nh; ++i) {
+               int upper_bound;
+
+               nhge = &nhg->nh_entries[i];
+               if (!nhge->nh) {
+                       upper_bound = -1;
+               } else {
+                       w += nhge->weight;
+                       upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31,
+                                                           total) - 1;
+               }
+
+               atomic_set(&nhge->upper_bound, upper_bound);
+       }
+}
+
 static const struct net_device *nh_info_dev(const struct nh_info *nhi)
 {
        switch (nhi->family) {
@@ -219,8 +407,25 @@ bool nexthop_uses_dev(const struct nexthop *nh, const 
struct net_device *dev)
        const struct nh_info *nhi;
        bool dev_match = false;
 
-       nhi = rcu_dereference(nh->nh_info);
-       dev_match = nh_info_uses_dev(nhi, dev);
+       if (nh->is_group) {
+               const struct nh_group *nh_grp;
+               int i;
+
+               nh_grp = rcu_dereference(nh->nh_grp);
+               for (i = 0; i < nh_grp->num_nh; ++i) {
+                       const struct nh_grp_entry *nhge;
+
+                       nhge = &nh_grp->nh_entries[i];
+                       nhi = rcu_dereference(nhge->nh->nh_info);
+                       dev_match = nh_info_uses_dev(nhi, dev);
+                       if (dev_match)
+                               break;
+               }
+
+       } else {
+               nhi = rcu_dereference(nh->nh_info);
+               dev_match = nh_info_uses_dev(nhi, dev);
+       }
 
        return dev_match;
 }
@@ -249,6 +454,14 @@ static int nh_fill_node(struct sk_buff *skb, struct 
nexthop *nh,
        if (nla_put_u32(skb, NHA_ID, nh->id))
                goto nla_put_failure;
 
+       if (nh->is_group) {
+               struct nh_group *nh_grp = rtnl_dereference(nh->nh_grp);
+
+               if (nla_put_nh_group(skb, nh_grp))
+                       goto nla_put_failure;
+               goto end;
+       }
+
        nhi = rtnl_dereference(nh->nh_info);
        if (nhi->reject_nh && nla_put_flag(skb, NHA_BLACKHOLE))
                goto nla_put_failure;
@@ -281,6 +494,7 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop 
*nh,
                break;
        }
 
+end:
        nlmsg_end(skb, nlh);
        return 0;
 
@@ -315,6 +529,50 @@ static void nexthop_notify(int event, struct nexthop *nh, 
struct nl_info *info)
                rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
 }
 
+static void remove_nh_grp_entry(struct nh_grp_entry *nhge, bool rebalance)
+{
+       struct nh_group *nh_grp;
+
+       list_del(&nhge->nh_list);
+       nexthop_put(nhge->nh);
+       nhge->nh = NULL;
+
+       nh_grp = rtnl_dereference(nhge->nh_parent->nh_grp);
+       nh_grp->num_nh_set--;
+       if (rebalance)
+               nh_group_rebalance(nh_grp);
+}
+
+static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
+                                      bool skip_fib, struct nl_info *nlinfo)
+{
+       struct nh_grp_entry *nhge, *tmp;
+
+       list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) {
+               struct nh_group *nh_grp;
+
+               remove_nh_grp_entry(nhge, true);
+
+               /* if this group has no more entries then remove it */
+               nh_grp = rtnl_dereference(nhge->nh_parent->nh_grp);
+               if (!nh_grp->num_nh_set)
+                       remove_nexthop(net, nhge->nh_parent, skip_fib,
+                                      nlinfo);
+       }
+}
+
+static void remove_nexthop_group(struct nexthop *nh)
+{
+       struct nh_group *nh_grp;
+       int i;
+
+       nh_grp = rtnl_dereference(nh->nh_grp);
+       for (i = 0; i < nh_grp->num_nh; ++i) {
+               if (nh_grp->nh_entries[i].nh)
+                       remove_nh_grp_entry(&nh_grp->nh_entries[i], false);
+       }
+}
+
 static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
 {
        struct fib6_info *f6i, *tmp;
@@ -339,13 +597,19 @@ static void __remove_nexthop_fib(struct net *net, struct 
nexthop *nh)
 static void __remove_nexthop(struct net *net, struct nexthop *nh,
                             bool skip_fib, struct nl_info *nlinfo)
 {
-       const struct net_device *dev;
-       struct nh_info *nhi;
+       if (nh->is_group) {
+               remove_nexthop_group(nh);
+       } else {
+               const struct net_device *dev;
+               struct nh_info *nhi;
 
-       nhi = rtnl_dereference(nh->nh_info);
-       dev = nh_info_dev(nhi);
-       if (dev)
-               hlist_del(&nhi->dev_hash);
+               nhi = rtnl_dereference(nh->nh_info);
+               dev = nh_info_dev(nhi);
+               if (dev)
+                       hlist_del(&nhi->dev_hash);
+
+               remove_nexthop_from_groups(net, nh, skip_fib, nlinfo);
+       }
        if (!skip_fib)
                __remove_nexthop_fib(net, nh);
 }
@@ -362,21 +626,46 @@ static void remove_nexthop(struct net *net, struct 
nexthop *nh,
 
        nexthop_put(nh);
 
-       nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);
+       if (nlinfo)
+               nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);
 }
 
 static int replace_nexthop(struct net *net, struct nexthop *old,
                           struct nexthop *new, struct netlink_ext_ack *extack)
 {
-       struct nh_info *oldi, *newi;
+       if (old->is_group) {
+               struct nh_group *oldg, *newg;
+               int i;
 
-       oldi = rtnl_dereference(old->nh_info);
-       newi = rtnl_dereference(new->nh_info);
-       rcu_assign_pointer(old->nh_info, newi);
-       rcu_assign_pointer(new->nh_info, oldi);
+               if (!new->is_group) {
+                       NL_SET_ERR_MSG(extack, "Can not replace a nexthop group 
with a nexthop.");
+                       return -EINVAL;
+               }
+               oldg = rtnl_dereference(old->nh_grp);
+               newg = rtnl_dereference(new->nh_grp);
+               rcu_assign_pointer(old->nh_grp, newg);
+               rcu_assign_pointer(new->nh_grp, oldg);
+
+               /* update parents - used by nexthop code for cleanup */
+               for (i = 0; i < newg->num_nh; ++i)
+                       newg->nh_entries[i].nh_parent = old;
+               for (i = 0; i < oldg->num_nh; ++i)
+                       oldg->nh_entries[i].nh_parent = new;
+       } else {
+               struct nh_info *oldi, *newi;
 
-       newi->nh_parent = old;
-       oldi->nh_parent = new;
+               if (new->is_group) {
+                       NL_SET_ERR_MSG(extack, "Can not replace a nexthop with 
a nexthop group.");
+                       return -EINVAL;
+               }
+               oldi = rtnl_dereference(old->nh_info);
+               newi = rtnl_dereference(new->nh_info);
+               rcu_assign_pointer(old->nh_info, newi);
+               rcu_assign_pointer(new->nh_info, oldi);
+
+               newi->nh_parent = old;
+               oldi->nh_parent = new;
+       }
 
        old->protocol = new->protocol;
        old->nh_flags = new->nh_flags;
@@ -491,10 +780,16 @@ int fib_check_nexthop(struct fib_info *fi, struct 
fib_config *cfg,
                      struct netlink_ext_ack *extack)
 {
        struct nexthop *nh = fi->nh;
-       struct nh_info *nhi;
 
-       nhi = rtnl_dereference(nh->nh_info);
-       if (nhi->family != AF_UNSPEC) {
+       if (nh->is_group) {
+               if (cfg->fc_scope == RT_SCOPE_HOST) {
+                       NL_SET_ERR_MSG(extack, "Route with host scope can not 
have multiple nexthops");
+                       return -EINVAL;
+               }
+               return 0;
+       }
+
+       if (nh->nh_info->family != AF_UNSPEC) {
                if (nh->nh_flags & RTNH_F_ONLINK &&
                    cfg->fc_scope >= RT_SCOPE_LINK) {
                        NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop");
@@ -505,6 +800,57 @@ int fib_check_nexthop(struct fib_info *fi, struct 
fib_config *cfg,
        return 0;
 }
 
+void nexthop_select_path(struct net *net, struct fib_result *res, int hash)
+{
+       struct fib_info *fi = res->fi;
+       struct nexthop *nh = fi->nh;
+       struct nh_group *nh_grp;
+       bool first = false;
+       int i;
+
+       WARN_ON(!nh->is_group);
+
+       nh_grp = rcu_dereference(nh->nh_grp);
+       for (i = 0; i < nh_grp->num_nh; ++i) {
+               struct nh_grp_entry *nhge = &nh_grp->nh_entries[i];
+               struct fib_nh *fib_nh;
+
+               if (hash > atomic_read(&nhge->upper_bound))
+                       continue;
+
+               fib_nh = &nhge->nh->nh_info->fib_nh;
+
+               /* nexthops always check if it is good and does
+                * not rely on a sysctl for this behavior
+                */
+               if (fib_good_nh(fib_nh)) {
+                       res->nh = fib_nh;
+                       return;
+               }
+               if (!first) {
+                       res->nh = fib_nh;
+                       first = true;
+               }
+       }
+}
+
+struct nexthop *nexthop_mpath_select(struct nexthop *nh, int nhsel)
+{
+       struct nh_group *nh_grp;
+       int i, j = 0;
+
+       nh_grp = rcu_dereference(nh->nh_grp);
+       for (i = 0; i < nh_grp->num_nh; ++i) {
+               if (nh_grp->nh_entries[i].nh) {
+                       if (nhsel == j)
+                               return nh_grp->nh_entries[i].nh;
+                       ++j;
+               }
+       }
+       return NULL;
+}
+EXPORT_SYMBOL_GPL(nexthop_mpath_select);
+
 static int nh_check_attr(struct nhmsg *nhm, struct nlattr *tb[],
                         struct net *net, struct netlink_ext_ack *extack)
 {
@@ -557,6 +903,19 @@ static int rtm_to_nh_config(struct net *net, struct 
sk_buff *skb,
        if (tb[NHA_ID])
                cfg->nh_id = nla_get_u32(tb[NHA_ID]);
 
+       if (tb[NHA_GROUP]) {
+               cfg->nh_grp = tb[NHA_GROUP];
+
+               cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH;
+               if (tb[NHA_GROUP_TYPE])
+                       cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]);
+
+               if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) {
+                       NL_SET_ERR_MSG(extack, "Invalid group type");
+                       goto out;
+               }
+       }
+
        if (tb[NHA_OIF]) {
                cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
 
@@ -644,6 +1003,14 @@ static int rtm_to_nh_config(struct net *net, struct 
sk_buff *skb,
                goto out;
        }
 
+       if (tb[NHA_GROUP]) {
+               err = nh_check_attr_group(net, tb, extack);
+               if (err)
+                       goto out;
+
+               return 0;
+       }
+
        err = 0;
 out:
        return err;
@@ -791,7 +1158,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop 
*nh,
        return err;
 }
 
-static int nh_create_ipv6(struct net *net,  struct nexthop *nh,
+static int nh_create_ipv6(struct net *net, struct nexthop *nh,
                          struct nh_info *nhi, struct nh_config *cfg,
                          struct netlink_ext_ack *extack)
 {
@@ -856,10 +1223,47 @@ static int nh_create_unspec(struct net *net, struct 
nexthop *nh,
 
 static void nexthop_init_common(struct nexthop *nh)
 {
+       INIT_LIST_HEAD(&nh->grp_list);
        INIT_LIST_HEAD(&nh->fi_list);
        INIT_LIST_HEAD(&nh->f6i_list);
 }
 
+static struct nexthop *nexthop_create_group(struct net *net,
+                                           struct nh_config *cfg)
+{
+       struct nlattr *grps_attr = cfg->nh_grp;
+       struct nexthop_grp *entry = nla_data(grps_attr);
+       struct nh_group *nh_grp;
+       struct nexthop *nh;
+       int i;
+
+       nh = nexthop_grp_alloc(nla_len(grps_attr) / sizeof(*entry));
+       if (!nh)
+               return ERR_PTR(-ENOMEM);
+
+       nexthop_init_common(nh);
+
+       nh_grp = rtnl_dereference(nh->nh_grp);
+       for (i = 0; i < nh_grp->num_nh; ++i) {
+               struct nexthop *nhe;
+
+               nhe = nexthop_find_by_id(net, entry[i].id);
+               nexthop_get(nhe);
+
+               nh_grp->nh_entries[i].nh = nhe;
+               nh_grp->nh_entries[i].weight = entry[i].weight ? : 1;
+               list_add(&nh_grp->nh_entries[i].nh_list, &nhe->grp_list);
+               nh_grp->nh_entries[i].nh_parent = nh;
+       }
+
+       if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) {
+               nh_grp->mpath = 1;
+               nh_group_rebalance(nh_grp);
+       }
+
+       return nh;
+}
+
 static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
                                      struct netlink_ext_ack *extack)
 {
@@ -929,7 +1333,11 @@ static struct nexthop *nexthop_add(struct net *net, 
struct nh_config *cfg,
                }
        }
 
-       nh = nexthop_create(net, cfg, extack);
+       if (cfg->nh_grp)
+               nh = nexthop_create_group(net, cfg);
+       else
+               nh = nexthop_create(net, cfg, extack);
+
        if (IS_ERR(nh))
                return nh;
 
@@ -968,19 +1376,25 @@ static int rtm_new_nexthop(struct sk_buff *skb, struct 
nlmsghdr *nlh,
        return err;
 }
 
-static bool nh_dump_filtered(struct nexthop *nh, int dev_idx,
+static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int group_filter,
                             int master_idx, u8 family)
 {
        const struct net_device *dev;
        const struct nh_info *nhi;
 
-       if (dev_idx || master_idx || family)
+       if (group_filter && !nh->is_group)
+               return true;
+
+       if ((dev_idx || master_idx || family) && nh->is_group)
                return true;
 
        nhi = rtnl_dereference(nh->nh_info);
-       if (family && nhi->family != family)
+       if (family && !nh->is_group && nhi->family != family)
                return true;
 
+       if (nh->is_group)
+               return false;
+
        dev = nh_info_dev(nhi);
        if (dev_idx && (!dev || dev->ifindex != dev_idx))
                return true;
@@ -998,7 +1412,7 @@ static bool nh_dump_filtered(struct nexthop *nh, int 
dev_idx,
 /* rtnl */
 static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
 {
-       int dev_filter_idx = 0, master_idx = 0;
+       int group_filter = 0, dev_filter_idx = 0, master_idx = 0;
        struct net *net = sock_net(skb->sk);
        struct rb_root *root = &net->nexthop.root;
        struct nlattr *tb[NHA_MAX + 1];
@@ -1010,6 +1424,9 @@ static int rtm_dump_nexthop(struct sk_buff *skb, struct 
netlink_callback *cb)
 
        if (nlmsg_parse(cb->nlh, sizeof(*nhm), tb, NHA_MAX,
                        rtm_nh_policy, NULL) >= 0) {
+               if (tb[NHA_GROUPS])
+                       group_filter = 1;
+
                if (tb[NHA_OIF])
                        dev_filter_idx = nla_get_u32(tb[NHA_OIF]);
 
@@ -1027,8 +1444,8 @@ static int rtm_dump_nexthop(struct sk_buff *skb, struct 
netlink_callback *cb)
                        goto cont;
 
                nh = rb_entry(node, struct nexthop, rb_node);
-               if (nh_dump_filtered(nh, dev_filter_idx, master_idx,
-                                    nhm->nh_family))
+               if (nh_dump_filtered(nh, dev_filter_idx, group_filter,
+                                    master_idx, nhm->nh_family))
                        goto cont;
 
                err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP,
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 1297c7c934a8..4c16715607e0 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -112,6 +112,7 @@
 #include <net/secure_seq.h>
 #include <net/ip_tunnels.h>
 #include <net/l3mdev.h>
+#include <net/nexthop.h>
 
 #include "fib_lookup.h"
 
@@ -1887,10 +1888,17 @@ static int ip_mkroute_input(struct sk_buff *skb,
                            struct flow_keys *hkeys)
 {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-       if (res->fi && res->fi->fib_nhs > 1) {
-               int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
-
-               fib_select_multipath(res, h);
+       if (res->fi) {
+               struct net *net = res->fi->fib_net;
+               int h;
+
+               if (res->fi->nh && nexthop_is_multipath(res->fi->nh)) {
+                       h = fib_multipath_hash(net, NULL, skb, hkeys);
+                       nexthop_select_path(net, res, h);
+               } else if (res->fi->fib_nhs > 1) {
+                       h = fib_multipath_hash(net, NULL, skb, hkeys);
+                       fib_select_multipath(res, h);
+               }
        }
 #endif
 
-- 
2.11.0

Reply via email to