From: David Ahern <dsah...@gmail.com> Allow the creation of nexthop groups which reference other nexthop objects to create multipath routes.
TO-DO: Add mpath support to IPv6 Signed-off-by: David Ahern <dsah...@gmail.com> --- include/net/nexthop.h | 77 +++++-- net/ipv4/fib_semantics.c | 5 +- net/ipv4/nexthop.c | 511 ++++++++++++++++++++++++++++++++++++++++++----- net/ipv4/route.c | 16 +- 4 files changed, 540 insertions(+), 69 deletions(-) diff --git a/include/net/nexthop.h b/include/net/nexthop.h index 759bb39e4ea7..654b67192337 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -28,6 +28,23 @@ struct nexthop; +struct nh_grp_entry { + struct nexthop *nh; + u32 weight; + atomic_t upper_bound; + + struct list_head nh_list; + struct nexthop *nh_parent; /* nexthop of group with this entry */ +}; + +struct nh_group { + u16 num_nh_set; + u16 num_nh; + u8 mpath:1, + unused:7; + struct nh_grp_entry nh_entries[0]; +}; + struct nh_info { struct hlist_node dev_hash; struct net *net; @@ -47,6 +64,7 @@ struct nh_info { struct nexthop { struct rb_node rb_node; + struct list_head grp_list; /* nh group entries using this nh */ struct list_head fi_list; /* v4 entries using nh */ struct list_head f6i_list; /* v6 entries using nh */ @@ -54,12 +72,15 @@ struct nexthop { u8 protocol; u8 nh_flags; + u8 is_group:1, + unused:7; refcount_t refcnt; struct rcu_head rcu; union { struct nh_info __rcu *nh_info; + struct nh_group __rcu *nh_grp; }; }; @@ -81,6 +102,9 @@ struct nh_config { struct in6_addr ipv6; } gw; + struct nlattr *nh_grp; + u16 nh_grp_type; + u32 nlflags; struct nl_info nlinfo; }; @@ -88,42 +112,61 @@ struct nh_config { void nexthop_get(struct nexthop *nh); void nexthop_put(struct nexthop *nh); +static inline bool nexthop_cmp(struct nexthop *nh1, struct nexthop *nh2) +{ + return nh1 == nh2; +} + /* caller is holding rtnl; no reference taken to nexthop */ struct nexthop *nexthop_find_by_id(struct net *net, u32 id); -static inline bool nexthop_cmp(struct nexthop *nh1, struct nexthop *nh2) +/* called with rcu lock */ +static inline bool nexthop_is_multipath(const struct nexthop *nh) { - return nh1 == nh2; + if (nh->is_group) { + struct nh_group *nh_grp; + + nh_grp = rcu_dereference(nh->nh_grp); + return !!nh_grp->mpath; + } + return false; } +struct nexthop *nexthop_mpath_select(struct nexthop *nh, int nhsel); + +/* called with rcu lock */ static inline int nexthop_num_path(struct nexthop *nh) { + if (nexthop_is_multipath(nh)) { + struct nh_group *nh_grp; + + nh_grp = rcu_dereference(nh->nh_grp); + return nh_grp->num_nh_set; + } + return 1; } -/* called with rcu lock */ +void nexthop_select_path(struct net *net, struct fib_result *res, int hash); + static inline bool nexthop_has_gw(struct nexthop *nh) { - struct nh_info *nhi; - - nhi = rcu_dereference(nh->nh_info); - return !!nhi->has_gw; + return !!nh->nh_info->has_gw; } -/* called with rcu lock */ static inline bool nexthop_is_blackhole(struct nexthop *nh) { - struct nh_info *nhi; - - nhi = rcu_dereference(nh->nh_info); - return !!nhi->reject_nh; + return !nexthop_is_multipath(nh) && !!nh->nh_info->reject_nh; } static inline struct fib_nh *nexthop_fib_nh(struct nexthop *nh, int nhsel) { struct nh_info *nhi; - nhi = rcu_dereference(nh->nh_info); + if (nexthop_is_multipath(nh)) + nh = nexthop_mpath_select(nh, nhsel); + + nhi = nh->nh_info; if (nhi->family == AF_INET || nhi->family == AF_UNSPEC) /* dev only re-uses IPv4 struct */ return &nhi->fib_nh; @@ -164,11 +207,11 @@ static inline __be32 fib_info_nh_gw(struct fib_info *fi) */ static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh) { - struct nh_info *nhi; + if (nexthop_is_multipath(nh)) + nh = nexthop_mpath_select(nh, 0); - nhi = rcu_dereference(nh->nh_info); - if (nhi->family == AF_INET6) - return &nhi->fib6_nh; + if (nh->nh_info->family == AF_INET6) + return &nh->nh_info->fib6_nh; return NULL; } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index c91cdafd40ec..0ddf14512bb3 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1821,7 +1821,10 @@ void fib_select_path(struct net *net, struct fib_result *res, goto check_saddr; #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res->fi->fib_nhs > 1) { + if (res->fi->nh && nexthop_is_multipath(res->fi->nh)) { + h = fib_multipath_hash(net, fl4, skb, NULL); + nexthop_select_path(net, res, h); + } else if (res->fi->fib_nhs > 1) { h = fib_multipath_hash(net, fl4, skb, NULL); fib_select_multipath(res, h); } diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 1e77fa94e562..f0b4151c661a 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -35,6 +35,8 @@ static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = { [NHA_TABLE_ID] = { .type = NLA_U32 }, [NHA_BLACKHOLE] = { .type = NLA_FLAG }, [NHA_MASTER] = { .type = NLA_U32 }, + [NHA_GROUP_TYPE] = { .type = NLA_U16 }, + [NHA_GROUPS] = { .type = NLA_FLAG }, }; static unsigned int nh_dev_hashfn(unsigned int val) @@ -67,19 +69,35 @@ static void nexthop_devhash_add(struct net *net, struct nh_info *nhi) static void nexthop_free_rcu(struct rcu_head *head) { struct nexthop *nh = container_of(head, struct nexthop, rcu); - struct nh_info *nhi; - nhi = rcu_dereference_raw(nh->nh_info); - switch (nhi->family) { - case AF_INET: - case AF_UNSPEC: - fib_nh_release(nhi->net, &nhi->fib_nh); - break; - case AF_INET6: - fib6_nh_release(&nhi->fib6_nh); - break; + if (nh->is_group) { + struct nh_group *nh_grp; + int i; + + nh_grp = rcu_dereference_raw(nh->nh_grp); + for (i = 0; i < nh_grp->num_nh; ++i) { + if (!nh_grp->nh_entries[i].nh) + continue; + + list_del(&nh_grp->nh_entries[i].nh_list); + nexthop_put(nh_grp->nh_entries[i].nh); + } + kfree(nh_grp); + } else { + struct nh_info *nhi; + + nhi = rcu_dereference_raw(nh->nh_info); + switch (nhi->family) { + case AF_INET: + case AF_UNSPEC: + fib_nh_release(nhi->net, &nhi->fib_nh); + break; + case AF_INET6: + fib6_nh_release(&nhi->fib6_nh); + break; + } + kfree(nhi); } - kfree(nhi); kfree(nh); } @@ -89,6 +107,33 @@ static struct nexthop *nexthop_alloc(void) return kzalloc(sizeof(struct nexthop), GFP_KERNEL); } +/* nexthop for group has variable size and may not use the kmem_cache */ +static struct nexthop *nexthop_grp_alloc(u16 num_nh) +{ + size_t sz = offsetof(struct nexthop, nh_grp) + + sizeof(struct nh_group) + + sizeof(struct nh_grp_entry) * num_nh; + struct nh_group *nh_grp; + struct nexthop *nh; + + nh = nexthop_alloc(); + if (!nh) + return ERR_PTR(-ENOMEM); + + nh_grp = kzalloc(sz, GFP_KERNEL); + if (!nh_grp) { + kfree(nh); + return ERR_PTR(-ENOMEM); + } + + nh->is_group = 1; + nh_grp->num_nh = num_nh; + nh_grp->num_nh_set = num_nh; + rcu_assign_pointer(nh->nh_grp, nh_grp); + + return nh; +} + static void nh_base_seq_inc(struct net *net) { while (++net->nexthop.seq == 0) @@ -173,23 +218,166 @@ static size_t nh_nlmsg_size_ipv4(struct nh_info *nhi) static size_t nh_nlmsg_size(struct nexthop *nh) { - struct nh_info *nhi = rtnl_dereference(nh->nh_info); size_t sz = nla_total_size(4); /* NHA_ID */ - /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE - * are mutually exclusive - */ - sz += nla_total_size(4); /* NHA_OIF */ + if (nh->is_group) { + struct nh_group *nh_grp = rtnl_dereference(nh->nh_grp); + size_t sz2 = sizeof(struct nh_group) * nh_grp->num_nh_set; - if (nhi->family == AF_INET) - sz += nh_nlmsg_size_ipv4(nhi); + sz += nla_total_size(sz2) + + nla_total_size(2); /* NHA_GROUP_TYPE */ + } else { + struct nh_info *nhi = rtnl_dereference(nh->nh_info); - else if (nhi->family == AF_INET6) - sz += nh_nlmsg_size_ipv6(nhi); + /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE + * are mutually exclusive + */ + sz += nla_total_size(4); /* NHA_OIF */ + + if (nhi->family == AF_INET) + sz += nh_nlmsg_size_ipv4(nhi); + else if (nhi->family == AF_INET6) + sz += nh_nlmsg_size_ipv6(nhi); + } return sz; } +static bool valid_group_nh(struct nexthop *nh, struct netlink_ext_ack *extack) +{ + if (nh->is_group) { + struct nh_group *nh_grp = rtnl_dereference(nh->nh_grp); + + /* nested multipath (group within a group) is not + * supported + */ + if (nh_grp->mpath) { + NL_SET_ERR_MSG(extack, + "Multipath group can not be a nexthop within a group"); + return false; + } + } else { + struct nh_info *nhi = rtnl_dereference(nh->nh_info); + + if (nhi->reject_nh) { + NL_SET_ERR_MSG(extack, + "Blackhole nexthop can not be used in a group"); + return false; + } + } + + return true; +} + +static int nh_check_attr_group(struct net *net, struct nlattr *tb[], + struct netlink_ext_ack *extack) +{ + unsigned int len = nla_len(tb[NHA_GROUP]); + struct nexthop_grp *nhg; + int i; + + if (len & (sizeof(struct nh_group) - 1)) { + NL_SET_ERR_MSG(extack, + "Invalid length for nexthop group attribute"); + return -EINVAL; + } + + /* convert len to number of nexthop ids */ + len /= sizeof(*nhg); + + nhg = nla_data(tb[NHA_GROUP]); + for (i = 0; i < len; ++i) { + struct nexthop *nh; + + nh = nexthop_find_by_id(net, nhg->id); + if (!nh) { + NL_SET_ERR_MSG(extack, "Invalid nexthop id"); + return -EINVAL; + } + if (!valid_group_nh(nh, extack)) + return -EINVAL; + + nhg += 1; + } + + for (i = NHA_GROUP + 1; i < __NHA_MAX; ++i) { + if (!tb[i]) + continue; + + NL_SET_ERR_MSG(extack, + "No other attributes can be set in nexthop groups"); + return -EINVAL; + } + + return 0; +} + +static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nh_grp) +{ + size_t len = nh_grp->num_nh_set * sizeof(struct nh_group); + struct nexthop_grp *p; + struct nlattr *nla; + u16 group_type = 0; + int i; + + if (nh_grp->mpath) + group_type = NEXTHOP_GRP_TYPE_MPATH; + + if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type)) + goto nla_put_failure; + + nla = nla_reserve(skb, NHA_GROUP, len); + if (!nla) + goto nla_put_failure; + + p = nla_data(nla); + for (i = 0; i < nh_grp->num_nh; ++i) { + if (!nh_grp->nh_entries[i].nh) + continue; + + p->id = nh_grp->nh_entries[i].nh->id; + p->weight = nh_grp->nh_entries[i].weight; + p += 1; + } + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static void nh_group_rebalance(struct nh_group *nhg) +{ + struct nh_grp_entry *nhge; + int total = 0; + int w = 0; + int i; + + for (i = 0; i < nhg->num_nh; ++i) { + nhge = &nhg->nh_entries[i]; + + if (!nhge->nh) + continue; + + total += nhge->weight; + } + + for (i = 0; i < nhg->num_nh; ++i) { + int upper_bound; + + nhge = &nhg->nh_entries[i]; + if (!nhge->nh) { + upper_bound = -1; + } else { + w += nhge->weight; + upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, + total) - 1; + } + + atomic_set(&nhge->upper_bound, upper_bound); + } +} + static const struct net_device *nh_info_dev(const struct nh_info *nhi) { switch (nhi->family) { @@ -219,8 +407,25 @@ bool nexthop_uses_dev(const struct nexthop *nh, const struct net_device *dev) const struct nh_info *nhi; bool dev_match = false; - nhi = rcu_dereference(nh->nh_info); - dev_match = nh_info_uses_dev(nhi, dev); + if (nh->is_group) { + const struct nh_group *nh_grp; + int i; + + nh_grp = rcu_dereference(nh->nh_grp); + for (i = 0; i < nh_grp->num_nh; ++i) { + const struct nh_grp_entry *nhge; + + nhge = &nh_grp->nh_entries[i]; + nhi = rcu_dereference(nhge->nh->nh_info); + dev_match = nh_info_uses_dev(nhi, dev); + if (dev_match) + break; + } + + } else { + nhi = rcu_dereference(nh->nh_info); + dev_match = nh_info_uses_dev(nhi, dev); + } return dev_match; } @@ -249,6 +454,14 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, if (nla_put_u32(skb, NHA_ID, nh->id)) goto nla_put_failure; + if (nh->is_group) { + struct nh_group *nh_grp = rtnl_dereference(nh->nh_grp); + + if (nla_put_nh_group(skb, nh_grp)) + goto nla_put_failure; + goto end; + } + nhi = rtnl_dereference(nh->nh_info); if (nhi->reject_nh && nla_put_flag(skb, NHA_BLACKHOLE)) goto nla_put_failure; @@ -281,6 +494,7 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, break; } +end: nlmsg_end(skb, nlh); return 0; @@ -315,6 +529,50 @@ static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info) rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err); } +static void remove_nh_grp_entry(struct nh_grp_entry *nhge, bool rebalance) +{ + struct nh_group *nh_grp; + + list_del(&nhge->nh_list); + nexthop_put(nhge->nh); + nhge->nh = NULL; + + nh_grp = rtnl_dereference(nhge->nh_parent->nh_grp); + nh_grp->num_nh_set--; + if (rebalance) + nh_group_rebalance(nh_grp); +} + +static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, + bool skip_fib, struct nl_info *nlinfo) +{ + struct nh_grp_entry *nhge, *tmp; + + list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) { + struct nh_group *nh_grp; + + remove_nh_grp_entry(nhge, true); + + /* if this group has no more entries then remove it */ + nh_grp = rtnl_dereference(nhge->nh_parent->nh_grp); + if (!nh_grp->num_nh_set) + remove_nexthop(net, nhge->nh_parent, skip_fib, + nlinfo); + } +} + +static void remove_nexthop_group(struct nexthop *nh) +{ + struct nh_group *nh_grp; + int i; + + nh_grp = rtnl_dereference(nh->nh_grp); + for (i = 0; i < nh_grp->num_nh; ++i) { + if (nh_grp->nh_entries[i].nh) + remove_nh_grp_entry(&nh_grp->nh_entries[i], false); + } +} + static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) { struct fib6_info *f6i, *tmp; @@ -339,13 +597,19 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) static void __remove_nexthop(struct net *net, struct nexthop *nh, bool skip_fib, struct nl_info *nlinfo) { - const struct net_device *dev; - struct nh_info *nhi; + if (nh->is_group) { + remove_nexthop_group(nh); + } else { + const struct net_device *dev; + struct nh_info *nhi; - nhi = rtnl_dereference(nh->nh_info); - dev = nh_info_dev(nhi); - if (dev) - hlist_del(&nhi->dev_hash); + nhi = rtnl_dereference(nh->nh_info); + dev = nh_info_dev(nhi); + if (dev) + hlist_del(&nhi->dev_hash); + + remove_nexthop_from_groups(net, nh, skip_fib, nlinfo); + } if (!skip_fib) __remove_nexthop_fib(net, nh); } @@ -362,21 +626,46 @@ static void remove_nexthop(struct net *net, struct nexthop *nh, nexthop_put(nh); - nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo); + if (nlinfo) + nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo); } static int replace_nexthop(struct net *net, struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) { - struct nh_info *oldi, *newi; + if (old->is_group) { + struct nh_group *oldg, *newg; + int i; - oldi = rtnl_dereference(old->nh_info); - newi = rtnl_dereference(new->nh_info); - rcu_assign_pointer(old->nh_info, newi); - rcu_assign_pointer(new->nh_info, oldi); + if (!new->is_group) { + NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop."); + return -EINVAL; + } + oldg = rtnl_dereference(old->nh_grp); + newg = rtnl_dereference(new->nh_grp); + rcu_assign_pointer(old->nh_grp, newg); + rcu_assign_pointer(new->nh_grp, oldg); + + /* update parents - used by nexthop code for cleanup */ + for (i = 0; i < newg->num_nh; ++i) + newg->nh_entries[i].nh_parent = old; + for (i = 0; i < oldg->num_nh; ++i) + oldg->nh_entries[i].nh_parent = new; + } else { + struct nh_info *oldi, *newi; - newi->nh_parent = old; - oldi->nh_parent = new; + if (new->is_group) { + NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group."); + return -EINVAL; + } + oldi = rtnl_dereference(old->nh_info); + newi = rtnl_dereference(new->nh_info); + rcu_assign_pointer(old->nh_info, newi); + rcu_assign_pointer(new->nh_info, oldi); + + newi->nh_parent = old; + oldi->nh_parent = new; + } old->protocol = new->protocol; old->nh_flags = new->nh_flags; @@ -491,10 +780,16 @@ int fib_check_nexthop(struct fib_info *fi, struct fib_config *cfg, struct netlink_ext_ack *extack) { struct nexthop *nh = fi->nh; - struct nh_info *nhi; - nhi = rtnl_dereference(nh->nh_info); - if (nhi->family != AF_UNSPEC) { + if (nh->is_group) { + if (cfg->fc_scope == RT_SCOPE_HOST) { + NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops"); + return -EINVAL; + } + return 0; + } + + if (nh->nh_info->family != AF_UNSPEC) { if (nh->nh_flags & RTNH_F_ONLINK && cfg->fc_scope >= RT_SCOPE_LINK) { NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop"); @@ -505,6 +800,57 @@ int fib_check_nexthop(struct fib_info *fi, struct fib_config *cfg, return 0; } +void nexthop_select_path(struct net *net, struct fib_result *res, int hash) +{ + struct fib_info *fi = res->fi; + struct nexthop *nh = fi->nh; + struct nh_group *nh_grp; + bool first = false; + int i; + + WARN_ON(!nh->is_group); + + nh_grp = rcu_dereference(nh->nh_grp); + for (i = 0; i < nh_grp->num_nh; ++i) { + struct nh_grp_entry *nhge = &nh_grp->nh_entries[i]; + struct fib_nh *fib_nh; + + if (hash > atomic_read(&nhge->upper_bound)) + continue; + + fib_nh = &nhge->nh->nh_info->fib_nh; + + /* nexthops always check if it is good and does + * not rely on a sysctl for this behavior + */ + if (fib_good_nh(fib_nh)) { + res->nh = fib_nh; + return; + } + if (!first) { + res->nh = fib_nh; + first = true; + } + } +} + +struct nexthop *nexthop_mpath_select(struct nexthop *nh, int nhsel) +{ + struct nh_group *nh_grp; + int i, j = 0; + + nh_grp = rcu_dereference(nh->nh_grp); + for (i = 0; i < nh_grp->num_nh; ++i) { + if (nh_grp->nh_entries[i].nh) { + if (nhsel == j) + return nh_grp->nh_entries[i].nh; + ++j; + } + } + return NULL; +} +EXPORT_SYMBOL_GPL(nexthop_mpath_select); + static int nh_check_attr(struct nhmsg *nhm, struct nlattr *tb[], struct net *net, struct netlink_ext_ack *extack) { @@ -557,6 +903,19 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, if (tb[NHA_ID]) cfg->nh_id = nla_get_u32(tb[NHA_ID]); + if (tb[NHA_GROUP]) { + cfg->nh_grp = tb[NHA_GROUP]; + + cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH; + if (tb[NHA_GROUP_TYPE]) + cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]); + + if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) { + NL_SET_ERR_MSG(extack, "Invalid group type"); + goto out; + } + } + if (tb[NHA_OIF]) { cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]); @@ -644,6 +1003,14 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, goto out; } + if (tb[NHA_GROUP]) { + err = nh_check_attr_group(net, tb, extack); + if (err) + goto out; + + return 0; + } + err = 0; out: return err; @@ -791,7 +1158,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh, return err; } -static int nh_create_ipv6(struct net *net, struct nexthop *nh, +static int nh_create_ipv6(struct net *net, struct nexthop *nh, struct nh_info *nhi, struct nh_config *cfg, struct netlink_ext_ack *extack) { @@ -856,10 +1223,47 @@ static int nh_create_unspec(struct net *net, struct nexthop *nh, static void nexthop_init_common(struct nexthop *nh) { + INIT_LIST_HEAD(&nh->grp_list); INIT_LIST_HEAD(&nh->fi_list); INIT_LIST_HEAD(&nh->f6i_list); } +static struct nexthop *nexthop_create_group(struct net *net, + struct nh_config *cfg) +{ + struct nlattr *grps_attr = cfg->nh_grp; + struct nexthop_grp *entry = nla_data(grps_attr); + struct nh_group *nh_grp; + struct nexthop *nh; + int i; + + nh = nexthop_grp_alloc(nla_len(grps_attr) / sizeof(*entry)); + if (!nh) + return ERR_PTR(-ENOMEM); + + nexthop_init_common(nh); + + nh_grp = rtnl_dereference(nh->nh_grp); + for (i = 0; i < nh_grp->num_nh; ++i) { + struct nexthop *nhe; + + nhe = nexthop_find_by_id(net, entry[i].id); + nexthop_get(nhe); + + nh_grp->nh_entries[i].nh = nhe; + nh_grp->nh_entries[i].weight = entry[i].weight ? : 1; + list_add(&nh_grp->nh_entries[i].nh_list, &nhe->grp_list); + nh_grp->nh_entries[i].nh_parent = nh; + } + + if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) { + nh_grp->mpath = 1; + nh_group_rebalance(nh_grp); + } + + return nh; +} + static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg, struct netlink_ext_ack *extack) { @@ -929,7 +1333,11 @@ static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg, } } - nh = nexthop_create(net, cfg, extack); + if (cfg->nh_grp) + nh = nexthop_create_group(net, cfg); + else + nh = nexthop_create(net, cfg, extack); + if (IS_ERR(nh)) return nh; @@ -968,19 +1376,25 @@ static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, return err; } -static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, +static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int group_filter, int master_idx, u8 family) { const struct net_device *dev; const struct nh_info *nhi; - if (dev_idx || master_idx || family) + if (group_filter && !nh->is_group) + return true; + + if ((dev_idx || master_idx || family) && nh->is_group) return true; nhi = rtnl_dereference(nh->nh_info); - if (family && nhi->family != family) + if (family && !nh->is_group && nhi->family != family) return true; + if (nh->is_group) + return false; + dev = nh_info_dev(nhi); if (dev_idx && (!dev || dev->ifindex != dev_idx)) return true; @@ -998,7 +1412,7 @@ static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, /* rtnl */ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) { - int dev_filter_idx = 0, master_idx = 0; + int group_filter = 0, dev_filter_idx = 0, master_idx = 0; struct net *net = sock_net(skb->sk); struct rb_root *root = &net->nexthop.root; struct nlattr *tb[NHA_MAX + 1]; @@ -1010,6 +1424,9 @@ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) if (nlmsg_parse(cb->nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy, NULL) >= 0) { + if (tb[NHA_GROUPS]) + group_filter = 1; + if (tb[NHA_OIF]) dev_filter_idx = nla_get_u32(tb[NHA_OIF]); @@ -1027,8 +1444,8 @@ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) goto cont; nh = rb_entry(node, struct nexthop, rb_node); - if (nh_dump_filtered(nh, dev_filter_idx, master_idx, - nhm->nh_family)) + if (nh_dump_filtered(nh, dev_filter_idx, group_filter, + master_idx, nhm->nh_family)) goto cont; err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 1297c7c934a8..4c16715607e0 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -112,6 +112,7 @@ #include <net/secure_seq.h> #include <net/ip_tunnels.h> #include <net/l3mdev.h> +#include <net/nexthop.h> #include "fib_lookup.h" @@ -1887,10 +1888,17 @@ static int ip_mkroute_input(struct sk_buff *skb, struct flow_keys *hkeys) { #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res->fi && res->fi->fib_nhs > 1) { - int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); - - fib_select_multipath(res, h); + if (res->fi) { + struct net *net = res->fi->fib_net; + int h; + + if (res->fi->nh && nexthop_is_multipath(res->fi->nh)) { + h = fib_multipath_hash(net, NULL, skb, hkeys); + nexthop_select_path(net, res, h); + } else if (res->fi->fib_nhs > 1) { + h = fib_multipath_hash(net, NULL, skb, hkeys); + fib_select_multipath(res, h); + } } #endif -- 2.11.0