IPv6 returns multipath routes as a series of individual routes making their display and handling by userspace different and more complicated than IPv4, putting the burden on the user to see that a route is part of a multipath route and internally creating a multipath route if desired (e.g., libnl does this as of commit 29b71371e764). This patch addresses this difference, allowing multipath routes to be returned using the RTA_MULTIPATH attribute.
The end result is that IPv6 multipath routes can be treated and displayed in a format similar to IPv4: $ ip -6 ro ls vrf red 2001:db8::/120 metric 1024 nexthop via 2001:db8:1::62 dev eth1 weight 1 nexthop via 2001:db8:1::61 dev eth1 weight 1 nexthop via 2001:db8:1::60 dev eth1 weight 1 nexthop via 2001:db8:1::59 dev eth1 weight 1 2001:db8:1::/120 dev eth1 proto kernel metric 256 pref medium ... Suggested-by: Dinesh Dutt <dd...@cumulusnetworks.com> Signed-off-by: David Ahern <d...@cumulusnetworks.com> --- v3 - dropped user API to opt-in to change v2 - changed user api to opt in to new behavior from attribute appended to the request to requiring an rtmsg struct with the RTM_F_ALL_NEXTHOPS set include/net/netlink.h | 1 + net/ipv6/ip6_fib.c | 16 ++++++- net/ipv6/route.c | 127 +++++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 126 insertions(+), 18 deletions(-) diff --git a/include/net/netlink.h b/include/net/netlink.h index d3938f11ae52..b239fcd33d80 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -229,6 +229,7 @@ struct nl_info { struct nlmsghdr *nlh; struct net *nl_net; u32 portid; + bool skip_notify; }; int netlink_rcv_skb(struct sk_buff *skb, diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index bcaf247232d7..2542794b2c64 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -318,6 +318,16 @@ static int fib6_dump_node(struct fib6_walker *w) w->leaf = rt; return 1; } + + /* if multipath routes are dumped in one route with + * the RTA_MULTIPATH attribute, then jump rt to point + * to the last sibling of this route (no need to dump + * the sibling routes again) + */ + if (rt->rt6i_nsiblings) + rt = list_last_entry(&rt->rt6i_siblings, + struct rt6_info, + rt6i_siblings); } w->leaf = NULL; return 0; @@ -871,7 +881,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, *ins = rt; rt->rt6i_node = fn; atomic_inc(&rt->rt6i_ref); - inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); + if (!info->skip_notify) + inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; if (!(fn->fn_flags & RTN_RTINFO)) { @@ -897,7 +908,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, rt->rt6i_node = fn; rt->dst.rt6_next = iter->dst.rt6_next; atomic_inc(&rt->rt6i_ref); - inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); + if (!info->skip_notify) + inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 81e2b2a28806..747f333ae006 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3010,19 +3010,25 @@ static int ip6_route_info_append(struct list_head *rt6_nh_list, static int ip6_route_multipath_add(struct fib6_config *cfg) { + struct rt6_info *rt, *rt_first = NULL; struct fib6_config r_cfg; struct rtnexthop *rtnh; - struct rt6_info *rt; struct rt6_nh *err_nh; struct rt6_nh *nh, *nh_safe; + __u16 nlflags; int remaining; int attrlen; int err = 1; int nhn = 0; + int append = cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_APPEND; int replace = (cfg->fc_nlinfo.nlh && (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); LIST_HEAD(rt6_nh_list); + nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; + if (append) + nlflags |= NLM_F_APPEND; + remaining = cfg->fc_mp_len; rtnh = (struct rtnexthop *)cfg->fc_mp; @@ -3065,9 +3071,20 @@ static int ip6_route_multipath_add(struct fib6_config *cfg) rtnh = rtnh_next(rtnh, &remaining); } + /* for route append want to send separate notifications on + * each add; for add and replace send one notification with + * all nexthops + */ + if (!append) + cfg->fc_nlinfo.skip_notify = 1; + err_nh = NULL; list_for_each_entry(nh, &rt6_nh_list, next) { err = __ip6_ins_rt(nh->rt6_info, &cfg->fc_nlinfo, &nh->mxc); + /* save reference to first route for notification */ + if (!rt_first && !err && !append) + rt_first = nh->rt6_info; + /* nh->rt6_info is used or freed at this point, reset to NULL*/ nh->rt6_info = NULL; if (err) { @@ -3089,9 +3106,21 @@ static int ip6_route_multipath_add(struct fib6_config *cfg) nhn++; } + if (rt_first) + inet6_rt_notify(RTM_NEWROUTE, rt_first, &cfg->fc_nlinfo, + nlflags); + goto cleanup; add_errout: + /* send notification for routes that were added so far so + * that the delete notifications sent by ip6_route_del in + * the next block are coherent + */ + if (rt_first) + inet6_rt_notify(RTM_NEWROUTE, rt_first, &cfg->fc_nlinfo, + nlflags); + /* Delete routes that were already added */ list_for_each_entry(nh, &rt6_nh_list, next) { if (err_nh == nh) @@ -3198,6 +3227,60 @@ static inline size_t rt6_nlmsg_size(struct rt6_info *rt) + lwtunnel_get_encap_size(rt->dst.lwtstate); } +static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt, + unsigned int *flags) +{ + if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) { + *flags |= RTNH_F_LINKDOWN; + if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown) + *flags |= RTNH_F_DEAD; + } + + if (rt->rt6i_flags & RTF_GATEWAY) { + if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0) + goto nla_put_failure; + } + + if (rt->dst.dev && + nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) + goto nla_put_failure; + + if (rt->dst.lwtstate && + lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt) +{ + struct rtnexthop *rtnh; + unsigned int flags = 0; + + rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); + if (!rtnh) + goto nla_put_failure; + + rtnh->rtnh_hops = 0; + rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0; + + if (rt6_nexthop_info(skb, rt, &flags) < 0) + goto nla_put_failure; + + rtnh->rtnh_flags = flags; + + /* length of rtnetlink header + attributes */ + rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct rt6_info *rt, struct in6_addr *dst, struct in6_addr *src, @@ -3249,11 +3332,6 @@ static int rt6_fill_node(struct net *net, else rtm->rtm_type = RTN_UNICAST; rtm->rtm_flags = 0; - if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) { - rtm->rtm_flags |= RTNH_F_LINKDOWN; - if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown) - rtm->rtm_flags |= RTNH_F_DEAD; - } rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = rt->rt6i_protocol; if (rt->rt6i_flags & RTF_DYNAMIC) @@ -3317,17 +3395,36 @@ static int rt6_fill_node(struct net *net, if (rtnetlink_put_metrics(skb, metrics) < 0) goto nla_put_failure; - if (rt->rt6i_flags & RTF_GATEWAY) { - if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0) - goto nla_put_failure; - } - - if (rt->dst.dev && - nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) - goto nla_put_failure; if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric)) goto nla_put_failure; + /* For multipath routes, walk the siblings list and add + * each as a nexthop within RTA_MULTIPATH. If this route + * is an append, then only send the route that is added. + */ + if (rt->rt6i_nsiblings && !(flags & NLM_F_APPEND)) { + struct rt6_info *sibling, *next_sibling; + struct nlattr *mp; + + mp = nla_nest_start(skb, RTA_MULTIPATH); + if (!mp) + goto nla_put_failure; + + if (rt6_add_nexthop(skb, rt) < 0) + goto nla_put_failure; + + list_for_each_entry_safe(sibling, next_sibling, + &rt->rt6i_siblings, rt6i_siblings) { + if (rt6_add_nexthop(skb, sibling) < 0) + goto nla_put_failure; + } + + nla_nest_end(skb, mp); + } else { + if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags) < 0) + goto nla_put_failure; + } + expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0; if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) @@ -3336,8 +3433,6 @@ static int rt6_fill_node(struct net *net, if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) goto nla_put_failure; - if (lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0) - goto nla_put_failure; nlmsg_end(skb, nlh); return 0; -- 2.1.4