When multiple nexthops are available for a given route, the routing engine chooses a nexthop by computing the flow hash through get_hash_from_flowi6 and by taking that value modulo the number of nexthops. The resulting value indexes the nexthop to select. This method causes issues when a new nexthop is added or one is removed (e.g. link failure). In that case, the number of nexthops changes and potentially all the flows get re-routed to another nexthop.
This patch implements a consistent hash method to select the nexthop in case of ECMP. The idea is to generate N random numbers (__u32) for each nexthop, where N is configurable. In order to select a nexthop, we find the number that is directly higher than the flow hash (which is also __u32). The nexthop associated to the number is then selected. The lookup method performs a binary search over the sorted array of numbers, which yields a time complexity of O(log n), where n is the number of nexthops times the number of random values generated for each number. This feature can be enabled through the CONFIG_IPV6_MULTIPATH_CONSISTENT option and the number of random values generated for each nexthop is defined through CONFIG_IPV6_MPCONSIST_BUCKETSIZE. Signed-off-by: David Lebrun <david.leb...@uclouvain.be> --- include/net/ip6_fib.h | 20 ++++ net/ipv6/Kconfig | 26 +++++ net/ipv6/Makefile | 1 + net/ipv6/ip6_ecmp.c | 263 ++++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv6/ip6_fib.c | 18 ++++ net/ipv6/route.c | 56 +++++++++++ 6 files changed, 384 insertions(+) create mode 100644 net/ipv6/ip6_ecmp.c diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index a74e2aa..e22417d 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -93,6 +93,16 @@ struct rt6key { struct fib6_table; +struct rt6_multi_nh { + __u32 key; + struct rt6_info *nh; +}; + +struct rt6_multi_map { + struct rt6_multi_nh *nhs; + unsigned int size; +}; + struct rt6_info { struct dst_entry dst; @@ -113,6 +123,9 @@ struct rt6_info { */ struct list_head rt6i_siblings; unsigned int rt6i_nsiblings; +#ifdef CONFIG_IPV6_MULTIPATH_CONSISTENT + struct rt6_multi_map *rt6i_nh_map; +#endif atomic_t rt6i_ref; @@ -302,4 +315,11 @@ static inline void fib6_rules_cleanup(void) return ; } #endif + +#ifdef CONFIG_IPV6_MULTIPATH_CONSISTENT +int fib6_mp_shrink(struct rt6_info *sref, struct rt6_info *rt); +int fib6_mp_extend(struct rt6_info *rt, struct rt6_multi_map *nh_map); +void fib6_mp_free(struct rt6_info *rt); +#endif + #endif diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index ec1267e..ebfae0d 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -324,4 +324,30 @@ config IPV6_SEG6_HMAC If unsure, say N. +config IPV6_MULTIPATH_CONSISTENT + bool "IPv6: enable consistent hashing for ECMP" + depends on IPV6 + ---help--- + Enable consistent hashing for Equal-Cost Multi-Path (ECMP) + route selection. By default, the nexthop is selected by taking + the flow hash modulo the number of nexthops. When a nexthop is + added or removed (e.g. link failure), all flows might change to + a different nexthop as the modulo changes. Enabling this option + allows to ensure that when a nexthop is removed, only the affected + flows are assigned to another nexthop, and they are balanced equally + across the remaining nexthops. When a nexthop is added, only a subset + of the flows are assigned to it. The lookup performance is O(log n) + where n is the number of nexthops times CONFIG_IPV6_MPCONSIST_BUCKETSIZE. + + If unsure, say N. + +config IPV6_MPCONSIST_BUCKETSIZE + int "IPv6: bucket size for ECMP consistent hashing" + default 10 + depends on IPV6_MULTIPATH_CONSISTENT + ---help--- + Define the number of hash entries generated for each ECMP nexthop. + A higher value increases the uniform distribution of flows across + nexthops, but also increases lookup performances logarithmically. + endif # IPV6 diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index a9e9fec..6481c5d 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -25,6 +25,7 @@ ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o ipv6-$(CONFIG_NETLABEL) += calipso.o ipv6-$(CONFIG_IPV6_SEG6_LWTUNNEL) += seg6_iptunnel.o ipv6-$(CONFIG_IPV6_SEG6_HMAC) += seg6_hmac.o +ipv6-$(CONFIG_IPV6_MULTIPATH_CONSISTENT) += ip6_ecmp.o ipv6-objs += $(ipv6-y) diff --git a/net/ipv6/ip6_ecmp.c b/net/ipv6/ip6_ecmp.c new file mode 100644 index 0000000..799a328 --- /dev/null +++ b/net/ipv6/ip6_ecmp.c @@ -0,0 +1,263 @@ +/* + * IPv6 Equal-Cost Multi-Path + * + * Author: + * David Lebrun <david.leb...@uclouvain.be> + + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/net.h> +#include <linux/route.h> +#include <linux/netdevice.h> +#include <linux/in6.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/slab.h> + +#include <net/ipv6.h> +#include <net/ndisc.h> +#include <net/addrconf.h> +#include <net/lwtunnel.h> + +#include <net/ip6_fib.h> +#include <net/ip6_route.h> + +static int mphash_bucket_size = CONFIG_IPV6_MPCONSIST_BUCKETSIZE; + +void fib6_mp_free(struct rt6_info *rt) +{ + struct rt6_multi_map *nh_map = rt->rt6i_nh_map; + struct rt6_info *sibling; + + if (nh_map) { + list_for_each_entry(sibling, &rt->rt6i_siblings, + rt6i_siblings) { + sibling->rt6i_nh_map = NULL; + } + + rt->rt6i_nh_map = NULL; + + kfree(nh_map->nhs); + kfree(nh_map); + } +} + +static bool fib6_mp_key_exists(struct rt6_multi_nh *nhs, unsigned int size, + __u32 key) +{ + unsigned int i; + + for (i = 0; i < size; i++) { + if (nhs[i].key == key) + return true; + } + + return false; +} + +static void fib6_mp_populate(struct rt6_multi_nh *nhs, unsigned int offset, + unsigned int size, unsigned int fullsize, + struct rt6_info *rt) +{ + unsigned int i; + + for (i = offset; i < offset + size; i++) { + __u32 key; + + do { + get_random_bytes(&key, sizeof(key)); + } while (fib6_mp_key_exists(nhs, fullsize, key)); + + nhs[i].key = key; + nhs[i].nh = rt; + } +} + +static int fib6_mp_update(struct rt6_info *rt, struct rt6_multi_nh *nhs, + unsigned int size, bool check) +{ + struct rt6_multi_map *new_map; + struct rt6_info *sibling; + + new_map = kmalloc(sizeof(*new_map), GFP_ATOMIC); + if (!new_map) + return -ENOMEM; + + new_map->nhs = nhs; + new_map->size = size; + + list_for_each_entry(sibling, &rt->rt6i_siblings, rt6i_siblings) { + if (check) + WARN_ON(sibling->rt6i_nh_map); + sibling->rt6i_nh_map = new_map; + } + + rt->rt6i_nh_map = new_map; + + return 0; +} + +static void ___merge(struct rt6_multi_nh *nhs, struct rt6_multi_nh *helper, + unsigned int low, unsigned int middle, unsigned int high) +{ + unsigned int i, j, k; + + i = low; + j = middle + 1; + k = 0; + + while (i <= middle || j <= high) { + if (i <= middle && j <= high) { + if (nhs[i].key < nhs[j].key) + helper[k++] = nhs[i++]; + else + helper[k++] = nhs[j++]; + } else if (i <= middle) { + helper[k++] = nhs[i++]; + } else { + helper[k++] = nhs[j++]; + } + } + + for (i = 0; i < (high - low + 1); i++) + nhs[low + i] = helper[i]; +} + +static void ___mergesort(struct rt6_multi_nh *nhs, struct rt6_multi_nh *helper, + unsigned int low, unsigned int high) +{ + unsigned int middle; + + if (low > high) + return; + + middle = ((low + high) / 2); + + if (low < high) { + ___mergesort(nhs, helper, low, middle); + ___mergesort(nhs, helper, middle + 1, high); + } + + ___merge(nhs, helper, low, middle, high); +} + +static int fib6_mp_sort(struct rt6_multi_nh *nhs, unsigned int size) +{ + struct rt6_multi_nh *helper; + + helper = kmalloc_array(size, sizeof(*helper), GFP_ATOMIC); + if (!helper) + return -ENOMEM; + + ___mergesort(nhs, helper, 0, size - 1); + kfree(helper); + + return 0; +} + +int fib6_mp_extend(struct rt6_info *rt, struct rt6_multi_map *nh_map) +{ + struct rt6_multi_nh *tmp_nhs; + unsigned int size; + int err; + + if (!nh_map) { + struct rt6_info *sibling; + + size = mphash_bucket_size * 2; + tmp_nhs = kcalloc(size, sizeof(struct rt6_multi_nh), + GFP_ATOMIC); + if (!tmp_nhs) + return -ENOMEM; + + sibling = list_first_entry(&rt->rt6i_siblings, struct rt6_info, + rt6i_siblings); + + fib6_mp_populate(tmp_nhs, 0, mphash_bucket_size, size, sibling); + + fib6_mp_populate(tmp_nhs, mphash_bucket_size, + mphash_bucket_size, size, rt); + + err = fib6_mp_sort(tmp_nhs, size); + if (err) { + kfree(tmp_nhs); + return err; + } + + err = fib6_mp_update(rt, tmp_nhs, size, true); + if (err) { + kfree(tmp_nhs); + return err; + } + + return 0; + } + + size = nh_map->size + mphash_bucket_size; + tmp_nhs = __krealloc(nh_map->nhs, size * sizeof(*tmp_nhs), GFP_ATOMIC); + if (!tmp_nhs) + return -ENOMEM; + + memset(tmp_nhs + nh_map->size, 0, + mphash_bucket_size * sizeof(*tmp_nhs)); + + fib6_mp_populate(tmp_nhs, nh_map->size, mphash_bucket_size, size, rt); + + err = fib6_mp_sort(tmp_nhs, size); + if (err) { + kfree(tmp_nhs); + return err; + } + + err = fib6_mp_update(rt, tmp_nhs, size, false); + if (err) { + kfree(tmp_nhs); + return err; + } + + if (tmp_nhs != nh_map->nhs) + kfree(nh_map->nhs); + kfree(nh_map); + + return 0; +} + +int fib6_mp_shrink(struct rt6_info *sref, struct rt6_info *rt) +{ + struct rt6_multi_map *nh_map = sref->rt6i_nh_map; + struct rt6_multi_nh *tmp_nhs; + unsigned int size, i, j; + int err; + + WARN_ON(!nh_map); + if (!nh_map) + return -ENOENT; + + size = nh_map->size - mphash_bucket_size; + tmp_nhs = kcalloc(size, sizeof(*tmp_nhs), GFP_ATOMIC); + if (!tmp_nhs) + return -ENOMEM; + + for (i = 0, j = 0; i < nh_map->size; i++) { + if (nh_map->nhs[i].nh != rt) + tmp_nhs[j++] = nh_map->nhs[i]; + } + + err = fib6_mp_update(sref, tmp_nhs, size, false); + if (err) { + kfree(tmp_nhs); + return err; + } + + rt->rt6i_nh_map = NULL; + kfree(nh_map->nhs); + kfree(nh_map); + + return 0; +} diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index ef54852..ad5f645 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -837,6 +837,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, } sibling = sibling->dst.rt6_next; } +#ifdef CONFIG_IPV6_MULTIPATH_CONSISTENT + fib6_mp_extend(rt, sibling->rt6i_nh_map); +#endif /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings * is broken! @@ -900,6 +903,10 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, fn->fn_flags |= RTN_RTINFO; } nsiblings = iter->rt6i_nsiblings; +#ifdef CONFIG_IPV6_MULTIPATH_CONSISTENT + if (nsiblings) + fib6_mp_free(iter); +#endif fib6_purge_rt(iter, fn, info->nl_net); rt6_release(iter); @@ -1407,6 +1414,11 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, /* Remove this entry from other siblings */ if (rt->rt6i_nsiblings) { +#ifdef CONFIG_IPV6_MULTIPATH_CONSISTENT + struct rt6_info *sref = list_first_entry(&rt->rt6i_siblings, + struct rt6_info, + rt6i_siblings); +#endif struct rt6_info *sibling, *next_sibling; list_for_each_entry_safe(sibling, next_sibling, @@ -1414,6 +1426,12 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, sibling->rt6i_nsiblings--; rt->rt6i_nsiblings = 0; list_del_init(&rt->rt6i_siblings); +#ifdef CONFIG_IPV6_MULTIPATH_CONSISTENT + if (!sref->rt6i_nsiblings) + fib6_mp_free(sref); + else + fib6_mp_shrink(sref, rt); +#endif } /* Adjust walkers */ diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b317bb1..109f371 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -427,6 +427,60 @@ static bool rt6_check_expired(const struct rt6_info *rt) return false; } +#ifdef CONFIG_IPV6_MULTIPATH_CONSISTENT + +static struct rt6_info *rt6_multipath_select(struct rt6_info *match, + struct flowi6 *fl6, int oif, + int strict) +{ + struct rt6_multi_map *nh_map = match->rt6i_nh_map; + __u32 hash = get_hash_from_flowi6(fl6); + unsigned int left, right, idx; + struct rt6_info *res = NULL; + + WARN_ON(!nh_map); + if (!nh_map) + return match; + + if (hash <= nh_map->nhs[0].key || + hash > nh_map->nhs[nh_map->size - 1].key) { + res = nh_map->nhs[0].nh; + goto skip_lookup; + } + + left = 0; + right = nh_map->size - 1; + + do { + struct rt6_multi_nh *nh1, *nh2; + + idx = (left + right) / 2; + nh1 = &nh_map->nhs[idx]; + nh2 = &nh_map->nhs[idx + 1]; + + if (hash < nh1->key && hash < nh2->key) + right = idx; + else if (hash > nh1->key && hash > nh2->key) + left = idx + 1; + else if (hash == nh1->key) + res = nh1->nh; + else + res = nh2->nh; + } while (left != right && !res); + + WARN_ON(!res); + if (!res) + return match; + +skip_lookup: + if (rt6_score_route(res, oif, strict) < 0) + res = match; + + return res; +} + +#else + /* Multipath route selection: * Hash based function using packet header and flowlabel. * Adapted from fib_info_hashfn() @@ -462,6 +516,8 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, return match; } +#endif + /* * Route lookup. Any table->tb6_lock is implied. */ -- 2.7.3