Introduces a new Netlink attribute RTA_TUNNEL which allows routes to set tunnel transmit metadata and specify the tunnel endpoint or tunnel id on a per route basis. The route must point to a tunnel device which understands per skb tunnel metadata and has been put into the respective mode.
Signed-off-by: Thomas Graf <tg...@suug.ch> --- include/net/ip_fib.h | 3 +++ include/net/ip_tunnels.h | 1 - include/net/route.h | 10 ++++++++ include/uapi/linux/rtnetlink.h | 16 ++++++++++++ net/ipv4/fib_frontend.c | 57 ++++++++++++++++++++++++++++++++++++++++++ net/ipv4/fib_semantics.c | 45 +++++++++++++++++++++++++++++++++ net/ipv4/route.c | 30 +++++++++++++++++++++- net/openvswitch/vport.h | 1 + 8 files changed, 161 insertions(+), 2 deletions(-) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 54271ed..1cd7cf8 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -22,6 +22,7 @@ #include <net/fib_rules.h> #include <net/inetpeer.h> #include <linux/percpu.h> +#include <net/ip_tunnels.h> struct fib_config { u8 fc_dst_len; @@ -44,6 +45,7 @@ struct fib_config { u32 fc_flow; u32 fc_nlflags; struct nl_info fc_nlinfo; + struct ip_tunnel_info fc_tunnel; }; struct fib_info; @@ -117,6 +119,7 @@ struct fib_info { #ifdef CONFIG_IP_ROUTE_MULTIPATH int fib_power; #endif + struct ip_tunnel_info *fib_tunnel; struct rcu_head rcu; struct fib_nh fib_nh[0]; #define fib_dev fib_nh[0].nh_dev diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index df8cfd3..b4ab930 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -9,7 +9,6 @@ #include <net/dsfield.h> #include <net/gro_cells.h> #include <net/inet_ecn.h> -#include <net/ip.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/flow.h> diff --git a/include/net/route.h b/include/net/route.h index 6ede321..dbda603 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -28,6 +28,7 @@ #include <net/inetpeer.h> #include <net/flow.h> #include <net/inet_sock.h> +#include <net/ip_tunnels.h> #include <linux/in_route.h> #include <linux/rtnetlink.h> #include <linux/rcupdate.h> @@ -66,6 +67,7 @@ struct rtable { struct list_head rt_uncached; struct uncached_list *rt_uncached_list; + struct ip_tunnel_info *rt_tun_info; }; static inline bool rt_is_input_route(const struct rtable *rt) @@ -198,6 +200,8 @@ struct in_ifaddr; void fib_add_ifaddr(struct in_ifaddr *); void fib_del_ifaddr(struct in_ifaddr *, struct in_ifaddr *); +int fib_dump_tun_info(struct sk_buff *skb, struct ip_tunnel_info *tun_info); + static inline void ip_rt_put(struct rtable *rt) { /* dst_release() accepts a NULL parameter. @@ -317,9 +321,15 @@ static inline int ip4_dst_hoplimit(const struct dst_entry *dst) static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb) { + struct rtable *rt; + if (skb_shinfo(skb)->tun_info) return skb_shinfo(skb)->tun_info; + rt = skb_rtable(skb); + if (rt) + return rt->rt_tun_info; + return NULL; } diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 17fb02f..1f7aa68 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -286,6 +286,21 @@ enum rt_class_t { /* Routing message attributes */ +enum rta_tunnel_t { + RTA_TUN_UNSPEC, + RTA_TUN_ID, + RTA_TUN_DST, + RTA_TUN_SRC, + RTA_TUN_TTL, + RTA_TUN_TOS, + RTA_TUN_SPORT, + RTA_TUN_DPORT, + RTA_TUN_FLAGS, + __RTA_TUN_MAX, +}; + +#define RTA_TUN_MAX (__RTA_TUN_MAX - 1) + enum rtattr_type_t { RTA_UNSPEC, RTA_DST, @@ -308,6 +323,7 @@ enum rtattr_type_t { RTA_VIA, RTA_NEWDST, RTA_PREF, + RTA_TUNNEL, /* destination VTEP */ __RTA_MAX }; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 872494e..bfa77a6 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -580,6 +580,57 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) return -EINVAL; } +static const struct nla_policy tunnel_policy[RTA_TUN_MAX + 1] = { + [RTA_TUN_ID] = { .type = NLA_U64 }, + [RTA_TUN_DST] = { .type = NLA_U32 }, + [RTA_TUN_SRC] = { .type = NLA_U32 }, + [RTA_TUN_TTL] = { .type = NLA_U8 }, + [RTA_TUN_TOS] = { .type = NLA_U8 }, + [RTA_TUN_SPORT] = { .type = NLA_U16 }, + [RTA_TUN_DPORT] = { .type = NLA_U16 }, + [RTA_TUN_FLAGS] = { .type = NLA_U16 }, +}; + +static int parse_rta_tunnel(struct fib_config *cfg, struct nlattr *attr) +{ + struct nlattr *tb[RTA_TUN_MAX+1]; + int err; + + err = nla_parse_nested(tb, RTA_TUN_MAX, attr, tunnel_policy); + if (err < 0) + return err; + + if (tb[RTA_TUN_ID]) + cfg->fc_tunnel.key.tun_id = nla_get_u64(tb[RTA_TUN_ID]); + + if (tb[RTA_TUN_DST]) + cfg->fc_tunnel.key.ipv4_dst = nla_get_be32(tb[RTA_TUN_DST]); + + if (tb[RTA_TUN_SRC]) + cfg->fc_tunnel.key.ipv4_src = nla_get_be32(tb[RTA_TUN_SRC]); + + if (tb[RTA_TUN_TTL]) + cfg->fc_tunnel.key.ipv4_ttl = nla_get_u8(tb[RTA_TUN_TTL]); + + if (tb[RTA_TUN_TOS]) + cfg->fc_tunnel.key.ipv4_tos = nla_get_u8(tb[RTA_TUN_TOS]); + + if (tb[RTA_TUN_SPORT]) + cfg->fc_tunnel.key.tp_src = nla_get_be16(tb[RTA_TUN_SPORT]); + + if (tb[RTA_TUN_DPORT]) + cfg->fc_tunnel.key.tp_dst = nla_get_be16(tb[RTA_TUN_DPORT]); + + if (tb[RTA_TUN_FLAGS]) + cfg->fc_tunnel.key.tun_flags = nla_get_u16(tb[RTA_TUN_FLAGS]); + + cfg->fc_tunnel.mode = IP_TUNNEL_INFO_TX; + cfg->fc_tunnel.options = NULL; + cfg->fc_tunnel.options_len = 0; + + return 0; +} + const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_DST] = { .type = NLA_U32 }, [RTA_SRC] = { .type = NLA_U32 }, @@ -591,6 +642,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, [RTA_FLOW] = { .type = NLA_U32 }, + [RTA_TUNNEL] = { .type = NLA_NESTED }, }; static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, @@ -656,6 +708,11 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, case RTA_TABLE: cfg->fc_table = nla_get_u32(attr); break; + case RTA_TUNNEL: + err = parse_rta_tunnel(cfg, attr); + if (err < 0) + goto errout; + break; } } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 28ec3c1..1e94c81 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -215,6 +215,9 @@ static void free_fib_info_rcu(struct rcu_head *head) if (fi->fib_metrics != (u32 *) dst_default_metrics) kfree(fi->fib_metrics); + + ip_tunnel_info_put(fi->fib_tunnel); + kfree(fi); } @@ -760,6 +763,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) struct fib_info *ofi; int nhs = 1; struct net *net = cfg->fc_nlinfo.nl_net; + struct ip_tunnel_info *tun_info = NULL; if (cfg->fc_type > RTN_MAX) goto err_inval; @@ -856,6 +860,19 @@ struct fib_info *fib_create_info(struct fib_config *cfg) } } + if (cfg->fc_tunnel.mode) { + /* TODO: Allow specification of options */ + tun_info = ip_tunnel_info_alloc(0, GFP_KERNEL); + if (!tun_info) { + err = -ENOMEM; + goto failure; + } + + memcpy(tun_info, &cfg->fc_tunnel, sizeof(*tun_info)); + ip_tunnel_info_get(tun_info); + fi->fib_tunnel = tun_info; + } + if (cfg->fc_mp) { #ifdef CONFIG_IP_ROUTE_MULTIPATH err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg); @@ -975,6 +992,8 @@ err_inval: err = -EINVAL; failure: + kfree(tun_info); + if (fi) { fi->fib_dead = 1; free_fib_info(fi); @@ -983,6 +1002,29 @@ failure: return ERR_PTR(err); } +int fib_dump_tun_info(struct sk_buff *skb, struct ip_tunnel_info *tun_info) +{ + struct nlattr *tun_attr; + + tun_attr = nla_nest_start(skb, RTA_TUNNEL); + if (!tun_attr) + return -ENOMEM; + + if (nla_put_u64(skb, RTA_TUN_ID, tun_info->key.tun_id) || + nla_put_be32(skb, RTA_TUN_DST, tun_info->key.ipv4_dst) || + nla_put_be32(skb, RTA_TUN_SRC, tun_info->key.ipv4_src) || + nla_put_u8(skb, RTA_TUN_TOS, tun_info->key.ipv4_tos) || + nla_put_u8(skb, RTA_TUN_TTL, tun_info->key.ipv4_ttl) || + nla_put_u16(skb, RTA_TUN_SPORT, tun_info->key.tp_src) || + nla_put_u16(skb, RTA_TUN_DPORT, tun_info->key.tp_dst) || + nla_put_u16(skb, RTA_TUN_FLAGS, tun_info->key.tun_flags)) + return -ENOMEM; + + nla_nest_end(skb, tun_attr); + + return 0; +} + int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, unsigned int flags) @@ -1068,6 +1110,9 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_nest_end(skb, mp); } #endif + if (fi->fib_tunnel && fib_dump_tun_info(skb, fi->fib_tunnel)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6e8e1be..f53c62f 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1356,6 +1356,8 @@ static void ipv4_dst_destroy(struct dst_entry *dst) list_del(&rt->rt_uncached); spin_unlock_bh(&ul->lock); } + + ip_tunnel_info_put(rt->rt_tun_info); } void rt_flush_dev(struct net_device *dev) @@ -1489,6 +1491,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); + rth->rt_tun_info = NULL; if (our) { rth->dst.input= ip_local_deliver; rth->rt_flags |= RTCF_LOCAL; @@ -1543,6 +1546,7 @@ static int __mkroute_input(struct sk_buff *skb, struct in_device *in_dev, __be32 daddr, __be32 saddr, u32 tos) { + struct fib_info *fi = res->fi; struct fib_nh_exception *fnhe; struct rtable *rth; int err; @@ -1590,7 +1594,7 @@ static int __mkroute_input(struct sk_buff *skb, } fnhe = find_exception(&FIB_RES_NH(*res), daddr); - if (do_cache) { + if (do_cache && !(fi && fi->fib_tunnel)) { if (fnhe) rth = rcu_dereference(fnhe->fnhe_rth_input); else @@ -1621,6 +1625,13 @@ static int __mkroute_input(struct sk_buff *skb, INIT_LIST_HEAD(&rth->rt_uncached); RT_CACHE_STAT_INC(in_slow_tot); + if (fi && fi->fib_tunnel) { + ip_tunnel_info_get(fi->fib_tunnel); + rth->rt_tun_info = fi->fib_tunnel; + } else { + rth->rt_tun_info = NULL; + } + rth->dst.input = ip_forward; rth->dst.output = ip_output; @@ -1794,6 +1805,7 @@ local_input: rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); + rth->rt_tun_info = NULL; RT_CACHE_STAT_INC(in_slow_tot); if (res.type == RTN_UNREACHABLE) { rth->dst.input= ip_error; @@ -1940,6 +1952,11 @@ static struct rtable *__mkroute_output(const struct fib_result *res, fnhe = NULL; do_cache &= fi != NULL; + + /* Force dst for flows with tunnel encapsulation */ + if (fi && fi->fib_tunnel) + goto add; + if (do_cache) { struct rtable __rcu **prth; struct fib_nh *nh = &FIB_RES_NH(*res); @@ -1984,6 +2001,13 @@ add: rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); + if (fi && fi->fib_tunnel) { + ip_tunnel_info_get(fi->fib_tunnel); + rth->rt_tun_info = fi->fib_tunnel; + } else { + rth->rt_tun_info = NULL; + } + RT_CACHE_STAT_INC(out_slow_tot); if (flags & RTCF_LOCAL) @@ -2263,6 +2287,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_uses_gateway = ort->rt_uses_gateway; INIT_LIST_HEAD(&rt->rt_uncached); + rt->rt_tun_info = NULL; dst_free(new); } @@ -2394,6 +2419,9 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) goto nla_put_failure; + if (rt->rt_tun_info && fib_dump_tun_info(skb, rt->rt_tun_info)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 4750fb6..75d6824 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -27,6 +27,7 @@ #include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/u64_stats_sync.h> +#include <net/route.h> #include "datapath.h" -- 2.3.5 -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html