On Tue, Nov 20, 2018 at 6:23 AM Alexis Bauvin <abau...@scaleway.com> wrote: > > Creating a VXLAN device with is underlay in the non-default VRF makes > egress route lookup fail or incorrect since it will resolve in the > default VRF, and ingress fail because the socket listens in the default > VRF. > > This patch binds the underlying UDP tunnel socket to the l3mdev of the > lower device of the VXLAN device. This will listen in the proper VRF and > output traffic from said l3mdev, matching l3mdev routing rules and > looking up the correct routing table. > > When the VXLAN device does not have a lower device, or the lower device > is in the default VRF, the socket will not be bound to any interface, > keeping the previous behaviour. > > The underlay l3mdev is deduced from the VXLAN lower device > (IFLA_VXLAN_LINK). > > The l3mdev_master_upper_ifindex_by_index function has been added to > l3mdev. Its goal is to fetch the effective l3mdev of an interface which > is not a direct slave of said l3mdev. It handles the following example, > properly resolving the l3mdev of eth0 to vrf-blue: > > +----------+ +---------+ > | | | | > | vrf-blue | | vrf-red | > | | | | > +----+-----+ +----+----+ > | | > | | > +----+-----+ +----+----+ > | | | | > | br-blue | | br-red | > | | | | > +----+-----+ +---+-+---+ > | | | > | +-----+ +-----+ > | | | > +----+-----+ +------+----+ +----+----+ > | | lower device | | | | > | eth0 | <- - - - - - - | vxlan-red | | tap-red | (... more taps) > | | | | | | > +----------+ +-----------+ +---------+ > > Signed-off-by: Alexis Bauvin <abau...@scaleway.com> > Reviewed-by: Amine Kherbouche <akherbou...@scaleway.com> > Tested-by: Amine Kherbouche <akherbou...@scaleway.com> > --- > drivers/net/vxlan.c | 32 ++++++++++++++++++++++++-------- > include/net/l3mdev.h | 22 ++++++++++++++++++++++ > net/l3mdev/l3mdev.c | 18 ++++++++++++++++++ > 3 files changed, 64 insertions(+), 8 deletions(-) > > diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c > index 27bd586b94b0..a3de08122269 100644 > --- a/drivers/net/vxlan.c > +++ b/drivers/net/vxlan.c > @@ -212,7 +212,7 @@ static inline struct vxlan_rdst *first_remote_rtnl(struct > vxlan_fdb *fdb) > * and enabled unshareable flags. > */ > static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t > family, > - __be16 port, u32 flags) > + __be16 port, u32 flags, int ifindex) > { > struct vxlan_sock *vs; > > @@ -221,7 +221,8 @@ static struct vxlan_sock *vxlan_find_sock(struct net > *net, sa_family_t family, > hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) { > if (inet_sk(vs->sock->sk)->inet_sport == port && > vxlan_get_sk_family(vs) == family && > - vs->flags == flags) > + vs->flags == flags && > + vs->sock->sk->sk_bound_dev_if == ifindex) > return vs; > } > return NULL; > @@ -261,7 +262,7 @@ static struct vxlan_dev *vxlan_find_vni(struct net *net, > int ifindex, > { > struct vxlan_sock *vs; > > - vs = vxlan_find_sock(net, family, port, flags); > + vs = vxlan_find_sock(net, family, port, flags, ifindex); > if (!vs) > return NULL; > > @@ -2172,6 +2173,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct > net_device *dev, > struct rtable *rt; > __be16 df = 0; > > + if (!ifindex) > + ifindex = sock4->sock->sk->sk_bound_dev_if; > + > rt = vxlan_get_route(vxlan, dev, sock4, skb, ifindex, tos, > dst->sin.sin_addr.s_addr, > &local_ip.sin.sin_addr.s_addr, > @@ -2210,6 +2214,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct > net_device *dev, > } else { > struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock); > > + if (!ifindex) > + ifindex = sock6->sock->sk->sk_bound_dev_if; > + > ndst = vxlan6_get_route(vxlan, dev, sock6, skb, ifindex, tos, > label, &dst->sin6.sin6_addr, > &local_ip.sin6.sin6_addr, > @@ -2813,7 +2820,7 @@ static const struct ethtool_ops vxlan_ethtool_ops = { > }; > > static struct socket *vxlan_create_sock(struct net *net, bool ipv6, > - __be16 port, u32 flags) > + __be16 port, u32 flags, int ifindex) > { > struct socket *sock; > struct udp_port_cfg udp_conf; > @@ -2831,6 +2838,7 @@ static struct socket *vxlan_create_sock(struct net > *net, bool ipv6, > } > > udp_conf.local_udp_port = port; > + udp_conf.bind_ifindex = ifindex; > > /* Open UDP socket */ > err = udp_sock_create(net, &udp_conf, &sock); > @@ -2842,7 +2850,8 @@ static struct socket *vxlan_create_sock(struct net > *net, bool ipv6, > > /* Create new listen socket if needed */ > static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6, > - __be16 port, u32 flags) > + __be16 port, u32 flags, > + int ifindex) > { > struct vxlan_net *vn = net_generic(net, vxlan_net_id); > struct vxlan_sock *vs; > @@ -2857,7 +2866,7 @@ static struct vxlan_sock *vxlan_socket_create(struct > net *net, bool ipv6, > for (h = 0; h < VNI_HASH_SIZE; ++h) > INIT_HLIST_HEAD(&vs->vni_list[h]); > > - sock = vxlan_create_sock(net, ipv6, port, flags); > + sock = vxlan_create_sock(net, ipv6, port, flags, ifindex); > if (IS_ERR(sock)) { > kfree(vs); > return ERR_CAST(sock); > @@ -2894,11 +2903,17 @@ static int __vxlan_sock_add(struct vxlan_dev *vxlan, > bool ipv6) > struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); > struct vxlan_sock *vs = NULL; > struct vxlan_dev_node *node; > + int l3mdev_index; > + > + l3mdev_index = > + l3mdev_master_upper_ifindex_by_index(vxlan->net, > + > vxlan->cfg.remote_ifindex);
vxlan->cfg.remote_ifindex is optional, so we can avoid trying to derive the l3mdev_ifindex for cases where it is not present > > if (!vxlan->cfg.no_share) { > spin_lock(&vn->sock_lock); > vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET, > - vxlan->cfg.dst_port, vxlan->cfg.flags); > + vxlan->cfg.dst_port, vxlan->cfg.flags, > + l3mdev_index); > if (vs && !refcount_inc_not_zero(&vs->refcnt)) { > spin_unlock(&vn->sock_lock); > return -EBUSY; > @@ -2907,7 +2922,8 @@ static int __vxlan_sock_add(struct vxlan_dev *vxlan, > bool ipv6) > } > if (!vs) > vs = vxlan_socket_create(vxlan->net, ipv6, > - vxlan->cfg.dst_port, > vxlan->cfg.flags); > + vxlan->cfg.dst_port, > vxlan->cfg.flags, > + l3mdev_index); > if (IS_ERR(vs)) > return PTR_ERR(vs); > #if IS_ENABLED(CONFIG_IPV6) > diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h > index 3832099289c5..78fa0ac4613c 100644 > --- a/include/net/l3mdev.h > +++ b/include/net/l3mdev.h > @@ -101,6 +101,17 @@ struct net_device *l3mdev_master_dev_rcu(const struct > net_device *_dev) > return master; > } > > +int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex); > +static inline > +int l3mdev_master_upper_ifindex_by_index(struct net *net, int ifindex) > +{ > + rcu_read_lock(); > + ifindex = l3mdev_master_upper_ifindex_by_index_rcu(net, ifindex); > + rcu_read_unlock(); > + > + return ifindex; > +} > + > u32 l3mdev_fib_table_rcu(const struct net_device *dev); > u32 l3mdev_fib_table_by_index(struct net *net, int ifindex); > static inline u32 l3mdev_fib_table(const struct net_device *dev) > @@ -207,6 +218,17 @@ static inline int l3mdev_master_ifindex_by_index(struct > net *net, int ifindex) > return 0; > } > > +static inline > +int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex) > +{ > + return 0; > +} > +static inline > +int l3mdev_master_upper_ifindex_by_index(struct net *net, int ifindex) > +{ > + return 0; > +} > + > static inline > struct net_device *l3mdev_master_dev_rcu(const struct net_device *dev) > { > diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c > index 8da86ceca33d..309dee76724e 100644 > --- a/net/l3mdev/l3mdev.c > +++ b/net/l3mdev/l3mdev.c > @@ -46,6 +46,24 @@ int l3mdev_master_ifindex_rcu(const struct net_device *dev) > } > EXPORT_SYMBOL_GPL(l3mdev_master_ifindex_rcu); > > +/** > + * l3mdev_master_upper_ifindex_by_index - get index of upper l3 master > + * device > + * @net: network namespace for device index lookup > + * @ifindex: targeted interface > + */ > +int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex) > +{ > + struct net_device *dev; > + > + dev = dev_get_by_index_rcu(net, ifindex); > + while (dev && !netif_is_l3_master(dev)) > + dev = netdev_master_upper_dev_get(dev); > + > + return dev ? dev->ifindex : 0; > +} > +EXPORT_SYMBOL_GPL(l3mdev_master_upper_ifindex_by_index_rcu); > + > /** > * l3mdev_fib_table - get FIB table id associated with an L3 > * master interface > -- >