On Jul 18, 2013, at 5:22 PM, Pravin B Shelar <pshe...@nicira.com> wrote:
> Following patch restructures vxlan tunneling so that it is more > in sync with upstream vxlan tunneling code. > > Signed-off-by: Pravin Shelar <pshe...@nicira.com> > --- > v3-v2: > - Moved kernel version in flow_dissector check to top. > v1-v2: > - Added create flag to vxlan-port add. > - Moved rxhash functions to flow_dissector.c > --- > datapath/compat.h | 6 + > datapath/linux/Modules.mk | 6 +- > datapath/linux/compat/flow_dissector.c | 203 +++++++++++ > datapath/linux/compat/include/linux/in.h | 20 ++ > datapath/linux/compat/include/linux/skbuff.h | 22 ++ > datapath/linux/compat/include/net/flow_keys.h | 22 ++ > datapath/linux/compat/include/net/ip.h | 7 + > datapath/linux/compat/include/net/ipv6.h | 15 + > datapath/linux/compat/include/net/vxlan.h | 43 +++ > datapath/linux/compat/vxlan.c | 457 +++++++++++++++++++++++++ > datapath/vport-vxlan.c | 221 +++++-------- > 11 files changed, 877 insertions(+), 145 deletions(-) > create mode 100644 datapath/linux/compat/flow_dissector.c > create mode 100644 datapath/linux/compat/include/net/flow_keys.h > create mode 100644 datapath/linux/compat/include/net/vxlan.h > create mode 100644 datapath/linux/compat/vxlan.c > > diff --git a/datapath/compat.h b/datapath/compat.h > index a6a01d5..4dfd192 100644 > --- a/datapath/compat.h > +++ b/datapath/compat.h > @@ -100,4 +100,10 @@ static inline void skb_set_mark(struct sk_buff *skb, u32 > mark) > #define rt_dst(rt) (rt->u.dst) > #endif > > +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) > +#define inet_sport(sk) (inet_sk(sk)->sport) > +#else > +#define inet_sport(sk) (inet_sk(sk)->inet_sport) > +#endif > + > #endif /* compat.h */ > diff --git a/datapath/linux/Modules.mk b/datapath/linux/Modules.mk > index dcacc79..edaeabb 100644 > --- a/datapath/linux/Modules.mk > +++ b/datapath/linux/Modules.mk > @@ -3,6 +3,7 @@ openvswitch_sources += \ > linux/compat/dev-openvswitch.c \ > linux/compat/exthdrs_core.c \ > linux/compat/flex_array.c \ > + linux/compat/flow_dissector.c \ > linux/compat/gre.c \ > linux/compat/gso.c \ > linux/compat/genetlink-openvswitch.c \ > @@ -14,6 +15,7 @@ openvswitch_sources += \ > linux/compat/reciprocal_div.c \ > linux/compat/skbuff-openvswitch.c \ > linux/compat/time.c \ > + linux/compat/vxlan.c \ > linux/compat/workqueue.c > openvswitch_headers += \ > linux/compat/gso.h \ > @@ -65,6 +67,7 @@ openvswitch_headers += \ > linux/compat/include/linux/workqueue.h \ > linux/compat/include/net/checksum.h \ > linux/compat/include/net/dst.h \ > + linux/compat/include/net/flow_keys.h \ > linux/compat/include/net/genetlink.h \ > linux/compat/include/net/gre.h \ > linux/compat/include/net/inet_frag.h \ > @@ -76,4 +79,5 @@ openvswitch_headers += \ > linux/compat/include/net/protocol.h \ > linux/compat/include/net/route.h \ > linux/compat/include/net/sock.h \ > - linux/compat/include/net/netns/generic.h > + linux/compat/include/net/netns/generic.h \ > + linux/compat/include/net/vxlan.h > diff --git a/datapath/linux/compat/flow_dissector.c > b/datapath/linux/compat/flow_dissector.c > new file mode 100644 > index 0000000..c2078d6 > --- /dev/null > +++ b/datapath/linux/compat/flow_dissector.c > @@ -0,0 +1,203 @@ > + > +#include <linux/version.h> > +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) > +#include <linux/ip.h> > +#include <linux/ipv6.h> > +#include <linux/if_vlan.h> > +#include <net/ip.h> > +#include <net/ipv6.h> > +#include <linux/igmp.h> > +#include <linux/icmp.h> > +#include <linux/sctp.h> > +#include <linux/dccp.h> > +#include <linux/if_tunnel.h> > +#include <linux/if_pppox.h> > +#include <linux/ppp_defs.h> > +#include <net/flow_keys.h> > + This file appears to be missing license and copyright information. > + > +/* copy saddr & daddr, possibly using 64bit load/store > + * Equivalent to : flow->src = iph->saddr; > + * flow->dst = iph->daddr; > + */ > +static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct > iphdr *iph) > +{ > + BUILD_BUG_ON(offsetof(typeof(*flow), dst) != > + offsetof(typeof(*flow), src) + sizeof(flow->src)); > + memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst)); > +} > + > +static bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys > *flow) > +{ > + int poff, nhoff = skb_network_offset(skb); > + u8 ip_proto; > + __be16 proto = skb->protocol; > + > + memset(flow, 0, sizeof(*flow)); > + > +again: > + switch (proto) { > + case __constant_htons(ETH_P_IP): { > + const struct iphdr *iph; > + struct iphdr _iph; > +ip: > + iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); > + if (!iph) > + return false; > + > + if (ip_is_fragment(iph)) > + ip_proto = 0; > + else > + ip_proto = iph->protocol; > + iph_to_flow_copy_addrs(flow, iph); > + nhoff += iph->ihl * 4; > + break; > + } > + case __constant_htons(ETH_P_IPV6): { > + const struct ipv6hdr *iph; > + struct ipv6hdr _iph; > +ipv6: > + iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); > + if (!iph) > + return false; > + > + ip_proto = iph->nexthdr; > + flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr); > + flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr); > + nhoff += sizeof(struct ipv6hdr); > + break; > + } > + case __constant_htons(ETH_P_8021Q): { > + const struct vlan_hdr *vlan; > + struct vlan_hdr _vlan; > + > + vlan = skb_header_pointer(skb, nhoff, sizeof(_vlan), &_vlan); > + if (!vlan) > + return false; > + > + proto = vlan->h_vlan_encapsulated_proto; > + nhoff += sizeof(*vlan); > + goto again; > + } > + case __constant_htons(ETH_P_PPP_SES): { > + struct { > + struct pppoe_hdr hdr; > + __be16 proto; > + } *hdr, _hdr; > + hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr); > + if (!hdr) > + return false; > + proto = hdr->proto; > + nhoff += PPPOE_SES_HLEN; > + switch (proto) { > + case __constant_htons(PPP_IP): > + goto ip; > + case __constant_htons(PPP_IPV6): > + goto ipv6; > + default: > + return false; > + } > + } > + default: > + return false; > + } > + > + switch (ip_proto) { > + case IPPROTO_GRE: { > + struct gre_hdr { > + __be16 flags; > + __be16 proto; > + } *hdr, _hdr; > + > + hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr); > + if (!hdr) > + return false; > + /* > + * Only look inside GRE if version zero and no > + * routing > + */ > + if (!(hdr->flags & (GRE_VERSION|GRE_ROUTING))) { > + proto = hdr->proto; > + nhoff += 4; > + if (hdr->flags & GRE_CSUM) > + nhoff += 4; > + if (hdr->flags & GRE_KEY) > + nhoff += 4; > + if (hdr->flags & GRE_SEQ) > + nhoff += 4; > + if (proto == htons(ETH_P_TEB)) { > + const struct ethhdr *eth; > + struct ethhdr _eth; > + > + eth = skb_header_pointer(skb, nhoff, > + sizeof(_eth), &_eth); > + if (!eth) > + return false; > + proto = eth->h_proto; > + nhoff += sizeof(*eth); > + } > + goto again; > + } > + break; > + } > + case IPPROTO_IPIP: > + goto again; > + default: > + break; > + } > + > + flow->ip_proto = ip_proto; > + poff = proto_ports_offset(ip_proto); > + if (poff >= 0) { > + __be32 *ports, _ports; > + > + nhoff += poff; > + ports = skb_header_pointer(skb, nhoff, sizeof(_ports), &_ports); > + if (ports) > + flow->ports = *ports; > + } > + > + flow->thoff = (u16) nhoff; > + > + return true; > +} > + > +static u32 hashrnd __read_mostly; > + > +static void init_hashrnd(void) > +{ > + if (likely(hashrnd)) > + return; > + get_random_bytes(&hashrnd, sizeof(hashrnd)); > +} > + > +u32 __skb_get_rxhash(struct sk_buff *skb) > +{ > + struct flow_keys keys; > + u32 hash; > + > + if (!skb_flow_dissect(skb, &keys)) > + return 0; > + > + /* get a consistent hash (same value on both flow directions) */ > + if (((__force u32)keys.dst < (__force u32)keys.src) || > + (((__force u32)keys.dst == (__force u32)keys.src) && > + ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) { > + swap(keys.dst, keys.src); > + swap(keys.port16[0], keys.port16[1]); > + } > + > + init_hashrnd(); > + > + hash = jhash_3words((__force u32)keys.dst, > + (__force u32)keys.src, > + (__force u32)keys.ports, hashrnd); > + if (!hash) > + hash = 1; > + > +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,34) > + skb->rxhash = hash; > +#endif > + return hash; > +} > +#endif > diff --git a/datapath/linux/compat/include/linux/in.h > b/datapath/linux/compat/include/linux/in.h > index f91a832..fa2e026 100644 > --- a/datapath/linux/compat/include/linux/in.h > +++ b/datapath/linux/compat/include/linux/in.h > @@ -3,6 +3,26 @@ > > #include_next <linux/in.h> > > +#include <linux/module.h> > +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) > +static inline int proto_ports_offset(int proto) > +{ > + switch (proto) { > + case IPPROTO_TCP: > + case IPPROTO_UDP: > + case IPPROTO_DCCP: > + case IPPROTO_ESP: /* SPI */ > + case IPPROTO_SCTP: > + case IPPROTO_UDPLITE: > + return 0; > + case IPPROTO_AH: /* SPI */ > + return 4; > + default: > + return -EINVAL; > + } > +} > +#endif > + > #ifndef HAVE_IPV4_IS_MULTICAST > > static inline bool ipv4_is_loopback(__be32 addr) > diff --git a/datapath/linux/compat/include/linux/skbuff.h > b/datapath/linux/compat/include/linux/skbuff.h > index d485b39..c9c103d 100644 > --- a/datapath/linux/compat/include/linux/skbuff.h > +++ b/datapath/linux/compat/include/linux/skbuff.h > @@ -251,4 +251,26 @@ static inline void skb_reset_mac_len(struct sk_buff *skb) > skb->mac_len = skb->network_header - skb->mac_header; > } > #endif > + > +static inline int skb_unclone(struct sk_buff *skb, gfp_t pri) > +{ > + might_sleep_if(pri & __GFP_WAIT); > + > + if (skb_cloned(skb)) > + return pskb_expand_head(skb, 0, 0, pri); > + > + return 0; > +} > + > +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) > +extern u32 __skb_get_rxhash(struct sk_buff *skb); > +static inline __u32 skb_get_rxhash(struct sk_buff *skb) > +{ > +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,34) > + if (!skb->rxhash) > +#endif > + return __skb_get_rxhash(skb); > +} > +#endif > + > #endif > diff --git a/datapath/linux/compat/include/net/flow_keys.h > b/datapath/linux/compat/include/net/flow_keys.h > new file mode 100644 > index 0000000..4de17d1 > --- /dev/null > +++ b/datapath/linux/compat/include/net/flow_keys.h > @@ -0,0 +1,22 @@ > +#ifndef _NET_FLOW_KEYS_WRAPPER_H > +#define _NET_FLOW_KEYS_WRAPPER_H > + > +#include <linux/version.h> > + > +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,3,0) > +#include_next <net/flow_keys.h> > +#else > +struct flow_keys { > + /* (src,dst) must be grouped, in the same way than in IP header */ > + __be32 src; > + __be32 dst; > + union { > + __be32 ports; > + __be16 port16[2]; > + }; > + u16 thoff; > + u8 ip_proto; > +}; > +#endif > + > +#endif > diff --git a/datapath/linux/compat/include/net/ip.h > b/datapath/linux/compat/include/net/ip.h > index b18b968..1dccdea 100644 > --- a/datapath/linux/compat/include/net/ip.h > +++ b/datapath/linux/compat/include/net/ip.h > @@ -11,4 +11,11 @@ extern int ip_local_out(struct sk_buff *skb); > > #endif /* linux kernel < 2.6.25 */ > > +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,1,0) > +static inline bool ip_is_fragment(const struct iphdr *iph) > +{ > + return (iph->frag_off & htons(IP_MF | IP_OFFSET)) != 0; > +} > +#endif > + > #endif > diff --git a/datapath/linux/compat/include/net/ipv6.h > b/datapath/linux/compat/include/net/ipv6.h > index d1e3248..7ab234a 100644 > --- a/datapath/linux/compat/include/net/ipv6.h > +++ b/datapath/linux/compat/include/net/ipv6.h > @@ -23,4 +23,19 @@ enum { > extern int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, > int target, unsigned short *fragoff, int *fragflg); > > +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0) > +static inline u32 ipv6_addr_hash(const struct in6_addr *a) > +{ > +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 > + const unsigned long *ul = (const unsigned long *)a; > + unsigned long x = ul[0] ^ ul[1]; > + > + return (u32)(x ^ (x >> 32)); > +#else > + return (__force u32)(a->s6_addr32[0] ^ a->s6_addr32[1] ^ > + a->s6_addr32[2] ^ a->s6_addr32[3]); > +#endif > +} > +#endif > + > #endif > diff --git a/datapath/linux/compat/include/net/vxlan.h > b/datapath/linux/compat/include/net/vxlan.h > new file mode 100644 > index 0000000..102bc0c > --- /dev/null > +++ b/datapath/linux/compat/include/net/vxlan.h > @@ -0,0 +1,43 @@ > +#ifndef __NET_VXLAN_WRAPPER_H > +#define __NET_VXLAN_WRAPPER_H 1 > + > +#include <linux/skbuff.h> > +#include <linux/netdevice.h> > +#include <linux/udp.h> > + > +/* per UDP socket information */ > +struct vxlan_sock { > + struct hlist_node hlist; > + struct rcu_head rcu; > + struct socket *sock; > + struct list_head handler_list; > +}; > + > +struct vxlan_handler; > +typedef int (vxlan_rcv_t)(struct vxlan_handler *vh, struct sk_buff *skb, > __be32 key); > + > +struct vxlan_handler { > + vxlan_rcv_t *rcv; > + struct list_head node; > + void *data; > + struct vxlan_sock *vs; > + atomic_t refcnt; > + struct rcu_head rcu; > + struct work_struct del_work; > + int priority; > +}; > + > +void vxlan_handler_put(struct vxlan_handler *vh); > + > +struct vxlan_handler *vxlan_handler_add(struct net *net, > + __be16 portno, vxlan_rcv_t *rcv, > + void *data, int priority, bool create); > + > +int vxlan_xmit_skb(struct net *net, struct vxlan_handler *vh, > + struct rtable *rt, struct sk_buff *skb, > + __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, > + __be16 src_port, __be16 dst_port, __be32 vni); > + > +__be16 vxlan_src_port(__u16 port_min, __u16 port_max, struct sk_buff *skb); > + > +#endif > diff --git a/datapath/linux/compat/vxlan.c b/datapath/linux/compat/vxlan.c > new file mode 100644 > index 0000000..b41ecc2 > --- /dev/null > +++ b/datapath/linux/compat/vxlan.c > @@ -0,0 +1,457 @@ > +#include <linux/kernel.h> > +#include <linux/types.h> > +#include <linux/module.h> > +#include <linux/errno.h> > +#include <linux/slab.h> > +#include <linux/skbuff.h> > +#include <linux/rculist.h> > +#include <linux/netdevice.h> > +#include <linux/in.h> > +#include <linux/ip.h> > +#include <linux/udp.h> > +#include <linux/igmp.h> > +#include <linux/etherdevice.h> > +#include <linux/if_ether.h> > +#include <linux/if_vlan.h> > +#include <linux/hash.h> > +#include <linux/ethtool.h> > +#include <net/arp.h> > +#include <net/ndisc.h> > +#include <net/ip.h> > +#include <net/ip_tunnels.h> > +#include <net/icmp.h> > +#include <net/udp.h> > +#include <net/rtnetlink.h> > +#include <net/route.h> > +#include <net/dsfield.h> > +#include <net/inet_ecn.h> > +#include <net/net_namespace.h> > +#include <net/netns/generic.h> > +#include <net/vxlan.h> > + Same thing here, no license or copyright. > +#include "checksum.h" > +#include "compat.h" > +#include "gso.h" > +#include "vlan.h" > + > +#define PORT_HASH_BITS 8 > +#define PORT_HASH_SIZE (1<<PORT_HASH_BITS) > + > +#define VXLAN_N_VID (1u << 24) > +#define VXLAN_VID_MASK (VXLAN_N_VID - 1) > +/* IP header + UDP + VXLAN + Ethernet header */ > +#define VXLAN_HEADROOM (20 + 8 + 8 + 14) > +#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr)) > + > +#define VXLAN_FLAGS 0x08000000 /* struct vxlanhdr.vx_flags required > value. */ > + > +/* VXLAN protocol header */ > +struct vxlanhdr { > + __be32 vx_flags; > + __be32 vx_vni; > +}; > + > +static int vxlan_net_id; > + > +/* per-network namespace private data for this module */ > +struct vxlan_net { > + struct hlist_head sock_list[PORT_HASH_SIZE]; > + struct mutex sock_lock; /* RTNL lock nests inside this lock. */ > +}; > + > +/* Socket hash table head */ > +static inline struct hlist_head *vs_head(struct net *net, __be16 port) > +{ > + struct vxlan_net *vn = net_generic(net, vxlan_net_id); > + > + return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)]; > +} > + > +/* Find VXLAN socket based on network namespace and UDP port */ > +static struct vxlan_sock *vxlan_find_port(struct net *net, __be16 port) > +{ > + struct vxlan_sock *vs; > + > + hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) { > + if (inet_sport(vs->sock->sk) == port) > + return vs; > + } > + return NULL; > +} > + > +/* Callback from net/ipv4/udp.c to receive packets */ > +static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) > +{ > + struct vxlan_handler *vh; > + struct vxlan_sock *vs; > + struct vxlanhdr *vxh; > + > + /* Need Vxlan and inner Ethernet header to be present */ > + if (!pskb_may_pull(skb, VXLAN_HLEN)) > + goto error; > + > + /* Return packets with reserved bits set */ > + vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1); > + if (vxh->vx_flags != htonl(VXLAN_FLAGS) || > + (vxh->vx_vni & htonl(0xff))) { > + printk("invalid vxlan flags=%#x vni=%#x\n", > + ntohl(vxh->vx_flags), ntohl(vxh->vx_vni)); > + goto error; > + } > + > + if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB))) > + goto drop; > + > + vs = vxlan_find_port(sock_net(sk), inet_sport(sk)); > + if (!vs) > + goto drop; > + > + list_for_each_entry_rcu(vh, &vs->handler_list, node) { > + if (vh->rcv(vh, skb, vxh->vx_vni) == PACKET_RCVD) > + return 0; > + } > + > +drop: > + /* Consume bad packet */ > + kfree_skb(skb); > + return 0; > + > +error: > + /* Return non vxlan pkt */ > + return 1; > +} > + > +static void vxlan_sock_put(struct sk_buff *skb) > +{ > + sock_put(skb->sk); > +} > + > +/* On transmit, associate with the tunnel socket */ > +static void vxlan_set_owner(struct sock *sk, struct sk_buff *skb) > +{ > + skb_orphan(skb); > + sock_hold(sk); > + skb->sk = sk; > + skb->destructor = vxlan_sock_put; > +} > + > +/* Compute source port for outgoing packet > + * first choice to use L4 flow hash since it will spread > + * better and maybe available from hardware > + * secondary choice is to use jhash on the Ethernet header > + */ > +__be16 vxlan_src_port(__u16 port_min, __u16 port_max, struct sk_buff *skb) > +{ > + unsigned int range = (port_max - port_min) + 1; > + u32 hash; > + > + hash = skb_get_rxhash(skb); > + if (!hash) > + hash = jhash(skb->data, 2 * ETH_ALEN, > + (__force u32) skb->protocol); > + > + return htons((((u64) hash * range) >> 32) + port_min); > +} > + > +static void vxlan_gso(struct sk_buff *skb) > +{ > + int udp_offset = skb_transport_offset(skb); > + struct udphdr *uh; > + > + uh = udp_hdr(skb); > + uh->len = htons(skb->len - udp_offset); > + > + /* csum segment if tunnel sets skb with csum. */ > + if (unlikely(uh->check)) { > + struct iphdr *iph = ip_hdr(skb); > + > + uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, > + skb->len - udp_offset, > + IPPROTO_UDP, 0); > + uh->check = csum_fold(skb_checksum(skb, udp_offset, > + skb->len - udp_offset, 0)); > + > + if (uh->check == 0) > + uh->check = CSUM_MANGLED_0; > + > + } > + skb->ip_summed = CHECKSUM_NONE; > +} > + > +static int handle_offloads(struct sk_buff *skb) > +{ > + if (skb_is_gso(skb)) { > + OVS_GSO_CB(skb)->fix_segment = vxlan_gso; > + } else { > + if (skb->ip_summed != CHECKSUM_PARTIAL) > + skb->ip_summed = CHECKSUM_NONE; > + } > + return 0; > +} > + > +int vxlan_xmit_skb(struct net *net, struct vxlan_handler *vh, > + struct rtable *rt, struct sk_buff *skb, > + __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, > + __be16 src_port, __be16 dst_port, __be32 vni) > +{ > + struct vxlanhdr *vxh; > + struct udphdr *uh; > + int min_headroom; > + int err; > + > + skb_reset_inner_headers(skb); > + > + min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len > + + VXLAN_HLEN + sizeof(struct iphdr) > + + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); > + > + /* Need space for new headers (invalidates iph ptr) */ > + err = skb_cow_head(skb, min_headroom); > + if (unlikely(err)) > + return err; > + > + if (unlikely(vlan_deaccel_tag(skb))) > + return -ENOMEM; > + > + vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); > + vxh->vx_flags = htonl(VXLAN_FLAGS); > + vxh->vx_vni = vni; > + > + __skb_push(skb, sizeof(*uh)); > + skb_reset_transport_header(skb); > + uh = udp_hdr(skb); > + > + uh->dest = dst_port; > + uh->source = src_port; > + > + uh->len = htons(skb->len); > + uh->check = 0; > + > + vxlan_set_owner(vh->vs->sock->sk, skb); > + > + err = handle_offloads(skb); > + if (err) > + return err; > + > + return iptunnel_xmit(net, rt, skb, src, dst, > + IPPROTO_UDP, tos, ttl, df); > +} > + > +static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port) > +{ > + struct vxlan_sock *vs; > + struct sock *sk; > + struct sockaddr_in vxlan_addr = { > + .sin_family = AF_INET, > + .sin_addr.s_addr = htonl(INADDR_ANY), > + .sin_port = port, > + }; > + int rc; > + > + vs = kmalloc(sizeof(*vs), GFP_KERNEL); > + if (!vs) > + return ERR_PTR(-ENOMEM); > + > + /* Create UDP socket for encapsulation receive. */ > + rc = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &vs->sock); > + if (rc < 0) { > + pr_debug("UDP socket create failed\n"); > + kfree(vs); > + return ERR_PTR(rc); > + } > + > + /* Put in proper namespace */ > + sk = vs->sock->sk; > + sk_change_net(sk, net); > + > + rc = kernel_bind(vs->sock, (struct sockaddr *) &vxlan_addr, > + sizeof(vxlan_addr)); > + if (rc < 0) { > + pr_debug("bind for UDP socket %pI4:%u (%d)\n", > + &vxlan_addr.sin_addr, ntohs(vxlan_addr.sin_port), rc); > + sk_release_kernel(sk); > + kfree(vs); > + return ERR_PTR(rc); > + } > + > + /* Disable multicast loopback */ > + inet_sk(sk)->mc_loop = 0; > + INIT_LIST_HEAD(&vs->handler_list); > + hlist_add_head_rcu(&vs->hlist, vs_head(net, port)); > + > + /* Mark socket as an encapsulation socket. */ > + udp_sk(sk)->encap_type = 1; > + udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv; > + udp_encap_enable(); > + > + return vs; > +} > + > +static void rcu_free_vs_callback(struct rcu_head *rcu) > +{ > + struct vxlan_sock *vs = container_of(rcu, struct vxlan_sock, rcu); > + > + kfree(vs); > +} > + > +static void vxlan_socket_del(struct vxlan_sock *vs) > +{ > + if (list_empty(&vs->handler_list)) { > + hlist_del_rcu(&vs->hlist); > + > + sk_release_kernel(vs->sock->sk); > + call_rcu(&vs->rcu, rcu_free_vs_callback); > + } > +} > + > +static int vxlan_init_module(void); > +static void vxlan_cleanup_module(void); > + > +static void rcu_free_vh_callback(struct rcu_head *rcu) > +{ > + struct vxlan_handler *vh = container_of(rcu, struct vxlan_handler, rcu); > + > + kfree(vh); > +} > + > +static void vh_del_work(struct work_struct *work) > +{ > + struct vxlan_handler *vh = container_of(work, struct vxlan_handler, > del_work); > + struct vxlan_sock *vs = vh->vs; > + struct net *net = sock_net(vs->sock->sk); > + struct vxlan_net *vn = net_generic(net, vxlan_net_id); > + > + mutex_lock(&vn->sock_lock); > + > + list_del_rcu(&vh->node); > + call_rcu(&vh->rcu, rcu_free_vh_callback); > + vxlan_socket_del(vs); > + > + mutex_unlock(&vn->sock_lock); > + > + vxlan_cleanup_module(); > +} > + > +struct vxlan_handler *vxlan_handler_add(struct net *net, > + __be16 portno, vxlan_rcv_t *rcv, > + void *data, int priority, bool create) > +{ > + struct vxlan_net *vn; > + struct vxlan_sock *vs; > + struct vxlan_handler *vh; > + struct vxlan_handler *new; > + int err; > + > + err = vxlan_init_module(); > + if (err) > + return ERR_PTR(err); > + > + vn = net_generic(net, vxlan_net_id); > + mutex_lock(&vn->sock_lock); > + /* Look to see if can reuse socket */ > + vs = vxlan_find_port(net, portno); > + if (!vs) { > + vs = vxlan_socket_create(net, portno); > + if (IS_ERR(vs)) { > + new = (void *) vs; > + goto out; > + } > + } > + > + /* Try existing vxlan hanlders for this socket. */ > + list_for_each_entry(vh, &vs->handler_list, node) { > + if (vh->rcv == rcv) { > + if (create) { > + vxlan_socket_del(vs); > + new = ERR_PTR(-EEXIST); > + goto out; > + } > + atomic_inc(&vh->refcnt); > + new = vh; > + goto out; > + } > + } > + > + new = kzalloc(sizeof(*new), GFP_KERNEL); > + if (!new) { > + vxlan_socket_del(vs); > + new = ERR_PTR(-ENOMEM); > + goto out; > + } > + > + new->rcv = rcv; > + new->vs = vs; > + atomic_set(&new->refcnt, 1); > + INIT_WORK(&new->del_work, vh_del_work); > + new->data = data; > + new->priority = priority; > + > + list_for_each_entry(vh, &vs->handler_list, node) { > + if (vh->priority > priority) { > + list_add_tail_rcu(&new->node, &vh->node); > + goto out; > + } > + } > + > + list_add_tail_rcu(&new->node, &vs->handler_list); > +out: > + mutex_unlock(&vn->sock_lock); > + return new; > +} > + > +void vxlan_handler_put(struct vxlan_handler *vh) > +{ > + BUG_ON(!vh->vs); > + > + if (atomic_dec_and_test(&vh->refcnt)) > + queue_work(&vh->del_work); > +} > + > +static __net_init int vxlan_init_net(struct net *net) > +{ > + struct vxlan_net *vn = net_generic(net, vxlan_net_id); > + unsigned int h; > + > + mutex_init(&vn->sock_lock); > + > + for (h = 0; h < PORT_HASH_SIZE; ++h) > + INIT_HLIST_HEAD(&vn->sock_list[h]); > + > + return 0; > +} > + > +static struct pernet_operations vxlan_net_ops = { > + .init = vxlan_init_net, > + .id = &vxlan_net_id, > + .size = sizeof(struct vxlan_net), > +}; > + > +static int refcnt; > +static DEFINE_MUTEX(init_lock); > +DEFINE_COMPAT_PNET_REG_FUNC(gen_device); > + > +static int vxlan_init_module(void) > +{ > + int err = 0; > + > + mutex_lock(&init_lock); > + if (refcnt) > + goto out; > + err = register_pernet_device(&vxlan_net_ops); > +out: > + if (!err) > + refcnt++; > + mutex_unlock(&init_lock); > + return err; > +} > + > +static void vxlan_cleanup_module(void) > +{ > + mutex_lock(&init_lock); > + refcnt--; > + if (refcnt) > + goto out; > + unregister_pernet_device(&vxlan_net_ops); > +out: > + mutex_unlock(&init_lock); > +} > diff --git a/datapath/vport-vxlan.c b/datapath/vport-vxlan.c > index 7ff51fd..5546820 100644 > --- a/datapath/vport-vxlan.c > +++ b/datapath/vport-vxlan.c > @@ -31,164 +31,60 @@ > #include <net/icmp.h> > #include <net/ip.h> > #include <net/udp.h> > +#include <net/ip_tunnels.h> > +#include <net/udp.h> > +#include <net/rtnetlink.h> > +#include <net/route.h> > +#include <net/dsfield.h> > +#include <net/inet_ecn.h> > +#include <net/net_namespace.h> > +#include <net/netns/generic.h> > +#include <net/vxlan.h> > > #include "datapath.h" > #include "tunnel.h" > #include "vport.h" > > -#define VXLAN_FLAGS 0x08000000 /* struct vxlanhdr.vx_flags required value. > */ > - > -/** > - * struct vxlanhdr - VXLAN header > - * @vx_flags: Must have the exact value %VXLAN_FLAGS. > - * @vx_vni: VXLAN Network Identifier (VNI) in top 24 bits, low 8 bits zeroed. > - */ > -struct vxlanhdr { > - __be32 vx_flags; > - __be32 vx_vni; > -}; > - > -#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr)) > +#define OVS_VXLAN_RCV_PRIORITY 8 > > /** > * struct vxlan_port - Keeps track of open UDP ports > - * @dst_port: vxlan UDP port no. > - * @list: list element in @vxlan_ports. > - * @vxlan_rcv_socket: The socket created for this port number. > + * @vh: vxlan_handler created for the port. > * @name: vport name. > */ > struct vxlan_port { > - __be16 dst_port; > - struct list_head list; > - struct socket *vxlan_rcv_socket; > + struct vxlan_handler *vh; > char name[IFNAMSIZ]; > }; > > -static LIST_HEAD(vxlan_ports); > - > static inline struct vxlan_port *vxlan_vport(const struct vport *vport) > { > return vport_priv(vport); > } > > -static struct vxlan_port *vxlan_find_port(struct net *net, __be16 port) > -{ > - struct vxlan_port *vxlan_port; > - > - list_for_each_entry_rcu(vxlan_port, &vxlan_ports, list) { > - > - if (vxlan_port->dst_port == port && > - net_eq(sock_net(vxlan_port->vxlan_rcv_socket->sk), net)) > - return vxlan_port; > - } > - > - return NULL; > -} > - > -static inline struct vxlanhdr *vxlan_hdr(const struct sk_buff *skb) > -{ > - return (struct vxlanhdr *)(udp_hdr(skb) + 1); > -} > - > -static void vxlan_build_header(const struct vport *vport, > - struct sk_buff *skb, > - int tunnel_hlen) > -{ > - struct vxlan_port *vxlan_port = vxlan_vport(vport); > - struct udphdr *udph = udp_hdr(skb); > - struct vxlanhdr *vxh = (struct vxlanhdr *)(udph + 1); > - const struct ovs_key_ipv4_tunnel *tun_key = OVS_CB(skb)->tun_key; > - > - udph->dest = vxlan_port->dst_port; > - udph->source = htons(ovs_tnl_get_src_port(skb)); > - udph->check = 0; > - udph->len = htons(skb->len - skb_transport_offset(skb)); > - > - vxh->vx_flags = htonl(VXLAN_FLAGS); > - vxh->vx_vni = htonl(be64_to_cpu(tun_key->tun_id) << 8); > -} > - > /* Called with rcu_read_lock and BH disabled. */ > -static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) > +static int vxlan_rcv(struct vxlan_handler *vh, struct sk_buff *skb, __be32 > vx_vni) > { > - struct vxlan_port *vxlan_vport; > - struct vxlanhdr *vxh; > + struct vport *vport = vh->data; > struct iphdr *iph; > struct ovs_key_ipv4_tunnel tun_key; > __be64 key; > > - vxlan_vport = vxlan_find_port(dev_net(skb->dev), udp_hdr(skb)->dest); > - if (unlikely(!vxlan_vport)) > - goto error; > - > - if (unlikely(!pskb_may_pull(skb, VXLAN_HLEN + ETH_HLEN))) > - goto error; > - > - vxh = vxlan_hdr(skb); > - if (unlikely(vxh->vx_flags != htonl(VXLAN_FLAGS) || > - vxh->vx_vni & htonl(0xff))) > - goto error; > - > - skb_pull_rcsum(skb, VXLAN_HLEN); > - > - key = cpu_to_be64(ntohl(vxh->vx_vni) >> 8); > - > /* Save outer tunnel values */ > iph = ip_hdr(skb); > + key = cpu_to_be64(ntohl(vx_vni) >> 8); > tnl_tun_key_init(&tun_key, iph, key, TUNNEL_KEY); > > - ovs_tnl_rcv(vport_from_priv(vxlan_vport), skb, &tun_key); > - goto out; > - > -error: > - kfree_skb(skb); > -out: > - return 0; > -} > - > -/* Random value. Irrelevant as long as it's not 0 since we set the handler. > */ > -#define UDP_ENCAP_VXLAN 1 > -static int vxlan_socket_init(struct vxlan_port *vxlan_port, struct net *net) > -{ > - struct sockaddr_in sin; > - int err; > - > - err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, > - &vxlan_port->vxlan_rcv_socket); > - if (err) > - goto error; > - > - /* release net ref. */ > - sk_change_net(vxlan_port->vxlan_rcv_socket->sk, net); > - > - sin.sin_family = AF_INET; > - sin.sin_addr.s_addr = htonl(INADDR_ANY); > - sin.sin_port = vxlan_port->dst_port; > - > - err = kernel_bind(vxlan_port->vxlan_rcv_socket, (struct sockaddr *)&sin, > - sizeof(struct sockaddr_in)); > - if (err) > - goto error_sock; > - > - udp_sk(vxlan_port->vxlan_rcv_socket->sk)->encap_type = UDP_ENCAP_VXLAN; > - udp_sk(vxlan_port->vxlan_rcv_socket->sk)->encap_rcv = vxlan_rcv; > - > - udp_encap_enable(); > - > - return 0; > - > -error_sock: > - sk_release_kernel(vxlan_port->vxlan_rcv_socket->sk); > -error: > - pr_warn("cannot register vxlan protocol handler\n"); > - return err; > + ovs_vport_receive(vport, skb, &tun_key); > + return PACKET_RCVD; > } > > static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) > { > struct vxlan_port *vxlan_port = vxlan_vport(vport); > + __be16 dst_port = inet_sport(vxlan_port->vh->vs->sock->sk); > > - if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, > ntohs(vxlan_port->dst_port))) > + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port))) > return -EMSGSIZE; > return 0; > } > @@ -197,9 +93,7 @@ static void vxlan_tnl_destroy(struct vport *vport) > { > struct vxlan_port *vxlan_port = vxlan_vport(vport); > > - list_del_rcu(&vxlan_port->list); > - /* Release socket */ > - sk_release_kernel(vxlan_port->vxlan_rcv_socket->sk); > + vxlan_handler_put(vxlan_port->vh); > > ovs_vport_deferred_free(vport); > } > @@ -209,10 +103,11 @@ static struct vport *vxlan_tnl_create(const struct > vport_parms *parms) > struct net *net = ovs_dp_get_net(parms->dp); > struct nlattr *options = parms->options; > struct vxlan_port *vxlan_port; > + struct vxlan_handler *vh; > struct vport *vport; > struct nlattr *a; > - int err; > u16 dst_port; > + int err; > > if (!options) { > err = -EINVAL; > @@ -227,41 +122,79 @@ static struct vport *vxlan_tnl_create(const struct > vport_parms *parms) > goto error; > } > > - /* Verify if we already have a socket created for this port */ > - if (vxlan_find_port(net, htons(dst_port))) { > - err = -EEXIST; > - goto error; > - } > - > vport = ovs_vport_alloc(sizeof(struct vxlan_port), > &ovs_vxlan_vport_ops, parms); > if (IS_ERR(vport)) > return vport; > > vxlan_port = vxlan_vport(vport); > - vxlan_port->dst_port = htons(dst_port); > strncpy(vxlan_port->name, parms->name, IFNAMSIZ); > > - err = vxlan_socket_init(vxlan_port, net); > - if (err) > - goto error_free; > + vh = vxlan_handler_add(net, htons(dst_port), vxlan_rcv, > + vport, OVS_VXLAN_RCV_PRIORITY, true); > + if (IS_ERR(vh)) { > + ovs_vport_free(vport); > + return (void *)vh; > + } > + vxlan_port->vh = vh; > > - list_add_tail_rcu(&vxlan_port->list, &vxlan_ports); > return vport; > > -error_free: > - ovs_vport_free(vport); > error: > return ERR_PTR(err); > } > > static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) > { > - if (unlikely(!OVS_CB(skb)->tun_key)) > - return -EINVAL; > + struct vxlan_port *vxlan_port = vxlan_vport(vport); > + __be16 dst_port = inet_sport(vxlan_port->vh->vs->sock->sk); > + struct net *net = ovs_dp_get_net(vport->dp); > + struct rtable *rt; > + __be16 src_port; > + __be32 saddr; > + __be16 df; > + int port_min; > + int port_max; > + int err; > + > + if (unlikely(!OVS_CB(skb)->tun_key)) { > + err = -EINVAL; > + goto error; > + } > > - return ovs_tnl_send(vport, skb, IPPROTO_UDP, > - VXLAN_HLEN, vxlan_build_header); > + forward_ip_summed(skb, true); > + > + /* Route lookup */ > + saddr = OVS_CB(skb)->tun_key->ipv4_src; > + rt = find_route(ovs_dp_get_net(vport->dp), > + &saddr, > + OVS_CB(skb)->tun_key->ipv4_dst, > + IPPROTO_UDP, > + OVS_CB(skb)->tun_key->ipv4_tos, > + skb_get_mark(skb)); > + if (IS_ERR(rt)) { > + err = PTR_ERR(rt); > + goto error; > + } > + > + df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? > + htons(IP_DF) : 0; > + > + skb->local_df = 1; > + > + inet_get_local_port_range(&port_min, &port_max); > + src_port = vxlan_src_port(port_min, port_max, skb); > + > + err = vxlan_xmit_skb(net, vxlan_port->vh, rt, skb, > + saddr, OVS_CB(skb)->tun_key->ipv4_dst, > + OVS_CB(skb)->tun_key->ipv4_tos, > + OVS_CB(skb)->tun_key->ipv4_ttl, df, > + src_port, dst_port, > + htonl(be64_to_cpu(OVS_CB(skb)->tun_key->tun_id) << > 8)); > + if (err < 0) > + ip_rt_put(rt); > +error: > + return err; > } > > static const char *vxlan_get_name(const struct vport *vport) > -- > 1.7.1 > > _______________________________________________ > dev mailing list > dev@openvswitch.org > http://openvswitch.org/mailman/listinfo/dev _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev