The Stateless TCP Tunnel (STT) protocol encapsulates traffic in IPv4/TCP packets. STT uses TCP segmentation offload available in most of NIC. On packet xmit STT driver appends STT header along with TCP header to the packet. For GSO packet GSO parameters are set according to tunnel configuration and packet is handed over to networking stack. This allows use of segmentation offload available in NICs
The protocol is documented at http://www.ietf.org/archive/id/draft-davie-stt-06.txt Signed-off-by: Pravin B Shelar <pshe...@nicira.com> Signed-off-by: Jesse Gross <je...@nicira.com> --- Fixed according to comments from Jesse. --- FAQ.md | 1 + NEWS | 1 + datapath/Modules.mk | 2 + datapath/linux/.gitignore | 2 + datapath/linux/Modules.mk | 2 + datapath/linux/compat/gso.h | 9 + datapath/linux/compat/include/linux/openvswitch.h | 1 + datapath/linux/compat/include/net/stt.h | 65 + datapath/linux/compat/stt.c | 1545 +++++++++++++++++++++ datapath/vport-stt.c | 231 +++ lib/dpif-netlink.c | 5 + lib/netdev-vport.c | 17 +- ofproto/ofproto-dpif-ipfix.c | 11 +- vswitchd/vswitch.xml | 29 +- 14 files changed, 1909 insertions(+), 12 deletions(-) create mode 100644 datapath/linux/compat/include/net/stt.h create mode 100644 datapath/linux/compat/stt.c create mode 100644 datapath/vport-stt.c diff --git a/FAQ.md b/FAQ.md index 21d4e7a..761cffb 100644 --- a/FAQ.md +++ b/FAQ.md @@ -209,6 +209,7 @@ A: Support for tunnels was added to the upstream Linux kernel module | VXLAN | 3.12 | Geneve | 3.18 | LISP | <not upstream> +| STT | <not upstream> If you are using a version of the kernel that is older than the one listed above, it is still possible to use that tunnel protocol. However, diff --git a/NEWS b/NEWS index 87460a7..6e5e070 100644 --- a/NEWS +++ b/NEWS @@ -77,6 +77,7 @@ Post-v2.3.0 numbers. OpenFlow is 6653 and OVSDB is 6640. - Support for DPDK vHost. - Support for outer UDP checksums in Geneve and VXLAN. + - Support for STT tunneling. v2.3.0 - 14 Aug 2014 diff --git a/datapath/Modules.mk b/datapath/Modules.mk index e9f4079..2a3acb0 100644 --- a/datapath/Modules.mk +++ b/datapath/Modules.mk @@ -9,6 +9,7 @@ both_modules = \ vport_geneve \ vport_gre \ vport_lisp \ + vport_stt \ vport_vxlan build_modules = $(both_modules) # Modules to build dist_modules = $(both_modules) # Modules to distribute @@ -28,6 +29,7 @@ vport_geneve_sources = vport-geneve.c vport_vxlan_sources = vport-vxlan.c vport_gre_sources = vport-gre.c vport_lisp_sources = vport-lisp.c +vport_stt_sources = vport-stt.c openvswitch_headers = \ compat.h \ diff --git a/datapath/linux/.gitignore b/datapath/linux/.gitignore index 69d6658..a4a930d 100644 --- a/datapath/linux/.gitignore +++ b/datapath/linux/.gitignore @@ -35,6 +35,7 @@ /random32.c /reciprocal_div.c /skbuff-openvswitch.c +/stt.c /table.c /time.c /tmp @@ -50,6 +51,7 @@ /vport-lisp.c /vport-netdev.c /vport-patch.c +/vport-stt.c /vport-vxlan.c /vport.c /vxlan.c diff --git a/datapath/linux/Modules.mk b/datapath/linux/Modules.mk index 7d9710d..b6e44f6 100644 --- a/datapath/linux/Modules.mk +++ b/datapath/linux/Modules.mk @@ -12,6 +12,7 @@ openvswitch_sources += \ linux/compat/net_namespace.c \ linux/compat/reciprocal_div.c \ linux/compat/skbuff-openvswitch.c \ + linux/compat/stt.c \ linux/compat/udp.c \ linux/compat/udp_tunnel.c \ linux/compat/vxlan.c \ @@ -76,5 +77,6 @@ openvswitch_headers += \ linux/compat/include/net/udp.h \ linux/compat/include/net/udp_tunnel.h \ linux/compat/include/net/sock.h \ + linux/compat/include/net/stt.h \ linux/compat/include/net/vxlan.h \ linux/compat/include/net/sctp/checksum.h diff --git a/datapath/linux/compat/gso.h b/datapath/linux/compat/gso.h index 337d13a..980b899 100644 --- a/datapath/linux/compat/gso.h +++ b/datapath/linux/compat/gso.h @@ -26,6 +26,15 @@ struct ovs_gso_cb { }; #define OVS_GSO_CB(skb) ((struct ovs_gso_cb *)(skb)->cb) +static inline void skb_clear_ovs_gso_cb(struct sk_buff *skb) +{ + OVS_GSO_CB(skb)->fix_segment = NULL; +} +#else +static inline void skb_clear_ovs_gso_cb(struct sk_buff *skb) +{ + +} #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) diff --git a/datapath/linux/compat/include/linux/openvswitch.h b/datapath/linux/compat/include/linux/openvswitch.h index 61c59a2..f53bc81 100644 --- a/datapath/linux/compat/include/linux/openvswitch.h +++ b/datapath/linux/compat/include/linux/openvswitch.h @@ -229,6 +229,7 @@ enum ovs_vport_type { OVS_VPORT_TYPE_GENEVE, /* Geneve tunnel. */ OVS_VPORT_TYPE_GRE64 = 104, /* GRE tunnel with 64-bit keys */ OVS_VPORT_TYPE_LISP = 105, /* LISP tunnel */ + OVS_VPORT_TYPE_STT = 106, /* STT tunnel */ __OVS_VPORT_TYPE_MAX }; diff --git a/datapath/linux/compat/include/net/stt.h b/datapath/linux/compat/include/net/stt.h new file mode 100644 index 0000000..68a1ff0 --- /dev/null +++ b/datapath/linux/compat/include/net/stt.h @@ -0,0 +1,65 @@ +#ifndef __NET_STT_H +#define __NET_STT_H 1 + +#include <linux/kconfig.h> +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0) && IS_ENABLED(CONFIG_NETFILTER) +#include <net/ip_tunnels.h> +#define OVS_STT + +struct stthdr { + __u8 version; + __u8 flags; + __u8 l4_offset; + __u8 reserved; + __be16 mss; + __be16 vlan_tci; + __be64 key; +}; + +/* Padding after the end of the tunnel headers to provide alignment + * for inner packet IP header after 14 byte Ethernet header. + */ +#define STT_ETH_PAD 2 + +#define STT_BASE_HLEN (sizeof(struct stthdr) + STT_ETH_PAD) +#define STT_HEADER_LEN (sizeof(struct tcphdr) + STT_BASE_HLEN) + +static inline struct stthdr *stt_hdr(const struct sk_buff *skb) +{ + return (struct stthdr *)(skb_transport_header(skb) + + sizeof(struct tcphdr)); +} + +struct stt_sock; +typedef void (stt_rcv_t)(struct stt_sock *stt_sock, struct sk_buff *skb); + +/* @list: Per-net list of STT ports. + * @rcv: The callback is called on STT packet recv, STT reassembly can generate + * multiple packets, in this case first packet has tunnel outer header, rest + * of the packets are inner packet segments with no stt header. + * @rcv_data: user data. + * @sock: Fake TCP socket for the STT port. + */ +struct stt_sock { + struct list_head list; + stt_rcv_t *rcv; + void *rcv_data; + struct socket *sock; + struct rcu_head rcu; +}; + +struct stt_sock *stt_sock_add(struct net *net, __be16 port, + stt_rcv_t *rcv, void *data); + +void stt_sock_release(struct stt_sock *stt_sock); + +int stt_xmit_skb(struct sk_buff *skb, struct rtable *rt, + __be32 src, __be32 dst, __u8 tos, + __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, + __be64 tun_id); + +int stt_init_module(void); +void stt_cleanup_module(void); + +#endif +#endif /*ifdef__NET_STT_H */ diff --git a/datapath/linux/compat/stt.c b/datapath/linux/compat/stt.c new file mode 100644 index 0000000..f294ecd --- /dev/null +++ b/datapath/linux/compat/stt.c @@ -0,0 +1,1545 @@ +/* + * Stateless TCP Tunnel (STT) vport. + * + * Copyright (c) 2015 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/unaligned.h> + +#include <linux/delay.h> +#include <linux/flex_array.h> +#include <linux/if.h> +#include <linux/if_vlan.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/jhash.h> +#include <linux/list.h> +#include <linux/log2.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/percpu.h> +#include <linux/skbuff.h> +#include <linux/tcp.h> +#include <linux/workqueue.h> + +#include <net/icmp.h> +#include <net/inet_ecn.h> +#include <net/ip.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> +#include <net/sock.h> +#include <net/stt.h> +#include <net/tcp.h> +#include <net/udp.h> + +#include "gso.h" + +#ifdef OVS_STT +#define STT_VER 0 + +#define STT_CSUM_VERIFIED BIT(0) +#define STT_CSUM_PARTIAL BIT(1) +#define STT_PROTO_IPV4 BIT(2) +#define STT_PROTO_TCP BIT(3) +#define STT_PROTO_TYPES (STT_PROTO_IPV4 | STT_PROTO_TCP) + +#define SUPPORTED_GSO_TYPES (SKB_GSO_TCPV4 | SKB_GSO_UDP | SKB_GSO_DODGY | \ + SKB_GSO_TCPV6) + +/* The length and offset of a fragment are encoded in the sequence number. + * STT_SEQ_LEN_SHIFT is the left shift needed to store the length. + * STT_SEQ_OFFSET_MASK is the mask to extract the offset. + */ +#define STT_SEQ_LEN_SHIFT 16 +#define STT_SEQ_OFFSET_MASK (BIT(STT_SEQ_LEN_SHIFT) - 1) + +/* The maximum amount of memory used to store packets waiting to be reassembled + * on a given CPU. Once this threshold is exceeded we will begin freeing the + * least recently used fragments. + */ +#define REASM_HI_THRESH (4 * 1024 * 1024) +/* The target for the high memory evictor. Once we have exceeded + * REASM_HI_THRESH, we will continue freeing fragments until we hit + * this limit. + */ +#define REASM_LO_THRESH (3 * 1024 * 1024) +/* The length of time a given packet has to be reassembled from the time the + * first fragment arrives. Once this limit is exceeded it becomes available + * for cleaning. + */ +#define FRAG_EXP_TIME (30 * HZ) +/* Number of hash entries. Each entry has only a single slot to hold a packet + * so if there are collisions, we will drop packets. This is allocated + * per-cpu and each entry consists of struct pkt_frag. + */ +#define FRAG_HASH_SHIFT 8 +#define FRAG_HASH_ENTRIES BIT(FRAG_HASH_SHIFT) +#define FRAG_HASH_SEGS ((sizeof(u32) * 8) / FRAG_HASH_SHIFT) + +#define CLEAN_PERCPU_INTERVAL (30 * HZ) + +struct pkt_key { + __be32 saddr; + __be32 daddr; + __be32 pkt_seq; + u32 mark; +}; + +struct pkt_frag { + struct sk_buff *skbs; + unsigned long timestamp; + struct list_head lru_node; + struct pkt_key key; +}; + +struct stt_percpu { + struct flex_array *frag_hash; + struct list_head frag_lru; + unsigned int frag_mem_used; + + /* Protect frags table. */ + spinlock_t lock; +}; + +struct first_frag { + struct sk_buff *last_skb; + unsigned int mem_used; + u16 tot_len; + u16 rcvd_len; + bool ecn_ce; +}; + +struct frag_skb_cb { + u16 offset; + + /* Only valid for the first skb in the chain. */ + struct first_frag first; +}; + +#define FRAG_CB(skb) ((struct frag_skb_cb *)(skb)->cb) + +/* per-network namespace private data for this module */ +struct stt_net { + struct list_head sock_list; +}; + +static int stt_net_id; + +static struct stt_percpu __percpu *stt_percpu_data __read_mostly; +static u32 frag_hash_seed __read_mostly; + +/* Protects sock-hash and refcounts. */ +static DEFINE_MUTEX(stt_mutex); + +static int n_tunnels; +static DEFINE_PER_CPU(u32, pkt_seq_counter); + +static void clean_percpu(struct work_struct *work); +static DECLARE_DELAYED_WORK(clean_percpu_wq, clean_percpu); + +static struct stt_sock *stt_find_sock(struct net *net, __be16 port) +{ + struct stt_net *sn = net_generic(net, stt_net_id); + struct stt_sock *stt_sock; + + list_for_each_entry_rcu(stt_sock, &sn->sock_list, list) { + if (inet_sk(stt_sock->sock->sk)->inet_sport == port) + return stt_sock; + } + return NULL; +} + +static __be32 ack_seq(void) +{ +#if NR_CPUS <= 65536 + u32 pkt_seq, ack; + + pkt_seq = this_cpu_read(pkt_seq_counter); + ack = pkt_seq << ilog2(NR_CPUS) | smp_processor_id(); + this_cpu_inc(pkt_seq_counter); + + return (__force __be32)ack; +#else +#error "Support for greater than 64k CPUs not implemented" +#endif +} + +static int clear_gso(struct sk_buff *skb) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + int err; + + if (shinfo->gso_type == 0 && shinfo->gso_size == 0 && + shinfo->gso_segs == 0) + return 0; + + err = skb_unclone(skb, GFP_ATOMIC); + if (unlikely(err)) + return err; + + shinfo = skb_shinfo(skb); + shinfo->gso_type = 0; + shinfo->gso_size = 0; + shinfo->gso_segs = 0; + return 0; +} + +static struct sk_buff *normalize_frag_list(struct sk_buff *head, + struct sk_buff **skbp) +{ + struct sk_buff *skb = *skbp; + struct sk_buff *last; + + do { + struct sk_buff *frags; + + if (skb_shared(skb)) { + struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); + + if (unlikely(!nskb)) + return ERR_PTR(-ENOMEM); + + nskb->next = skb->next; + consume_skb(skb); + skb = nskb; + *skbp = skb; + } + + if (head) { + head->len -= skb->len; + head->data_len -= skb->len; + head->truesize -= skb->truesize; + } + + frags = skb_shinfo(skb)->frag_list; + if (frags) { + int err; + + err = skb_unclone(skb, GFP_ATOMIC); + if (unlikely(err)) + return ERR_PTR(err); + + last = normalize_frag_list(skb, &frags); + if (IS_ERR(last)) + return last; + + skb_shinfo(skb)->frag_list = NULL; + last->next = skb->next; + skb->next = frags; + } else { + last = skb; + } + + skbp = &skb->next; + } while ((skb = skb->next)); + + return last; +} + +/* Takes a linked list of skbs, which potentially contain frag_list + * (whose members in turn potentially contain frag_lists, etc.) and + * converts them into a single linear linked list. + */ +static int straighten_frag_list(struct sk_buff **skbp) +{ + struct sk_buff *err_skb; + + err_skb = normalize_frag_list(NULL, skbp); + if (IS_ERR(err_skb)) + return PTR_ERR(err_skb); + + return 0; +} + +static void copy_skb_metadata(struct sk_buff *to, struct sk_buff *from) +{ + to->tstamp = from->tstamp; + to->priority = from->priority; + to->mark = from->mark; + to->vlan_tci = from->vlan_tci; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) + to->vlan_proto = from->vlan_proto; +#endif + skb_copy_secmark(to, from); + to->protocol = from->protocol; +} + +static void update_headers(struct sk_buff *skb, bool head, + unsigned int l4_offset, unsigned int hdr_len, + bool ipv4, u32 tcp_seq) +{ + u16 old_len, new_len; + __be32 delta; + struct tcphdr *tcph; + int gso_size; + + if (ipv4) { + struct iphdr *iph = (struct iphdr *)(skb->data + ETH_HLEN); + + old_len = ntohs(iph->tot_len); + new_len = skb->len - ETH_HLEN; + iph->tot_len = htons(new_len); + + ip_send_check(iph); + } else { + struct ipv6hdr *ip6h = (struct ipv6hdr *)(skb->data + ETH_HLEN); + + old_len = ntohs(ip6h->payload_len); + new_len = skb->len - ETH_HLEN - sizeof(struct ipv6hdr); + ip6h->payload_len = htons(new_len); + } + + tcph = (struct tcphdr *)(skb->data + l4_offset); + if (!head) { + tcph->seq = htonl(tcp_seq); + tcph->cwr = 0; + } + + if (skb->next) { + tcph->fin = 0; + tcph->psh = 0; + } + + delta = htonl(~old_len + new_len); + tcph->check = ~csum_fold((__force __wsum)((__force u32)tcph->check + + (__force u32)delta)); + + gso_size = skb_shinfo(skb)->gso_size; + if (gso_size && skb->len - hdr_len <= gso_size) + BUG_ON(clear_gso(skb)); + + skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)) + skb->data_len; +} + +static bool can_segment(struct sk_buff *head, bool ipv4, bool tcp, bool csum_partial) +{ + /* If no offloading is in use then we don't have enough information + * to process the headers. + */ + if (!csum_partial) + goto linearize; + + /* Handling UDP packets requires IP fragmentation, which means that + * the L4 checksum can no longer be calculated by hardware (since the + * fragments are in different packets. If we have to compute the + * checksum it's faster just to linearize and large UDP packets are + * pretty uncommon anyways, so it's not worth dealing with for now. + */ + if (!tcp) + goto linearize; + + if (ipv4) { + struct iphdr *iph = (struct iphdr *)(head->data + ETH_HLEN); + + /* It's difficult to get the IP IDs exactly right here due to + * varying segment sizes and potentially multiple layers of + * segmentation. IP ID isn't important when DF is set and DF + * is generally set for TCP packets, so just linearize if it's + * not. + */ + if (!(iph->frag_off & htons(IP_DF))) + goto linearize; + } else { + struct ipv6hdr *ip6h = (struct ipv6hdr *)(head->data + ETH_HLEN); + + /* Jumbograms require more processing to update and we'll + * probably never see them, so just linearize. + */ + if (ip6h->payload_len == 0) + goto linearize; + } + return true; + +linearize: + return false; +} + +static int copy_headers(struct sk_buff *head, struct sk_buff *frag, + int hdr_len) +{ + u16 csum_start; + + if (skb_cloned(frag) || skb_headroom(frag) < hdr_len) { + int extra_head = hdr_len - skb_headroom(frag); + + extra_head = extra_head > 0 ? extra_head : 0; + if (unlikely(pskb_expand_head(frag, extra_head, 0, + GFP_ATOMIC))) + return -ENOMEM; + } + + memcpy(__skb_push(frag, hdr_len), head->data, hdr_len); + + csum_start = head->csum_start - skb_headroom(head); + frag->csum_start = skb_headroom(frag) + csum_start; + frag->csum_offset = head->csum_offset; + frag->ip_summed = head->ip_summed; + + skb_shinfo(frag)->gso_size = skb_shinfo(head)->gso_size; + skb_shinfo(frag)->gso_type = skb_shinfo(head)->gso_type; + skb_shinfo(frag)->gso_segs = 0; + + copy_skb_metadata(frag, head); + return 0; +} + +static int skb_list_linearize(struct sk_buff *head, gfp_t gfp_mask) +{ + struct sk_buff *skb; + int tlen = 0; + int err; + + skb = head; + while (skb) { + tlen += skb->len; + skb = skb->next; + } + + tlen -= skb_headlen(head); + err = pskb_expand_head(head, 0, tlen, gfp_mask); + if (err) + return err; + + if (!__pskb_pull_tail(head, (head->len - skb_headlen(head)))) + return -ENOMEM; + + skb = head->next; + while (skb) { + err = skb_copy_bits(skb, 0, skb_tail_pointer(head), skb->len); + if (err) + return err; + head->tail += skb->len; + skb = skb->next; + } + kfree_skb_list(head->next); + head->next = NULL; + head->len = tlen; + return 0; +} + +static int skb_list_segment(struct sk_buff *head, bool ipv4, int l4_offset) +{ + struct sk_buff *skb; + struct tcphdr *tcph; + int seg_len; + int hdr_len; + int tcp_len; + u32 seq; + + if (unlikely(!pskb_may_pull(head, l4_offset + sizeof(*tcph)))) + return -ENOMEM; + + tcph = (struct tcphdr *)(head->data + l4_offset); + tcp_len = tcph->doff * 4; + hdr_len = l4_offset + tcp_len; + + if (unlikely((tcp_len < sizeof(struct tcphdr)) || + (head->len < hdr_len))) + return -EINVAL; + + if (unlikely(!pskb_may_pull(head, hdr_len))) + return -ENOMEM; + + tcph = (struct tcphdr *)(head->data + l4_offset); + /* Update header of each segment. */ + seq = ntohl(tcph->seq); + seg_len = head->len - hdr_len; + for (skb = head->next; skb; skb = skb->next) { + int err; + + seq += seg_len; + seg_len = skb->len; + err = copy_headers(head, skb, hdr_len); + if (err) + return err; + update_headers(skb, false, l4_offset, hdr_len, ipv4, seq); + } + update_headers(head, true, l4_offset, hdr_len, ipv4, 0); + return 0; +} + +static int coalesce_skb(struct sk_buff **headp) +{ + struct sk_buff *prev, *skb; + int err; + + err = straighten_frag_list(headp); + if (unlikely(err)) + return err; + + /* Coalesce skb list. */ + prev = *headp; + for (skb = (*headp)->next; skb; skb = skb->next) { + bool headstolen; + int delta; + + if (unlikely(skb_unclone(prev, GFP_ATOMIC))) + return -ENOMEM; + + if (!skb_try_coalesce(prev, skb, &headstolen, &delta)) { + prev = skb; + continue; + } + + prev->next = skb->next; + kfree_skb_partial(skb, headstolen); + skb = prev; + } + return 0; +} + +static int __try_to_segment(struct sk_buff *skb, bool csum_partial, + bool ipv4, bool tcp, int l4_offset) +{ + if (can_segment(skb, ipv4, tcp, csum_partial)) + return skb_list_segment(skb, ipv4, l4_offset); + else + return skb_list_linearize(skb, GFP_ATOMIC); +} + +static int try_to_segment(struct sk_buff *skb) +{ + struct stthdr *stth = stt_hdr(skb); + bool csum_partial = !!(stth->flags & STT_CSUM_PARTIAL); + bool ipv4 = !!(stth->flags & STT_PROTO_IPV4); + bool tcp = !!(stth->flags & STT_PROTO_TCP); + int l4_offset = stth->l4_offset; + + return __try_to_segment(skb, csum_partial, ipv4, tcp, l4_offset); +} + +static int segment_skb(struct sk_buff **headp, bool ipv4, bool tcp, int l4_offset) +{ + int err; + + err = coalesce_skb(headp); + if (err) + return err; + + if ((*headp)->next) + return __try_to_segment(*headp, true, ipv4, tcp, l4_offset); + return 0; +} + +static int __push_stt_header(struct sk_buff *skb, __be64 tun_id, + __be16 s_port, __be16 d_port, + __be32 saddr, __be32 dst, + __be16 l3_proto, u8 l4_proto, + int dst_mtu) +{ + int data_len = skb->len + sizeof(struct stthdr) + STT_ETH_PAD; + unsigned short encap_mss; + struct tcphdr *tcph; + struct stthdr *stth; + + skb_push(skb, STT_HEADER_LEN); + skb_reset_transport_header(skb); + tcph = tcp_hdr(skb); + memset(tcph, 0, STT_HEADER_LEN); + stth = stt_hdr(skb); + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + stth->flags |= STT_CSUM_PARTIAL; + + stth->l4_offset = skb->csum_start - + (skb_headroom(skb) + + STT_HEADER_LEN); + + if (l3_proto == htons(ETH_P_IP)) + stth->flags |= STT_PROTO_IPV4; + + if (l4_proto == IPPROTO_TCP) + stth->flags |= STT_PROTO_TCP; + + stth->mss = htons(skb_shinfo(skb)->gso_size); + } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + stth->flags |= STT_CSUM_VERIFIED; + } + + stth->vlan_tci = htons(skb->vlan_tci); + skb->vlan_tci = 0; + put_unaligned(tun_id, &stth->key); + + tcph->source = s_port; + tcph->dest = d_port; + tcph->doff = sizeof(struct tcphdr) / 4; + tcph->ack = 1; + tcph->psh = 1; + tcph->window = htons(USHRT_MAX); + tcph->seq = htonl(data_len << STT_SEQ_LEN_SHIFT); + tcph->ack_seq = ack_seq(); + tcph->check = ~tcp_v4_check(skb->len, saddr, dst, 0); + + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct tcphdr, check); + skb->ip_summed = CHECKSUM_PARTIAL; + + encap_mss = dst_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr); + if (data_len > encap_mss) { + if (unlikely(skb_unclone(skb, GFP_ATOMIC))) + return -EINVAL; + + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; + skb_shinfo(skb)->gso_size = encap_mss; + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(data_len, encap_mss); + } else { + if (unlikely(clear_gso(skb))) + return -EINVAL; + } + return 0; +} + +static struct sk_buff *push_stt_header(struct sk_buff *head, __be64 tun_id, + __be16 s_port, __be16 d_port, + __be32 saddr, __be32 dst, + __be16 l3_proto, u8 l4_proto, + int dst_mtu) +{ + struct sk_buff *skb; + + if (skb_shinfo(head)->frag_list) { + bool ipv4 = (l3_proto == htons(ETH_P_IP)); + bool tcp = (l4_proto == IPPROTO_TCP); + int l4_offset = skb_transport_offset(head); + + if (unlikely(segment_skb(&head, ipv4, tcp, l4_offset))) + goto error; + } + + for (skb = head; skb; skb = skb->next) { + if (__push_stt_header(skb, tun_id, s_port, d_port, saddr, dst, + l3_proto, l4_proto, dst_mtu)) + goto error; + } + + return head; +error: + kfree_skb_list(head); + return NULL; +} + +static bool stt_can_offload(struct sk_buff *skb, __be16 l3_proto, u8 l4_proto) +{ + if (skb_is_gso(skb) && skb->ip_summed != CHECKSUM_PARTIAL) { + int csum_offset; + __sum16 *csum; + int len; + + if (l4_proto == IPPROTO_TCP) + csum_offset = offsetof(struct tcphdr, check); + else if (l4_proto == IPPROTO_UDP) + csum_offset = offsetof(struct udphdr, check); + else + return false; + + len = skb->len - skb_transport_offset(skb); + csum = (__sum16 *)(skb_transport_header(skb) + csum_offset); + + if (l3_proto == htons(ETH_P_IP)) { + struct iphdr *iph = ip_hdr(skb); + + *csum = ~csum_tcpudp_magic(iph->saddr, iph->daddr, + len, l4_proto, 0); + } else if (l3_proto == htons(ETH_P_IPV6)) { + struct ipv6hdr *ip6h = ipv6_hdr(skb); + + *csum = ~csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + len, l4_proto, 0); + } else { + return false; + } + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = csum_offset; + skb->ip_summed = CHECKSUM_PARTIAL; + } + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + /* Assume receiver can only offload TCP/UDP over IPv4/6, + * and require 802.1Q VLANs to be accelerated. + */ + if (l3_proto != htons(ETH_P_IP) && + l3_proto != htons(ETH_P_IPV6)) + return false; + + if (l4_proto != IPPROTO_TCP && l4_proto != IPPROTO_UDP) + return false; + + /* L4 offset must fit in a 1-byte field. */ + if (skb->csum_start - skb_headroom(skb) > 255) + return false; + + if (skb_shinfo(skb)->gso_type & ~SUPPORTED_GSO_TYPES) + return false; + } + /* Total size of encapsulated packet must fit in 16 bits. */ + if (skb->len + STT_HEADER_LEN + sizeof(struct iphdr) > 65535) + return false; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) + if (skb_vlan_tag_present(skb) && skb->vlan_proto != htons(ETH_P_8021Q)) + return false; +#endif + return true; +} + +static bool need_linearize(const struct sk_buff *skb) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + int i; + + if (unlikely(shinfo->frag_list)) + return true; + + /* Generally speaking we should linearize if there are paged frags. + * However, if all of the refcounts are 1 we know nobody else can + * change them from underneath us and we can skip the linearization. + */ + for (i = 0; i < shinfo->nr_frags; i++) + if (unlikely(page_count(skb_frag_page(&shinfo->frags[i])) > 1)) + return true; + + return false; +} + +static struct sk_buff *handle_offloads(struct sk_buff *skb, int min_headroom) +{ + int err; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) + if (skb_vlan_tag_present(skb) && skb->vlan_proto != htons(ETH_P_8021Q)) { + + min_headroom += VLAN_HLEN; + if (skb_headroom(skb) < min_headroom) { + int head_delta = SKB_DATA_ALIGN(min_headroom - + skb_headroom(skb) + 16); + + err = pskb_expand_head(skb, max_t(int, head_delta, 0), + 0, GFP_ATOMIC); + if (unlikely(err)) + goto error; + } + + skb = __vlan_hwaccel_push_inside(skb); + if (!skb) { + err = -ENOMEM; + goto error; + } + } +#endif + + if (skb_is_gso(skb)) { + struct sk_buff *nskb; + char cb[sizeof(skb->cb)]; + + memcpy(cb, skb->cb, sizeof(cb)); + + nskb = __skb_gso_segment(skb, 0, false); + if (IS_ERR(nskb)) { + err = PTR_ERR(nskb); + goto error; + } + + consume_skb(skb); + skb = nskb; + while (nskb) { + memcpy(nskb->cb, cb, sizeof(cb)); + nskb = nskb->next; + } + } else if (skb->ip_summed == CHECKSUM_PARTIAL) { + /* Pages aren't locked and could change at any time. + * If this happens after we compute the checksum, the + * checksum will be wrong. We linearize now to avoid + * this problem. + */ + if (unlikely(need_linearize(skb))) { + err = __skb_linearize(skb); + if (unlikely(err)) + goto error; + } + + err = skb_checksum_help(skb); + if (unlikely(err)) + goto error; + } + skb->ip_summed = CHECKSUM_NONE; + + return skb; +error: + kfree_skb(skb); + return ERR_PTR(err); +} + +static int skb_list_xmit(struct rtable *rt, struct sk_buff *skb, __be32 src, + __be32 dst, __u8 tos, __u8 ttl, __be16 df) +{ + int len = 0; + + while (skb) { + struct sk_buff *next = skb->next; + + if (next) + dst_clone(&rt->dst); + + skb_clear_ovs_gso_cb(skb); + skb->next = NULL; + len += iptunnel_xmit(NULL, rt, skb, src, dst, IPPROTO_TCP, + tos, ttl, df, false); + + skb = next; + } + return len; +} + +static u8 parse_ipv6_l4_proto(struct sk_buff *skb) +{ + unsigned int nh_ofs = skb_network_offset(skb); + int payload_ofs; + struct ipv6hdr *nh; + uint8_t nexthdr; + __be16 frag_off; + + if (unlikely(!pskb_may_pull(skb, nh_ofs + sizeof(*nh)))) + return 0; + + nh = ipv6_hdr(skb); + nexthdr = nh->nexthdr; + payload_ofs = (u8 *)(nh + 1) - skb->data; + + payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off); + if (unlikely(payload_ofs < 0)) + return 0; + + return nexthdr; +} + +static u8 skb_get_l4_proto(struct sk_buff *skb, __be16 l3_proto) +{ + if (l3_proto == htons(ETH_P_IP)) { + unsigned int nh_ofs = skb_network_offset(skb); + + if (unlikely(!pskb_may_pull(skb, nh_ofs + sizeof(struct iphdr)))) + return 0; + + return ip_hdr(skb)->protocol; + } else if (l3_proto == htons(ETH_P_IPV6)) { + return parse_ipv6_l4_proto(skb); + } + return 0; +} + +int stt_xmit_skb(struct sk_buff *skb, struct rtable *rt, + __be32 src, __be32 dst, __u8 tos, + __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, + __be64 tun_id) +{ + struct ethhdr *eh = eth_hdr(skb); + int ret = 0, min_headroom; + __be16 inner_l3_proto; + u8 inner_l4_proto; + + inner_l3_proto = eh->h_proto; + inner_l4_proto = skb_get_l4_proto(skb, inner_l3_proto); + + min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + + STT_HEADER_LEN + sizeof(struct iphdr); + + if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { + int head_delta = SKB_DATA_ALIGN(min_headroom - + skb_headroom(skb) + + 16); + + ret = pskb_expand_head(skb, max_t(int, head_delta, 0), + 0, GFP_ATOMIC); + if (unlikely(ret)) + goto err_free_rt; + } + + if (!stt_can_offload(skb, inner_l3_proto, inner_l4_proto)) { + skb = handle_offloads(skb, min_headroom); + if (IS_ERR(skb)) { + ret = PTR_ERR(skb); + skb = NULL; + goto err_free_rt; + } + } + + while (skb) { + struct sk_buff *next_skb = skb->next; + + skb->next = NULL; + + if (next_skb) + dst_clone(&rt->dst); + + /* Push STT and TCP header. */ + skb = push_stt_header(skb, tun_id, src_port, dst_port, src, + dst, inner_l3_proto, inner_l4_proto, + dst_mtu(&rt->dst)); + if (unlikely(!skb)) + goto next; + + /* Push IP header. */ + ret += skb_list_xmit(rt, skb, src, dst, tos, ttl, df); + +next: + skb = next_skb; + } + + return ret; + +err_free_rt: + ip_rt_put(rt); + kfree_skb(skb); + return ret; +} +EXPORT_SYMBOL_GPL(stt_xmit_skb); + +static void free_frag(struct stt_percpu *stt_percpu, + struct pkt_frag *frag) +{ + stt_percpu->frag_mem_used -= FRAG_CB(frag->skbs)->first.mem_used; + kfree_skb_list(frag->skbs); + list_del(&frag->lru_node); + frag->skbs = NULL; +} + +static void evict_frags(struct stt_percpu *stt_percpu) +{ + while (!list_empty(&stt_percpu->frag_lru) && + stt_percpu->frag_mem_used > REASM_LO_THRESH) { + struct pkt_frag *frag; + + frag = list_first_entry(&stt_percpu->frag_lru, + struct pkt_frag, + lru_node); + free_frag(stt_percpu, frag); + } +} + +static bool pkt_key_match(struct net *net, + const struct pkt_frag *a, const struct pkt_key *b) +{ + return a->key.saddr == b->saddr && a->key.daddr == b->daddr && + a->key.pkt_seq == b->pkt_seq && a->key.mark == b->mark && + net_eq(dev_net(a->skbs->dev), net); +} + +static u32 pkt_key_hash(const struct net *net, const struct pkt_key *key) +{ + u32 initval = frag_hash_seed ^ (u32)(unsigned long)net ^ key->mark; + + return jhash_3words((__force u32)key->saddr, (__force u32)key->daddr, + (__force u32)key->pkt_seq, initval); +} + +static struct pkt_frag *lookup_frag(struct net *net, + struct stt_percpu *stt_percpu, + const struct pkt_key *key, u32 hash) +{ + struct pkt_frag *frag, *victim_frag = NULL; + int i; + + for (i = 0; i < FRAG_HASH_SEGS; i++) { + frag = flex_array_get(stt_percpu->frag_hash, + hash & (FRAG_HASH_ENTRIES - 1)); + + if (frag->skbs && + time_before(jiffies, frag->timestamp + FRAG_EXP_TIME) && + pkt_key_match(net, frag, key)) + return frag; + + if (!victim_frag || + (victim_frag->skbs && + (!frag->skbs || + time_before(frag->timestamp, victim_frag->timestamp)))) + victim_frag = frag; + + hash >>= FRAG_HASH_SHIFT; + } + + if (victim_frag->skbs) + free_frag(stt_percpu, victim_frag); + + return victim_frag; +} + +static struct sk_buff *reassemble(struct sk_buff *skb) +{ + struct iphdr *iph = ip_hdr(skb); + struct tcphdr *tcph = tcp_hdr(skb); + u32 seq = ntohl(tcph->seq); + struct stt_percpu *stt_percpu; + struct sk_buff *last_skb; + struct pkt_frag *frag; + struct pkt_key key; + int tot_len; + u32 hash; + + tot_len = seq >> STT_SEQ_LEN_SHIFT; + FRAG_CB(skb)->offset = seq & STT_SEQ_OFFSET_MASK; + + if (unlikely(skb->len == 0)) + goto out_free; + + if (unlikely(FRAG_CB(skb)->offset + skb->len > tot_len)) + goto out_free; + + if (tot_len == skb->len) + goto out; + + key.saddr = iph->saddr; + key.daddr = iph->daddr; + key.pkt_seq = tcph->ack_seq; + key.mark = skb->mark; + hash = pkt_key_hash(dev_net(skb->dev), &key); + + stt_percpu = per_cpu_ptr(stt_percpu_data, smp_processor_id()); + + spin_lock(&stt_percpu->lock); + + if (unlikely(stt_percpu->frag_mem_used + skb->truesize > REASM_HI_THRESH)) + evict_frags(stt_percpu); + + frag = lookup_frag(dev_net(skb->dev), stt_percpu, &key, hash); + if (!frag->skbs) { + frag->skbs = skb; + frag->key = key; + frag->timestamp = jiffies; + FRAG_CB(skb)->first.last_skb = skb; + FRAG_CB(skb)->first.mem_used = skb->truesize; + FRAG_CB(skb)->first.tot_len = tot_len; + FRAG_CB(skb)->first.rcvd_len = skb->len; + FRAG_CB(skb)->first.ecn_ce = INET_ECN_is_ce(iph->tos); + list_add_tail(&frag->lru_node, &stt_percpu->frag_lru); + stt_percpu->frag_mem_used += skb->truesize; + + skb = NULL; + goto unlock; + } + + /* Optimize for the common case where fragments are received in-order + * and not overlapping. + */ + last_skb = FRAG_CB(frag->skbs)->first.last_skb; + if (likely(FRAG_CB(last_skb)->offset + last_skb->len == + FRAG_CB(skb)->offset)) { + last_skb->next = skb; + FRAG_CB(frag->skbs)->first.last_skb = skb; + } else { + struct sk_buff *prev = NULL, *next; + + for (next = frag->skbs; next; next = next->next) { + if (FRAG_CB(next)->offset >= FRAG_CB(skb)->offset) + break; + prev = next; + } + + /* Overlapping fragments aren't allowed. We shouldn't start + * before the end of the previous fragment. + */ + if (prev && + FRAG_CB(prev)->offset + prev->len > FRAG_CB(skb)->offset) + goto unlock_free; + + /* We also shouldn't end after the beginning of the next + * fragment. + */ + if (next && + FRAG_CB(skb)->offset + skb->len > FRAG_CB(next)->offset) + goto unlock_free; + + if (prev) { + prev->next = skb; + } else { + FRAG_CB(skb)->first = FRAG_CB(frag->skbs)->first; + frag->skbs = skb; + } + + if (next) + skb->next = next; + else + FRAG_CB(frag->skbs)->first.last_skb = skb; + } + + FRAG_CB(frag->skbs)->first.ecn_ce |= INET_ECN_is_ce(iph->tos); + FRAG_CB(frag->skbs)->first.rcvd_len += skb->len; + FRAG_CB(frag->skbs)->first.mem_used += skb->truesize; + stt_percpu->frag_mem_used += skb->truesize; + + if (FRAG_CB(frag->skbs)->first.tot_len == + FRAG_CB(frag->skbs)->first.rcvd_len) { + struct sk_buff *frag_head = frag->skbs; + + frag_head->tstamp = skb->tstamp; + + list_del(&frag->lru_node); + stt_percpu->frag_mem_used -= FRAG_CB(frag_head)->first.mem_used; + frag->skbs = NULL; + skb = frag_head; + } else { + list_move_tail(&frag->lru_node, &stt_percpu->frag_lru); + skb = NULL; + } + + goto unlock; + +unlock_free: + kfree_skb(skb); + skb = NULL; +unlock: + spin_unlock(&stt_percpu->lock); + return skb; +out_free: + kfree_skb(skb); + skb = NULL; +out: + return skb; +} + +static bool validate_checksum(struct sk_buff *skb) +{ + struct iphdr *iph = ip_hdr(skb); + + if (skb_csum_unnecessary(skb)) + return true; + + if (skb->ip_summed == CHECKSUM_COMPLETE && + !tcp_v4_check(skb->len, iph->saddr, iph->daddr, skb->csum)) + return true; + + skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, skb->len, + IPPROTO_TCP, 0); + + return __tcp_checksum_complete(skb) == 0; +} + +static bool set_offloads(struct sk_buff *skb) +{ + struct stthdr *stth = stt_hdr(skb); + unsigned short gso_type; + int l3_header_size; + int l4_header_size; + u16 csum_offset; + u8 proto_type; + + if (stth->vlan_tci) + __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), + ntohs(stth->vlan_tci)); + + if (!(stth->flags & STT_CSUM_PARTIAL)) { + if (stth->flags & STT_CSUM_VERIFIED) + skb->ip_summed = CHECKSUM_UNNECESSARY; + else + skb->ip_summed = CHECKSUM_NONE; + + return clear_gso(skb) == 0; + } + + proto_type = stth->flags & STT_PROTO_TYPES; + + switch (proto_type) { + case (STT_PROTO_IPV4 | STT_PROTO_TCP): + /* TCP/IPv4 */ + csum_offset = offsetof(struct tcphdr, check); + gso_type = SKB_GSO_TCPV4; + l3_header_size = sizeof(struct iphdr); + l4_header_size = sizeof(struct tcphdr); + break; + case STT_PROTO_TCP: + /* TCP/IPv6 */ + csum_offset = offsetof(struct tcphdr, check); + gso_type = SKB_GSO_TCPV6; + l3_header_size = sizeof(struct ipv6hdr); + l4_header_size = sizeof(struct tcphdr); + skb->protocol = htons(ETH_P_IPV6); + break; + case STT_PROTO_IPV4: + /* UDP/IPv4 */ + csum_offset = offsetof(struct udphdr, check); + gso_type = SKB_GSO_UDP; + l3_header_size = sizeof(struct iphdr); + l4_header_size = sizeof(struct udphdr); + break; + default: + /* UDP/IPv6 */ + csum_offset = offsetof(struct udphdr, check); + gso_type = SKB_GSO_UDP; + l3_header_size = sizeof(struct ipv6hdr); + l4_header_size = sizeof(struct udphdr); + skb->protocol = htons(ETH_P_IPV6); + } + + if (unlikely(stth->l4_offset < ETH_HLEN + l3_header_size)) + return false; + + if (unlikely(!pskb_may_pull(skb, stth->l4_offset + l4_header_size))) + return false; + + stth = stt_hdr(skb); + + skb->csum_start = skb_headroom(skb) + stth->l4_offset; + skb->csum_offset = csum_offset; + skb->ip_summed = CHECKSUM_PARTIAL; + + if (stth->mss) { + if (unlikely(skb_unclone(skb, GFP_ATOMIC))) + return false; + + skb_shinfo(skb)->gso_type = gso_type | SKB_GSO_DODGY; + skb_shinfo(skb)->gso_size = ntohs(stth->mss); + skb_shinfo(skb)->gso_segs = 0; + } else { + if (unlikely(clear_gso(skb))) + return false; + } + + return true; +} +static void stt_rcv(struct stt_sock *stt_sock, struct sk_buff *skb) +{ + int err; + + if (unlikely(!validate_checksum(skb))) + goto drop; + + skb = reassemble(skb); + if (!skb) + return; + + if (skb->next) { + err = coalesce_skb(&skb); + if (err) + goto drop; + } + + err = iptunnel_pull_header(skb, + sizeof(struct stthdr) + STT_ETH_PAD, + htons(ETH_P_TEB)); + if (unlikely(err)) + goto drop; + + if (unlikely(stt_hdr(skb)->version != 0)) + goto drop; + + if (unlikely(!set_offloads(skb))) + goto drop; + + if (skb->next) { + err = try_to_segment(skb); + if (unlikely(err)) + goto drop; + } + + stt_sock->rcv(stt_sock, skb); + return; +drop: + /* Consume bad packet */ + kfree_skb_list(skb); +} + +static void tcp_sock_release(struct socket *sock) +{ + kernel_sock_shutdown(sock, SHUT_RDWR); + sk_release_kernel(sock->sk); +} + +static int tcp_sock_create4(struct net *net, __be16 port, + struct socket **sockp) +{ + struct sockaddr_in tcp_addr; + struct socket *sock = NULL; + int err; + + err = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + if (err < 0) + goto error; + + sk_change_net(sock->sk, net); + + memset(&tcp_addr, 0, sizeof(tcp_addr)); + tcp_addr.sin_family = AF_INET; + tcp_addr.sin_addr.s_addr = htonl(INADDR_ANY); + tcp_addr.sin_port = port; + err = kernel_bind(sock, (struct sockaddr *)&tcp_addr, + sizeof(tcp_addr)); + if (err < 0) + goto error; + + *sockp = sock; + return 0; + +error: + if (sock) + tcp_sock_release(sock); + *sockp = NULL; + return err; +} + +static void schedule_clean_percpu(void) +{ + schedule_delayed_work(&clean_percpu_wq, CLEAN_PERCPU_INTERVAL); +} + +static void clean_percpu(struct work_struct *work) +{ + int i; + + for_each_possible_cpu(i) { + struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i); + int j; + + for (j = 0; j < FRAG_HASH_ENTRIES; j++) { + struct pkt_frag *frag; + + frag = flex_array_get(stt_percpu->frag_hash, j); + if (!frag->skbs || + time_before(jiffies, frag->timestamp + FRAG_EXP_TIME)) + continue; + + spin_lock_bh(&stt_percpu->lock); + + if (frag->skbs && + time_after(jiffies, frag->timestamp + FRAG_EXP_TIME)) + free_frag(stt_percpu, frag); + + spin_unlock_bh(&stt_percpu->lock); + } + } + schedule_clean_percpu(); +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,13,0) +#define FIRST_PARAM const struct nf_hook_ops *ops, +#else +#define FIRST_PARAM unsigned int hooknum, +#endif + +static unsigned int nf_ip_hook(FIRST_PARAM + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct stt_sock *stt_sock; + int ip_hdr_len; + + if (ip_hdr(skb)->protocol != IPPROTO_TCP) + return NF_ACCEPT; + + ip_hdr_len = ip_hdrlen(skb); + if (unlikely(!pskb_may_pull(skb, ip_hdr_len + sizeof(struct tcphdr)))) + return NF_ACCEPT; + + skb_set_transport_header(skb, ip_hdr_len); + + stt_sock = stt_find_sock(dev_net(skb->dev), tcp_hdr(skb)->dest); + if (unlikely(!stt_sock)) + return NF_ACCEPT; + + __skb_pull(skb, ip_hdr_len + sizeof(struct tcphdr)); + stt_rcv(stt_sock, skb); + return NF_STOLEN; +} + +static struct nf_hook_ops nf_hook_ops __read_mostly = { + .hook = nf_ip_hook, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = INT_MAX, +}; + +static int stt_start(void) +{ + int err; + int i; + + if (n_tunnels) { + n_tunnels++; + return 0; + } + get_random_bytes(&frag_hash_seed, sizeof(u32)); + + stt_percpu_data = alloc_percpu(struct stt_percpu); + if (!stt_percpu_data) { + err = -ENOMEM; + goto error; + } + + for_each_possible_cpu(i) { + struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i); + struct flex_array *frag_hash; + + spin_lock_init(&stt_percpu->lock); + INIT_LIST_HEAD(&stt_percpu->frag_lru); + get_random_bytes(&per_cpu(pkt_seq_counter, i), sizeof(u32)); + + frag_hash = flex_array_alloc(sizeof(struct pkt_frag), + FRAG_HASH_ENTRIES, + GFP_KERNEL | __GFP_ZERO); + if (!frag_hash) { + err = -ENOMEM; + goto free_percpu; + } + stt_percpu->frag_hash = frag_hash; + + err = flex_array_prealloc(stt_percpu->frag_hash, 0, + FRAG_HASH_ENTRIES, + GFP_KERNEL | __GFP_ZERO); + if (err) + goto free_percpu; + } + err = nf_register_hook(&nf_hook_ops); + if (err) + goto free_percpu; + + schedule_clean_percpu(); + n_tunnels++; + return 0; + +free_percpu: + for_each_possible_cpu(i) { + struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i); + + if (stt_percpu->frag_hash) + flex_array_free(stt_percpu->frag_hash); + } + + free_percpu(stt_percpu_data); + +error: + return err; +} + +static void stt_cleanup(void) +{ + int i; + + n_tunnels--; + if (n_tunnels) + return; + + cancel_delayed_work_sync(&clean_percpu_wq); + nf_unregister_hook(&nf_hook_ops); + + for_each_possible_cpu(i) { + struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i); + int j; + + for (j = 0; j < FRAG_HASH_ENTRIES; j++) { + struct pkt_frag *frag; + + frag = flex_array_get(stt_percpu->frag_hash, j); + kfree_skb_list(frag->skbs); + } + + flex_array_free(stt_percpu->frag_hash); + } + + free_percpu(stt_percpu_data); +} + +static struct stt_sock *stt_socket_create(struct net *net, __be16 port, + stt_rcv_t *rcv, void *data) +{ + struct stt_net *sn = net_generic(net, stt_net_id); + struct stt_sock *stt_sock; + struct socket *sock; + int err; + + stt_sock = kzalloc(sizeof(*stt_sock), GFP_KERNEL); + if (!stt_sock) + return ERR_PTR(-ENOMEM); + + err = tcp_sock_create4(net, port, &sock); + if (err) { + kfree(stt_sock); + return ERR_PTR(err); + } + + stt_sock->sock = sock; + stt_sock->rcv = rcv; + stt_sock->rcv_data = data; + + list_add_rcu(&stt_sock->list, &sn->sock_list); + + return stt_sock; +} + +static void __stt_sock_release(struct stt_sock *stt_sock) +{ + list_del_rcu(&stt_sock->list); + tcp_sock_release(stt_sock->sock); + kfree_rcu(stt_sock, rcu); +} + +struct stt_sock *stt_sock_add(struct net *net, __be16 port, + stt_rcv_t *rcv, void *data) +{ + struct stt_sock *stt_sock; + int err; + + err = stt_start(); + if (err) + return ERR_PTR(err); + + mutex_lock(&stt_mutex); + rcu_read_lock(); + stt_sock = stt_find_sock(net, port); + rcu_read_unlock(); + if (stt_sock) + stt_sock = ERR_PTR(-EBUSY); + else + stt_sock = stt_socket_create(net, port, rcv, data); + + mutex_unlock(&stt_mutex); + + if (IS_ERR(stt_sock)) + stt_cleanup(); + + return stt_sock; +} +EXPORT_SYMBOL_GPL(stt_sock_add); + +void stt_sock_release(struct stt_sock *stt_sock) +{ + mutex_lock(&stt_mutex); + if (stt_sock) { + __stt_sock_release(stt_sock); + stt_cleanup(); + } + mutex_unlock(&stt_mutex); +} +EXPORT_SYMBOL_GPL(stt_sock_release); + +static int stt_init_net(struct net *net) +{ + struct stt_net *sn = net_generic(net, stt_net_id); + + INIT_LIST_HEAD(&sn->sock_list); + return 0; +} + +static struct pernet_operations stt_net_ops = { + .init = stt_init_net, + .id = &stt_net_id, + .size = sizeof(struct stt_net), +}; + +int stt_init_module(void) +{ + return register_pernet_subsys(&stt_net_ops); +} +EXPORT_SYMBOL_GPL(stt_init_module); + +void stt_cleanup_module(void) +{ + unregister_pernet_subsys(&stt_net_ops); +} +EXPORT_SYMBOL_GPL(stt_cleanup_module); +#endif diff --git a/datapath/vport-stt.c b/datapath/vport-stt.c new file mode 100644 index 0000000..9a1c8a6 --- /dev/null +++ b/datapath/vport-stt.c @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2015 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/if_vlan.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/module.h> +#include <linux/net.h> +#include <linux/rculist.h> +#include <linux/udp.h> + +#include <net/icmp.h> +#include <net/ip.h> +#include <net/route.h> +#include <net/stt.h> +#include <net/udp.h> + +#include "datapath.h" +#include "vport.h" + +#ifdef OVS_STT +static struct vport_ops ovs_stt_vport_ops; + +/** + * struct stt_port + * @stt_sock: The socket created for this port number. + * @name: vport name. + */ +struct stt_port { + struct stt_sock *stt_sock; + char name[IFNAMSIZ]; +}; + +static inline struct stt_port *stt_vport(const struct vport *vport) +{ + return vport_priv(vport); +} + +static void stt_rcv(struct stt_sock *stt_sock, struct sk_buff *skb) +{ + struct vport *vport = stt_sock->rcv_data; + struct stthdr *stth = stt_hdr(skb); + struct ovs_tunnel_info tun_info; + struct sk_buff *next; + + ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), + tcp_hdr(skb)->source, tcp_hdr(skb)->dest, + get_unaligned(&stth->key), + TUNNEL_KEY | TUNNEL_CSUM, + NULL, 0); + do { + next = skb->next; + skb->next = NULL; + ovs_vport_receive(vport, skb, &tun_info); + } while ((skb = next)); +} + +static int stt_tnl_get_options(const struct vport *vport, + struct sk_buff *skb) +{ + struct stt_port *stt_port = stt_vport(vport); + struct inet_sock *sk = inet_sk(stt_port->stt_sock->sock->sk); + + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(sk->inet_sport))) + return -EMSGSIZE; + return 0; +} + +static void stt_tnl_destroy(struct vport *vport) +{ + struct stt_port *stt_port = stt_vport(vport); + + stt_sock_release(stt_port->stt_sock); + ovs_vport_deferred_free(vport); +} + +static struct vport *stt_tnl_create(const struct vport_parms *parms) +{ + struct net *net = ovs_dp_get_net(parms->dp); + struct nlattr *options = parms->options; + struct stt_port *stt_port; + struct stt_sock *stt_sock; + struct vport *vport; + struct nlattr *a; + int err; + u16 dst_port; + + if (!options) { + err = -EINVAL; + goto error; + } + + a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); + if (a && nla_len(a) == sizeof(u16)) { + dst_port = nla_get_u16(a); + } else { + /* Require destination port from userspace. */ + err = -EINVAL; + goto error; + } + + vport = ovs_vport_alloc(sizeof(struct stt_port), + &ovs_stt_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + stt_port = stt_vport(vport); + strncpy(stt_port->name, parms->name, IFNAMSIZ); + + stt_sock = stt_sock_add(net, htons(dst_port), stt_rcv, vport); + if (IS_ERR(stt_sock)) { + ovs_vport_free(vport); + return ERR_CAST(stt_sock); + } + stt_port->stt_sock = stt_sock; + + return vport; +error: + return ERR_PTR(err); +} + +static int stt_tnl_send(struct vport *vport, struct sk_buff *skb) +{ + struct net *net = ovs_dp_get_net(vport->dp); + struct stt_port *stt_port = stt_vport(vport); + __be16 dport = inet_sk(stt_port->stt_sock->sock->sk)->inet_sport; + const struct ovs_key_ipv4_tunnel *tun_key; + const struct ovs_tunnel_info *tun_info; + struct rtable *rt; + __be16 sport; + __be32 saddr; + __be16 df; + int err; + + tun_info = OVS_CB(skb)->egress_tun_info; + if (unlikely(!tun_info)) { + err = -EINVAL; + goto error; + } + + tun_key = &tun_info->tunnel; + /* Route lookup */ + saddr = tun_key->ipv4_src; + rt = find_route(ovs_dp_get_net(vport->dp), + &saddr, tun_key->ipv4_dst, + IPPROTO_TCP, tun_key->ipv4_tos, + skb->mark); + + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto error; + } + + df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; + sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); + skb->ignore_df = 1; + + return stt_xmit_skb(skb, rt, saddr, tun_key->ipv4_dst, + tun_key->ipv4_tos, tun_key->ipv4_ttl, + df, sport, dport, tun_key->tun_id); +error: + kfree_skb(skb); + return err; +} + +static const char *stt_tnl_get_name(const struct vport *vport) +{ + return stt_vport(vport)->name; +} + +static int stt_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, + struct ovs_tunnel_info *egress_tun_info) +{ + struct stt_port *stt_port = stt_vport(vport); + struct net *net = ovs_dp_get_net(vport->dp); + __be16 dport = inet_sk(stt_port->stt_sock->sock->sk)->inet_sport; + __be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); + + /* Get tp_src and tp_dst, refert to stt_build_header(). + */ + return ovs_tunnel_get_egress_info(egress_tun_info, + ovs_dp_get_net(vport->dp), + OVS_CB(skb)->egress_tun_info, + IPPROTO_UDP, skb->mark, sport, dport); +} + +static struct vport_ops ovs_stt_vport_ops = { + .type = OVS_VPORT_TYPE_STT, + .create = stt_tnl_create, + .destroy = stt_tnl_destroy, + .get_name = stt_tnl_get_name, + .get_options = stt_tnl_get_options, + .send = stt_tnl_send, + .get_egress_tun_info = stt_get_egress_tun_info, + .owner = THIS_MODULE, +}; + +static int __init ovs_stt_tnl_init(void) +{ + int err; + + err = stt_init_module(); + if (err) + return err; + err = ovs_vport_ops_register(&ovs_stt_vport_ops); + if (err) + stt_cleanup_module(); + return err; +} + +static void __exit ovs_stt_tnl_exit(void) +{ + ovs_vport_ops_unregister(&ovs_stt_vport_ops); + stt_cleanup_module(); +} + +module_init(ovs_stt_tnl_init); +module_exit(ovs_stt_tnl_exit); + +MODULE_DESCRIPTION("OVS: STT switching port"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("vport-type-106"); +#endif diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c index 93fd8a4..07da8a4 100644 --- a/lib/dpif-netlink.c +++ b/lib/dpif-netlink.c @@ -768,6 +768,9 @@ get_vport_type(const struct dpif_netlink_vport *vport) case OVS_VPORT_TYPE_LISP: return "lisp"; + case OVS_VPORT_TYPE_STT: + return "stt"; + case OVS_VPORT_TYPE_UNSPEC: case __OVS_VPORT_TYPE_MAX: break; @@ -787,6 +790,8 @@ netdev_to_ovs_vport_type(const struct netdev *netdev) return OVS_VPORT_TYPE_NETDEV; } else if (!strcmp(type, "internal")) { return OVS_VPORT_TYPE_INTERNAL; + } else if (strstr(type, "stt")) { + return OVS_VPORT_TYPE_STT; } else if (!strcmp(type, "geneve")) { return OVS_VPORT_TYPE_GENEVE; } else if (strstr(type, "gre64")) { diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c index f228ac2..944a061 100644 --- a/lib/netdev-vport.c +++ b/lib/netdev-vport.c @@ -55,6 +55,7 @@ static struct vlog_rate_limit err_rl = VLOG_RATE_LIMIT_INIT(60, 5); #define GENEVE_DST_PORT 6081 #define VXLAN_DST_PORT 4789 #define LISP_DST_PORT 4341 +#define STT_DST_PORT 7471 #define VXLAN_HLEN (sizeof(struct eth_header) + \ sizeof(struct ip_header) + \ @@ -158,7 +159,7 @@ netdev_vport_needs_dst_port(const struct netdev *dev) return (class->get_config == get_tunnel_config && (!strcmp("geneve", type) || !strcmp("vxlan", type) || - !strcmp("lisp", type))); + !strcmp("lisp", type) || !strcmp("stt", type)) ); } const char * @@ -257,6 +258,8 @@ netdev_vport_construct(struct netdev *netdev_) dev->tnl_cfg.dst_port = htons(VXLAN_DST_PORT); } else if (!strcmp(type, "lisp")) { dev->tnl_cfg.dst_port = htons(LISP_DST_PORT); + } else if (!strcmp(type, "stt")) { + dev->tnl_cfg.dst_port = htons(STT_DST_PORT); } return 0; @@ -432,7 +435,7 @@ set_tunnel_config(struct netdev *dev_, const struct smap *args) struct smap_node *node; has_csum = strstr(type, "gre") || strstr(type, "geneve") || - strstr(type, "vxlan"); + strstr(type, "stt") || strstr(type, "vxlan"); ipsec_mech_set = false; memset(&tnl_cfg, 0, sizeof tnl_cfg); @@ -449,6 +452,10 @@ set_tunnel_config(struct netdev *dev_, const struct smap *args) tnl_cfg.dst_port = htons(LISP_DST_PORT); } + if (!strcmp(type, "stt")) { + tnl_cfg.dst_port = htons(STT_DST_PORT); + } + needs_dst_port = netdev_vport_needs_dst_port(dev_); tnl_cfg.ipsec = strstr(type, "ipsec"); tnl_cfg.dont_fragment = true; @@ -688,7 +695,8 @@ get_tunnel_config(const struct netdev *dev, struct smap *args) if ((!strcmp("geneve", type) && dst_port != GENEVE_DST_PORT) || (!strcmp("vxlan", type) && dst_port != VXLAN_DST_PORT) || - (!strcmp("lisp", type) && dst_port != LISP_DST_PORT)) { + (!strcmp("lisp", type) && dst_port != LISP_DST_PORT) || + (!strcmp("stt", type) && dst_port != STT_DST_PORT)) { smap_add_format(args, "dst_port", "%d", dst_port); } } @@ -1401,7 +1409,8 @@ netdev_vport_tunnel_register(void) TUNNEL_CLASS("vxlan", "vxlan_sys", netdev_vxlan_build_header, push_udp_header, netdev_vxlan_pop_header), - TUNNEL_CLASS("lisp", "lisp_sys", NULL, NULL, NULL) + TUNNEL_CLASS("lisp", "lisp_sys", NULL, NULL, NULL), + TUNNEL_CLASS("stt", "stt_sys", NULL, NULL, NULL), }; static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; diff --git a/ofproto/ofproto-dpif-ipfix.c b/ofproto/ofproto-dpif-ipfix.c index f73d8b4..8a931d6 100644 --- a/ofproto/ofproto-dpif-ipfix.c +++ b/ofproto/ofproto-dpif-ipfix.c @@ -48,8 +48,8 @@ static struct ovs_mutex mutex = OVS_MUTEX_INITIALIZER; * used to indicate the type of tunnel (0x01 = VxLAN, 0x02 = GRE) and the three * least significant bytes hold the value of the layer 2 overlay network * segment identifier: a 24-bit VxLAN tunnel's VNI or a 24-bit GRE tunnel's - * TNI. This is not compatible with GRE-64, as implemented in OVS, as its - * tunnel IDs are 64-bit. + * TNI. This is not compatible with GRE-64 or STT, as implemented in OVS, as + * their tunnel IDs are 64-bit. * * Two new enterprise information elements are defined which are similar to * laryerSegmentId but support 64-bit IDs: @@ -64,6 +64,7 @@ enum dpif_ipfix_tunnel_type { DPIF_IPFIX_TUNNEL_VXLAN = 0x01, DPIF_IPFIX_TUNNEL_GRE = 0x02, DPIF_IPFIX_TUNNEL_LISP = 0x03, + DPIF_IPFIX_TUNNEL_STT = 0x04, DPIF_IPFIX_TUNNEL_IPSEC_GRE = 0x05, DPIF_IPFIX_TUNNEL_GENEVE = 0x07, NUM_DPIF_IPFIX_TUNNEL @@ -299,7 +300,7 @@ static uint8_t tunnel_protocol[NUM_DPIF_IPFIX_TUNNEL] = { IPPROTO_UDP, /* DPIF_IPFIX_TUNNEL_VXLAN */ IPPROTO_GRE, /* DPIF_IPFIX_TUNNEL_GRE */ IPPROTO_UDP, /* DPIF_IPFIX_TUNNEL_LISP*/ - 0 , /* reserved */ + IPPROTO_TCP, /* DPIF_IPFIX_TUNNEL_STT*/ IPPROTO_GRE, /* DPIF_IPFIX_TUNNEL_IPSEC_GRE */ 0 , /* reserved */ IPPROTO_UDP, /* DPIF_IPFIX_TUNNEL_GENEVE*/ @@ -353,6 +354,7 @@ BUILD_ASSERT_DECL(sizeof(struct ipfix_data_record_aggregated_ip) == 32); * VxLAN: 24-bit VIN, * GRE: 32- or 64-bit key, * LISP: 24-bit instance ID + * STT: 64-bit key */ #define MAX_TUNNEL_KEY_LEN 8 @@ -607,6 +609,9 @@ dpif_ipfix_add_tunnel_port(struct dpif_ipfix *di, struct ofport *ofport, } else if (strcmp(type, "geneve") == 0) { dip->tunnel_type = DPIF_IPFIX_TUNNEL_GENEVE; dip->tunnel_key_length = 3; + } else if (strcmp(type, "stt") == 0) { + dip->tunnel_type = DPIF_IPFIX_TUNNEL_STT; + dip->tunnel_key_length = 8; } else { free(dip); goto out; diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 3256ce0..79b5606 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -1892,6 +1892,25 @@ </p> </dd> + <dt><code>stt</code></dt> + <dd> + The Stateless TCP Tunnel (STT) is particularly useful when tunnel + endpoints are in end-systems, as it utilizes the capabilities of + standard network interface cards to improve performance. STT utilizes + a TCP-like header inside the IP header. It is stateless, i.e., there is + no TCP connection state of any kind associated with the tunnel. The + TCP-like header is used to leverage the capabilities of existing + network interface cards, but should not be interpreted as implying + any sort of connection state between endpoints. + Since the STT protocol does not engage in the usual TCP 3-way handshake, + so it will have difficulty traversing stateful firewalls. + The protocol is documented at + http://www.ietf.org/archive/id/draft-davie-stt-06.txt + + All traffic uses a default destination port of 7471. STT is only + available in kernel datapath on kernel 3.5 or newer. + </dd> + <dt><code>patch</code></dt> <dd> A pair of virtual devices that act as a patch cable. @@ -1909,7 +1928,7 @@ These options apply to interfaces with <ref column="type"/> of <code>geneve</code>, <code>gre</code>, <code>ipsec_gre</code>, <code>gre64</code>, <code>ipsec_gre64</code>, <code>vxlan</code>, - and <code>lisp</code>. + <code>lisp</code> and <code>stt</code>. </p> <p> @@ -1998,8 +2017,8 @@ </li> <li> A positive 24-bit (for Geneve, VXLAN, and LISP), 32-bit (for GRE) - or 64-bit (for GRE64) number. The tunnel receives only packets - with the specified key. + or 64-bit (for GRE64 and STT) number. The tunnel receives only + packets with the specified key. </li> <li> The word <code>flow</code>. The tunnel accepts packets with any @@ -2025,8 +2044,8 @@ </li> <li> A positive 24-bit (for Geneve, VXLAN and LISP), 32-bit (for GRE) or - 64-bit (for GRE64) number. Packets sent through the tunnel will - have the specified key. + 64-bit (for GRE64 and STT) number. Packets sent through the tunnel + will have the specified key. </li> <li> The word <code>flow</code>. Packets sent through the tunnel will -- 1.9.1 _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev