This patch used userpsace tunneling mechanism for implementing STT tunneling protocol.
Signed-off-by: Pravin B Shelar <pshe...@nicira.com> --- lib/netdev-vport.c | 6 +- lib/odp-util.c | 41 ++- lib/packets.h | 26 ++ lib/tnl-push-pop.c | 639 ++++++++++++++++++++++++++++++++++++++++++++++- lib/tnl-push-pop.h | 11 + tests/tunnel-push-pop.at | 27 ++ 6 files changed, 745 insertions(+), 5 deletions(-) diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c index 8b41201..c70a596 100644 --- a/lib/netdev-vport.c +++ b/lib/netdev-vport.c @@ -907,8 +907,12 @@ netdev_vport_tunnel_register(void) TUNNEL_CLASS("vxlan", "vxlan_sys", NULL, netdev_vxlan_build_header, push_udp_header, netdev_vxlan_pop_header), + TUNNEL_CLASS("stt", "stt_sys", netdev_stt_class_init, + netdev_stt_build_header, + netdev_stt_push_header, + netdev_stt_pop_header), + TUNNEL_CLASS("lisp", "lisp_sys", NULL, NULL, NULL, NULL), - TUNNEL_CLASS("stt", "stt_sys", NULL, NULL, NULL, NULL), }; static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; diff --git a/lib/odp-util.c b/lib/odp-util.c index f16e113..bdc7391 100644 --- a/lib/odp-util.c +++ b/lib/odp-util.c @@ -85,6 +85,8 @@ static void format_geneve_opts(const struct geneve_opt *opt, static struct nlattr *generate_all_wildcard_mask(const struct attr_len_tbl tbl[], int max, struct ofpbuf *, const struct nlattr *key); +static void format_be64(struct ds *ds, const char *name, ovs_be64 key, + const ovs_be64 *mask, bool verbose); static void format_u128(struct ds *ds, const ovs_u128 *value, const ovs_u128 *mask, bool verbose); static int scan_u128(const char *s, ovs_u128 *value, ovs_u128 *mask); @@ -445,13 +447,29 @@ format_udp_tnl_push_header(struct ds *ds, const struct udp_header *udp) return udp + 1; } +static const void * +format_tcp_tnl_push_header(struct ds *ds, const struct tcp_header *tcp) +{ + ds_put_format(ds, "tcp(src=%"PRIu16",dst=%"PRIu16",seq=0x%"PRIx16"," + "ack=0x%"PRIx16",", ntohs(tcp->tcp_src), ntohs(tcp->tcp_dst), + ntohl(get_16aligned_be32(&tcp->tcp_seq)), + ntohl(get_16aligned_be32(&tcp->tcp_ack))); + + format_flags_masked(ds, "flags", packet_tcp_flag_to_string, + ntohs(tcp->tcp_ctl), TCP_FLAGS(OVS_BE16_MAX), + TCP_FLAGS(OVS_BE16_MAX)); + + ds_put_format(ds, ",csum=0x%"PRIx16",urg=0x%"PRIx16")", + ntohs(tcp->tcp_csum), ntohs(tcp->tcp_urg)); + return tcp + 1; +} + static void format_odp_tnl_push_header(struct ds *ds, struct ovs_action_push_tnl *data) { const struct eth_header *eth; const void *l3; const void *l4; - const struct udp_header *udp; eth = (const struct eth_header *)data->header; @@ -491,11 +509,12 @@ format_odp_tnl_push_header(struct ds *ds, struct ovs_action_push_tnl *data) l4 = (ip6 + 1); } - udp = (const struct udp_header *) l4; if (data->tnl_type == OVS_VPORT_TYPE_VXLAN) { const struct vxlanhdr *vxh; + const struct udp_header *udp; + udp = (const struct udp_header *) l4; vxh = format_udp_tnl_push_header(ds, udp); ds_put_format(ds, "vxlan(flags=0x%"PRIx32",vni=0x%"PRIx32")", @@ -503,7 +522,9 @@ format_odp_tnl_push_header(struct ds *ds, struct ovs_action_push_tnl *data) ntohl(get_16aligned_be32(&vxh->vx_vni)) >> 8); } else if (data->tnl_type == OVS_VPORT_TYPE_GENEVE) { const struct genevehdr *gnh; + const struct udp_header *udp; + udp = (const struct udp_header *) l4; gnh = format_udp_tnl_push_header(ds, udp); ds_put_format(ds, "geneve(%s%svni=0x%"PRIx32, @@ -541,6 +562,22 @@ format_odp_tnl_push_header(struct ds *ds, struct ovs_action_push_tnl *data) options++; } ds_put_format(ds, ")"); + } else if (data->tnl_type == OVS_VPORT_TYPE_STT) { + const struct tcp_header *tcp; + const struct stthdr *stth; + + tcp = (const struct tcp_header *) l4; + stth = format_tcp_tnl_push_header(ds, tcp); + ds_put_format(ds, ",stt("); + format_be64(ds, "tun_id", get_32aligned_be64(&stth->key), NULL, false); + ds_put_format(ds, "ver=0x%"PRIx8",flags=0x%"PRIx8"," + "l4_offset=0x%"PRIx8",res=0x%"PRIx8"," + "mss=0x%"PRIx16",", + stth->version, stth->flags, + stth->l4_offset, stth->reserved, ntohs(stth->mss)); + + format_vlan_tci(ds, stth->vlan_tci, OVS_BE16_MAX, false); + ds_put_format(ds, ")"); } ds_put_format(ds, ")"); } diff --git a/lib/packets.h b/lib/packets.h index 2157657..5459b0f 100644 --- a/lib/packets.h +++ b/lib/packets.h @@ -1021,6 +1021,32 @@ struct vxlanhdr { #define VXLAN_FLAGS 0x08000000 /* struct vxlanhdr.vx_flags required value. */ +/* STT header */ + +struct stthdr { + __u8 version; + __u8 flags; + __u8 l4_offset; + __u8 reserved; + ovs_be16 mss; + ovs_be16 vlan_tci; + ovs_32aligned_be64 key; +}; + +/* Padding after the end of the tunnel headers to provide alignment + * for inner packet IP header after 14 byte Ethernet header. + */ +#define STT_ETH_PAD 2 + +#define STT_BASE_HLEN (sizeof(struct stthdr) + STT_ETH_PAD) +#define STT_HEADER_LEN (sizeof(struct tcp_header) + STT_BASE_HLEN) + +#define STT_CSUM_VERIFIED (1 << 0) +#define STT_CSUM_PARTIAL (1 << 1) +#define STT_PROTO_IPV4 (1 << 2) +#define STT_PROTO_TCP (1 << 3) +#define STT_PROTO_TYPES (STT_PROTO_IPV4 | STT_PROTO_TCP) + void ipv6_format_addr(const struct in6_addr *addr, struct ds *); void ipv6_format_addr_bracket(const struct in6_addr *addr, struct ds *, bool bracket); diff --git a/lib/tnl-push-pop.c b/lib/tnl-push-pop.c index 86023c2..9440033 100644 --- a/lib/tnl-push-pop.c +++ b/lib/tnl-push-pop.c @@ -16,8 +16,6 @@ #include <config.h> -#include "netdev-vport.h" - #include <errno.h> #include <fcntl.h> #include <sys/socket.h> @@ -25,6 +23,10 @@ #include <netinet/ip6.h> #include <sys/ioctl.h> +#include <errno.h> +#include <stdlib.h> +#include <sys/time.h> + #include "byte-order.h" #include "csum.h" #include "daemon.h" @@ -32,20 +34,25 @@ #include "dpif.h" #include "dp-packet.h" #include "dynamic-string.h" +#include "entropy.h" #include "flow.h" #include "hash.h" #include "hmap.h" +#include "id-pool.h" #include "list.h" #include "netdev-provider.h" +#include "netdev-vport.h" #include "netdev-vport-private.h" #include "odp-netlink.h" #include "dp-packet.h" #include "ovs-router.h" #include "packets.h" #include "poll-loop.h" +#include "random.h" #include "route-table.h" #include "shash.h" #include "socket-util.h" +#include "timeval.h" #include "tnl-push-pop.h" #include "openvswitch/vlog.h" #include "unaligned.h" @@ -631,6 +638,634 @@ netdev_geneve_build_header(const struct netdev *netdev, return 0; } + +/* STT */ + +/* The maximum amount of memory used to store packets waiting to be reassembled + * on a given CPU. Once this threshold is exceeded we will begin freeing the + * least recently used fragments. + */ +#define REASM_HI_THRESH (4 * 1024 * 1024) +/* The target for the high memory evictor. Once we have exceeded + * REASM_HI_THRESH, we will continue freeing fragments until we hit + * this limit. + */ +#define REASM_LO_THRESH (3 * 1024 * 1024) +/* The length of time a given packet has to be reassembled from the time the + * first fragment arrives. Once this limit is exceeded it becomes available + * for cleaning. + */ + +#define FRAG_EXP_TIME frag_exp_time + +#define FRAG_HASH_SHIFT 8 +#define FRAG_HASH_ENTRIES (1 << FRAG_HASH_SHIFT) +#define FRAG_HASH_SEGS ((sizeof(uint32_t) * 8) / FRAG_HASH_SHIFT) + +/* The length and offset of a fragment are encoded in the sequence number. + * STT_SEQ_LEN_SHIFT is the left shift needed to store the length. + * STT_SEQ_OFFSET_MASK is the mask to extract the offset. + */ +#define STT_SEQ_LEN_SHIFT 16 +#define STT_SEQ_OFFSET_MASK ((1 << STT_SEQ_LEN_SHIFT) - 1) + +struct pkt_key { + struct in6_addr ipv6_src; + struct in6_addr ipv6_dst; + ovs_be32 pkt_seq; +}; + +struct pkt_frag { + struct dp_packet *pkts; + unsigned long timestamp; + struct ovs_list lru_node; + struct pkt_key key; +}; + +struct first_frag { + struct dp_packet *last_pkt; + unsigned int mem_used; + uint16_t tot_len; + uint16_t rcvd_len; + bool set_ecn_ce; +}; + +struct frag_packet_data { + uint16_t offset; + uint16_t pkt_size; + /* Only valid for the first packet in the chain. */ + struct first_frag first; + struct dp_packet *next; +}; + +BUILD_ASSERT_DECL(DP_PACKET_CONTEXT_SIZE >= sizeof(struct frag_packet_data)); + +#define FRAG_DATA(packet) ((struct frag_packet_data *)(packet)->data) +#define STT_PACKET_DATA(pkt) ((unsigned char *)dp_packet_l4(pkt) + STT_HEADER_LEN) + +struct stt_reassemble { + struct pkt_frag frag_hash[FRAG_HASH_ENTRIES]; + struct ovs_list frag_lru; + unsigned int frag_mem_used; + uint32_t id; + uint32_t counter; +}; + +static struct ovs_mutex thread_is_lock; +static struct id_pool *id_ppol; +static uint32_t frag_hash_seed; +static ovsthread_key_t per_thread_reasm_data; +static void evict_frags(struct stt_reassemble *reasm, int mem_limit); +static uint64_t frag_exp_time; + +static struct stt_reassemble * +get_reasm() +{ + struct stt_reassemble *reasm; + uint32_t i; + bool res; + + reasm = ovsthread_getspecific(per_thread_reasm_data); + if (OVS_UNLIKELY(reasm)) { + return reasm; + } + + reasm = xmalloc_cacheline(sizeof(*reasm)); + list_init(&reasm->frag_lru); + reasm->counter = 0; + + ovs_mutex_lock(&thread_is_lock); + + for (i = 0; i < USHRT_MAX; i++) { + res = id_pool_alloc_id(id_ppol, &i); + + if (res) { + break; + } + } + if (res) { + reasm->id = i; + } else { + OVS_NOT_REACHED(); + } + ovs_mutex_unlock(&thread_is_lock); + ovsthread_setspecific(per_thread_reasm_data, reasm); + return reasm; +} + +static void +reasm_destructor(void *_reasm) +{ + struct stt_reassemble *reasm = _reasm; + + evict_frags(reasm, 0); + + ovs_mutex_lock(&thread_is_lock); + id_pool_free_id(id_ppol, reasm->id); + ovs_mutex_unlock(&thread_is_lock); +} + +int +netdev_stt_class_init(void) +{ + ovsthread_key_create(&per_thread_reasm_data, reasm_destructor); + frag_hash_seed = random(); + ovs_mutex_init(&thread_is_lock); + id_ppol = id_pool_create(0, USHRT_MAX); + frag_exp_time = 30 * OVS_HZ; + return 0; +} + +static bool pkt_key_match(const struct pkt_key *a, const struct pkt_key *b) +{ + return !memcmp(a, b, sizeof (*a)); +} + +static uint32_t pkt_key_hash(const struct pkt_key *key) +{ + return hash_3words(hash_bytes(&key, offsetof(struct pkt_key, pkt_seq), 0), + (uint32_t)(key->pkt_seq), frag_hash_seed); +} + +static inline void list_packet_delete(struct dp_packet *pkt) +{ + do { + struct dp_packet *next = FRAG_DATA(pkt)->next; + + dp_packet_delete(pkt); + pkt = next; + } while (pkt); +} + +static void free_frag(struct stt_reassemble *reasm, struct pkt_frag *frag) +{ + reasm->frag_mem_used -= FRAG_DATA(frag->pkts)->first.mem_used; + list_packet_delete(frag->pkts); + frag->pkts = NULL; +} + +static struct pkt_frag * +pkt_frag_from_node(const struct ovs_list *node) +{ + return CONTAINER_OF(node, struct pkt_frag, lru_node); +} + +static void evict_frags(struct stt_reassemble *reasm, int mem_limit) +{ + while (!list_is_empty(&reasm->frag_lru) && + reasm->frag_mem_used > mem_limit) { + struct pkt_frag *frag; + + frag = pkt_frag_from_node(list_pop_back(&reasm->frag_lru)); + free_frag(reasm, frag); + } + + /* Update Fragment cache expiration time. */ + frag_exp_time = 30 * OVS_HZ; +} + +static struct pkt_frag * +lookup_frag(struct stt_reassemble *reasm, + const struct pkt_key *key, uint32_t hash) +{ + struct pkt_frag *frag, *victim_frag = NULL; + int i; + + for (i = 0; i < FRAG_HASH_SEGS; i++) { + frag = &reasm->frag_hash[hash & (FRAG_HASH_ENTRIES - 1)]; + + if (frag->pkts && + pkt_key_match(&frag->key, key) && + !time_before(cycles_counter(), frag->timestamp + FRAG_EXP_TIME)) { + return frag; + + } + if (!victim_frag || + (victim_frag->pkts && + (!frag->pkts || + time_before(frag->timestamp, victim_frag->timestamp)))) + victim_frag = frag; + + hash >>= FRAG_HASH_SHIFT; + } + + if (victim_frag->pkts) + free_frag(reasm, victim_frag); + + return victim_frag; +} + +static struct dp_packet * +packet_merge(struct dp_packet *pkt) +{ + struct dp_packet *next; + struct dp_packet *m; + + m = dp_packet_clone(pkt); + next = FRAG_DATA(pkt)->next; + dp_packet_delete(pkt); + + pkt = next; + while (pkt) { + void *data; + + data = dp_packet_put_uninit(m, FRAG_DATA(pkt)->pkt_size); + memcpy(data, STT_PACKET_DATA(pkt), FRAG_DATA(pkt)->pkt_size); + next = FRAG_DATA(pkt)->next; + dp_packet_delete(pkt); + pkt = next; + } + + return m; +} + +static struct dp_packet * +reassemble(struct dp_packet *packet) +{ + struct tcp_header *tcph = dp_packet_l4(packet); + int pkt_size = dp_packet_l4_size(packet) - sizeof(*tcph); + uint32_t seq = ntohl(get_16aligned_be32(&tcph->tcp_seq)); + struct stt_reassemble *reasm = get_reasm(); + struct dp_packet *last_pkt; + struct pkt_frag *frag; + struct pkt_key key; + uint32_t hash; + bool is_ipv6; + uint8_t tos; + int tot_len; + + tot_len = seq >> STT_SEQ_LEN_SHIFT; + FRAG_DATA(packet)->offset = seq & STT_SEQ_OFFSET_MASK; + FRAG_DATA(packet)->next = NULL; + FRAG_DATA(packet)->pkt_size = pkt_size; + + if (STT_BASE_HLEN > pkt_size) { + goto out_free; + } + + if (FRAG_DATA(packet)->offset + pkt_size > tot_len) { + goto out_free; + } + + if (tot_len == pkt_size) { + goto out; + } + + is_ipv6 = is_header_ipv6(dp_packet_data(packet)); + + if (is_ipv6) { + struct ovs_16aligned_ip6_hdr *ip6 = dp_packet_l3(packet); + + memcpy(&key.ipv6_src, &ip6->ip6_src.be16, sizeof ip6->ip6_src); + memcpy(&key.ipv6_dst, &ip6->ip6_dst.be16, sizeof ip6->ip6_dst); + } else { + struct ip_header *iph = dp_packet_l3(packet); + + in6_addr_set_mapped_ipv4(&key.ipv6_src, get_16aligned_be32(&iph->ip_src)); + in6_addr_set_mapped_ipv4(&key.ipv6_dst, get_16aligned_be32(&iph->ip_dst)); + } + key.pkt_seq = get_16aligned_be32(&tcph->tcp_ack); + hash = pkt_key_hash(&key); + + if (reasm->frag_mem_used + dp_packet_get_allocated(packet) > REASM_HI_THRESH) { + evict_frags(reasm, REASM_LO_THRESH); + } + + frag = lookup_frag(reasm, &key, hash); + if (!frag->pkts) { + frag->pkts = packet; + frag->key = key; + frag->timestamp = cycles_counter(); + FRAG_DATA(packet)->first.last_pkt = packet; + FRAG_DATA(packet)->first.mem_used = dp_packet_get_allocated(packet); + FRAG_DATA(packet)->first.tot_len = tot_len; + FRAG_DATA(packet)->first.rcvd_len = pkt_size; + FRAG_DATA(packet)->first.set_ecn_ce = false; + list_push_back(&reasm->frag_lru, &frag->lru_node); + reasm->frag_mem_used += dp_packet_get_allocated(packet); + + packet = NULL; + goto out; + } + + /* Optimize for the common case where fragments are received in-order + * and not overlapping. + */ + last_pkt = FRAG_DATA(frag->pkts)->first.last_pkt; + if (FRAG_DATA(last_pkt)->offset + FRAG_DATA(last_pkt)->pkt_size == + FRAG_DATA(packet)->offset) { + FRAG_DATA(last_pkt)->next = packet; + FRAG_DATA(frag->pkts)->first.last_pkt = packet; + } else { + struct dp_packet *prev = NULL, *next; + + for (next = frag->pkts; next; next = FRAG_DATA(next)->next) { + if (FRAG_DATA(next)->offset >= FRAG_DATA(packet)->offset) { + break; + } + prev = next; + } + + /* Overlapping fragments aren't allowed. We shouldn't start + * before the end of the previous fragment. + */ + if (prev && + FRAG_DATA(prev)->offset + FRAG_DATA(prev)->pkt_size > FRAG_DATA(packet)->offset) { + goto unlock_free; + } + + /* We also shouldn't end after the beginning of the next + * fragment. + */ + if (next && + FRAG_DATA(packet)->offset + pkt_size > FRAG_DATA(next)->offset) { + goto unlock_free; + } + + if (prev) { + FRAG_DATA(prev)->next = packet; + } else { + FRAG_DATA(packet)->first = FRAG_DATA(frag->pkts)->first; + frag->pkts = packet; + } + + if (next) { + FRAG_DATA(packet)->next = next; + } else { + FRAG_DATA(frag->pkts)->first.last_pkt = packet; + } + } + + if (is_ipv6) { + struct ovs_16aligned_ip6_hdr *ip6 = dp_packet_l3(packet); + + tos = ntohl(get_16aligned_be32(&ip6->ip6_flow)) >> 20; + } else { + struct ip_header *iph = dp_packet_l3(packet); + tos = iph->ip_tos; + } + + FRAG_DATA(frag->pkts)->first.set_ecn_ce |= IP_ECN_is_ce(tos); + FRAG_DATA(frag->pkts)->first.rcvd_len += pkt_size; + FRAG_DATA(frag->pkts)->first.mem_used += dp_packet_get_allocated(packet); + reasm->frag_mem_used += dp_packet_get_allocated(packet); + + if (FRAG_DATA(frag->pkts)->first.tot_len == + FRAG_DATA(frag->pkts)->first.rcvd_len) { + struct dp_packet *frag_head = frag->pkts; + + if (FRAG_DATA(frag_head)->first.set_ecn_ce) { + IP_ECN_set_ce(frag_head, is_ipv6); + } + + list_remove(&frag->lru_node); + reasm->frag_mem_used -= FRAG_DATA(frag_head)->first.mem_used; + frag->pkts = NULL; + packet = packet_merge(frag_head); + } else { + list_remove(&frag->lru_node); + list_push_back(&reasm->frag_lru, &frag->lru_node); + packet = NULL; + } + goto out; + +unlock_free: + dp_packet_delete(packet); + packet = NULL; +out: + return packet; +out_free: + dp_packet_delete(packet); + return NULL; +} + +static bool +valid_tcp_checksum(struct dp_packet *packet) +{ + uint32_t csum; + + if (is_header_ipv6(dp_packet_data(packet))) { + csum = packet_csum_pseudoheader6(dp_packet_l3(packet)); + } else { + csum = packet_csum_pseudoheader(dp_packet_l3(packet)); + } + + csum = csum_continue(csum, dp_packet_l4(packet), dp_packet_l4_size(packet)); + if (csum_finish(csum)) { + return false; + } + return true; +} + +static void * +tcp_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl, + unsigned int *hlen) +{ + struct tcp_header *tcp; + + tcp = ip_extract_tnl_md(packet, tnl, hlen); + if (!tcp) { + return NULL; + } + + tnl->flags |= FLOW_TNL_F_CSUM; + tnl->tp_src = tcp->tcp_src; + tnl->tp_dst = tcp->tcp_dst; + return tcp + 1; +} + +static int +stt_extract_tnl_md(struct dp_packet *packet) +{ + struct pkt_metadata *md = &packet->md; + struct flow_tnl *tnl = &md->tunnel; + uint8_t flags, l4_offset; + struct stthdr *stth; + uint32_t hlen; + + pkt_metadata_init_tnl(md); + stth = tcp_extract_tnl_md(packet, tnl, &hlen); + if (!stth) { + return EINVAL; + } + + if (stth->version != 0) { + VLOG_WARN_RL(&err_rl, "invalid STT version = %d\n", stth->version); + return EINVAL; + } + flags = stth->flags; + l4_offset = stth->l4_offset; + + tnl->tun_id = get_32aligned_be64(&stth->key); + tnl->flags |= FLOW_TNL_F_KEY; + + dp_packet_reset_packet(packet, hlen + STT_HEADER_LEN); + + if (flags & STT_CSUM_PARTIAL) { + uint8_t proto_type; + uint16_t csum_offset; + int l3_header_size; + int l4_header_size; + uint32_t l4_csum; + ovs_be16 *csum_ptr; + + proto_type = stth->flags & STT_PROTO_TYPES; + if (proto_type == (STT_PROTO_IPV4 | STT_PROTO_TCP)) { + /* TCP/IPv4 */ + csum_offset = offsetof(struct tcp_header, tcp_csum); + l3_header_size = sizeof(struct ip_header); + l4_header_size = sizeof(struct tcp_header); + } else if (proto_type == STT_PROTO_TCP) { + /* TCP/IPv6 */ + csum_offset = offsetof(struct tcp_header, tcp_csum); + l3_header_size = sizeof(struct ovs_16aligned_ip6_hdr); + l4_header_size = sizeof(struct tcp_header); + } else if (proto_type == STT_PROTO_IPV4) { + /* UDP/IPv4 */ + csum_offset = offsetof(struct udp_header, udp_csum); + l3_header_size = sizeof(struct ip_header); + l4_header_size = sizeof(struct udp_header); + } else { + /* UDP/IPv6 */ + csum_offset = offsetof(struct udp_header, udp_csum); + l3_header_size = sizeof(struct ovs_16aligned_ip6_hdr); + l4_header_size = sizeof(struct udp_header); + } + + if (l4_offset < ETH_HEADER_LEN + l3_header_size) { + return EINVAL; + } + if (dp_packet_size(packet) < l4_offset + l4_header_size) { + return EINVAL; + } + csum_ptr = (ovs_be16 *) ((uint16_t *) dp_packet_data(packet) + (l4_offset >> 1)); + l4_csum = csum_continue(0, csum_ptr, dp_packet_size(packet) - l4_offset); + *(csum_ptr + (csum_offset >> 1)) = csum_finish(l4_csum); + } + + return 0; +} + +int +netdev_stt_pop_header(struct dp_packet **p_packet) +{ + struct dp_packet *packet = *p_packet; + struct dp_packet *reasm_pkt; + + if (!valid_tcp_checksum(packet)) { + return EINVAL; + } + + reasm_pkt = reassemble(packet); + *p_packet = reasm_pkt; + if (reasm_pkt) { + return stt_extract_tnl_md(reasm_pkt); + } + return 0; +} + +static void * +tcp_build_header(struct netdev_tunnel_config *tnl_cfg, + struct ovs_action_push_tnl *data, + unsigned int *hlen) +{ + struct ovs_16aligned_ip6_hdr *ip6; + struct tcp_header *tcp; + struct ip_header *ip; + bool is_ipv6; + + *hlen = sizeof(struct eth_header); + + is_ipv6 = is_header_ipv6(data->header); + + if (is_ipv6) { + ip6 = ipv6_hdr(data->header); + ip6->ip6_nxt = IPPROTO_TCP; + tcp = (struct tcp_header *) (ip6 + 1); + *hlen += IPV6_HEADER_LEN; + } else { + ip = ip_hdr(data->header); + ip->ip_proto = IPPROTO_TCP; + tcp = (struct tcp_header *) (ip + 1); + *hlen += IP_HEADER_LEN; + } + + tcp->tcp_dst = tnl_cfg->dst_port; + tcp->tcp_ctl = TCP_CTL(TCP_ACK | TCP_PSH, sizeof(struct tcp_header) >> 2); + tcp->tcp_winsz = htons(USHRT_MAX); + return tcp + 1; +} + +int +netdev_stt_build_header(const struct netdev *netdev, + struct ovs_action_push_tnl *data, + const struct flow *tnl_flow) +{ + struct netdev_vport *dev = netdev_vport_cast(netdev); + struct netdev_tunnel_config *tnl_cfg; + struct stthdr *stth; + unsigned int hlen; + + /* XXX: RCUfy tnl_cfg. */ + ovs_mutex_lock(&dev->mutex); + tnl_cfg = &dev->tnl_cfg; + + stth = tcp_build_header(tnl_cfg, data, &hlen); + + stth->flags = STT_CSUM_VERIFIED; + stth->vlan_tci = 0; + put_32aligned_be64(&stth->key, tnl_flow->tunnel.tun_id); + + ovs_mutex_unlock(&dev->mutex); + data->header_len = hlen + STT_HEADER_LEN; + data->tnl_type = OVS_VPORT_TYPE_STT; + return 0; +} + +static uint32_t ack_seq(void) +{ + struct stt_reassemble *reasm = get_reasm(); + uint32_t ack; + + ack = reasm->counter << 16 | reasm->id; + reasm->counter++; + return ack; +} + +static void +push_tcp_header(struct dp_packet *packet, + const struct ovs_action_push_tnl *data) +{ + struct tcp_header *tcp; + uint32_t csum, stt_len; + int ip_tot_size; + + tcp = push_ip_header(packet, data->header, data->header_len, &ip_tot_size); + + /* set tcp src port */ + tcp->tcp_src = get_src_port(packet); + + stt_len = (ip_tot_size - sizeof(struct tcp_header)); + put_16aligned_be32(&tcp->tcp_seq, htonl(stt_len << STT_SEQ_LEN_SHIFT)); + put_16aligned_be32(&tcp->tcp_ack, htonl(ack_seq())); + + if (is_header_ipv6(dp_packet_data(packet))) { + csum = packet_csum_pseudoheader6(ipv6_hdr(dp_packet_data(packet))); + } else { + csum = packet_csum_pseudoheader(ip_hdr(dp_packet_data(packet))); + } + + csum = csum_continue(csum, tcp, ip_tot_size); + tcp->tcp_csum = csum_finish(csum); +} + +void +netdev_stt_push_header(struct dp_packet *packet, + const struct ovs_action_push_tnl *data) +{ + push_tcp_header(packet, data); +} + void netdev_vport_range(struct unixctl_conn *conn, int argc, const char *argv[], void *aux OVS_UNUSED) diff --git a/lib/tnl-push-pop.h b/lib/tnl-push-pop.h index be84ecd..c1f2a68 100644 --- a/lib/tnl-push-pop.h +++ b/lib/tnl-push-pop.h @@ -53,4 +53,15 @@ void netdev_vport_range(struct unixctl_conn *conn, int argc, const char *argv[], void *aux OVS_UNUSED); +int +netdev_stt_pop_header(struct dp_packet **packet); +int +netdev_stt_build_header(const struct netdev *netdev, + struct ovs_action_push_tnl *data, + const struct flow *tnl_flow); +void +netdev_stt_push_header(struct dp_packet *packet, + const struct ovs_action_push_tnl *data); +int +netdev_stt_class_init(void); #endif diff --git a/tests/tunnel-push-pop.at b/tests/tunnel-push-pop.at index b04f4a6..b0363f2 100644 --- a/tests/tunnel-push-pop.at +++ b/tests/tunnel-push-pop.at @@ -12,6 +12,8 @@ AT_CHECK([ovs-vsctl add-port int-br t2 -- set Interface t2 type=vxlan \ options:remote_ip=1.1.2.93 options:out_key=flow options:csum=true ofport_request=4\ -- add-port int-br t4 -- set Interface t4 type=geneve \ options:remote_ip=flow options:key=123 ofport_request=5\ + -- add-port int-br t5 -- set Interface t5 type=stt \ + options:remote_ip=1.1.2.92 options:key=789 ofport_request=6 options:csum=true\ ], [0]) AT_CHECK([ovs-appctl dpif/show], [0], [dnl @@ -25,6 +27,7 @@ dummy@ovs-dummy: hit:0 missed:0 t2 2/4789: (vxlan: key=123, remote_ip=1.1.2.92) t3 4/4789: (vxlan: csum=true, out_key=flow, remote_ip=1.1.2.93) t4 5/6081: (geneve: key=123, remote_ip=flow) + t5 6/7471: (stt: csum=true, key=789, remote_ip=1.1.2.92) ]) dnl First setup dummy interface IP address, then add the route @@ -51,6 +54,7 @@ AT_CHECK([ovs-appctl tnl/ports/show |sort], [0], [dnl Listening ports: genev_sys_6081 (6081) gre_sys (3) +stt_sys_7471 (7471) vxlan_sys_4789 (4789) ]) @@ -71,6 +75,13 @@ AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1),eth(src=f8:bc:12:44:34: AT_CHECK([tail -1 stdout], [0], [Datapath actions: tnl_pop(6081) ]) +i + +dnl Check STT tunnel pop +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x0800),ipv4(src=1.1.2.92,dst=1.1.2.88,proto=6,tos=0,ttl=64,frag=no),tcp(src=51283,dst=7471)'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], + [Datapath actions: tnl_pop(7471) +]) dnl Check VXLAN tunnel push AT_CHECK([ovs-ofctl add-flow int-br action=2]) @@ -108,6 +119,13 @@ AT_CHECK([tail -1 stdout], [0], [Datapath actions: tnl_push(tnl_port(6081),header(size=58,type=5,eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x0800),ipv4(src=1.1.2.88,dst=1.1.2.92,proto=17,tos=0,ttl=64,frag=0x40),udp(src=0,dst=6081,csum=0x0),geneve(crit,vni=0x7b,options({class=0xffff,type=0x80,len=4,0xa}))),out_port(100)) ]) +dnl Check STT tunnel push +AT_CHECK([ovs-ofctl add-flow int-br action=6]) +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(2),eth_type(0x0800),ipv4(src=1.1.3.88,dst=1.1.3.112,proto=47,tos=0,ttl=64,frag=no)'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], + [Datapath actions: tnl_push(tnl_port(7471),header(size=72,type=106,eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x0800),ipv4(src=1.1.2.88,dst=1.1.2.92,proto=6,tos=0,ttl=64,frag=0x40),tcp(src=0,dst=7471,seq=0x0,ack=0x0,flags=psh|ack|0x5000,csum=0x0,urg=0x0),stt(tun_id=0x315,ver=0x0,flags=0x1,l4_offset=0x0,res=0x0,mss=0x0,vid=0,pcp=0,cfi=0)),out_port(100)) +]) + dnl Check decapsulation of GRE packet AT_CHECK([ovs-appctl netdev-dummy/receive p0 'aa55aa550000001b213cab6408004500007e79464000402fba550101025c0101025820006558000001c8fe71d883724fbeb6f4e1494a080045000054ba200000400184861e0000011e00000200004227e75400030af3195500000000f265010000000000101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f3031323334353637']) ovs-appctl time/warp 1000 @@ -124,6 +142,15 @@ AT_CHECK([ovs-ofctl dump-ports int-br | grep 'port 3'], [0], [dnl port 3: rx pkts=1, bytes=98, drop=0, errs=0, frame=0, over=0, crc=0 ]) +dnl Check STT only accepts encapsulated Ethernet frames +AT_CHECK([ovs-ofctl del-flows int-br]) +AT_CHECK([ovs-appctl netdev-dummy/receive p0 'aa55aa550000001b213cab6408004500009c00004000400633a70101025c01010258204e1d2f007400000000001c5018ffff7cb8000000010000000000000000000000000315000066b2591a7347427c73ecf94a080045000054b15e400040017e950101045c010104580800a59658cb00018a7b8f560000000016f80a0000000000101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f3031323334353637']) +ovs-appctl time/warp 1000 + +AT_CHECK([ovs-ofctl dump-ports int-br | grep 'port 6'], [0], [dnl + port 6: rx pkts=1, bytes=98, drop=0, errs=0, frame=0, over=0, crc=0 +]) + dnl Check decapsulation of Geneve packet with options AT_CAPTURE_FILE([ofctl_monitor.log]) AT_CHECK([ovs-ofctl monitor int-br 65534 --detach --no-chdir --pidfile 2> ofctl_monitor.log]) -- 1.8.3.1 _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev