Hi Daniele, some comments inline prefixed with [Antonio F]. Regards, Antonio
> -----Original Message----- > From: dev [mailto:dev-boun...@openvswitch.org] On Behalf Of Daniele Di > Proietto > Sent: Saturday, April 16, 2016 1:03 AM > To: dev@openvswitch.org > Subject: [ovs-dev] [PATCH v2 04/15] conntrack: New userspace connection > tracker. > > This commit adds the conntrack module. > > It is a connection tracker that resides entirely in userspace. Its > primary user will be the dpif-netdev datapath. > > The module main goal is to provide conntrack_execute(), which offers a > convenient interface to implement the datapath ct() action. > > The conntrack module uses two submodules to deal with the l4 protocol > details (conntrack-other for UDP and ICMP, conntrack-tcp for TCP). > > The conntrack-tcp submodule implementation is adapted from FreeBSD's pf > subsystem, therefore it's BSD licensed. It has been slightly altered to > match the OVS coding style and to allow the pickup of already > established connections. > > Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com> > --- > COPYING | 1 + > debian/copyright.in | 4 + > lib/automake.mk | 5 + > lib/conntrack-other.c | 91 ++++++ > lib/conntrack-private.h | 77 +++++ > lib/conntrack-tcp.c | 476 +++++++++++++++++++++++++++ > lib/conntrack.c | 851 > ++++++++++++++++++++++++++++++++++++++++++++++++ > lib/conntrack.h | 144 ++++++++ > 8 files changed, 1649 insertions(+) > create mode 100644 lib/conntrack-other.c > create mode 100644 lib/conntrack-private.h > create mode 100644 lib/conntrack-tcp.c > create mode 100644 lib/conntrack.c > create mode 100644 lib/conntrack.h > > diff --git a/COPYING b/COPYING > index 308e3ea..afb98b9 100644 > --- a/COPYING > +++ b/COPYING > @@ -25,6 +25,7 @@ License, version 2. > The following files are licensed under the 2-clause BSD license. > include/windows/getopt.h > lib/getopt_long.c > + lib/conntrack-tcp.c > > The following files are licensed under the 3-clause BSD-license > include/windows/netinet/icmp6.h > diff --git a/debian/copyright.in b/debian/copyright.in > index 57d007a..a15f4dd 100644 > --- a/debian/copyright.in > +++ b/debian/copyright.in > @@ -21,6 +21,9 @@ Upstream Copyright Holders: > Copyright (c) 2014 Michael Chapman > Copyright (c) 2014 WindRiver, Inc. > Copyright (c) 2014 Avaya, Inc. > + Copyright (c) 2001 Daniel Hartmeier > + Copyright (c) 2002 - 2008 Henning Brauer > + Copyright (c) 2012 Gleb Smirnoff <gleb...@freebsd.org> > > License: > > @@ -90,6 +93,7 @@ License: > lib/getopt_long.c > include/windows/getopt.h > datapath-windows/ovsext/Conntrack-tcp.c > + lib/conntrack-tcp.c > > * The following files are licensed under the 3-clause BSD-license > > diff --git a/lib/automake.mk b/lib/automake.mk > index 1ec2115..ba30442 100644 > --- a/lib/automake.mk > +++ b/lib/automake.mk > @@ -47,6 +47,11 @@ lib_libopenvswitch_la_SOURCES = \ > lib/compiler.h \ > lib/connectivity.c \ > lib/connectivity.h \ > + lib/conntrack-private.h \ > + lib/conntrack-tcp.c \ > + lib/conntrack-other.c \ > + lib/conntrack.c \ > + lib/conntrack.h \ > lib/coverage.c \ > lib/coverage.h \ > lib/crc32c.c \ > diff --git a/lib/conntrack-other.c b/lib/conntrack-other.c > new file mode 100644 > index 0000000..65d02a9 > --- /dev/null > +++ b/lib/conntrack-other.c > @@ -0,0 +1,91 @@ > +/* > + * Copyright (c) 2015, 2016 Nicira, Inc. > + * > + * Licensed under the Apache License, Version 2.0 (the "License"); > + * you may not use this file except in compliance with the License. > + * You may obtain a copy of the License at: > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or > implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > + > +#include <config.h> > + > +#include "conntrack-private.h" > +#include "dp-packet.h" > + > +enum other_state { > + OTHERS_FIRST, > + OTHERS_MULTIPLE, > + OTHERS_BIDIR, > +}; > + > +struct conn_other { > + struct conn up; > + enum other_state state; > +}; > + > +static const long long other_timeouts[] = { > + [OTHERS_FIRST] = 60 * 1000, > + [OTHERS_MULTIPLE] = 60 * 1000, > + [OTHERS_BIDIR] = 30 * 1000, > +}; > + > +static struct conn_other * > +conn_other_cast(const struct conn *conn) > +{ > + return CONTAINER_OF(conn, struct conn_other, up); > +} > + > +static void > +update_expiration(struct conn_other *conn, long long now) > +{ > + conn->up.expiration = now + other_timeouts[conn->state]; > +} > + > +static enum ct_update_res > +other_conn_update(struct conn *conn_, struct dp_packet *pkt > OVS_UNUSED, > + bool reply, long long now) > +{ > + struct conn_other *conn = conn_other_cast(conn_); > + > + if (reply && conn->state != OTHERS_BIDIR) { > + conn->state = OTHERS_BIDIR; > + } else if (conn->state == OTHERS_FIRST) { > + conn->state = OTHERS_MULTIPLE; > + } > + > + update_expiration(conn, now); > + > + return CT_UPDATE_VALID; > +} > + > +static bool > +other_valid_new(struct dp_packet *pkt OVS_UNUSED) > +{ > + return true; > +} > + > +static struct conn * > +other_new_conn(struct dp_packet *pkt OVS_UNUSED, long long now) > +{ > + struct conn_other *conn; > + > + conn = xzalloc(sizeof(struct conn_other)); > + conn->state = OTHERS_FIRST; > + > + update_expiration(conn, now); > + > + return &conn->up; > +} > + > +struct ct_l4_proto ct_proto_other = { > + .new_conn = other_new_conn, > + .valid_new = other_valid_new, > + .conn_update = other_conn_update, > +}; > diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h > new file mode 100644 > index 0000000..e668c44 > --- /dev/null > +++ b/lib/conntrack-private.h > @@ -0,0 +1,77 @@ > +/* > + * Copyright (c) 2015, 2016 Nicira, Inc. > + * > + * Licensed under the Apache License, Version 2.0 (the "License"); > + * you may not use this file except in compliance with the License. > + * You may obtain a copy of the License at: > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or > implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > + > +#ifndef CONNTRACK_PRIVATE_H > +#define CONNTRACK_PRIVATE_H 1 > + > +#include <sys/types.h> > +#include <netinet/in.h> > +#include <netinet/ip6.h> > + > +#include "hmap.h" > +#include "openvswitch/types.h" > +#include "packets.h" > +#include "unaligned.h" > + > +struct ct_addr { > + union { > + ovs_16aligned_be32 ipv4; > + union ovs_16aligned_in6_addr ipv6; > + ovs_be32 ipv4_aligned; > + struct in6_addr ipv6_aligned; > + }; > +}; > + > +struct ct_endpoint { > + struct ct_addr addr; > + ovs_be16 port; > +}; > + > +struct conn_key { > + struct ct_endpoint src; > + struct ct_endpoint dst; > + > + ovs_be16 dl_type; > + uint8_t nw_proto; > + uint16_t zone; > +}; > + > +struct conn { > + struct conn_key key; > + struct conn_key rev_key; > + long long expiration; > + struct hmap_node node; > + uint32_t mark; > + ovs_u128 label; > +}; > + > +enum ct_update_res { > + CT_UPDATE_INVALID, > + CT_UPDATE_VALID, > + CT_UPDATE_NEW, > +}; > + > +struct ct_l4_proto { > + struct conn *(*new_conn)(struct dp_packet *pkt, long long now); > + bool (*valid_new)(struct dp_packet *pkt); > + enum ct_update_res (*conn_update)(struct conn *conn, struct dp_packet > *pkt, > + bool reply, long long now); > +}; > + > +extern struct ct_l4_proto ct_proto_tcp; > +extern struct ct_l4_proto ct_proto_other; > + > +#endif /* conntrack-private.h */ > diff --git a/lib/conntrack-tcp.c b/lib/conntrack-tcp.c > new file mode 100644 > index 0000000..4d80038 > --- /dev/null > +++ b/lib/conntrack-tcp.c > @@ -0,0 +1,476 @@ > +/*- > + * Copyright (c) 2001 Daniel Hartmeier > + * Copyright (c) 2002 - 2008 Henning Brauer > + * Copyright (c) 2012 Gleb Smirnoff <gleb...@freebsd.org> > + * Copyright (c) 2015, 2016 Nicira, Inc. > + * All rights reserved. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * > + * - Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * - Redistributions in binary form must reproduce the above > + * copyright notice, this list of conditions and the following > + * disclaimer in the documentation and/or other materials provided > + * with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND > CONTRIBUTORS > + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND > FITNESS > + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE > + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, > INDIRECT, > + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES > (INCLUDING, > + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR > SERVICES; > + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER > + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, > STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN > + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE > + * POSSIBILITY OF SUCH DAMAGE. > + * > + * Effort sponsored in part by the Defense Advanced Research Projects > + * Agency (DARPA) and Air Force Research Laboratory, Air Force > + * Materiel Command, USAF, under agreement number F30602-01-2-0537. > + * > + * $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $ > + */ > + > +#include <config.h> > + > +#include "conntrack-private.h" > +#include "ct-dpif.h" > +#include "dp-packet.h" > +#include "util.h" > + > +struct tcp_peer { > + enum ct_dpif_tcp_state state; > + uint32_t seqlo; /* Max sequence number sent */ > + uint32_t seqhi; /* Max the other end ACKd + win */ > + uint16_t max_win; /* largest window (pre scaling) */ > + uint8_t wscale; /* window scaling factor */ > +}; > + > +struct conn_tcp { > + struct conn up; > + struct tcp_peer peer[2]; > +}; > + > +enum { > + TCPOPT_EOL, > + TCPOPT_NOP, > + TCPOPT_WINDOW = 3, > +}; > + > +/* TCP sequence numbers are 32 bit integers operated > + * on with modular arithmetic. These macros can be > + * used to compare such integers. */ > +#define SEQ_LT(a,b) ((int)((a)-(b)) < 0) > +#define SEQ_LEQ(a,b) ((int)((a)-(b)) <= 0) > +#define SEQ_GT(a,b) ((int)((a)-(b)) > 0) > +#define SEQ_GEQ(a,b) ((int)((a)-(b)) >= 0) > + > +#define SEQ_MIN(a, b) ((SEQ_LT(a, b)) ? (a) : (b)) > +#define SEQ_MAX(a, b) ((SEQ_GT(a, b)) ? (a) : (b)) > + > +static struct conn_tcp* > +conn_tcp_cast(const struct conn* conn) > +{ > + return CONTAINER_OF(conn, struct conn_tcp, up); > +} > + > +/* pf does this in in pf_normalize_tcp(), and it is called only if scrub > + * is enabled. We're not scrubbing, but this check seems reasonable. */ > +static bool > +tcp_invalid_flags(uint16_t flags) > +{ > + > + if (flags & TCP_SYN) { > + if (flags & TCP_RST) { > + return true; > + } > + if (flags & TCP_FIN) { > + /* Here pf removes the fin flag. We simply mark the packet as > + * invalid */ > + return true; > + } > + } else { > + /* Illegal packet */ > + if (!(flags & (TCP_ACK|TCP_RST))) { > + return true; > + } > + } > + > + if (!(flags & TCP_ACK)) { > + /* These flags are only valid if ACK is set */ > + if ((flags & TCP_FIN) || (flags & TCP_PSH) || (flags & TCP_URG)) { > + return true; > + } > + } > + > + return false; > +} > + > +#define TCP_MAX_WSCALE 14 > +#define CT_WSCALE_FLAG 0x80 > +#define CT_WSCALE_UNKNOWN 0x40 > +#define CT_WSCALE_MASK 0xf > + > +static uint8_t > +tcp_get_wscale(const struct tcp_header *tcp) > +{ > + int len = TCP_OFFSET(tcp->tcp_ctl) * 4 - sizeof *tcp; > + const uint8_t *opt = (const uint8_t *)(tcp + 1); > + uint8_t wscale = 0; > + uint8_t optlen; > + > + while (len >= 3) { > + if (*opt == TCPOPT_EOL) { > + break; > + } > + switch (*opt) { > + case TCPOPT_NOP: > + opt++; > + len--; > + break; > + case TCPOPT_WINDOW: > + wscale = MIN(opt[2], TCP_MAX_WSCALE); > + wscale |= CT_WSCALE_FLAG; > + /* fall through */ > + default: > + optlen = opt[1]; > + if (optlen < 2) { > + optlen = 2; > + } > + len -= optlen; > + opt += optlen; > + } > + } > + > + return wscale; > +} > + > +static uint32_t > +tcp_payload_length(struct dp_packet *pkt) > +{ > + return (char *) dp_packet_tail(pkt) - dp_packet_l2_pad_size(pkt) > + - (char *) dp_packet_get_tcp_payload(pkt); > +} > + > +static void > +update_expiration(struct conn_tcp *conn, long long now, long long interval) > +{ > + conn->up.expiration = now + interval; > +} > + > + > +static enum ct_update_res > +tcp_conn_update(struct conn* conn_, struct dp_packet *pkt, bool reply, > + long long now) > +{ > + struct conn_tcp *conn = conn_tcp_cast(conn_); > + struct tcp_header *tcp = dp_packet_l4(pkt); > + /* The peer that sent 'pkt' */ > + struct tcp_peer *src = &conn->peer[reply ? 1 : 0]; > + /* The peer that should receive 'pkt' */ > + struct tcp_peer *dst = &conn->peer[reply ? 0 : 1]; > + uint8_t sws = 0, dws = 0; > + uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl); > + > + uint16_t win = ntohs(tcp->tcp_winsz); > + uint32_t ack, end, seq, orig_seq; > + uint32_t p_len = tcp_payload_length(pkt); > + int ackskew; > + > + if (tcp_invalid_flags(tcp_flags)) { > + return CT_UPDATE_INVALID; > + } > + > + if (((tcp_flags & (TCP_SYN|TCP_ACK)) == TCP_SYN) && > + dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 && > + src->state >= CT_DPIF_TCPS_FIN_WAIT_2) { > + src->state = dst->state = CT_DPIF_TCPS_CLOSED; > + return CT_UPDATE_NEW; > + } > + > + if (src->wscale & CT_WSCALE_FLAG > + && dst->wscale & CT_WSCALE_FLAG > + && !(tcp_flags & TCP_SYN)) { > + > + sws = src->wscale & CT_WSCALE_MASK; > + dws = dst->wscale & CT_WSCALE_MASK; > + > + } else if (src->wscale & CT_WSCALE_UNKNOWN > + && dst->wscale & CT_WSCALE_UNKNOWN > + && !(tcp_flags & TCP_SYN)) { > + > + sws = TCP_MAX_WSCALE; > + dws = TCP_MAX_WSCALE; > + } > + > + /* > + * Sequence tracking algorithm from Guido van Rooij's paper: > + * http://www.madison-gurkha.com/publications/tcp_filtering/ > + * tcp_filtering.ps > + */ > + > + orig_seq = seq = ntohl(get_16aligned_be32(&tcp->tcp_seq)); > + if (src->state < CT_DPIF_TCPS_SYN_SENT) { > + /* First packet from this end. Set its state */ > + > + ack = ntohl(get_16aligned_be32(&tcp->tcp_ack)); > + > + end = seq + p_len; > + if (tcp_flags & TCP_SYN) { > + end++; > + if (dst->wscale & CT_WSCALE_FLAG) { > + src->wscale = tcp_get_wscale(tcp); > + if (src->wscale & CT_WSCALE_FLAG) { > + /* Remove scale factor from initial window */ > + sws = src->wscale & CT_WSCALE_MASK; > + win = DIV_ROUND_UP((uint32_t) win, 1 << sws); > + dws = dst->wscale & CT_WSCALE_MASK; > + } else { > + /* fixup other window */ > + dst->max_win <<= dst->wscale & > + CT_WSCALE_MASK; > + /* in case of a retrans SYN|ACK */ > + dst->wscale = 0; > + } > + } > + } > + if (tcp_flags & TCP_FIN) { > + end++; > + } > + > + src->seqlo = seq; > + src->state = CT_DPIF_TCPS_SYN_SENT; > + /* > + * May need to slide the window (seqhi may have been set by > + * the crappy stack check or if we picked up the connection > + * after establishment) > + */ > + if (src->seqhi == 1 || > + SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) { > + src->seqhi = end + MAX(1, dst->max_win << dws); > + } > + if (win > src->max_win) { > + src->max_win = win; > + } > + > + } else { > + ack = ntohl(get_16aligned_be32(&tcp->tcp_ack)); > + end = seq + p_len; > + if (tcp_flags & TCP_SYN) { > + end++; > + } > + if (tcp_flags & TCP_FIN) { > + end++; > + } > + } > + > + if ((tcp_flags & TCP_ACK) == 0) { > + /* Let it pass through the ack skew check */ > + ack = dst->seqlo; > + } else if ((ack == 0 > + && (tcp_flags & (TCP_ACK|TCP_RST)) == (TCP_ACK|TCP_RST)) > + /* broken tcp stacks do not set ack */) { > + /* Many stacks (ours included) will set the ACK number in an > + * FIN|ACK if the SYN times out -- no sequence to ACK. */ > + ack = dst->seqlo; > + } > + > + if (seq == end) { > + /* Ease sequencing restrictions on no data packets */ > + seq = src->seqlo; > + end = seq; > + } > + > + ackskew = dst->seqlo - ack; > +#define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge > factor */ > + if (SEQ_GEQ(src->seqhi, end) > + /* Last octet inside other's window space */ > + && SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) > + /* Retrans: not more than one window back */ > + && (ackskew >= -MAXACKWINDOW) > + /* Acking not more than one reassembled fragment backwards */ > + && (ackskew <= (MAXACKWINDOW << sws)) > + /* Acking not more than one window forward */ > + && ((tcp_flags & TCP_RST) == 0 || orig_seq == src->seqlo > + || (orig_seq == src->seqlo + 1) || (orig_seq + 1 == > src->seqlo))) { > + /* Require an exact/+1 sequence match on resets when possible */ > + > + /* update max window */ > + if (src->max_win < win) { > + src->max_win = win; > + } > + /* synchronize sequencing */ > + if (SEQ_GT(end, src->seqlo)) { > + src->seqlo = end; > + } > + /* slide the window of what the other end can send */ > + if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) { > + dst->seqhi = ack + MAX((win << sws), 1); > + } > + > + /* update states */ > + if (tcp_flags & TCP_SYN && src->state < CT_DPIF_TCPS_SYN_SENT) { > + src->state = CT_DPIF_TCPS_SYN_SENT; > + } > + if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) { > + src->state = CT_DPIF_TCPS_CLOSING; > + } > + if (tcp_flags & TCP_ACK) { > + if (dst->state == CT_DPIF_TCPS_SYN_SENT) { > + dst->state = CT_DPIF_TCPS_ESTABLISHED; > + } else if (dst->state == CT_DPIF_TCPS_CLOSING) { > + dst->state = CT_DPIF_TCPS_FIN_WAIT_2; > + } > + } > + if (tcp_flags & TCP_RST) { > + src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT; > + } > + > + if (src->state >= CT_DPIF_TCPS_FIN_WAIT_2 > + && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2) { > + update_expiration(conn, now, 30 * 1000); > + } else if (src->state >= CT_DPIF_TCPS_CLOSING > + && dst->state >= CT_DPIF_TCPS_CLOSING) { > + update_expiration(conn, now, 45 * 1000); > + } else if (src->state < CT_DPIF_TCPS_ESTABLISHED > + || dst->state < CT_DPIF_TCPS_ESTABLISHED) { > + update_expiration(conn, now, 30 * 1000); > + } else if (src->state >= CT_DPIF_TCPS_CLOSING > + || dst->state >= CT_DPIF_TCPS_CLOSING) { > + update_expiration(conn, now, 15 * 60 * 1000); > + } else { > + update_expiration(conn, now, 24 * 60 * 60 * 1000); > + } > + } else if ((dst->state < CT_DPIF_TCPS_SYN_SENT > + || dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 > + || src->state >= CT_DPIF_TCPS_FIN_WAIT_2) > + && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) > + /* Within a window forward of the originating packet */ > + && SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) { > + /* Within a window backward of the originating packet */ > + > + /* > + * This currently handles three situations: > + * 1) Stupid stacks will shotgun SYNs before their peer > + * replies. > + * 2) When PF catches an already established stream (the > + * firewall rebooted, the state table was flushed, routes > + * changed...) > + * 3) Packets get funky immediately after the connection > + * closes (this should catch Solaris spurious ACK|FINs > + * that web servers like to spew after a close) > + * > + * This must be a little more careful than the above code > + * since packet floods will also be caught here. We don't > + * update the TTL here to mitigate the damage of a packet > + * flood and so the same code can handle awkward establishment > + * and a loosened connection close. > + * In the establishment case, a correct peer response will > + * validate the connection, go through the normal state code > + * and keep updating the state TTL. > + */ > + > + /* update max window */ > + if (src->max_win < win) { > + src->max_win = win; > + } > + /* synchronize sequencing */ > + if (SEQ_GT(end, src->seqlo)) { > + src->seqlo = end; > + } > + /* slide the window of what the other end can send */ > + if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) { > + dst->seqhi = ack + MAX((win << sws), 1); > + } > + > + /* > + * Cannot set dst->seqhi here since this could be a shotgunned > + * SYN and not an already established connection. > + */ > + > + if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) { > + src->state = CT_DPIF_TCPS_CLOSING; > + } > + > + if (tcp_flags & TCP_RST) { > + src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT; > + } > + } else { > + return CT_UPDATE_INVALID; > + } > + > + return CT_UPDATE_VALID; > +} > + > +static bool > +tcp_valid_new(struct dp_packet *pkt) > +{ > + struct tcp_header *tcp = dp_packet_l4(pkt); > + uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl); > + > + if (tcp_invalid_flags(tcp_flags)) { > + return false; > + } > + > + /* A syn+ack is not allowed to create a connection. We want to allow > + * totally new connections (syn) or already established, not partially > + * open (syn+ack). */ > + if ((tcp_flags & TCP_SYN) && (tcp_flags & TCP_ACK)) { > + return false; > + } > + > + return true; > +} > + > +static struct conn * > +tcp_new_conn(struct dp_packet *pkt, long long now) > +{ > + struct conn_tcp* newconn = NULL; > + struct tcp_header *tcp = dp_packet_l4(pkt); > + struct tcp_peer *src, *dst; > + uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl); > + > + newconn = xzalloc(sizeof(struct conn_tcp)); > + > + src = &newconn->peer[0]; > + dst = &newconn->peer[1]; > + > + src->seqlo = ntohl(get_16aligned_be32(&tcp->tcp_seq)); > + src->seqhi = src->seqlo + tcp_payload_length(pkt) + 1; > + > + if (tcp_flags & TCP_SYN) { > + src->seqhi++; > + src->wscale = tcp_get_wscale(tcp); > + } else { > + src->wscale = CT_WSCALE_UNKNOWN; > + dst->wscale = CT_WSCALE_UNKNOWN; > + } > + src->max_win = MAX(ntohs(tcp->tcp_winsz), 1); > + if (src->wscale & CT_WSCALE_MASK) { > + /* Remove scale factor from initial window */ > + uint8_t sws = src->wscale & CT_WSCALE_MASK; > + src->max_win = DIV_ROUND_UP((uint32_t) src->max_win, 1 << sws); > + } > + if (tcp_flags & TCP_FIN) { > + src->seqhi++; > + } > + dst->seqhi = 1; > + dst->max_win = 1; > + src->state = CT_DPIF_TCPS_SYN_SENT; > + dst->state = CT_DPIF_TCPS_CLOSED; > + > + update_expiration(newconn, now, 30 * 1000); > + > + return &newconn->up; > +} > + > +struct ct_l4_proto ct_proto_tcp = { > + .new_conn = tcp_new_conn, > + .valid_new = tcp_valid_new, > + .conn_update = tcp_conn_update, > +}; > diff --git a/lib/conntrack.c b/lib/conntrack.c > new file mode 100644 > index 0000000..840335b > --- /dev/null > +++ b/lib/conntrack.c > @@ -0,0 +1,851 @@ > +/* > + * Copyright (c) 2015, 2016 Nicira, Inc. > + * > + * Licensed under the Apache License, Version 2.0 (the "License"); > + * you may not use this file except in compliance with the License. > + * You may obtain a copy of the License at: > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or > implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > + > +#include <config.h> > +#include "conntrack.h" > + > +#include <errno.h> > +#include <sys/types.h> > +#include <netinet/in.h> > +#include <netinet/icmp6.h> > + > +#include "bitmap.h" > +#include "conntrack-private.h" > +#include "dp-packet.h" > +#include "flow.h" > +#include "hmap.h" > +#include "netdev.h" > +#include "odp-netlink.h" > +#include "openvswitch/vlog.h" > +#include "ovs-rcu.h" > +#include "random.h" > +#include "timeval.h" > + > +VLOG_DEFINE_THIS_MODULE(conntrack); > + > +struct conn_lookup_ctx { > + struct conn_key key; > + struct conn *conn; > + uint32_t hash; > + bool reply; > + bool related; > +}; > + > +static bool conn_key_extract(struct conntrack *, struct dp_packet *, > + struct conn_lookup_ctx *, uint16_t zone); > +static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis); > +static void conn_key_reverse(struct conn_key *); > +static void conn_key_lookup(struct conntrack *ct, > + struct conn_lookup_ctx *ctx, > + unsigned bucket, > + long long now); > +static bool valid_new(struct dp_packet *pkt, struct conn_key *); > +static struct conn *new_conn(struct dp_packet *pkt, struct conn_key *, > + long long now); > +static void delete_conn(struct conn *); > +static enum ct_update_res conn_update(struct conn *, struct dp_packet*, > + bool reply, long long now); > +static bool conn_expired(struct conn *, long long now); > +static void set_mark(struct dp_packet *, struct conn *, > + uint32_t val, uint32_t mask); > +static void set_label(struct dp_packet *, struct conn *, > + const struct ovs_key_ct_labels *val, > + const struct ovs_key_ct_labels *mask); > + > +static struct ct_l4_proto *l4_protos[] = { > + [IPPROTO_TCP] = &ct_proto_tcp, > + [IPPROTO_UDP] = &ct_proto_other, > + [IPPROTO_ICMP] = &ct_proto_other, > +}; > + > +/* Initializes the connection tracker 'ct'. The caller is responbile for > + * calling 'conntrack_destroy()', when the instance is not needed anymore */ > +void > +conntrack_init(struct conntrack *ct) > +{ > + unsigned i; > + > + for (i = 0; i < CONNTRACK_BUCKETS; i++) { > + ct_lock_init(&ct->locks[i]); > + ct_lock_lock(&ct->locks[i]); > + hmap_init(&ct->connections[i]); > + ct_lock_unlock(&ct->locks[i]); > + } > + ct->hash_basis = random_uint32(); > + ct->purge_bucket = 0; > + ct->purge_inner_bucket = 0; > + ct->purge_inner_offset = 0; > +} > + > +/* Destroys the connection tracker 'ct' and frees all the allocated memory. > */ > +void > +conntrack_destroy(struct conntrack *ct) > +{ > + unsigned i; > + > + for (i = 0; i < CONNTRACK_BUCKETS; i++) { > + struct conn *conn, *next; > + > + ct_lock_lock(&ct->locks[i]); > + HMAP_FOR_EACH_SAFE(conn, next, node, &ct->connections[i]) { > + hmap_remove(&ct->connections[i], &conn->node); > + delete_conn(conn); > + } > + hmap_destroy(&ct->connections[i]); > + ct_lock_unlock(&ct->locks[i]); > + ct_lock_destroy(&ct->locks[i]); > + } > +} > + > +static unsigned hash_to_bucket(uint32_t hash) > +{ > + /* Extracts the most significant bits in hash. The least significant bits > + * are already used internally by the hmap implementation. */ > + BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 && > CONNTRACK_BUCKETS_SHIFT >= 1); > + > + return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % > CONNTRACK_BUCKETS; > +} > + > +static void > +write_ct_md(struct dp_packet *pkt, uint8_t state, uint16_t zone, > + uint32_t mark, ovs_u128 label) > +{ > + pkt->md.ct_state = state | CS_TRACKED; > + pkt->md.ct_zone = zone; > + pkt->md.ct_mark = mark; > + pkt->md.ct_label = label; > +} > + > +static struct conn * > +conn_not_found(struct conntrack *ct, struct dp_packet *pkt, > + struct conn_lookup_ctx *ctx, uint8_t *state, bool commit, > + long long now) > +{ > + unsigned bucket = hash_to_bucket(ctx->hash); > + struct conn *nc = NULL; > + > + if (!valid_new(pkt, &ctx->key)) { > + *state |= CS_INVALID; > + return nc; > + } > + > + *state |= CS_NEW; > + > + if (commit) { > + nc = new_conn(pkt, &ctx->key, now); > + > + memcpy(&nc->rev_key, &ctx->key, sizeof nc->rev_key); > + > + conn_key_reverse(&nc->rev_key); > + hmap_insert(&ct->connections[bucket], &nc->node, ctx->hash); > + } > + > + return nc; > +} > + > +static struct conn * > +process_one(struct conntrack *ct, struct dp_packet *pkt, > + struct conn_lookup_ctx *ctx, uint16_t zone, > + bool commit, long long now) > +{ > + unsigned bucket = hash_to_bucket(ctx->hash); > + struct conn *conn = ctx->conn; > + uint8_t state = 0; > + > + if (conn) { > + if (ctx->related) { > + state |= CS_RELATED; > + if (ctx->reply) { > + state |= CS_REPLY_DIR; > + } > + } else { > + enum ct_update_res res; > + > + res = conn_update(conn, pkt, ctx->reply, now); > + > + switch (res) { > + case CT_UPDATE_VALID: > + state |= CS_ESTABLISHED; > + if (ctx->reply) { > + state |= CS_REPLY_DIR; > + } > + break; > + case CT_UPDATE_INVALID: > + state |= CS_INVALID; > + break; > + case CT_UPDATE_NEW: > + hmap_remove(&ct->connections[bucket], &conn->node); > + delete_conn(conn); > + conn = conn_not_found(ct, pkt, ctx, &state, commit, now); > + break; [Antonio F] I see that conn_update can return just CT_UPDATE_VALID/INVALID/NEW but should we consider a 'default' case here, eg default: state |= CS_INVALID; break; ? > + } > + } > + > + pkt->md.ct_label = conn->label; > + pkt->md.ct_mark = conn->mark; [Antonio F] Can we skip the previous 2 lines as they are implemented into write_ct_md ? > + write_ct_md(pkt, state, zone, conn->mark, conn->label); > + } else { > + conn = conn_not_found(ct, pkt, ctx, &state, commit, now); > + write_ct_md(pkt, state, zone, 0, (ovs_u128) {{0}}); > + } > + > + return conn; > +} > + > +/* Sends a group of 'cnt' packets ('pkts') through the connection tracker > + * 'ct'. If 'commit' is true, the packets are allowed to create new entries > + * in the connection tables. 'setmark', if not NULL, should point to a two > + * elements array containing a value and a mask to set the connection mark. > + * 'setlabel' behaves similarly for the connection label.*/ > +int > +conntrack_execute(struct conntrack *ct, struct dp_packet **pkts, size_t cnt, > + bool commit, uint16_t zone, const uint32_t *setmark, > + const struct ovs_key_ct_labels *setlabel, > + const char *helper) > +{ > +#if !defined(__CHECKER__) && !defined(_WIN32) > + const size_t KEY_ARRAY_SIZE = cnt; > +#else > + enum { KEY_ARRAY_SIZE = NETDEV_MAX_BURST }; > +#endif > + struct conn_lookup_ctx ctxs[KEY_ARRAY_SIZE]; > + int8_t bucket_list[CONNTRACK_BUCKETS]; > + struct { > + unsigned bucket; > + unsigned long maps; > + } arr[KEY_ARRAY_SIZE]; > + long long now = time_msec(); > + size_t i = 0; > + uint8_t arrcnt = 0; > + > + BUILD_ASSERT_DECL(sizeof arr[0].maps * CHAR_BIT >= > NETDEV_MAX_BURST); > + > + if (helper) { > + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); > + > + VLOG_WARN_RL(&rl, "ALG helper \"%s\" not supported", helper); > + /* Continue without the helper */ > + } > + > + memset(bucket_list, INT8_C(-1), sizeof bucket_list); > + for (i = 0; i < cnt; i++) { > + unsigned bucket; > + > + if (!conn_key_extract(ct, pkts[i], &ctxs[i], zone)) { > + write_ct_md(pkts[i], CS_INVALID, zone, 0, (ovs_u128){{0}}); > + continue; > + } > + > + bucket = hash_to_bucket(ctxs[i].hash); > + if (bucket_list[bucket] == INT8_C(-1)) { > + bucket_list[bucket] = arrcnt; > + > + arr[arrcnt].maps = 0; > + ULLONG_SET1(arr[arrcnt].maps, i); > + arr[arrcnt++].bucket = bucket; > + } else { > + ULLONG_SET1(arr[bucket_list[bucket]].maps, i); > + arr[bucket_list[bucket]].maps |= 1UL << i; > + } > + } > + > + for (i = 0; i < arrcnt; i++) { > + size_t j; > + > + ct_lock_lock(&ct->locks[arr[i].bucket]); > + > + ULLONG_FOR_EACH_1(j, arr[i].maps) { > + struct conn *conn; > + > + conn_key_lookup(ct, &ctxs[j], arr[i].bucket, now); > + > + conn = process_one(ct, pkts[j], &ctxs[j], zone, commit, now); > + > + if (conn && setmark) { > + set_mark(pkts[j], conn, setmark[0], setmark[1]); > + } > + > + if (conn && setlabel) { > + set_label(pkts[j], conn, &setlabel[0], &setlabel[1]); > + } > + } > + ct_lock_unlock(&ct->locks[arr[i].bucket]); > + } > + > + return 0; > +} > + > +static void > +set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t > mask) > +{ > + pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask)); > + conn->mark = pkt->md.ct_mark; > +} > + > +static void > +set_label(struct dp_packet *pkt, struct conn *conn, > + const struct ovs_key_ct_labels *val, > + const struct ovs_key_ct_labels *mask) > +{ > + ovs_u128 v, m; > + > + memcpy(&v, val, sizeof v); > + memcpy(&m, mask, sizeof m); > + > + pkt->md.ct_label.u64.lo = v.u64.lo > + | (pkt->md.ct_label.u64.lo & ~(m.u64.lo)); > + pkt->md.ct_label.u64.hi = v.u64.hi > + | (pkt->md.ct_label.u64.hi & ~(m.u64.hi)); > + conn->label = pkt->md.ct_label; > +} > + > +#define CONNTRACK_PURGE_NUM 256 > + > +static void > +sweep_bucket(struct hmap *bucket, uint32_t *inner_bucket, > + uint32_t *inner_offset, unsigned *left, long long now) > +{ > + while (*left != 0) { > + struct hmap_node *node; > + struct conn *conn; > + > + node = hmap_at_position(bucket, inner_bucket, inner_offset); > + > + if (!node) { > + hmap_shrink(bucket); > + break; > + } > + > + INIT_CONTAINER(conn, node, node); > + if (conn_expired(conn, now)) { > + hmap_remove(bucket, &conn->node); > + delete_conn(conn); > + (*left)--; > + } > + } > +} > + > +/* Cleans up old connection entries. Should be called periodically. */ > +void > +conntrack_run(struct conntrack *ct) > +{ > + unsigned bucket = hash_to_bucket(ct->purge_bucket); > + uint32_t inner_bucket = ct->purge_inner_bucket, > + inner_offset = ct->purge_inner_offset; > + unsigned left = CONNTRACK_PURGE_NUM; > + long long now = time_msec(); > + > + while (bucket < CONNTRACK_BUCKETS) { > + ct_lock_lock(&ct->locks[bucket]); > + sweep_bucket(&ct->connections[bucket], > + &inner_bucket, &inner_offset, > + &left, now); > + ct_lock_unlock(&ct->locks[bucket]); > + > + if (left == 0) { > + break; > + } else { > + bucket++; > + } > + } > + > + ct->purge_bucket = bucket; > + ct->purge_inner_bucket = inner_bucket; > + ct->purge_inner_offset = inner_offset; > +} > + > +/* Key extraction */ > + > +/* The function stores a pointer to the first byte after the header in > + * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is > + * not interested in the header's tail, meaning that the header has > + * already been parsed (e.g. by flow_extract): we take this as a hint to > + * save a few checks. */ > +static inline bool > +extract_l3_ipv4(struct conn_key *key, const void *data, size_t size, > + const char **new_data) > +{ > + const struct ip_header *ip = data; > + > + if (new_data) { > + size_t ip_len; > + > + if (OVS_UNLIKELY(size < IP_HEADER_LEN)) { > + return false; > + } > + ip_len = IP_IHL(ip->ip_ihl_ver) * 4; > + > + if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) { > + return false; > + } > + if (OVS_UNLIKELY(size < ip_len)) { > + return false; > + } > + > + *new_data = (char *) data + ip_len; > + } > + > + if (IP_IS_FRAGMENT(ip->ip_frag_off)) { > + return false; > + } > + > + key->src.addr.ipv4 = ip->ip_src; > + key->dst.addr.ipv4 = ip->ip_dst; > + key->nw_proto = ip->ip_proto; > + > + return true; > +} > + > +/* The function stores a pointer to the first byte after the header in > + * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is > + * not interested in the header's tail, meaning that the header has > + * already been parsed (e.g. by flow_extract): we take this as a hint to > + * save a few checks. */ > +static inline bool > +extract_l3_ipv6(struct conn_key *key, const void *data, size_t size, > + const char **new_data) > +{ > + const struct ovs_16aligned_ip6_hdr *ip6 = data; > + uint8_t nw_proto = ip6->ip6_nxt; > + uint8_t nw_frag = 0; > + > + if (new_data) { > + if (OVS_UNLIKELY(size < sizeof *ip6)) { > + return false; > + } > + } > + > + data = ip6 + 1; > + size -= sizeof *ip6; > + > + if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag)) { > + return false; > + } > + > + if (new_data) { > + *new_data = data; > + } > + > + if (nw_frag) { > + return false; > + } > + > + key->src.addr.ipv6 = ip6->ip6_src; > + key->dst.addr.ipv6 = ip6->ip6_dst; > + key->nw_proto = nw_proto; > + > + return true; > +} > + > +static inline bool > +check_l4_tcp(const void *data, size_t size) > +{ > + const struct tcp_header *tcp = data; > + size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4; > + > + if (OVS_LIKELY(tcp_len >= TCP_HEADER_LEN && tcp_len <= size)) { > + return true; > + } > + > + return false; > +} > + > +static inline bool > +extract_l4_tcp(struct conn_key *key, const void *data, size_t size) > +{ > + const struct tcp_header *tcp = data; > + > + if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) { > + return false; > + } > + > + key->src.port = tcp->tcp_src; > + key->dst.port = tcp->tcp_dst; > + > + return true; > +} > + > +static inline bool > +extract_l4_udp(struct conn_key *key, const void *data, size_t size) > +{ > + const struct udp_header *udp = data; > + > + if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) { > + return false; > + } > + > + key->src.port = udp->udp_src; > + key->dst.port = udp->udp_dst; > + > + return true; > +} > + > +static inline bool extract_l4(struct conn_key *key, const void *data, > + size_t size, bool *related); > + > +/* If 'related' is not NULL and the function is processing an ICMP > + * error packet, extract the l3 and l4 fields from the nested header > + * instead and set *related to true. If 'related' is NULL we're > + * already processing a nested header and no such recursion is > + * possible */ > +static inline int > +extract_l4_icmp(struct conn_key *key, const void *data, size_t size, > + bool *related) > +{ > + const struct icmp_header *icmp = data; > + > + if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) { > + return false; > + } > + > + switch (icmp->icmp_type) { > + case ICMP4_ECHO_REQUEST: > + case ICMP4_ECHO_REPLY: > + case ICMP4_TIMESTAMP: > + case ICMP4_TIMESTAMPREPLY: > + case ICMP4_INFOREQUEST: > + case ICMP4_INFOREPLY: > + /* Separate ICMP connection: identified using id */ > + key->src.port = key->dst.port = icmp->icmp_fields.echo.id; > + break; > + case ICMP4_DST_UNREACH: > + case ICMP4_TIME_EXCEEDED: > + case ICMP4_PARAM_PROB: > + case ICMP4_SOURCEQUENCH: > + case ICMP4_REDIRECT: { > + /* ICMP packet part of another connection. We should > + * extract the key from embedded packet header */ > + struct conn_key inner_key; > + const char *l3 = (const char *) (icmp + 1); > + const char *tail = (const char *) data + size; > + const char *l4; > + bool ok; > + > + if (!related) { > + return false; > + } > + *related = true; > + > + memset(&inner_key, 0, sizeof inner_key); > + inner_key.dl_type = htons(ETH_TYPE_IP); > + ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4); > + if (!ok) { > + return false; > + } > + > + /* pf doesn't do this, but it seems a good idea */ > + if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned > + || inner_key.dst.addr.ipv4_aligned != > key->src.addr.ipv4_aligned) { > + return false; > + } > + > + key->src = inner_key.src; > + key->dst = inner_key.dst; > + key->nw_proto = inner_key.nw_proto; > + > + ok = extract_l4(key, l4, tail - l4, NULL); > + if (ok) { > + conn_key_reverse(key); > + } > + return ok; > + } > + default: > + return false; > + } > + > + return true; > +} > + > +/* If 'related' is not NULL and the function is processing an ICMP > + * error packet, extract the l3 and l4 fields from the nested header > + * instead and set *related to true. If 'related' is NULL we're > + * already processing a nested header and no such recursion is > + * possible */ > +static inline bool > +extract_l4_icmp6(struct conn_key *key, const void *data, size_t size, > + bool *related) > +{ > + const struct icmp6_header *icmp6 = data; > + > + /* All the messages that we support need at least 4 bytes after > + * the header */ > + if (size < sizeof *icmp6 + 4) { > + return false; > + } > + > + switch (icmp6->icmp6_type) { > + case ICMP6_ECHO_REQUEST: > + case ICMP6_ECHO_REPLY: > + /* Separate ICMP connection: identified using id */ > + key->src.port = key->dst.port = *(ovs_be16 *) (icmp6 + 1); > + break; > + case ICMP6_DST_UNREACH: > + case ICMP6_PACKET_TOO_BIG: > + case ICMP6_TIME_EXCEEDED: > + case ICMP6_PARAM_PROB:{ > + /* ICMP packet part of another connection. We should > + * extract the key from embedded packet header */ > + struct conn_key inner_key; > + const char *l3 = (const char *) icmp6 + 8; > + const char *tail = (const char *) data + size; > + const char *l4 = NULL; > + bool ok; > + > + if (!related) { > + return false; > + } > + *related = true; > + > + memset(&inner_key, 0, sizeof inner_key); > + inner_key.dl_type = htons(ETH_TYPE_IPV6); > + ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4); > + if (!ok) { > + return false; > + } > + > + /* pf doesn't do this, but it seems a good idea */ > + if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned, > + &key->dst.addr.ipv6_aligned) > + || !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned, > + &key->src.addr.ipv6_aligned)) { > + return false; > + } > + > + key->src = inner_key.src; > + key->dst = inner_key.dst; > + key->nw_proto = inner_key.nw_proto; > + > + ok = extract_l4(key, l4, tail - l4, NULL); > + if (ok) { > + conn_key_reverse(key); > + } > + return ok; > + } > + default: > + return false; > + } > + > + return true; > +} > + > +/* Extract l4 fields into 'key', which must already contain valid l3 > + * members. If 'related' is not NULL and an ICMP error packet is being > + * processed, the function will extract the key from the packet nested > + * in the ICMP paylod and set '*related' to true. If 'related' is NULL, > + * nested parsing isn't allowed. This is necessary to limit the > + * recursion level. */ > +static inline bool > +extract_l4(struct conn_key *key, const void *data, size_t size, bool > *related) > +{ > + if (key->nw_proto == IPPROTO_TCP) { > + return extract_l4_tcp(key, data, size) > + && (!related || check_l4_tcp(data, size)); > + } else if (key->nw_proto == IPPROTO_UDP) { > + return extract_l4_udp(key, data, size); > + } else if (key->dl_type == htons(ETH_TYPE_IP) > + && key->nw_proto == IPPROTO_ICMP) { > + return extract_l4_icmp(key, data, size, related); > + } else if (key->dl_type == htons(ETH_TYPE_IPV6) > + && key->nw_proto == IPPROTO_ICMPV6) { > + return extract_l4_icmp6(key, data, size, related); > + } else { > + return false; > + } > +} > + > +static bool > +conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, > + struct conn_lookup_ctx *ctx, uint16_t zone) > +{ > + const struct eth_header *l2 = dp_packet_l2(pkt); > + const struct ip_header *l3 = dp_packet_l3(pkt); > + const char *l4 = dp_packet_l4(pkt); > + const char *tail = dp_packet_tail(pkt); > + bool ok; > + > + memset(ctx, 0, sizeof *ctx); > + > + if (!l2 || !l3 || !l4) { > + return false; > + } > + > + ctx->key.zone = zone; > + > + /* XXX In this function we parse the packet (again, it has already > + * gone through miniflow_extract()) for two reasons: > + * > + * 1) To extract the l3 addresses and l4 ports. > + * We already have the l3 and l4 headers' pointers. Extracting > + * the l3 addresses and the l4 ports is really cheap, since they > + * can be found at fixed locations. > + * 2) To extract the l3 and l4 types. > + * Extracting the l3 and l4 types (especially the l3[1]) on the > + * other hand is quite expensive, because they're not at a > + * fixed location. > + * > + * Here's a way to avoid (2) with the help of the datapath. > + * The datapath doesn't keep the packet's extracted flow[2], so > + * using that is not an option. We could use the packet's matching > + * megaflow, but we have to make sure that the l3 and l4 types > + * are unwildcarded. This means either: > + * > + * a) dpif-netdev unwildcards the l3 (and l4) types when a new flow > + * is installed if the actions contains ct(). This is what the > + * kernel datapath does. It is not so straightforward, though. > + * > + * b) ofproto-dpif-xlate unwildcards the l3 (and l4) types when > + * translating a ct() action. This is already done in different > + * actions and since both the userspace and the kernel datapath > + * would benefit from it, it seems an appropriate place to do > + * it. > + * > + * --- > + * [1] A simple benchmark (running only the connection tracker > + * over and over on the same packets) shows that if the > + * l3 type is already provided we are 15% faster (running the > + * connection tracker over a couple of DPDK devices with a > + * stream of UDP 64-bytes packets shows that we are 4% faster). > + * > + * [2] The reasons for this are that keeping the flow increases > + * (slightly) the cache footprint and increases computation > + * time as we move the packet around. Most importantly, the flow > + * should be updated by the actions and this can be slow, as > + * we use a sparse representation (miniflow). > + * > + */ > + ctx->key.dl_type = parse_dl_type(l2, (char *) l3 - (char *) l2); > + if (ctx->key.dl_type == htons(ETH_TYPE_IP)) { [Antonio F] In case we specify a flow with VLAN + CT like add_flow brX table=0,in_port=1,vlan_tci=0xf123,tcp,ct_state=-trk,action=ct(commit,zone=9),2 should we also consider ETH_TYPE_VLAN in the previous conditional statement? > + ok = extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, NULL); > + } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) { > + ok = extract_l3_ipv6(&ctx->key, l3, tail - (char *) l3, NULL); > + } else { > + ok = false; > + } > + > + if (ok) { > + if (extract_l4(&ctx->key, l4, tail - l4, &ctx->related)) { > + ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis); > + return true; > + } > + } > + > + return false; > +} > + > +/* Symmetric */ > +static uint32_t > +conn_key_hash(const struct conn_key *key, uint32_t basis) > +{ > + uint32_t hsrc, hdst, hash; > + int i; > + > + hsrc = hdst = basis; > + > + for (i = 0; i < sizeof(key->src) / sizeof(uint32_t); i++) { > + hsrc = hash_add(hsrc, ((uint32_t *) &key->src)[i]); > + hdst = hash_add(hdst, ((uint32_t *) &key->dst)[i]); > + } > + > + hash = hsrc ^ hdst; > + > + hash = hash_words((uint32_t *) &key->dst + 1, > + (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1), > + hash); [Antonio F] Is that to involve dl_type, nw_proto and zone in the hash computation? > + > + return hash; > +} > + > +static void > +conn_key_reverse(struct conn_key *key) > +{ > + struct ct_endpoint tmp; > + tmp = key->src; > + key->src = key->dst; > + key->dst = tmp; > +} > + > +static void > +conn_key_lookup(struct conntrack *ct, > + struct conn_lookup_ctx *ctx, > + unsigned bucket, > + long long now) > +{ > + struct conn *conn, *found = NULL; > + uint32_t hash = ctx->hash; > + bool reply; > + > + HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ct- > >connections[bucket]) { > + if (!memcmp(&conn->key, &ctx->key, sizeof(conn->key))) { > + found = conn; > + reply = false; > + break; > + } > + if (!memcmp(&conn->rev_key, &ctx->key, sizeof(conn->rev_key))) { > + found = conn; > + reply = true; > + break; > + } > + } > + > + if (found) { > + if (conn_expired(found, now)) { > + found = NULL; > + } else { > + ctx->reply = reply; > + } > + } > + > + ctx->conn = found; > +} > + > +static enum ct_update_res > +conn_update(struct conn *conn, struct dp_packet *pkt, bool reply, > + long long now) > +{ > + return l4_protos[conn->key.nw_proto]->conn_update(conn, pkt, reply, > now); > +} > + > +static bool > +conn_expired(struct conn *conn, long long now) > +{ > + return now >= conn->expiration; > +} > + > +static bool > +valid_new(struct dp_packet *pkt, struct conn_key *key) > +{ > + return l4_protos[key->nw_proto]->valid_new(pkt); > +} > + > +static struct conn * > +new_conn(struct dp_packet *pkt, struct conn_key *key, long long now) > +{ > + struct conn *newconn; > + > + newconn = l4_protos[key->nw_proto]->new_conn(pkt, now); > + > + if (newconn) { > + newconn->key = *key; > + } > + > + return newconn; > +} > + > +static void > +delete_conn(struct conn *conn) > +{ > + free(conn); > +} > diff --git a/lib/conntrack.h b/lib/conntrack.h > new file mode 100644 > index 0000000..8561273 > --- /dev/null > +++ b/lib/conntrack.h > @@ -0,0 +1,144 @@ > +/* > + * Copyright (c) 2015, 2016 Nicira, Inc. > + * > + * Licensed under the Apache License, Version 2.0 (the "License"); > + * you may not use this file except in compliance with the License. > + * You may obtain a copy of the License at: > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or > implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > + > +#ifndef CONNTRACK_H > +#define CONNTRACK_H 1 > + > +#include <stdbool.h> > + > +#include "hmap.h" > +#include "netdev-dpdk.h" > +#include "odp-netlink.h" > +#include "openvswitch/thread.h" > +#include "openvswitch/types.h" > + > + > +struct dp_packet; > + > +/* Userspace connection tracker > + * ============================ > + * > + * This is a connection tracking module that keeps all the state in > userspace. > + * > + * Usage > + * ===== > + * > + * struct conntract ct; > + * > + * Initialization: > + * > + * conntrack_init(&ct); > + * > + * It is necessary to periodically issue a call to > + * > + * conntrack_run(&ct); > + * > + * to allow the module to clean up expired connections. > + * > + * To send a group of packets through the connection tracker: > + * > + * conntrack_execute(&ct, pkts, n_pkts, ...); > + * > + * Thread-safety > + * ============= > + * > + * conntrack_execute() can be called by multiple threads simultaneoulsy. > + */ > + > +struct conntrack; > + > +void conntrack_init(struct conntrack *); > +void conntrack_run(struct conntrack *); > +void conntrack_destroy(struct conntrack *); > + > +int conntrack_execute(struct conntrack *, struct dp_packet **, size_t, > + bool commit, uint16_t zone, const uint32_t *setmark, > + const struct ovs_key_ct_labels *setlabel, > + const char *helper); > + > +/* struct ct_lock is a standard mutex or a spinlock when using DPDK */ > + > +#ifdef DPDK_NETDEV > +struct OVS_LOCKABLE ct_lock { > + rte_spinlock_t lock; > +}; > + > +static inline void ct_lock_init(struct ct_lock *lock) > +{ > + rte_spinlock_init(&lock->lock); > +} > + > +static inline void ct_lock_lock(struct ct_lock *lock) > + OVS_ACQUIRES(lock) > + OVS_NO_THREAD_SAFETY_ANALYSIS > +{ > + rte_spinlock_lock(&lock->lock); > +} > + > +static inline void ct_lock_unlock(struct ct_lock *lock) > + OVS_RELEASES(lock) > + OVS_NO_THREAD_SAFETY_ANALYSIS > +{ > + rte_spinlock_unlock(&lock->lock); > +} > + > +static inline void ct_lock_destroy(struct ct_lock *lock OVS_UNUSED) > +{ > +} > +#else > +struct OVS_LOCKABLE ct_lock { > + struct ovs_mutex lock; > +}; > + > +static inline void ct_lock_init(struct ct_lock *lock) > +{ > + ovs_mutex_init(&lock->lock); > +} > + > +static inline void ct_lock_lock(struct ct_lock *lock) > + OVS_ACQUIRES(lock) > + OVS_NO_THREAD_SAFETY_ANALYSIS > +{ > + ovs_mutex_lock(&lock->lock); > +} > + > +static inline void ct_lock_unlock(struct ct_lock *lock) > + OVS_RELEASES(lock) > + OVS_NO_THREAD_SAFETY_ANALYSIS > +{ > + ovs_mutex_unlock(&lock->lock); > +} > + > +static inline void ct_lock_destroy(struct ct_lock *lock) > +{ > + ovs_mutex_destroy(&lock->lock); > +} > +#endif > + > +#define CONNTRACK_BUCKETS_SHIFT 8 > +#define CONNTRACK_BUCKETS (1 << CONNTRACK_BUCKETS_SHIFT) > + > +struct conntrack { > + /* Each lock guards a 'connections' bucket */ > + struct ct_lock locks[CONNTRACK_BUCKETS]; > + struct hmap connections[CONNTRACK_BUCKETS] OVS_GUARDED; > + uint32_t hash_basis; > + unsigned purge_bucket; > + uint32_t purge_inner_bucket; > + uint32_t purge_inner_offset; > +}; > + > +#endif /* conntrack.h */ > -- > 2.1.4 > > _______________________________________________ > dev mailing list > dev@openvswitch.org > http://openvswitch.org/mailman/listinfo/dev _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev