Hi Daniele, just a comment below. Apart from that, it looks good to me, thanks.
Acked-by: Antonio Fischetti <antonio.fische...@intel.com> > -----Original Message----- > From: dev [mailto:dev-boun...@openvswitch.org] On Behalf Of Daniele > Di Proietto > Sent: Tuesday, May 17, 2016 1:56 AM > To: dev@openvswitch.org > Subject: [ovs-dev] [PATCH v3 04/16] conntrack: New userspace > connection tracker. > > This commit adds the conntrack module. > > It is a connection tracker that resides entirely in userspace. Its > primary user will be the dpif-netdev datapath. > > The module main goal is to provide conntrack_execute(), which offers > a > convenient interface to implement the datapath ct() action. > > The conntrack module uses two submodules to deal with the l4 protocol > details (conntrack-other for UDP and ICMP, conntrack-tcp for TCP). > > The conntrack-tcp submodule implementation is adapted from FreeBSD's > pf > subsystem, therefore it's BSD licensed. It has been slightly altered > to > match the OVS coding style and to allow the pickup of already > established connections. > > Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com> > --- > COPYING | 1 + > debian/copyright.in | 4 + > include/openvswitch/types.h | 4 + > lib/automake.mk | 5 + > lib/conntrack-other.c | 85 +++++ > lib/conntrack-private.h | 88 +++++ > lib/conntrack-tcp.c | 463 +++++++++++++++++++++++ > lib/conntrack.c | 883 > ++++++++++++++++++++++++++++++++++++++++++++ > lib/conntrack.h | 151 ++++++++ > lib/util.h | 9 + > 10 files changed, 1693 insertions(+) > create mode 100644 lib/conntrack-other.c > create mode 100644 lib/conntrack-private.h > create mode 100644 lib/conntrack-tcp.c > create mode 100644 lib/conntrack.c > create mode 100644 lib/conntrack.h > > diff --git a/COPYING b/COPYING > index 308e3ea..afb98b9 100644 > --- a/COPYING > +++ b/COPYING > @@ -25,6 +25,7 @@ License, version 2. > The following files are licensed under the 2-clause BSD license. > include/windows/getopt.h > lib/getopt_long.c > + lib/conntrack-tcp.c > > The following files are licensed under the 3-clause BSD-license > include/windows/netinet/icmp6.h > diff --git a/debian/copyright.in b/debian/copyright.in > index 57d007a..a15f4dd 100644 > --- a/debian/copyright.in > +++ b/debian/copyright.in > @@ -21,6 +21,9 @@ Upstream Copyright Holders: > Copyright (c) 2014 Michael Chapman > Copyright (c) 2014 WindRiver, Inc. > Copyright (c) 2014 Avaya, Inc. > + Copyright (c) 2001 Daniel Hartmeier > + Copyright (c) 2002 - 2008 Henning Brauer > + Copyright (c) 2012 Gleb Smirnoff <gleb...@freebsd.org> > > License: > > @@ -90,6 +93,7 @@ License: > lib/getopt_long.c > include/windows/getopt.h > datapath-windows/ovsext/Conntrack-tcp.c > + lib/conntrack-tcp.c > > * The following files are licensed under the 3-clause BSD-license > > diff --git a/include/openvswitch/types.h > b/include/openvswitch/types.h > index 5f3347d..d7e94a6 100644 > --- a/include/openvswitch/types.h > +++ b/include/openvswitch/types.h > @@ -107,6 +107,10 @@ static const ovs_u128 OVS_U128_MAX = { { > UINT32_MAX, UINT32_MAX, > UINT32_MAX, UINT32_MAX } }; > static const ovs_be128 OVS_BE128_MAX OVS_UNUSED = { { OVS_BE32_MAX, > OVS_BE32_MAX, > OVS_BE32_MAX, > OVS_BE32_MAX } }; > +static const ovs_u128 OVS_U128_MIN OVS_UNUSED = { {0, 0, 0, 0} }; > +static const ovs_u128 OVS_BE128_MIN OVS_UNUSED = { {0, 0, 0, 0} }; > + > +#define OVS_U128_ZERO OVS_U128_MIN > > /* A 64-bit value, in network byte order, that is only aligned on a > 32-bit > * boundary. */ > diff --git a/lib/automake.mk b/lib/automake.mk > index affbb5c..df8b07d 100644 > --- a/lib/automake.mk > +++ b/lib/automake.mk > @@ -47,6 +47,11 @@ lib_libopenvswitch_la_SOURCES = \ > lib/compiler.h \ > lib/connectivity.c \ > lib/connectivity.h \ > + lib/conntrack-private.h \ > + lib/conntrack-tcp.c \ > + lib/conntrack-other.c \ > + lib/conntrack.c \ > + lib/conntrack.h \ > lib/coverage.c \ > lib/coverage.h \ > lib/crc32c.c \ > diff --git a/lib/conntrack-other.c b/lib/conntrack-other.c > new file mode 100644 > index 0000000..295cb2c > --- /dev/null > +++ b/lib/conntrack-other.c > @@ -0,0 +1,85 @@ > +/* > + * Copyright (c) 2015, 2016 Nicira, Inc. > + * > + * Licensed under the Apache License, Version 2.0 (the "License"); > + * you may not use this file except in compliance with the License. > + * You may obtain a copy of the License at: > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, > software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or > implied. > + * See the License for the specific language governing permissions > and > + * limitations under the License. > + */ > + > +#include <config.h> > + > +#include "conntrack-private.h" > +#include "dp-packet.h" > + > +enum other_state { > + OTHERS_FIRST, > + OTHERS_MULTIPLE, > + OTHERS_BIDIR, > +}; > + > +struct conn_other { > + struct conn up; > + enum other_state state; > +}; > + > +static const enum ct_timeout other_timeouts[] = { > + [OTHERS_FIRST] = CT_TM_OTHER_FIRST, > + [OTHERS_MULTIPLE] = CT_TM_OTHER_MULTIPLE, > + [OTHERS_BIDIR] = CT_TM_OTHER_BIDIR, > +}; > + > +static struct conn_other * > +conn_other_cast(const struct conn *conn) > +{ > + return CONTAINER_OF(conn, struct conn_other, up); > +} > + > +static enum ct_update_res > +other_conn_update(struct conn *conn_, struct dp_packet *pkt > OVS_UNUSED, > + bool reply, long long now) > +{ > + struct conn_other *conn = conn_other_cast(conn_); > + > + if (reply && conn->state != OTHERS_BIDIR) { > + conn->state = OTHERS_BIDIR; > + } else if (conn->state == OTHERS_FIRST) { > + conn->state = OTHERS_MULTIPLE; > + } > + > + update_expiration(conn_, other_timeouts[conn->state], now); > + > + return CT_UPDATE_VALID; > +} > + > +static bool > +other_valid_new(struct dp_packet *pkt OVS_UNUSED) > +{ > + return true; > +} > + > +static struct conn * > +other_new_conn(struct dp_packet *pkt OVS_UNUSED, long long now) > +{ > + struct conn_other *conn; > + > + conn = xzalloc(sizeof *conn); > + conn->state = OTHERS_FIRST; > + > + update_expiration(&conn->up, other_timeouts[conn->state], now); > + > + return &conn->up; > +} > + > +struct ct_l4_proto ct_proto_other = { > + .new_conn = other_new_conn, > + .valid_new = other_valid_new, > + .conn_update = other_conn_update, > +}; > diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h > new file mode 100644 > index 0000000..d3e0099 > --- /dev/null > +++ b/lib/conntrack-private.h > @@ -0,0 +1,88 @@ > +/* > + * Copyright (c) 2015, 2016 Nicira, Inc. > + * > + * Licensed under the Apache License, Version 2.0 (the "License"); > + * you may not use this file except in compliance with the License. > + * You may obtain a copy of the License at: > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, > software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or > implied. > + * See the License for the specific language governing permissions > and > + * limitations under the License. > + */ > + > +#ifndef CONNTRACK_PRIVATE_H > +#define CONNTRACK_PRIVATE_H 1 > + > +#include <sys/types.h> > +#include <netinet/in.h> > +#include <netinet/ip6.h> > + > +#include "conntrack.h" > +#include "hmap.h" > +#include "openvswitch/list.h" > +#include "openvswitch/types.h" > +#include "packets.h" > +#include "unaligned.h" > + > +struct ct_addr { > + union { > + ovs_16aligned_be32 ipv4; > + union ovs_16aligned_in6_addr ipv6; > + ovs_be32 ipv4_aligned; > + struct in6_addr ipv6_aligned; > + }; > +}; > + > +struct ct_endpoint { > + struct ct_addr addr; > + ovs_be16 port; > +}; > + > +struct conn_key { > + struct ct_endpoint src; > + struct ct_endpoint dst; > + > + ovs_be16 dl_type; > + uint8_t nw_proto; > + uint16_t zone; > +}; > + > +struct conn { > + struct conn_key key; > + struct conn_key rev_key; > + long long expiration; > + struct ovs_list exp_node; > + struct hmap_node node; > + uint32_t mark; > + ovs_u128 label; > +}; > + > +enum ct_update_res { > + CT_UPDATE_INVALID, > + CT_UPDATE_VALID, > + CT_UPDATE_NEW, > +}; > + > +struct ct_l4_proto { > + struct conn *(*new_conn)(struct dp_packet *pkt, long long now); > + bool (*valid_new)(struct dp_packet *pkt); > + enum ct_update_res (*conn_update)(struct conn *conn, struct > dp_packet *pkt, > + bool reply, long long now); > +}; > + > +extern struct ct_l4_proto ct_proto_tcp; > +extern struct ct_l4_proto ct_proto_other; > + > +extern long long ct_timeout_val[]; > + > +static inline void > +update_expiration(struct conn *conn, enum ct_timeout tm, long long > now) > +{ > + conn->expiration = now + ct_timeout_val[tm]; > +} > + > +#endif /* conntrack-private.h */ > diff --git a/lib/conntrack-tcp.c b/lib/conntrack-tcp.c > new file mode 100644 > index 0000000..4d3d9c3 > --- /dev/null > +++ b/lib/conntrack-tcp.c > @@ -0,0 +1,463 @@ > +/*- > + * Copyright (c) 2001 Daniel Hartmeier > + * Copyright (c) 2002 - 2008 Henning Brauer > + * Copyright (c) 2012 Gleb Smirnoff <gleb...@freebsd.org> > + * Copyright (c) 2015, 2016 Nicira, Inc. > + * All rights reserved. > + * > + * Redistribution and use in source and binary forms, with or > without > + * modification, are permitted provided that the following > conditions > + * are met: > + * > + * - Redistributions of source code must retain the above > copyright > + * notice, this list of conditions and the following > disclaimer. > + * - Redistributions in binary form must reproduce the above > + * copyright notice, this list of conditions and the following > + * disclaimer in the documentation and/or other materials > provided > + * with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND > CONTRIBUTORS > + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS > + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE > + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, > INDIRECT, > + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES > (INCLUDING, > + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; > + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER > + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, > STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN > + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE > + * POSSIBILITY OF SUCH DAMAGE. > + * > + * Effort sponsored in part by the Defense Advanced Research > Projects > + * Agency (DARPA) and Air Force Research Laboratory, Air Force > + * Materiel Command, USAF, under agreement number F30602-01-2-0537. > + * > + * $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $ > + */ > + > +#include <config.h> > + > +#include "conntrack-private.h" > +#include "ct-dpif.h" > +#include "dp-packet.h" > +#include "util.h" > + > +struct tcp_peer { > + enum ct_dpif_tcp_state state; > + uint32_t seqlo; /* Max sequence number > sent */ > + uint32_t seqhi; /* Max the other end ACKd > + win */ > + uint16_t max_win; /* largest window (pre > scaling) */ > + uint8_t wscale; /* window scaling factor > */ > +}; > + > +struct conn_tcp { > + struct conn up; > + struct tcp_peer peer[2]; > +}; > + > +enum { > + TCPOPT_EOL, > + TCPOPT_NOP, > + TCPOPT_WINDOW = 3, > +}; > + > +/* TCP sequence numbers are 32 bit integers operated > + * on with modular arithmetic. These macros can be > + * used to compare such integers. */ > +#define SEQ_LT(a,b) INT_MOD_LT(a, b) > +#define SEQ_LEQ(a,b) INT_MOD_LEQ(a, b) > +#define SEQ_GT(a,b) INT_MOD_GT(a, b) > +#define SEQ_GEQ(a,b) INT_MOD_GEQ(a, b) > + > +#define SEQ_MIN(a, b) INT_MOD_MIN(a, b) > +#define SEQ_MAX(a, b) INT_MOD_MAX(a, b) > + > +static struct conn_tcp* > +conn_tcp_cast(const struct conn* conn) > +{ > + return CONTAINER_OF(conn, struct conn_tcp, up); > +} > + > +/* pf does this in in pf_normalize_tcp(), and it is called only if > scrub > + * is enabled. We're not scrubbing, but this check seems > reasonable. */ > +static bool > +tcp_invalid_flags(uint16_t flags) > +{ > + > + if (flags & TCP_SYN) { > + if (flags & TCP_RST || flags & TCP_FIN) { > + return true; > + } > + } else { > + /* Illegal packet */ > + if (!(flags & (TCP_ACK|TCP_RST))) { > + return true; > + } > + } > + > + if (!(flags & TCP_ACK)) { > + /* These flags are only valid if ACK is set */ > + if ((flags & TCP_FIN) || (flags & TCP_PSH) || (flags & > TCP_URG)) { > + return true; > + } > + } > + > + return false; > +} > + > +#define TCP_MAX_WSCALE 14 > +#define CT_WSCALE_FLAG 0x80 > +#define CT_WSCALE_UNKNOWN 0x40 > +#define CT_WSCALE_MASK 0xf > + > +static uint8_t > +tcp_get_wscale(const struct tcp_header *tcp) > +{ > + int len = TCP_OFFSET(tcp->tcp_ctl) * 4 - sizeof *tcp; > + const uint8_t *opt = (const uint8_t *)(tcp + 1); > + uint8_t wscale = 0; > + uint8_t optlen; > + > + while (len >= 3) { > + switch (*opt) { > + case TCPOPT_EOL: > + return wscale; > + case TCPOPT_NOP: > + opt++; > + len--; > + break; > + case TCPOPT_WINDOW: > + wscale = MIN(opt[2], TCP_MAX_WSCALE); > + wscale |= CT_WSCALE_FLAG; > + /* fall through */ > + default: > + optlen = opt[1]; > + if (optlen < 2) { > + optlen = 2; > + } > + len -= optlen; > + opt += optlen; > + } > + } > + > + return wscale; > +} > + > +static uint32_t > +tcp_payload_length(struct dp_packet *pkt) > +{ > + return (char *) dp_packet_tail(pkt) - dp_packet_l2_pad_size(pkt) > + - (char *) dp_packet_get_tcp_payload(pkt); > +} > + > +static enum ct_update_res > +tcp_conn_update(struct conn* conn_, struct dp_packet *pkt, bool > reply, > + long long now) > +{ > + struct conn_tcp *conn = conn_tcp_cast(conn_); > + struct tcp_header *tcp = dp_packet_l4(pkt); > + /* The peer that sent 'pkt' */ > + struct tcp_peer *src = &conn->peer[reply ? 1 : 0]; > + /* The peer that should receive 'pkt' */ > + struct tcp_peer *dst = &conn->peer[reply ? 0 : 1]; > + uint8_t sws = 0, dws = 0; > + uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl); > + > + uint16_t win = ntohs(tcp->tcp_winsz); > + uint32_t ack, end, seq, orig_seq; > + uint32_t p_len = tcp_payload_length(pkt); > + int ackskew; > + > + if (tcp_invalid_flags(tcp_flags)) { > + return CT_UPDATE_INVALID; > + } > + > + if (((tcp_flags & (TCP_SYN|TCP_ACK)) == TCP_SYN) && > + dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 && > + src->state >= CT_DPIF_TCPS_FIN_WAIT_2) { > + src->state = dst->state = CT_DPIF_TCPS_CLOSED; > + return CT_UPDATE_NEW; > + } > + > + if (src->wscale & CT_WSCALE_FLAG > + && dst->wscale & CT_WSCALE_FLAG > + && !(tcp_flags & TCP_SYN)) { > + > + sws = src->wscale & CT_WSCALE_MASK; > + dws = dst->wscale & CT_WSCALE_MASK; > + > + } else if (src->wscale & CT_WSCALE_UNKNOWN > + && dst->wscale & CT_WSCALE_UNKNOWN > + && !(tcp_flags & TCP_SYN)) { > + > + sws = TCP_MAX_WSCALE; > + dws = TCP_MAX_WSCALE; > + } > + > + /* > + * Sequence tracking algorithm from Guido van Rooij's paper: > + * http://www.madison-gurkha.com/publications/tcp_filtering/ > + * tcp_filtering.ps > + */ > + > + orig_seq = seq = ntohl(get_16aligned_be32(&tcp->tcp_seq)); > + if (src->state < CT_DPIF_TCPS_SYN_SENT) { > + /* First packet from this end. Set its state */ > + > + ack = ntohl(get_16aligned_be32(&tcp->tcp_ack)); > + > + end = seq + p_len; > + if (tcp_flags & TCP_SYN) { > + end++; > + if (dst->wscale & CT_WSCALE_FLAG) { > + src->wscale = tcp_get_wscale(tcp); > + if (src->wscale & CT_WSCALE_FLAG) { > + /* Remove scale factor from initial window */ > + sws = src->wscale & CT_WSCALE_MASK; > + win = DIV_ROUND_UP((uint32_t) win, 1 << sws); > + dws = dst->wscale & CT_WSCALE_MASK; > + } else { > + /* fixup other window */ > + dst->max_win <<= dst->wscale & > + CT_WSCALE_MASK; > + /* in case of a retrans SYN|ACK */ > + dst->wscale = 0; > + } > + } > + } > + if (tcp_flags & TCP_FIN) { > + end++; > + } > + > + src->seqlo = seq; > + src->state = CT_DPIF_TCPS_SYN_SENT; > + /* > + * May need to slide the window (seqhi may have been set by > + * the crappy stack check or if we picked up the connection > + * after establishment) > + */ > + if (src->seqhi == 1 || > + SEQ_GEQ(end + MAX(1, dst->max_win << dws), src- > >seqhi)) { > + src->seqhi = end + MAX(1, dst->max_win << dws); > + } > + if (win > src->max_win) { > + src->max_win = win; > + } > + > + } else { > + ack = ntohl(get_16aligned_be32(&tcp->tcp_ack)); > + end = seq + p_len; > + if (tcp_flags & TCP_SYN) { > + end++; > + } > + if (tcp_flags & TCP_FIN) { > + end++; > + } > + } > + > + if ((tcp_flags & TCP_ACK) == 0) { > + /* Let it pass through the ack skew check */ > + ack = dst->seqlo; > + } else if ((ack == 0 > + && (tcp_flags & (TCP_ACK|TCP_RST)) == > (TCP_ACK|TCP_RST)) > + /* broken tcp stacks do not set ack */) { > + /* Many stacks (ours included) will set the ACK number in an > + * FIN|ACK if the SYN times out -- no sequence to ACK. */ > + ack = dst->seqlo; > + } > + > + if (seq == end) { > + /* Ease sequencing restrictions on no data packets */ > + seq = src->seqlo; > + end = seq; > + } > + > + ackskew = dst->seqlo - ack; > +#define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary > fudge factor */ > + if (SEQ_GEQ(src->seqhi, end) > + /* Last octet inside other's window space */ > + && SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) > + /* Retrans: not more than one window back */ > + && (ackskew >= -MAXACKWINDOW) > + /* Acking not more than one reassembled fragment backwards > */ > + && (ackskew <= (MAXACKWINDOW << sws)) > + /* Acking not more than one window forward */ > + && ((tcp_flags & TCP_RST) == 0 || orig_seq == src->seqlo > + || (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src- > >seqlo))) { > + /* Require an exact/+1 sequence match on resets when > possible */ > + > + /* update max window */ > + if (src->max_win < win) { > + src->max_win = win; > + } > + /* synchronize sequencing */ > + if (SEQ_GT(end, src->seqlo)) { > + src->seqlo = end; > + } > + /* slide the window of what the other end can send */ > + if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) { > + dst->seqhi = ack + MAX((win << sws), 1); > + } > + > + /* update states */ > + if (tcp_flags & TCP_SYN && src->state < > CT_DPIF_TCPS_SYN_SENT) { > + src->state = CT_DPIF_TCPS_SYN_SENT; > + } > + if (tcp_flags & TCP_FIN && src->state < > CT_DPIF_TCPS_CLOSING) { > + src->state = CT_DPIF_TCPS_CLOSING; > + } > + if (tcp_flags & TCP_ACK) { > + if (dst->state == CT_DPIF_TCPS_SYN_SENT) { > + dst->state = CT_DPIF_TCPS_ESTABLISHED; > + } else if (dst->state == CT_DPIF_TCPS_CLOSING) { > + dst->state = CT_DPIF_TCPS_FIN_WAIT_2; > + } > + } > + if (tcp_flags & TCP_RST) { > + src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT; > + } > + > + if (src->state >= CT_DPIF_TCPS_FIN_WAIT_2 > + && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2) { > + update_expiration(conn_, CT_TM_TCP_CLOSED, now); > + } else if (src->state >= CT_DPIF_TCPS_CLOSING > + && dst->state >= CT_DPIF_TCPS_CLOSING) { > + update_expiration(conn_, CT_TM_TCP_FIN_WAIT, now); > + } else if (src->state < CT_DPIF_TCPS_ESTABLISHED > + || dst->state < CT_DPIF_TCPS_ESTABLISHED) { > + update_expiration(conn_, now, CT_TM_TCP_OPENING); > + } else if (src->state >= CT_DPIF_TCPS_CLOSING > + || dst->state >= CT_DPIF_TCPS_CLOSING) { > + update_expiration(conn_, now, CT_TM_TCP_CLOSING); > + } else { > + update_expiration(conn_, now, CT_TM_TCP_ESTABLISHED); > + } > + } else if ((dst->state < CT_DPIF_TCPS_SYN_SENT > + || dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 > + || src->state >= CT_DPIF_TCPS_FIN_WAIT_2) > + && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) > + /* Within a window forward of the originating packet > */ > + && SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) { > + /* Within a window backward of the originating packet > */ > + > + /* > + * This currently handles three situations: > + * 1) Stupid stacks will shotgun SYNs before their peer > + * replies. > + * 2) When PF catches an already established stream (the > + * firewall rebooted, the state table was flushed, > routes > + * changed...) > + * 3) Packets get funky immediately after the connection > + * closes (this should catch Solaris spurious ACK|FINs > + * that web servers like to spew after a close) > + * > + * This must be a little more careful than the above code > + * since packet floods will also be caught here. We don't > + * update the TTL here to mitigate the damage of a packet > + * flood and so the same code can handle awkward > establishment > + * and a loosened connection close. > + * In the establishment case, a correct peer response will > + * validate the connection, go through the normal state code > + * and keep updating the state TTL. > + */ > + > + /* update max window */ > + if (src->max_win < win) { > + src->max_win = win; > + } > + /* synchronize sequencing */ > + if (SEQ_GT(end, src->seqlo)) { > + src->seqlo = end; > + } > + /* slide the window of what the other end can send */ > + if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) { > + dst->seqhi = ack + MAX((win << sws), 1); > + } > + > + /* > + * Cannot set dst->seqhi here since this could be a > shotgunned > + * SYN and not an already established connection. > + */ > + > + if (tcp_flags & TCP_FIN && src->state < > CT_DPIF_TCPS_CLOSING) { > + src->state = CT_DPIF_TCPS_CLOSING; > + } > + > + if (tcp_flags & TCP_RST) { > + src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT; > + } > + } else { > + return CT_UPDATE_INVALID; > + } > + > + return CT_UPDATE_VALID; > +} > + > +static bool > +tcp_valid_new(struct dp_packet *pkt) > +{ > + struct tcp_header *tcp = dp_packet_l4(pkt); > + uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl); > + > + if (tcp_invalid_flags(tcp_flags)) { > + return false; > + } > + > + /* A syn+ack is not allowed to create a connection. We want to > allow > + * totally new connections (syn) or already established, not > partially > + * open (syn+ack). */ > + if ((tcp_flags & TCP_SYN) && (tcp_flags & TCP_ACK)) { > + return false; > + } > + > + return true; > +} > + > +static struct conn * > +tcp_new_conn(struct dp_packet *pkt, long long now) > +{ > + struct conn_tcp* newconn = NULL; > + struct tcp_header *tcp = dp_packet_l4(pkt); > + struct tcp_peer *src, *dst; > + uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl); > + > + newconn = xzalloc(sizeof *newconn); > + > + src = &newconn->peer[0]; > + dst = &newconn->peer[1]; > + > + src->seqlo = ntohl(get_16aligned_be32(&tcp->tcp_seq)); > + src->seqhi = src->seqlo + tcp_payload_length(pkt) + 1; > + > + if (tcp_flags & TCP_SYN) { > + src->seqhi++; > + src->wscale = tcp_get_wscale(tcp); > + } else { > + src->wscale = CT_WSCALE_UNKNOWN; > + dst->wscale = CT_WSCALE_UNKNOWN; > + } > + src->max_win = MAX(ntohs(tcp->tcp_winsz), 1); > + if (src->wscale & CT_WSCALE_MASK) { > + /* Remove scale factor from initial window */ > + uint8_t sws = src->wscale & CT_WSCALE_MASK; > + src->max_win = DIV_ROUND_UP((uint32_t) src->max_win, 1 << > sws); > + } > + if (tcp_flags & TCP_FIN) { > + src->seqhi++; > + } > + dst->seqhi = 1; > + dst->max_win = 1; > + src->state = CT_DPIF_TCPS_SYN_SENT; > + dst->state = CT_DPIF_TCPS_CLOSED; > + > + update_expiration(&newconn->up, now, CT_TM_TCP_FIRST_PACKET); > + > + return &newconn->up; > +} > + > +struct ct_l4_proto ct_proto_tcp = { > + .new_conn = tcp_new_conn, > + .valid_new = tcp_valid_new, > + .conn_update = tcp_conn_update, > +}; > diff --git a/lib/conntrack.c b/lib/conntrack.c > new file mode 100644 > index 0000000..e282485 > --- /dev/null > +++ b/lib/conntrack.c > @@ -0,0 +1,883 @@ > +/* > + * Copyright (c) 2015, 2016 Nicira, Inc. > + * > + * Licensed under the Apache License, Version 2.0 (the "License"); > + * you may not use this file except in compliance with the License. > + * You may obtain a copy of the License at: > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, > software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or > implied. > + * See the License for the specific language governing permissions > and > + * limitations under the License. > + */ > + > +#include <config.h> > +#include "conntrack.h" > + > +#include <errno.h> > +#include <sys/types.h> > +#include <netinet/in.h> > +#include <netinet/icmp6.h> > + > +#include "bitmap.h" > +#include "conntrack-private.h" > +#include "coverage.h" > +#include "csum.h" > +#include "dp-packet.h" > +#include "flow.h" > +#include "hmap.h" > +#include "netdev.h" > +#include "odp-netlink.h" > +#include "openvswitch/vlog.h" > +#include "ovs-rcu.h" > +#include "random.h" > +#include "timeval.h" > + > +VLOG_DEFINE_THIS_MODULE(conntrack); > + > +COVERAGE_DEFINE(conntrack_new_full); > + > +struct conn_lookup_ctx { > + struct conn_key key; > + struct conn *conn; > + uint32_t hash; > + bool reply; > + bool related; > +}; > + > +static bool conn_key_extract(struct conntrack *, struct dp_packet *, > + struct conn_lookup_ctx *, uint16_t > zone); > +static uint32_t conn_key_hash(const struct conn_key *, uint32_t > basis); > +static void conn_key_reverse(struct conn_key *); > +static void conn_key_lookup(struct conntrack_bucket *ctb, > + struct conn_lookup_ctx *ctx, > + long long now); > +static bool valid_new(struct dp_packet *pkt, struct conn_key *); > +static struct conn *new_conn(struct dp_packet *pkt, struct conn_key > *, > + long long now); > +static void delete_conn(struct conn *); > +static enum ct_update_res conn_update(struct conn *, struct > dp_packet*, > + bool reply, long long now); > +static bool conn_expired(struct conn *, long long now); > +static void set_mark(struct dp_packet *, struct conn *, > + uint32_t val, uint32_t mask); > +static void set_label(struct dp_packet *, struct conn *, > + const struct ovs_key_ct_labels *val, > + const struct ovs_key_ct_labels *mask); > + > +static struct ct_l4_proto *l4_protos[] = { > + [IPPROTO_TCP] = &ct_proto_tcp, > + [IPPROTO_UDP] = &ct_proto_other, > + [IPPROTO_ICMP] = &ct_proto_other, > + [IPPROTO_ICMPV6] = &ct_proto_other, > +}; > + > +long long ct_timeout_val[] = { > +#define CT_TIMEOUT(NAME, VAL) [CT_TM_##NAME] = VAL, > + CT_TIMEOUTS > +#undef CT_TIMEOUT > +}; > + > +/* If the total number of connections goes above this value, no new > connections > + * are accepted */ > +#define DEFAULT_N_CONN_LIMIT 3000000 > + > +/* Initializes the connection tracker 'ct'. The caller is > responbile for > + * calling 'conntrack_destroy()', when the instance is not needed > anymore */ > +void > +conntrack_init(struct conntrack *ct) > +{ > + unsigned i; > + > + for (i = 0; i < CONNTRACK_BUCKETS; i++) { > + struct conntrack_bucket *ctb = &ct->buckets[i]; > + > + ct_lock_init(&ctb->lock); > + ct_lock_lock(&ctb->lock); > + hmap_init(&ctb->connections); > + ct_lock_unlock(&ctb->lock); > + } > + ct->hash_basis = random_uint32(); > + atomic_count_init(&ct->n_conn, 0); > + atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT); > +} > + > +/* Destroys the connection tracker 'ct' and frees all the allocated > memory. */ > +void > +conntrack_destroy(struct conntrack *ct) > +{ > + unsigned i; > + > + for (i = 0; i < CONNTRACK_BUCKETS; i++) { > + struct conntrack_bucket *ctb = &ct->buckets[i]; > + struct conn *conn; > + > + ct_lock_lock(&ctb->lock); > + HMAP_FOR_EACH_POP(conn, node, &ctb->connections) { > + atomic_count_dec(&ct->n_conn); > + delete_conn(conn); > + } > + hmap_destroy(&ctb->connections); > + ct_lock_unlock(&ctb->lock); > + ct_lock_destroy(&ctb->lock); > + } > +} > + > +static unsigned hash_to_bucket(uint32_t hash) > +{ > + /* Extracts the most significant bits in hash. The least > significant bits > + * are already used internally by the hmap implementation. */ > + BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 && > CONNTRACK_BUCKETS_SHIFT >= 1); > + > + return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % > CONNTRACK_BUCKETS; > +} > + > +static void > +write_ct_md(struct dp_packet *pkt, uint16_t state, uint16_t zone, > + uint32_t mark, ovs_u128 label) > +{ > + pkt->md.ct_state = state | CS_TRACKED; > + pkt->md.ct_zone = zone; > + pkt->md.ct_mark = mark; > + pkt->md.ct_label = label; > +} > + > +static struct conn * > +conn_not_found(struct conntrack *ct, struct dp_packet *pkt, > + struct conn_lookup_ctx *ctx, uint16_t *state, bool > commit, > + long long now) > +{ > + unsigned bucket = hash_to_bucket(ctx->hash); > + struct conn *nc = NULL; > + > + if (!valid_new(pkt, &ctx->key)) { > + *state |= CS_INVALID; > + return nc; > + } > + > + *state |= CS_NEW; > + > + if (commit) { > + unsigned int n_conn_limit; > + > + atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit); > + > + if (atomic_count_get(&ct->n_conn) >= n_conn_limit) { > + COVERAGE_INC(conntrack_new_full); > + return nc; > + } > + > + nc = new_conn(pkt, &ctx->key, now); > + > + memcpy(&nc->rev_key, &ctx->key, sizeof nc->rev_key); > + > + conn_key_reverse(&nc->rev_key); > + hmap_insert(&ct->buckets[bucket].connections, &nc->node, > ctx->hash); > + atomic_count_inc(&ct->n_conn); > + } > + > + return nc; > +} > + > +static struct conn * > +process_one(struct conntrack *ct, struct dp_packet *pkt, > + struct conn_lookup_ctx *ctx, uint16_t zone, > + bool commit, long long now) > +{ > + unsigned bucket = hash_to_bucket(ctx->hash); > + struct conn *conn = ctx->conn; > + uint16_t state = 0; > + > + if (conn) { > + if (ctx->related) { > + state |= CS_RELATED; > + if (ctx->reply) { > + state |= CS_REPLY_DIR; > + } > + } else { > + enum ct_update_res res; > + > + res = conn_update(conn, pkt, ctx->reply, now); > + > + switch (res) { > + case CT_UPDATE_VALID: > + state |= CS_ESTABLISHED; > + if (ctx->reply) { > + state |= CS_REPLY_DIR; > + } > + break; > + case CT_UPDATE_INVALID: > + state |= CS_INVALID; > + break; > + case CT_UPDATE_NEW: > + hmap_remove(&ct->buckets[bucket].connections, &conn- > >node); > + atomic_count_dec(&ct->n_conn); > + delete_conn(conn); > + conn = conn_not_found(ct, pkt, ctx, &state, commit, > now); > + break; > + } [Antonio F] Sorry to repeat, but I'd prefer to add the 'default' case, here. I mean something like default: state |= CS_INVALID; break; I know if we add new items to enum ct_update_res we can get a warning from the compiler, but I wouldn't rely on that. > + } > + } else { > + conn = conn_not_found(ct, pkt, ctx, &state, commit, now); > + } > + > + write_ct_md(pkt, state, zone, conn ? conn->mark : 0, > + conn ? conn->label : OVS_U128_ZERO); > + > + return conn; > +} > + > +/* Sends a group of 'cnt' packets ('pkts') through the connection > tracker > + * 'ct'. If 'commit' is true, the packets are allowed to create new > entries > + * in the connection tables. 'setmark', if not NULL, should point > to a two > + * elements array containing a value and a mask to set the > connection mark. > + * 'setlabel' behaves similarly for the connection label.*/ > +int > +conntrack_execute(struct conntrack *ct, struct dp_packet **pkts, > size_t cnt, > + bool commit, uint16_t zone, const uint32_t > *setmark, > + const struct ovs_key_ct_labels *setlabel, > + const char *helper) > +{ > +#if !defined(__CHECKER__) && !defined(_WIN32) > + const size_t KEY_ARRAY_SIZE = cnt; > +#else > + enum { KEY_ARRAY_SIZE = NETDEV_MAX_BURST }; > +#endif > + struct conn_lookup_ctx ctxs[KEY_ARRAY_SIZE]; > + int8_t bucket_list[CONNTRACK_BUCKETS]; > + struct { > + unsigned bucket; > + unsigned long maps; > + } arr[KEY_ARRAY_SIZE]; > + long long now = time_msec(); > + size_t i = 0; > + uint8_t arrcnt = 0; > + > + BUILD_ASSERT_DECL(sizeof arr[0].maps * CHAR_BIT >= > NETDEV_MAX_BURST); > + > + if (helper) { > + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, > 5); > + > + VLOG_WARN_RL(&rl, "ALG helper \"%s\" not supported", > helper); > + /* Continue without the helper */ > + } > + > + memset(bucket_list, INT8_C(-1), sizeof bucket_list); > + for (i = 0; i < cnt; i++) { > + unsigned bucket; > + > + if (!conn_key_extract(ct, pkts[i], &ctxs[i], zone)) { > + write_ct_md(pkts[i], CS_INVALID, zone, 0, > OVS_U128_ZERO); > + continue; > + } > + > + bucket = hash_to_bucket(ctxs[i].hash); > + if (bucket_list[bucket] == INT8_C(-1)) { > + bucket_list[bucket] = arrcnt; > + > + arr[arrcnt].maps = 0; > + ULLONG_SET1(arr[arrcnt].maps, i); > + arr[arrcnt++].bucket = bucket; > + } else { > + ULLONG_SET1(arr[bucket_list[bucket]].maps, i); > + arr[bucket_list[bucket]].maps |= 1UL << i; > + } > + } > + > + for (i = 0; i < arrcnt; i++) { > + struct conntrack_bucket *ctb = &ct->buckets[arr[i].bucket]; > + size_t j; > + > + ct_lock_lock(&ctb->lock); > + > + ULLONG_FOR_EACH_1(j, arr[i].maps) { > + struct conn *conn; > + > + conn_key_lookup(ctb, &ctxs[j], now); > + > + conn = process_one(ct, pkts[j], &ctxs[j], zone, commit, > now); > + > + if (conn && setmark) { > + set_mark(pkts[j], conn, setmark[0], setmark[1]); > + } > + > + if (conn && setlabel) { > + set_label(pkts[j], conn, &setlabel[0], > &setlabel[1]); > + } > + } > + ct_lock_unlock(&ctb->lock); > + } > + > + return 0; > +} > + > +static void > +set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, > uint32_t mask) > +{ > + pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask)); > + conn->mark = pkt->md.ct_mark; > +} > + > +static void > +set_label(struct dp_packet *pkt, struct conn *conn, > + const struct ovs_key_ct_labels *val, > + const struct ovs_key_ct_labels *mask) > +{ > + ovs_u128 v, m; > + > + memcpy(&v, val, sizeof v); > + memcpy(&m, mask, sizeof m); > + > + pkt->md.ct_label.u64.lo = v.u64.lo > + | (pkt->md.ct_label.u64.lo & > ~(m.u64.lo)); > + pkt->md.ct_label.u64.hi = v.u64.hi > + | (pkt->md.ct_label.u64.hi & > ~(m.u64.hi)); > + conn->label = pkt->md.ct_label; > +} > + > +/* Key extraction */ > + > +/* The function stores a pointer to the first byte after the header > in > + * '*new_data', if 'new_data' is not NULL. If it is NULL, the > caller is > + * not interested in the header's tail, meaning that the header has > + * already been parsed (e.g. by flow_extract): we take this as a > hint to > + * save a few checks. If 'validate_checksum' is true, the function > returns > + * false if the IPv4 checksum is invalid. */ > +static inline bool > +extract_l3_ipv4(struct conn_key *key, const void *data, size_t size, > + const char **new_data, bool validate_checksum) > +{ > + const struct ip_header *ip = data; > + size_t ip_len; > + > + if (new_data) { > + if (OVS_UNLIKELY(size < IP_HEADER_LEN)) { > + return false; > + } > + } > + > + ip_len = IP_IHL(ip->ip_ihl_ver) * 4; > + > + if (new_data) { > + if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) { > + return false; > + } > + if (OVS_UNLIKELY(size < ip_len)) { > + return false; > + } > + > + *new_data = (char *) data + ip_len; > + } > + > + if (IP_IS_FRAGMENT(ip->ip_frag_off)) { > + return false; > + } > + > + if (validate_checksum && csum(data, ip_len) != 0) { > + return false; > + } > + > + key->src.addr.ipv4 = ip->ip_src; > + key->dst.addr.ipv4 = ip->ip_dst; > + key->nw_proto = ip->ip_proto; > + > + return true; > +} > + > +/* The function stores a pointer to the first byte after the header > in > + * '*new_data', if 'new_data' is not NULL. If it is NULL, the > caller is > + * not interested in the header's tail, meaning that the header has > + * already been parsed (e.g. by flow_extract): we take this as a > hint to > + * save a few checks. */ > +static inline bool > +extract_l3_ipv6(struct conn_key *key, const void *data, size_t size, > + const char **new_data) > +{ > + const struct ovs_16aligned_ip6_hdr *ip6 = data; > + uint8_t nw_proto = ip6->ip6_nxt; > + uint8_t nw_frag = 0; > + > + if (new_data) { > + if (OVS_UNLIKELY(size < sizeof *ip6)) { > + return false; > + } > + } > + > + data = ip6 + 1; > + size -= sizeof *ip6; > + > + if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag)) { > + return false; > + } > + > + if (new_data) { > + *new_data = data; > + } > + > + if (nw_frag) { > + return false; > + } > + > + key->src.addr.ipv6 = ip6->ip6_src; > + key->dst.addr.ipv6 = ip6->ip6_dst; > + key->nw_proto = nw_proto; > + > + return true; > +} > + > +static inline bool > +checksum_valid(const struct conn_key *key, const void *data, size_t > size, > + const void *l3) > +{ > + uint32_t csum = 0; > + > + if (key->dl_type == htons(ETH_TYPE_IP)) { > + csum = packet_csum_pseudoheader(l3); > + } else if (key->dl_type == htons(ETH_TYPE_IPV6)) { > + csum = packet_csum_pseudoheader6(l3); > + } else { > + return false; > + } > + > + csum = csum_continue(csum, data, size); > + > + return csum_finish(csum) == 0; > +} > + > +static inline bool > +check_l4_tcp(const struct conn_key *key, const void *data, size_t > size, > + const void *l3) > +{ > + const struct tcp_header *tcp = data; > + size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4; > + > + if (OVS_UNLIKELY(tcp_len < TCP_HEADER_LEN || tcp_len > size)) { > + return false; > + } > + > + return checksum_valid(key, data, size, l3); > +} > + > +static inline bool > +check_l4_udp(const struct conn_key *key, const void *data, size_t > size, > + const void *l3) > +{ > + const struct udp_header *udp = data; > + size_t udp_len = ntohs(udp->udp_len); > + > + if (OVS_UNLIKELY(udp_len < UDP_HEADER_LEN || udp_len > size)) { > + return false; > + } > + > + /* Validation must be skipped if checksum is 0 on IPv4 packets > */ > + return (udp->udp_csum == 0 && key->dl_type == > htons(ETH_TYPE_IP)) > + || checksum_valid(key, data, size, l3); > +} > + > +static inline bool > +check_l4_icmp(const void *data, size_t size) > +{ > + return csum(data, size) == 0; > +} > + > +static inline bool > +check_l4_icmp6(const struct conn_key *key, const void *data, size_t > size, > + const void *l3) > +{ > + return checksum_valid(key, data, size, l3); > +} > + > +static inline bool > +extract_l4_tcp(struct conn_key *key, const void *data, size_t size) > +{ > + const struct tcp_header *tcp = data; > + > + if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) { > + return false; > + } > + > + key->src.port = tcp->tcp_src; > + key->dst.port = tcp->tcp_dst; > + > + /* Port 0 is invalid */ > + return key->src.port && key->dst.port; > +} > + > +static inline bool > +extract_l4_udp(struct conn_key *key, const void *data, size_t size) > +{ > + const struct udp_header *udp = data; > + > + if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) { > + return false; > + } > + > + key->src.port = udp->udp_src; > + key->dst.port = udp->udp_dst; > + > + /* Port 0 is invalid */ > + return key->src.port && key->dst.port; > +} > + > +static inline bool extract_l4(struct conn_key *key, const void > *data, > + size_t size, bool *related, const void > *l3); > + > +/* If 'related' is not NULL and the function is processing an ICMP > + * error packet, extract the l3 and l4 fields from the nested header > + * instead and set *related to true. If 'related' is NULL we're > + * already processing a nested header and no such recursion is > + * possible */ > +static inline int > +extract_l4_icmp(struct conn_key *key, const void *data, size_t size, > + bool *related) > +{ > + const struct icmp_header *icmp = data; > + > + if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) { > + return false; > + } > + > + switch (icmp->icmp_type) { > + case ICMP4_ECHO_REQUEST: > + case ICMP4_ECHO_REPLY: > + case ICMP4_TIMESTAMP: > + case ICMP4_TIMESTAMPREPLY: > + case ICMP4_INFOREQUEST: > + case ICMP4_INFOREPLY: > + /* Separate ICMP connection: identified using id */ > + key->src.port = key->dst.port = icmp->icmp_fields.echo.id; > + break; > + case ICMP4_DST_UNREACH: > + case ICMP4_TIME_EXCEEDED: > + case ICMP4_PARAM_PROB: > + case ICMP4_SOURCEQUENCH: > + case ICMP4_REDIRECT: { > + /* ICMP packet part of another connection. We should > + * extract the key from embedded packet header */ > + struct conn_key inner_key; > + const char *l3 = (const char *) (icmp + 1); > + const char *tail = (const char *) data + size; > + const char *l4; > + bool ok; > + > + if (!related) { > + return false; > + } > + *related = true; > + > + memset(&inner_key, 0, sizeof inner_key); > + inner_key.dl_type = htons(ETH_TYPE_IP); > + ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4, false); > + if (!ok) { > + return false; > + } > + > + /* pf doesn't do this, but it seems a good idea */ > + if (inner_key.src.addr.ipv4_aligned != key- > >dst.addr.ipv4_aligned > + || inner_key.dst.addr.ipv4_aligned != key- > >src.addr.ipv4_aligned) { > + return false; > + } > + > + key->src = inner_key.src; > + key->dst = inner_key.dst; > + key->nw_proto = inner_key.nw_proto; > + > + ok = extract_l4(key, l4, tail - l4, NULL, l3); > + if (ok) { > + conn_key_reverse(key); > + } > + return ok; > + } > + default: > + return false; > + } > + > + return true; > +} > + > +/* If 'related' is not NULL and the function is processing an ICMP > + * error packet, extract the l3 and l4 fields from the nested header > + * instead and set *related to true. If 'related' is NULL we're > + * already processing a nested header and no such recursion is > + * possible */ > +static inline bool > +extract_l4_icmp6(struct conn_key *key, const void *data, size_t > size, > + bool *related) > +{ > + const struct icmp6_header *icmp6 = data; > + > + /* All the messages that we support need at least 4 bytes after > + * the header */ > + if (size < sizeof *icmp6 + 4) { > + return false; > + } > + > + switch (icmp6->icmp6_type) { > + case ICMP6_ECHO_REQUEST: > + case ICMP6_ECHO_REPLY: > + /* Separate ICMP connection: identified using id */ > + key->src.port = key->dst.port = *(ovs_be16 *) (icmp6 + 1); > + break; > + case ICMP6_DST_UNREACH: > + case ICMP6_PACKET_TOO_BIG: > + case ICMP6_TIME_EXCEEDED: > + case ICMP6_PARAM_PROB: { > + /* ICMP packet part of another connection. We should > + * extract the key from embedded packet header */ > + struct conn_key inner_key; > + const char *l3 = (const char *) icmp6 + 8; > + const char *tail = (const char *) data + size; > + const char *l4 = NULL; > + bool ok; > + > + if (!related) { > + return false; > + } > + *related = true; > + > + memset(&inner_key, 0, sizeof inner_key); > + inner_key.dl_type = htons(ETH_TYPE_IPV6); > + ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4); > + if (!ok) { > + return false; > + } > + > + /* pf doesn't do this, but it seems a good idea */ > + if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned, > + &key->dst.addr.ipv6_aligned) > + || !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned, > + &key->src.addr.ipv6_aligned)) { > + return false; > + } > + > + key->src = inner_key.src; > + key->dst = inner_key.dst; > + key->nw_proto = inner_key.nw_proto; > + > + ok = extract_l4(key, l4, tail - l4, NULL, l3); > + if (ok) { > + conn_key_reverse(key); > + } > + return ok; > + } > + default: > + return false; > + } > + > + return true; > +} > + > +/* Extract l4 fields into 'key', which must already contain valid l3 > + * members. > + * > + * If 'related' is not NULL and an ICMP error packet is being > + * processed, the function will extract the key from the packet > nested > + * in the ICMP paylod and set '*related' to true. > + * > + * If 'related' is NULL, it means that we're already parsing a > header nested > + * in an ICMP error. In this case, we skip checksum and length > validation. */ > +static inline bool > +extract_l4(struct conn_key *key, const void *data, size_t size, bool > *related, > + const void *l3) > +{ > + if (key->nw_proto == IPPROTO_TCP) { > + return (!related || check_l4_tcp(key, data, size, l3)) > + && extract_l4_tcp(key, data, size); > + } else if (key->nw_proto == IPPROTO_UDP) { > + return (!related || check_l4_udp(key, data, size, l3)) > + && extract_l4_udp(key, data, size); > + } else if (key->dl_type == htons(ETH_TYPE_IP) > + && key->nw_proto == IPPROTO_ICMP) { > + return (!related || check_l4_icmp(data, size)) > + && extract_l4_icmp(key, data, size, related); > + } else if (key->dl_type == htons(ETH_TYPE_IPV6) > + && key->nw_proto == IPPROTO_ICMPV6) { > + return (!related || check_l4_icmp6(key, data, size, l3)) > + && extract_l4_icmp6(key, data, size, related); > + } else { > + return false; > + } > +} > + > +static bool > +conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, > + struct conn_lookup_ctx *ctx, uint16_t zone) > +{ > + const struct eth_header *l2 = dp_packet_l2(pkt); > + const struct ip_header *l3 = dp_packet_l3(pkt); > + const char *l4 = dp_packet_l4(pkt); > + const char *tail = dp_packet_tail(pkt); > + bool ok; > + > + memset(ctx, 0, sizeof *ctx); > + > + if (!l2 || !l3 || !l4) { > + return false; > + } > + > + ctx->key.zone = zone; > + > + /* XXX In this function we parse the packet (again, it has > already > + * gone through miniflow_extract()) for two reasons: > + * > + * 1) To extract the l3 addresses and l4 ports. > + * We already have the l3 and l4 headers' pointers. > Extracting > + * the l3 addresses and the l4 ports is really cheap, since > they > + * can be found at fixed locations. > + * 2) To extract the l3 and l4 types. > + * Extracting the l3 and l4 types (especially the l3[1]) on > the > + * other hand is quite expensive, because they're not at a > + * fixed location. > + * > + * Here's a way to avoid (2) with the help of the datapath. > + * The datapath doesn't keep the packet's extracted flow[2], so > + * using that is not an option. We could use the packet's > matching > + * megaflow for l3 type (it's always unwildcarded), and for l4 > type > + * (we have to unwildcard it first). This means either: > + * > + * a) dpif-netdev passes the matching megaflow to > dp_execute_cb(), which > + * is used to extract the l3 type. Unfortunately, > dp_execute_cb() is > + * used also in dpif_netdev_execute(), which doesn't have a > matching > + * megaflow. > + * > + * b) We define an alternative OVS_ACTION_ATTR_CT, used only by > the > + * userspace datapath, which includes l3 (and l4) type. The > + * alternative action could be generated by ofproto-dpif > specifically > + * for the userspace datapath. Having a different interface > for > + * userspace and kernel doesn't seem very clean, though. > + * > + * --- > + * [1] A simple benchmark (running only the connection tracker > + * over and over on the same packets) shows that if the > + * l3 type is already provided we are 15% faster (running > the > + * connection tracker over a couple of DPDK devices with a > + * stream of UDP 64-bytes packets shows that we are 4% > faster). > + * > + * [2] The reasons for this are that keeping the flow increases > + * (slightly) the cache footprint and increases computation > + * time as we move the packet around. Most importantly, the > flow > + * should be updated by the actions and this can be slow, as > + * we use a sparse representation (miniflow). > + * > + */ > + ctx->key.dl_type = parse_dl_type(l2, (char *) l3 - (char *) l2); > + if (ctx->key.dl_type == htons(ETH_TYPE_IP)) { > + ok = extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, > NULL, true); > + } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) { > + ok = extract_l3_ipv6(&ctx->key, l3, tail - (char *) l3, > NULL); > + } else { > + ok = false; > + } > + > + if (ok) { > + if (extract_l4(&ctx->key, l4, tail - l4, &ctx->related, l3)) > { > + ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis); > + return true; > + } > + } > + > + return false; > +} > + > +/* Symmetric */ > +static uint32_t > +conn_key_hash(const struct conn_key *key, uint32_t basis) > +{ > + uint32_t hsrc, hdst, hash; > + int i; > + > + hsrc = hdst = basis; > + > + /* Hash the source and destination tuple */ > + for (i = 0; i < sizeof(key->src) / sizeof(uint32_t); i++) { > + hsrc = hash_add(hsrc, ((uint32_t *) &key->src)[i]); > + hdst = hash_add(hdst, ((uint32_t *) &key->dst)[i]); > + } > + > + /* Even if source and destination are swapped the hash will be > the same. */ > + hash = hsrc ^ hdst; > + > + /* Hash the rest of the key(L3 and L4 types and zone). */ > + hash = hash_words((uint32_t *) &key->dst + 1, > + (uint32_t *) (key + 1) - (uint32_t *) (&key- > >dst + 1), > + hash); > + > + return hash; > +} > + > +static void > +conn_key_reverse(struct conn_key *key) > +{ > + struct ct_endpoint tmp; > + tmp = key->src; > + key->src = key->dst; > + key->dst = tmp; > +} > + > +static void > +conn_key_lookup(struct conntrack_bucket *ctb, > + struct conn_lookup_ctx *ctx, > + long long now) > +{ > + uint32_t hash = ctx->hash; > + struct conn *conn; > + > + ctx->conn = NULL; > + > + HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) { > + if (!memcmp(&conn->key, &ctx->key, sizeof(conn->key)) > + && !conn_expired(conn, now)) { > + ctx->conn = conn; > + ctx->reply = false; > + break; > + } > + if (!memcmp(&conn->rev_key, &ctx->key, sizeof(conn- > >rev_key)) > + && !conn_expired(conn, now)) { > + ctx->conn = conn; > + ctx->reply = true; > + break; > + } > + } > +} > + > +static enum ct_update_res > +conn_update(struct conn *conn, struct dp_packet *pkt, bool reply, > + long long now) > +{ > + return l4_protos[conn->key.nw_proto]->conn_update(conn, pkt, > reply, now); > +} > + > +static bool > +conn_expired(struct conn *conn, long long now) > +{ > + return now >= conn->expiration; > +} > + > +static bool > +valid_new(struct dp_packet *pkt, struct conn_key *key) > +{ > + return l4_protos[key->nw_proto]->valid_new(pkt); > +} > + > +static struct conn * > +new_conn(struct dp_packet *pkt, struct conn_key *key, long long now) > +{ > + struct conn *newconn; > + > + newconn = l4_protos[key->nw_proto]->new_conn(pkt, now); > + > + if (newconn) { > + newconn->key = *key; > + } > + > + return newconn; > +} > + > +static void > +delete_conn(struct conn *conn) > +{ > + free(conn); > +} > diff --git a/lib/conntrack.h b/lib/conntrack.h > new file mode 100644 > index 0000000..d905bd2 > --- /dev/null > +++ b/lib/conntrack.h > @@ -0,0 +1,151 @@ > +/* > + * Copyright (c) 2015, 2016 Nicira, Inc. > + * > + * Licensed under the Apache License, Version 2.0 (the "License"); > + * you may not use this file except in compliance with the License. > + * You may obtain a copy of the License at: > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, > software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or > implied. > + * See the License for the specific language governing permissions > and > + * limitations under the License. > + */ > + > +#ifndef CONNTRACK_H > +#define CONNTRACK_H 1 > + > +#include <stdbool.h> > + > +#include "hmap.h" > +#include "odp-netlink.h" > +#include "openvswitch/thread.h" > +#include "openvswitch/types.h" > +#include "ovs-atomic.h" > + > + > +struct dp_packet; > + > +/* Userspace connection tracker > + * ============================ > + * > + * This is a connection tracking module that keeps all the state in > userspace. > + * > + * Usage > + * ===== > + * > + * struct conntract ct; > + * > + * Initialization: > + * > + * conntrack_init(&ct); > + * > + * It is necessary to periodically issue a call to > + * > + * conntrack_run(&ct); > + * > + * to allow the module to clean up expired connections. > + * > + * To send a group of packets through the connection tracker: > + * > + * conntrack_execute(&ct, pkts, n_pkts, ...); > + * > + * Thread-safety > + * ============= > + * > + * conntrack_execute() can be called by multiple threads > simultaneoulsy. > + */ > + > +struct conntrack; > + > +void conntrack_init(struct conntrack *); > +void conntrack_run(struct conntrack *); > +void conntrack_destroy(struct conntrack *); > + > +int conntrack_execute(struct conntrack *, struct dp_packet **, > size_t, > + bool commit, uint16_t zone, const uint32_t > *setmark, > + const struct ovs_key_ct_labels *setlabel, > + const char *helper); > + > +/* 'struct ct_lock' is a wrapper for an adaptive mutex. It's useful > to try > + * different types of locks (e.g. spinlocks) */ > + > +struct OVS_LOCKABLE ct_lock { > + struct ovs_mutex lock; > +}; > + > +static inline void ct_lock_init(struct ct_lock *lock) > +{ > + ovs_mutex_init_adaptive(&lock->lock); > +} > + > +static inline void ct_lock_lock(struct ct_lock *lock) > + OVS_ACQUIRES(lock) > + OVS_NO_THREAD_SAFETY_ANALYSIS > +{ > + ovs_mutex_lock(&lock->lock); > +} > + > +static inline void ct_lock_unlock(struct ct_lock *lock) > + OVS_RELEASES(lock) > + OVS_NO_THREAD_SAFETY_ANALYSIS > +{ > + ovs_mutex_unlock(&lock->lock); > +} > + > +static inline void ct_lock_destroy(struct ct_lock *lock) > +{ > + ovs_mutex_destroy(&lock->lock); > +} > + > +/* Timeouts: all the possible timeout states passed to > update_expiration() > + * are listed here. The name will be prefix by CT_TM_ and the value > is in > + * milliseconds */ > +#define CT_TIMEOUTS \ > + CT_TIMEOUT(TCP_FIRST_PACKET, 30 * 1000) \ > + CT_TIMEOUT(TCP_OPENING, 30 * 1000) \ > + CT_TIMEOUT(TCP_ESTABLISHED, 24 * 60 * 60 * 1000) \ > + CT_TIMEOUT(TCP_CLOSING, 15 * 60 * 1000) \ > + CT_TIMEOUT(TCP_FIN_WAIT, 45 * 1000) \ > + CT_TIMEOUT(TCP_CLOSED, 30 * 1000) \ > + CT_TIMEOUT(OTHER_FIRST, 60 * 1000) \ > + CT_TIMEOUT(OTHER_MULTIPLE, 60 * 1000) \ > + CT_TIMEOUT(OTHER_BIDIR, 30 * 1000) \ > + > +enum ct_timeout { > +#define CT_TIMEOUT(NAME, VALUE) CT_TM_##NAME, > + CT_TIMEOUTS > +#undef CT_TIMEOUT > + N_CT_TM > +}; > + > +/* Locking: > + * > + * The connections are kept in different buckets, which are > completely > + * independent. The connection bucket is determined by the hash of > its key. > + * */ > +struct conntrack_bucket { > + struct ct_lock lock; > + struct hmap connections OVS_GUARDED; > +}; > + > +#define CONNTRACK_BUCKETS_SHIFT 8 > +#define CONNTRACK_BUCKETS (1 << CONNTRACK_BUCKETS_SHIFT) > + > +struct conntrack { > + /* Independent buckets containing the connections */ > + struct conntrack_bucket buckets[CONNTRACK_BUCKETS]; > + > + /* Salt for hashing a connection key. */ > + uint32_t hash_basis; > + > + /* Number of connections currently in the connection tracker. */ > + atomic_count n_conn; > + /* Connections limit. When this limit is reached, no new > connection > + * will be accepted. */ > + atomic_uint n_conn_limit; > +}; > + > +#endif /* conntrack.h */ > diff --git a/lib/util.h b/lib/util.h > index 7be4a30..ad31e74 100644 > --- a/lib/util.h > +++ b/lib/util.h > @@ -69,6 +69,15 @@ ovs_prefetch_range(const void *start, size_t size) > #define MAX(X, Y) ((X) > (Y) ? (X) : (Y)) > #endif > > +/* Comparisons for ints with modular arithmetic */ > +#define INT_MOD_LT(a,b) ((int) ((a)-(b)) < 0) > +#define INT_MOD_LEQ(a,b) ((int) ((a)-(b)) <= 0) > +#define INT_MOD_GT(a,b) ((int) ((a)-(b)) > 0) > +#define INT_MOD_GEQ(a,b) ((int) ((a)-(b)) >= 0) > + > +#define INT_MOD_MIN(a, b) ((INT_MOD_LT(a, b)) ? (a) : (b)) > +#define INT_MOD_MAX(a, b) ((INT_MOD_GT(a, b)) ? (a) : (b)) > + > #define OVS_NOT_REACHED() abort() > > /* Use "%"PRIuSIZE to format size_t with printf(). */ > -- > 2.1.4 > > _______________________________________________ > dev mailing list > dev@openvswitch.org > http://openvswitch.org/mailman/listinfo/dev _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev