On 13/06/2016 14:11, "Joe Stringer" <j...@ovn.org> wrote:
>On 10 June 2016 at 15:47, Daniele Di Proietto <diproiet...@vmware.com> wrote:
>> This commit adds the conntrack module.
>>
>> It is a connection tracker that resides entirely in userspace. Its
>> primary user will be the dpif-netdev datapath.
>>
>> The module main goal is to provide conntrack_execute(), which offers a
>> convenient interface to implement the datapath ct() action.
>>
>> The conntrack module uses two submodules to deal with the l4 protocol
>> details (conntrack-other for UDP and ICMP, conntrack-tcp for TCP).
>>
>> The conntrack-tcp submodule implementation is adapted from FreeBSD's pf
>> subsystem, therefore it's BSD licensed. It has been slightly altered to
>> match the OVS coding style and to allow the pickup of already
>> established connections.
>>
>> Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
>> Acked-by: Antonio Fischetti <antonio.fische...@intel.com>
>
>I've got a bunch of pretty trivial comments below, but it seems
>otherwise fine to me. Thanks!
>
>Acked-by: Joe Stringer <j...@ovn.org>
Thank you for the review and all the feedback, this was a big patch!
>
>> ---
>> COPYING | 1 +
>> debian/copyright.in | 4 +
>> include/openvswitch/types.h | 4 +
>> lib/automake.mk | 5 +
>> lib/conntrack-other.c | 85 +++++
>> lib/conntrack-private.h | 88 +++++
>> lib/conntrack-tcp.c | 462 +++++++++++++++++++++++
>> lib/conntrack.c | 890
>> ++++++++++++++++++++++++++++++++++++++++++++
>> lib/conntrack.h | 150 ++++++++
>> lib/util.h | 9 +
>> 10 files changed, 1698 insertions(+)
>> create mode 100644 lib/conntrack-other.c
>> create mode 100644 lib/conntrack-private.h
>> create mode 100644 lib/conntrack-tcp.c
>> create mode 100644 lib/conntrack.c
>> create mode 100644 lib/conntrack.h
>>
>> diff --git a/COPYING b/COPYING
>> index 308e3ea..afb98b9 100644
>> --- a/COPYING
>> +++ b/COPYING
>> @@ -25,6 +25,7 @@ License, version 2.
>> The following files are licensed under the 2-clause BSD license.
>> include/windows/getopt.h
>> lib/getopt_long.c
>> + lib/conntrack-tcp.c
>>
>> The following files are licensed under the 3-clause BSD-license
>> include/windows/netinet/icmp6.h
>> diff --git a/debian/copyright.in b/debian/copyright.in
>> index 57d007a..a15f4dd 100644
>> --- a/debian/copyright.in
>> +++ b/debian/copyright.in
>> @@ -21,6 +21,9 @@ Upstream Copyright Holders:
>> Copyright (c) 2014 Michael Chapman
>> Copyright (c) 2014 WindRiver, Inc.
>> Copyright (c) 2014 Avaya, Inc.
>> + Copyright (c) 2001 Daniel Hartmeier
>> + Copyright (c) 2002 - 2008 Henning Brauer
>> + Copyright (c) 2012 Gleb Smirnoff <gleb...@freebsd.org>
>>
>> License:
>>
>> @@ -90,6 +93,7 @@ License:
>> lib/getopt_long.c
>> include/windows/getopt.h
>> datapath-windows/ovsext/Conntrack-tcp.c
>> + lib/conntrack-tcp.c
>>
>> * The following files are licensed under the 3-clause BSD-license
>>
>> diff --git a/include/openvswitch/types.h b/include/openvswitch/types.h
>> index bc94145..35bde0a 100644
>> --- a/include/openvswitch/types.h
>> +++ b/include/openvswitch/types.h
>> @@ -107,6 +107,10 @@ static const ovs_u128 OVS_U128_MAX = { { UINT32_MAX,
>> UINT32_MAX,
>> UINT32_MAX, UINT32_MAX } };
>> static const ovs_be128 OVS_BE128_MAX OVS_UNUSED = { { OVS_BE32_MAX,
>> OVS_BE32_MAX,
>> OVS_BE32_MAX, OVS_BE32_MAX } };
>> +static const ovs_u128 OVS_U128_MIN OVS_UNUSED = { {0, 0, 0, 0} };
>> +static const ovs_u128 OVS_BE128_MIN OVS_UNUSED = { {0, 0, 0, 0} };
>> +
>> +#define OVS_U128_ZERO OVS_U128_MIN
>>
>> /* A 64-bit value, in network byte order, that is only aligned on a 32-bit
>> * boundary. */
>> diff --git a/lib/automake.mk b/lib/automake.mk
>> index eabc0e7..4ba1a2c 100644
>> --- a/lib/automake.mk
>> +++ b/lib/automake.mk
>> @@ -49,6 +49,11 @@ lib_libopenvswitch_la_SOURCES = \
>> lib/compiler.h \
>> lib/connectivity.c \
>> lib/connectivity.h \
>> + lib/conntrack-private.h \
>> + lib/conntrack-tcp.c \
>> + lib/conntrack-other.c \
>> + lib/conntrack.c \
>> + lib/conntrack.h \
>> lib/coverage.c \
>> lib/coverage.h \
>> lib/crc32c.c \
>> diff --git a/lib/conntrack-other.c b/lib/conntrack-other.c
>> new file mode 100644
>> index 0000000..295cb2c
>> --- /dev/null
>> +++ b/lib/conntrack-other.c
>> @@ -0,0 +1,85 @@
>> +/*
>> + * Copyright (c) 2015, 2016 Nicira, Inc.
>> + *
>> + * Licensed under the Apache License, Version 2.0 (the "License");
>> + * you may not use this file except in compliance with the License.
>> + * You may obtain a copy of the License at:
>> + *
>> + * http://www.apache.org/licenses/LICENSE-2.0
>> + *
>> + * Unless required by applicable law or agreed to in writing, software
>> + * distributed under the License is distributed on an "AS IS" BASIS,
>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>> + * See the License for the specific language governing permissions and
>> + * limitations under the License.
>> + */
>> +
>> +#include <config.h>
>> +
>> +#include "conntrack-private.h"
>> +#include "dp-packet.h"
>> +
>> +enum other_state {
>> + OTHERS_FIRST,
>> + OTHERS_MULTIPLE,
>> + OTHERS_BIDIR,
>> +};
>> +
>> +struct conn_other {
>> + struct conn up;
>> + enum other_state state;
>> +};
>> +
>> +static const enum ct_timeout other_timeouts[] = {
>> + [OTHERS_FIRST] = CT_TM_OTHER_FIRST,
>> + [OTHERS_MULTIPLE] = CT_TM_OTHER_MULTIPLE,
>> + [OTHERS_BIDIR] = CT_TM_OTHER_BIDIR,
>> +};
>> +
>> +static struct conn_other *
>> +conn_other_cast(const struct conn *conn)
>> +{
>> + return CONTAINER_OF(conn, struct conn_other, up);
>> +}
>> +
>> +static enum ct_update_res
>> +other_conn_update(struct conn *conn_, struct dp_packet *pkt OVS_UNUSED,
>> + bool reply, long long now)
>> +{
>> + struct conn_other *conn = conn_other_cast(conn_);
>> +
>> + if (reply && conn->state != OTHERS_BIDIR) {
>> + conn->state = OTHERS_BIDIR;
>> + } else if (conn->state == OTHERS_FIRST) {
>> + conn->state = OTHERS_MULTIPLE;
>> + }
>> +
>> + update_expiration(conn_, other_timeouts[conn->state], now);
>> +
>> + return CT_UPDATE_VALID;
>> +}
>> +
>> +static bool
>> +other_valid_new(struct dp_packet *pkt OVS_UNUSED)
>> +{
>> + return true;
>> +}
>> +
>> +static struct conn *
>> +other_new_conn(struct dp_packet *pkt OVS_UNUSED, long long now)
>> +{
>> + struct conn_other *conn;
>> +
>> + conn = xzalloc(sizeof *conn);
>> + conn->state = OTHERS_FIRST;
>> +
>> + update_expiration(&conn->up, other_timeouts[conn->state], now);
>> +
>> + return &conn->up;
>> +}
>> +
>> +struct ct_l4_proto ct_proto_other = {
>> + .new_conn = other_new_conn,
>> + .valid_new = other_valid_new,
>> + .conn_update = other_conn_update,
>> +};
>> diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
>> new file mode 100644
>> index 0000000..d3e0099
>> --- /dev/null
>> +++ b/lib/conntrack-private.h
>> @@ -0,0 +1,88 @@
>> +/*
>> + * Copyright (c) 2015, 2016 Nicira, Inc.
>> + *
>> + * Licensed under the Apache License, Version 2.0 (the "License");
>> + * you may not use this file except in compliance with the License.
>> + * You may obtain a copy of the License at:
>> + *
>> + * http://www.apache.org/licenses/LICENSE-2.0
>> + *
>> + * Unless required by applicable law or agreed to in writing, software
>> + * distributed under the License is distributed on an "AS IS" BASIS,
>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>> + * See the License for the specific language governing permissions and
>> + * limitations under the License.
>> + */
>> +
>> +#ifndef CONNTRACK_PRIVATE_H
>> +#define CONNTRACK_PRIVATE_H 1
>> +
>> +#include <sys/types.h>
>> +#include <netinet/in.h>
>> +#include <netinet/ip6.h>
>> +
>> +#include "conntrack.h"
>> +#include "hmap.h"
>> +#include "openvswitch/list.h"
>> +#include "openvswitch/types.h"
>> +#include "packets.h"
>> +#include "unaligned.h"
>> +
>> +struct ct_addr {
>> + union {
>> + ovs_16aligned_be32 ipv4;
>> + union ovs_16aligned_in6_addr ipv6;
>> + ovs_be32 ipv4_aligned;
>> + struct in6_addr ipv6_aligned;
>> + };
>> +};
>> +
>> +struct ct_endpoint {
>> + struct ct_addr addr;
>> + ovs_be16 port;
>> +};
>> +
>> +struct conn_key {
>> + struct ct_endpoint src;
>> + struct ct_endpoint dst;
>> +
>> + ovs_be16 dl_type;
>> + uint8_t nw_proto;
>> + uint16_t zone;
>> +};
>> +
>> +struct conn {
>> + struct conn_key key;
>> + struct conn_key rev_key;
>> + long long expiration;
>> + struct ovs_list exp_node;
>> + struct hmap_node node;
>> + uint32_t mark;
>> + ovs_u128 label;
>> +};
>> +
>> +enum ct_update_res {
>> + CT_UPDATE_INVALID,
>> + CT_UPDATE_VALID,
>> + CT_UPDATE_NEW,
>> +};
>> +
>> +struct ct_l4_proto {
>> + struct conn *(*new_conn)(struct dp_packet *pkt, long long now);
>> + bool (*valid_new)(struct dp_packet *pkt);
>> + enum ct_update_res (*conn_update)(struct conn *conn, struct dp_packet
>> *pkt,
>> + bool reply, long long now);
>> +};
>> +
>> +extern struct ct_l4_proto ct_proto_tcp;
>> +extern struct ct_l4_proto ct_proto_other;
>> +
>> +extern long long ct_timeout_val[];
>> +
>> +static inline void
>> +update_expiration(struct conn *conn, enum ct_timeout tm, long long now)
>> +{
>> + conn->expiration = now + ct_timeout_val[tm];
>> +}
>> +
>> +#endif /* conntrack-private.h */
>> diff --git a/lib/conntrack-tcp.c b/lib/conntrack-tcp.c
>> new file mode 100644
>> index 0000000..b574eeb
>> --- /dev/null
>> +++ b/lib/conntrack-tcp.c
>> @@ -0,0 +1,462 @@
>> +/*-
>> + * Copyright (c) 2001 Daniel Hartmeier
>> + * Copyright (c) 2002 - 2008 Henning Brauer
>> + * Copyright (c) 2012 Gleb Smirnoff <gleb...@freebsd.org>
>> + * Copyright (c) 2015, 2016 Nicira, Inc.
>> + * All rights reserved.
>> + *
>> + * Redistribution and use in source and binary forms, with or without
>> + * modification, are permitted provided that the following conditions
>> + * are met:
>> + *
>> + * - Redistributions of source code must retain the above copyright
>> + * notice, this list of conditions and the following disclaimer.
>> + * - Redistributions in binary form must reproduce the above
>> + * copyright notice, this list of conditions and the following
>> + * disclaimer in the documentation and/or other materials provided
>> + * with the distribution.
>> + *
>> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
>> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
>> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
>> + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
>> + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
>> + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
>> + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
>> + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
>> + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
>> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
>> + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
>> + * POSSIBILITY OF SUCH DAMAGE.
>> + *
>> + * Effort sponsored in part by the Defense Advanced Research Projects
>> + * Agency (DARPA) and Air Force Research Laboratory, Air Force
>> + * Materiel Command, USAF, under agreement number F30602-01-2-0537.
>> + *
>> + * $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $
>> + */
>> +
>> +#include <config.h>
>> +
>> +#include "conntrack-private.h"
>> +#include "ct-dpif.h"
>> +#include "dp-packet.h"
>> +#include "util.h"
>> +
>> +struct tcp_peer {
>> + enum ct_dpif_tcp_state state;
>> + uint32_t seqlo; /* Max sequence number sent
>> */
>> + uint32_t seqhi; /* Max the other end ACKd + win
>> */
>> + uint16_t max_win; /* largest window (pre scaling)
>> */
>> + uint8_t wscale; /* window scaling factor
>> */
>> +};
>> +
>> +struct conn_tcp {
>> + struct conn up;
>> + struct tcp_peer peer[2];
>> +};
>> +
>> +enum {
>> + TCPOPT_EOL,
>> + TCPOPT_NOP,
>> + TCPOPT_WINDOW = 3,
>> +};
>> +
>> +/* TCP sequence numbers are 32 bit integers operated
>> + * on with modular arithmetic. These macros can be
>> + * used to compare such integers. */
>> +#define SEQ_LT(a,b) INT_MOD_LT(a, b)
>> +#define SEQ_LEQ(a,b) INT_MOD_LEQ(a, b)
>> +#define SEQ_GT(a,b) INT_MOD_GT(a, b)
>> +#define SEQ_GEQ(a,b) INT_MOD_GEQ(a, b)
>> +
>> +#define SEQ_MIN(a, b) INT_MOD_MIN(a, b)
>> +#define SEQ_MAX(a, b) INT_MOD_MAX(a, b)
>
>It would appear only GT and GEQ of these are used (also for
>INT_MOD_*).. Were the others used upstream originally? Just want to
>double-check.
Good point. Most of them aren't used, SEQ_LT() is used for extra features
not ported here, such as fragment reassembly.
>
>> +
>> +static struct conn_tcp*
>> +conn_tcp_cast(const struct conn* conn)
>> +{
>> + return CONTAINER_OF(conn, struct conn_tcp, up);
>> +}
>> +
>> +/* pf does this in in pf_normalize_tcp(), and it is called only if scrub
>> + * is enabled. We're not scrubbing, but this check seems reasonable. */
>> +static bool
>> +tcp_invalid_flags(uint16_t flags)
>> +{
>> +
>> + if (flags & TCP_SYN) {
>> + if (flags & TCP_RST || flags & TCP_FIN) {
>> + return true;
>> + }
>> + } else {
>> + /* Illegal packet */
>> + if (!(flags & (TCP_ACK|TCP_RST))) {
>> + return true;
>> + }
>> + }
>> +
>> + if (!(flags & TCP_ACK)) {
>> + /* These flags are only valid if ACK is set */
>> + if ((flags & TCP_FIN) || (flags & TCP_PSH) || (flags & TCP_URG)) {
>> + return true;
>> + }
>> + }
>> +
>> + return false;
>> +}
>> +
>> +#define TCP_MAX_WSCALE 14
>> +#define CT_WSCALE_FLAG 0x80
>> +#define CT_WSCALE_UNKNOWN 0x40
>> +#define CT_WSCALE_MASK 0xf
>> +
>> +static uint8_t
>> +tcp_get_wscale(const struct tcp_header *tcp)
>> +{
>> + int len = TCP_OFFSET(tcp->tcp_ctl) * 4 - sizeof *tcp;
>> + const uint8_t *opt = (const uint8_t *)(tcp + 1);
>> + uint8_t wscale = 0;
>> + uint8_t optlen;
>> +
>> + while (len >= 3) {
>> + switch (*opt) {
>> + case TCPOPT_EOL:
>> + return wscale;
>> + case TCPOPT_NOP:
>> + opt++;
>> + len--;
>> + break;
>> + case TCPOPT_WINDOW:
>> + wscale = MIN(opt[2], TCP_MAX_WSCALE);
>> + wscale |= CT_WSCALE_FLAG;
>> + /* fall through */
>> + default:
>> + optlen = opt[1];
>> + if (optlen < 2) {
>> + optlen = 2;
>> + }
>> + len -= optlen;
>> + opt += optlen;
>> + }
>> + }
>> +
>> + return wscale;
>> +}
>> +
>> +static uint32_t
>> +tcp_payload_length(struct dp_packet *pkt)
>> +{
>> + return (char *) dp_packet_tail(pkt) - dp_packet_l2_pad_size(pkt)
>> + - (char *) dp_packet_get_tcp_payload(pkt);
>> +}
>
>Is there a reason this exists independently of dp_packet_l4_size()?
It's similar, but where dp_packet_l4_size() uses dp_packet_l4 (the start of
the tcp header), this tcp_payload_length uses dp_packet_get_tcp_payload (the
end of the tcp header).
>> +static enum ct_update_res
>> +tcp_conn_update(struct conn* conn_, struct dp_packet *pkt, bool reply,
>> + long long now)
>> +{
>> + struct conn_tcp *conn = conn_tcp_cast(conn_);
>> + struct tcp_header *tcp = dp_packet_l4(pkt);
>> + /* The peer that sent 'pkt' */
>> + struct tcp_peer *src = &conn->peer[reply ? 1 : 0];
>> + /* The peer that should receive 'pkt' */
>> + struct tcp_peer *dst = &conn->peer[reply ? 0 : 1];
>> + uint8_t sws = 0, dws = 0;
>> + uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
>> +
>> + uint16_t win = ntohs(tcp->tcp_winsz);
>> + uint32_t ack, end, seq, orig_seq;
>> + uint32_t p_len = tcp_payload_length(pkt);
>> + int ackskew;
>> +
>> + if (tcp_invalid_flags(tcp_flags)) {
>> + return CT_UPDATE_INVALID;
>> + }
>> +
>> + if (((tcp_flags & (TCP_SYN|TCP_ACK)) == TCP_SYN)
>> + && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2
>> + && src->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
>> + src->state = dst->state = CT_DPIF_TCPS_CLOSED;
>> + return CT_UPDATE_NEW;
>> + }
>
>Minor nits: We usually put spaces around infix binary and ternary
>operatiors (for instance, |) and ampersands are usually aligned with
>the other expressions at the same level of nesting. I didn't bother
>checking the rest of the file for this type of style deviation, but
>based on your diff tree it looks like you're familiar with all the
>locations already.
You're right, thanks for pointing that out. I found another spot
where the indentation is wrong. I fixed both.
>
>> +
>> + if (src->wscale & CT_WSCALE_FLAG
>> + && dst->wscale & CT_WSCALE_FLAG
>> + && !(tcp_flags & TCP_SYN)) {
>> +
>> + sws = src->wscale & CT_WSCALE_MASK;
>> + dws = dst->wscale & CT_WSCALE_MASK;
>> +
>> + } else if (src->wscale & CT_WSCALE_UNKNOWN
>> + && dst->wscale & CT_WSCALE_UNKNOWN
>> + && !(tcp_flags & TCP_SYN)) {
>> +
>> + sws = TCP_MAX_WSCALE;
>> + dws = TCP_MAX_WSCALE;
>> + }
>> +
>> + /*
>> + * Sequence tracking algorithm from Guido van Rooij's paper:
>> + * http://www.madison-gurkha.com/publications/tcp_filtering/
>> + * tcp_filtering.ps
>> + */
>> +
>> + orig_seq = seq = ntohl(get_16aligned_be32(&tcp->tcp_seq));
>> + if (src->state < CT_DPIF_TCPS_SYN_SENT) {
>> + /* First packet from this end. Set its state */
>> +
>> + ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));
>> +
>> + end = seq + p_len;
>> + if (tcp_flags & TCP_SYN) {
>> + end++;
>> + if (dst->wscale & CT_WSCALE_FLAG) {
>> + src->wscale = tcp_get_wscale(tcp);
>> + if (src->wscale & CT_WSCALE_FLAG) {
>> + /* Remove scale factor from initial window */
>> + sws = src->wscale & CT_WSCALE_MASK;
>> + win = DIV_ROUND_UP((uint32_t) win, 1 << sws);
>> + dws = dst->wscale & CT_WSCALE_MASK;
>> + } else {
>> + /* fixup other window */
>> + dst->max_win <<= dst->wscale & CT_WSCALE_MASK;
>> + /* in case of a retrans SYN|ACK */
>> + dst->wscale = 0;
>> + }
>> + }
>> + }
>> + if (tcp_flags & TCP_FIN) {
>> + end++;
>> + }
>> +
>> + src->seqlo = seq;
>> + src->state = CT_DPIF_TCPS_SYN_SENT;
>> + /*
>> + * May need to slide the window (seqhi may have been set by
>> + * the crappy stack check or if we picked up the connection
>> + * after establishment)
>> + */
>> + if (src->seqhi == 1
>> + || SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) {
>> + src->seqhi = end + MAX(1, dst->max_win << dws);
>> + }
>> + if (win > src->max_win) {
>> + src->max_win = win;
>> + }
>> +
>> + } else {
>> + ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));
>> + end = seq + p_len;
>> + if (tcp_flags & TCP_SYN) {
>> + end++;
>> + }
>> + if (tcp_flags & TCP_FIN) {
>> + end++;
>> + }
>> + }
>> +
>> + if ((tcp_flags & TCP_ACK) == 0) {
>> + /* Let it pass through the ack skew check */
>> + ack = dst->seqlo;
>> + } else if ((ack == 0
>> + && (tcp_flags & (TCP_ACK|TCP_RST)) == (TCP_ACK|TCP_RST))
>> + /* broken tcp stacks do not set ack */) {
>> + /* Many stacks (ours included) will set the ACK number in an
>> + * FIN|ACK if the SYN times out -- no sequence to ACK. */
>> + ack = dst->seqlo;
>> + }
>> +
>> + if (seq == end) {
>> + /* Ease sequencing restrictions on no data packets */
>> + seq = src->seqlo;
>> + end = seq;
>> + }
>> +
>> + ackskew = dst->seqlo - ack;
>> +#define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge
>> factor */
>> + if (SEQ_GEQ(src->seqhi, end)
>> + /* Last octet inside other's window space */
>> + && SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws))
>> + /* Retrans: not more than one window back */
>> + && (ackskew >= -MAXACKWINDOW)
>> + /* Acking not more than one reassembled fragment backwards */
>> + && (ackskew <= (MAXACKWINDOW << sws))
>> + /* Acking not more than one window forward */
>> + && ((tcp_flags & TCP_RST) == 0 || orig_seq == src->seqlo
>> + || (orig_seq == src->seqlo + 1) || (orig_seq + 1 ==
>> src->seqlo))) {
>> + /* Require an exact/+1 sequence match on resets when possible */
>> +
>> + /* update max window */
>> + if (src->max_win < win) {
>> + src->max_win = win;
>> + }
>> + /* synchronize sequencing */
>> + if (SEQ_GT(end, src->seqlo)) {
>> + src->seqlo = end;
>> + }
>> + /* slide the window of what the other end can send */
>> + if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
>> + dst->seqhi = ack + MAX((win << sws), 1);
>> + }
>> +
>> + /* update states */
>> + if (tcp_flags & TCP_SYN && src->state < CT_DPIF_TCPS_SYN_SENT) {
>> + src->state = CT_DPIF_TCPS_SYN_SENT;
>> + }
>> + if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {
>> + src->state = CT_DPIF_TCPS_CLOSING;
>> + }
>> + if (tcp_flags & TCP_ACK) {
>> + if (dst->state == CT_DPIF_TCPS_SYN_SENT) {
>> + dst->state = CT_DPIF_TCPS_ESTABLISHED;
>> + } else if (dst->state == CT_DPIF_TCPS_CLOSING) {
>> + dst->state = CT_DPIF_TCPS_FIN_WAIT_2;
>> + }
>> + }
>> + if (tcp_flags & TCP_RST) {
>> + src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
>> + }
>> +
>> + if (src->state >= CT_DPIF_TCPS_FIN_WAIT_2
>> + && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
>> + update_expiration(conn_, CT_TM_TCP_CLOSED, now);
>> + } else if (src->state >= CT_DPIF_TCPS_CLOSING
>> + && dst->state >= CT_DPIF_TCPS_CLOSING) {
>> + update_expiration(conn_, CT_TM_TCP_FIN_WAIT, now);
>> + } else if (src->state < CT_DPIF_TCPS_ESTABLISHED
>> + || dst->state < CT_DPIF_TCPS_ESTABLISHED) {
>> + update_expiration(conn_, now, CT_TM_TCP_OPENING);
>> + } else if (src->state >= CT_DPIF_TCPS_CLOSING
>> + || dst->state >= CT_DPIF_TCPS_CLOSING) {
>> + update_expiration(conn_, now, CT_TM_TCP_CLOSING);
>> + } else {
>> + update_expiration(conn_, now, CT_TM_TCP_ESTABLISHED);
>> + }
>> + } else if ((dst->state < CT_DPIF_TCPS_SYN_SENT
>> + || dst->state >= CT_DPIF_TCPS_FIN_WAIT_2
>> + || src->state >= CT_DPIF_TCPS_FIN_WAIT_2)
>> + && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end)
>> + /* Within a window forward of the originating packet */
>> + && SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
>> + /* Within a window backward of the originating packet */
>> +
>> + /*
>> + * This currently handles three situations:
>> + * 1) Stupid stacks will shotgun SYNs before their peer
>> + * replies.
>> + * 2) When PF catches an already established stream (the
>> + * firewall rebooted, the state table was flushed, routes
>> + * changed...)
>> + * 3) Packets get funky immediately after the connection
>> + * closes (this should catch Solaris spurious ACK|FINs
>> + * that web servers like to spew after a close)
>> + *
>> + * This must be a little more careful than the above code
>> + * since packet floods will also be caught here. We don't
>> + * update the TTL here to mitigate the damage of a packet
>> + * flood and so the same code can handle awkward establishment
>> + * and a loosened connection close.
>> + * In the establishment case, a correct peer response will
>> + * validate the connection, go through the normal state code
>> + * and keep updating the state TTL.
>> + */
>> +
>> + /* update max window */
>> + if (src->max_win < win) {
>> + src->max_win = win;
>> + }
>> + /* synchronize sequencing */
>> + if (SEQ_GT(end, src->seqlo)) {
>> + src->seqlo = end;
>> + }
>> + /* slide the window of what the other end can send */
>> + if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
>> + dst->seqhi = ack + MAX((win << sws), 1);
>> + }
>> +
>> + /*
>> + * Cannot set dst->seqhi here since this could be a shotgunned
>> + * SYN and not an already established connection.
>> + */
>> +
>> + if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {
>> + src->state = CT_DPIF_TCPS_CLOSING;
>> + }
>> +
>> + if (tcp_flags & TCP_RST) {
>> + src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
>> + }
>> + } else {
>> + return CT_UPDATE_INVALID;
>> + }
>> +
>> + return CT_UPDATE_VALID;
>> +}
>> +
>> +static bool
>> +tcp_valid_new(struct dp_packet *pkt)
>> +{
>> + struct tcp_header *tcp = dp_packet_l4(pkt);
>> + uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
>> +
>> + if (tcp_invalid_flags(tcp_flags)) {
>> + return false;
>> + }
>> +
>> + /* A syn+ack is not allowed to create a connection. We want to allow
>> + * totally new connections (syn) or already established, not partially
>> + * open (syn+ack). */
>> + if ((tcp_flags & TCP_SYN) && (tcp_flags & TCP_ACK)) {
>> + return false;
>> + }
>> +
>> + return true;
>> +}
>> +
>> +static struct conn *
>> +tcp_new_conn(struct dp_packet *pkt, long long now)
>> +{
>> + struct conn_tcp* newconn = NULL;
>> + struct tcp_header *tcp = dp_packet_l4(pkt);
>> + struct tcp_peer *src, *dst;
>> + uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
>> +
>> + newconn = xzalloc(sizeof *newconn);
>> +
>> + src = &newconn->peer[0];
>> + dst = &newconn->peer[1];
>> +
>> + src->seqlo = ntohl(get_16aligned_be32(&tcp->tcp_seq));
>> + src->seqhi = src->seqlo + tcp_payload_length(pkt) + 1;
>> +
>> + if (tcp_flags & TCP_SYN) {
>> + src->seqhi++;
>> + src->wscale = tcp_get_wscale(tcp);
>> + } else {
>> + src->wscale = CT_WSCALE_UNKNOWN;
>> + dst->wscale = CT_WSCALE_UNKNOWN;
>> + }
>> + src->max_win = MAX(ntohs(tcp->tcp_winsz), 1);
>> + if (src->wscale & CT_WSCALE_MASK) {
>> + /* Remove scale factor from initial window */
>> + uint8_t sws = src->wscale & CT_WSCALE_MASK;
>> + src->max_win = DIV_ROUND_UP((uint32_t) src->max_win, 1 << sws);
>> + }
>> + if (tcp_flags & TCP_FIN) {
>> + src->seqhi++;
>> + }
>> + dst->seqhi = 1;
>> + dst->max_win = 1;
>> + src->state = CT_DPIF_TCPS_SYN_SENT;
>> + dst->state = CT_DPIF_TCPS_CLOSED;
>> +
>> + update_expiration(&newconn->up, now, CT_TM_TCP_FIRST_PACKET);
>> +
>> + return &newconn->up;
>> +}
>> +
>> +struct ct_l4_proto ct_proto_tcp = {
>> + .new_conn = tcp_new_conn,
>> + .valid_new = tcp_valid_new,
>> + .conn_update = tcp_conn_update,
>> +};
>> diff --git a/lib/conntrack.c b/lib/conntrack.c
>> new file mode 100644
>> index 0000000..96935bc
>> --- /dev/null
>> +++ b/lib/conntrack.c
>> @@ -0,0 +1,890 @@
>> +/*
>> + * Copyright (c) 2015, 2016 Nicira, Inc.
>> + *
>> + * Licensed under the Apache License, Version 2.0 (the "License");
>> + * you may not use this file except in compliance with the License.
>> + * You may obtain a copy of the License at:
>> + *
>> + * http://www.apache.org/licenses/LICENSE-2.0
>> + *
>> + * Unless required by applicable law or agreed to in writing, software
>> + * distributed under the License is distributed on an "AS IS" BASIS,
>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>> + * See the License for the specific language governing permissions and
>> + * limitations under the License.
>> + */
>> +
>> +#include <config.h>
>> +#include "conntrack.h"
>> +
>> +#include <errno.h>
>> +#include <sys/types.h>
>> +#include <netinet/in.h>
>> +#include <netinet/icmp6.h>
>> +
>> +#include "bitmap.h"
>> +#include "conntrack-private.h"
>> +#include "coverage.h"
>> +#include "csum.h"
>> +#include "dp-packet.h"
>> +#include "flow.h"
>> +#include "hmap.h"
>> +#include "netdev.h"
>> +#include "odp-netlink.h"
>> +#include "openvswitch/vlog.h"
>> +#include "ovs-rcu.h"
>> +#include "random.h"
>> +#include "timeval.h"
>> +
>> +VLOG_DEFINE_THIS_MODULE(conntrack);
>> +
>> +COVERAGE_DEFINE(conntrack_new_full);
>
>Could just be 'conntrack_full' or 'conntrack_limit_reached'?
Agreed, 'conntrack_full' sounds better.
>
>> +
>> +struct conn_lookup_ctx {
>> + struct conn_key key;
>> + struct conn *conn;
>> + uint32_t hash;
>> + bool reply;
>> + bool related;
>> +};
>> +
>> +static bool conn_key_extract(struct conntrack *, struct dp_packet *,
>> + struct conn_lookup_ctx *, uint16_t zone);
>> +static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);
>> +static void conn_key_reverse(struct conn_key *);
>> +static void conn_key_lookup(struct conntrack_bucket *ctb,
>> + struct conn_lookup_ctx *ctx,
>> + long long now);
>> +static bool valid_new(struct dp_packet *pkt, struct conn_key *);
>> +static struct conn *new_conn(struct dp_packet *pkt, struct conn_key *,
>> + long long now);
>> +static void delete_conn(struct conn *);
>> +static enum ct_update_res conn_update(struct conn *, struct dp_packet*,
>> + bool reply, long long now);
>> +static bool conn_expired(struct conn *, long long now);
>> +static void set_mark(struct dp_packet *, struct conn *,
>> + uint32_t val, uint32_t mask);
>> +static void set_label(struct dp_packet *, struct conn *,
>> + const struct ovs_key_ct_labels *val,
>> + const struct ovs_key_ct_labels *mask);
>> +
>> +static struct ct_l4_proto *l4_protos[] = {
>> + [IPPROTO_TCP] = &ct_proto_tcp,
>> + [IPPROTO_UDP] = &ct_proto_other,
>> + [IPPROTO_ICMP] = &ct_proto_other,
>> + [IPPROTO_ICMPV6] = &ct_proto_other,
>> +};
>> +
>> +long long ct_timeout_val[] = {
>> +#define CT_TIMEOUT(NAME, VAL) [CT_TM_##NAME] = VAL,
>> + CT_TIMEOUTS
>> +#undef CT_TIMEOUT
>> +};
>> +
>> +/* If the total number of connections goes above this value, no new
>> connections
>> + * are accepted */
>> +#define DEFAULT_N_CONN_LIMIT 3000000
>
>I see you're a fan of base-10 ;-)
>
>> +
>> +/* Initializes the connection tracker 'ct'. The caller is responsibile for
>
>s/responsibile/responsible
Somehow I got it wrong twice, thanks for spotting this! fixed
>> + * calling 'conntrack_destroy()', when the instance is not needed anymore */
>> +void
>> +conntrack_init(struct conntrack *ct)
>> +{
>> + unsigned i;
>> +
>> + for (i = 0; i < CONNTRACK_BUCKETS; i++) {
>> + struct conntrack_bucket *ctb = &ct->buckets[i];
>> +
>> + ct_lock_init(&ctb->lock);
>> + ct_lock_lock(&ctb->lock);
>> + hmap_init(&ctb->connections);
>> + ct_lock_unlock(&ctb->lock);
>> + }
>> + ct->hash_basis = random_uint32();
>> + atomic_count_init(&ct->n_conn, 0);
>> + atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT);
>> +}
>> +
>> +/* Destroys the connection tracker 'ct' and frees all the allocated memory.
>> */
>> +void
>> +conntrack_destroy(struct conntrack *ct)
>> +{
>> + unsigned i;
>> +
>> + for (i = 0; i < CONNTRACK_BUCKETS; i++) {
>> + struct conntrack_bucket *ctb = &ct->buckets[i];
>> + struct conn *conn;
>> +
>> + ct_lock_lock(&ctb->lock);
>> + HMAP_FOR_EACH_POP(conn, node, &ctb->connections) {
>> + atomic_count_dec(&ct->n_conn);
>> + delete_conn(conn);
>> + }
>> + hmap_destroy(&ctb->connections);
>> + ct_lock_unlock(&ctb->lock);
>> + ct_lock_destroy(&ctb->lock);
>> + }
>> +}
>> +
>> +static unsigned hash_to_bucket(uint32_t hash)
>> +{
>> + /* Extracts the most significant bits in hash. The least significant
>> bits
>> + * are already used internally by the hmap implementation. */
>> + BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 && CONNTRACK_BUCKETS_SHIFT >=
>> 1);
>> +
>> + return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS;
>> +}
>> +
>> +static void
>> +write_ct_md(struct dp_packet *pkt, uint16_t state, uint16_t zone,
>> + uint32_t mark, ovs_u128 label)
>> +{
>> + pkt->md.ct_state = state | CS_TRACKED;
>> + pkt->md.ct_zone = zone;
>> + pkt->md.ct_mark = mark;
>> + pkt->md.ct_label = label;
>> +}
>> +
>> +static struct conn *
>> +conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
>> + struct conn_lookup_ctx *ctx, uint16_t *state, bool commit,
>> + long long now)
>> +{
>> + unsigned bucket = hash_to_bucket(ctx->hash);
>> + struct conn *nc = NULL;
>> +
>> + if (!valid_new(pkt, &ctx->key)) {
>> + *state |= CS_INVALID;
>> + return nc;
>> + }
>> +
>> + *state |= CS_NEW;
>> +
>> + if (commit) {
>> + unsigned int n_conn_limit;
>> +
>> + atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
>> +
>> + if (atomic_count_get(&ct->n_conn) >= n_conn_limit) {
>> + COVERAGE_INC(conntrack_new_full);
>> + return nc;
>> + }
>> +
>> + nc = new_conn(pkt, &ctx->key, now);
>> +
>> + memcpy(&nc->rev_key, &ctx->key, sizeof nc->rev_key);
>> +
>> + conn_key_reverse(&nc->rev_key);
>> + hmap_insert(&ct->buckets[bucket].connections, &nc->node, ctx->hash);
>> + atomic_count_inc(&ct->n_conn);
>> + }
>> +
>> + return nc;
>> +}
>> +
>> +static struct conn *
>> +process_one(struct conntrack *ct, struct dp_packet *pkt,
>> + struct conn_lookup_ctx *ctx, uint16_t zone,
>> + bool commit, long long now)
>> +{
>> + unsigned bucket = hash_to_bucket(ctx->hash);
>> + struct conn *conn = ctx->conn;
>> + uint16_t state = 0;
>> +
>> + if (conn) {
>> + if (ctx->related) {
>> + state |= CS_RELATED;
>> + if (ctx->reply) {
>> + state |= CS_REPLY_DIR;
>> + }
>> + } else {
>> + enum ct_update_res res;
>> +
>> + res = conn_update(conn, pkt, ctx->reply, now);
>> +
>> + switch (res) {
>> + case CT_UPDATE_VALID:
>> + state |= CS_ESTABLISHED;
>> + if (ctx->reply) {
>> + state |= CS_REPLY_DIR;
>> + }
>> + break;
>> + case CT_UPDATE_INVALID:
>> + state |= CS_INVALID;
>> + break;
>> + case CT_UPDATE_NEW:
>> + hmap_remove(&ct->buckets[bucket].connections, &conn->node);
>> + atomic_count_dec(&ct->n_conn);
>> + delete_conn(conn);
>> + conn = conn_not_found(ct, pkt, ctx, &state, commit, now);
>> + break;
>> + default:
>> + OVS_NOT_REACHED();
>> + }
>> + }
>> + } else {
>> + conn = conn_not_found(ct, pkt, ctx, &state, commit, now);
>> + }
>> +
>> + write_ct_md(pkt, state, zone, conn ? conn->mark : 0,
>> + conn ? conn->label : OVS_U128_ZERO);
>> +
>> + return conn;
>> +}
>> +
>> +/* Sends the packets in '*pkt_batch' through the connection tracker 'ct'.
>> All
>> + * the packets should have the same 'dl_type' (IPv4 or IPv6) and should have
>> + * the l3 and and l4 offset properly set.
>> + *
>
>Whitespace.
Ok
>
>> + * If 'commit' is true, the packets are allowed to create new entries in the
>> + * connection tables. 'setmark', if not NULL, should point to a two
>> + * elements array containing a value and a mask to set the connection mark.
>> + * 'setlabel' behaves similarly for the connection label.*/
>> +int
>> +conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
>> + bool commit, uint16_t zone, const uint32_t *setmark,
>> + const struct ovs_key_ct_labels *setlabel,
>> + const char *helper)
>> +{
>> + struct dp_packet **pkts = pkt_batch->packets;
>> + size_t cnt = pkt_batch->count;
>> +#if !defined(__CHECKER__) && !defined(_WIN32)
>> + const size_t KEY_ARRAY_SIZE = cnt;
>> +#else
>> + enum { KEY_ARRAY_SIZE = NETDEV_MAX_BURST };
>> +#endif
>> + struct conn_lookup_ctx ctxs[KEY_ARRAY_SIZE];
>> + int8_t bucket_list[CONNTRACK_BUCKETS];
>> + struct {
>> + unsigned bucket;
>> + unsigned long maps;
>> + } arr[KEY_ARRAY_SIZE];
>> + long long now = time_msec();
>> + size_t i = 0;
>> + uint8_t arrcnt = 0;
>> +
>> + BUILD_ASSERT_DECL(sizeof arr[0].maps * CHAR_BIT >= NETDEV_MAX_BURST);
>> +
>> + if (helper) {
>> + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
>> +
>> + VLOG_WARN_RL(&rl, "ALG helper \"%s\" not supported", helper);
>> + /* Continue without the helper */
>> + }
>> +
>> + memset(bucket_list, INT8_C(-1), sizeof bucket_list);
>> + for (i = 0; i < cnt; i++) {
>> + unsigned bucket;
>> +
>> + if (!conn_key_extract(ct, pkts[i], &ctxs[i], zone)) {
>> + write_ct_md(pkts[i], CS_INVALID, zone, 0, OVS_U128_ZERO);
>> + continue;
>> + }
>> +
>> + bucket = hash_to_bucket(ctxs[i].hash);
>> + if (bucket_list[bucket] == INT8_C(-1)) {
>> + bucket_list[bucket] = arrcnt;
>> +
>> + arr[arrcnt].maps = 0;
>> + ULLONG_SET1(arr[arrcnt].maps, i);
>> + arr[arrcnt++].bucket = bucket;
>> + } else {
>> + ULLONG_SET1(arr[bucket_list[bucket]].maps, i);
>> + arr[bucket_list[bucket]].maps |= 1UL << i;
>> + }
>> + }
>> +
>> + for (i = 0; i < arrcnt; i++) {
>> + struct conntrack_bucket *ctb = &ct->buckets[arr[i].bucket];
>> + size_t j;
>> +
>> + ct_lock_lock(&ctb->lock);
>> +
>> + ULLONG_FOR_EACH_1(j, arr[i].maps) {
>> + struct conn *conn;
>> +
>> + conn_key_lookup(ctb, &ctxs[j], now);
>> +
>> + conn = process_one(ct, pkts[j], &ctxs[j], zone, commit, now);
>> +
>> + if (conn && setmark) {
>> + set_mark(pkts[j], conn, setmark[0], setmark[1]);
>> + }
>> +
>> + if (conn && setlabel) {
>> + set_label(pkts[j], conn, &setlabel[0], &setlabel[1]);
>> + }
>> + }
>> + ct_lock_unlock(&ctb->lock);
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static void
>> +set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t
>> mask)
>> +{
>> + pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));
>> + conn->mark = pkt->md.ct_mark;
>> +}
>> +
>> +static void
>> +set_label(struct dp_packet *pkt, struct conn *conn,
>> + const struct ovs_key_ct_labels *val,
>> + const struct ovs_key_ct_labels *mask)
>> +{
>> + ovs_u128 v, m;
>> +
>> + memcpy(&v, val, sizeof v);
>> + memcpy(&m, mask, sizeof m);
>> +
>> + pkt->md.ct_label.u64.lo = v.u64.lo
>> + | (pkt->md.ct_label.u64.lo & ~(m.u64.lo));
>> + pkt->md.ct_label.u64.hi = v.u64.hi
>> + | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));
>> + conn->label = pkt->md.ct_label;
>> +}
>> +
>> +/* Key extraction */
>> +
>> +/* The function stores a pointer to the first byte after the header in
>> + * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is
>> + * not interested in the header's tail, meaning that the header has
>> + * already been parsed (e.g. by flow_extract): we take this as a hint to
>> + * save a few checks. If 'validate_checksum' is true, the function returns
>> + * false if the IPv4 checksum is invalid. */
>> +static inline bool
>> +extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
>> + const char **new_data, bool validate_checksum)
>> +{
>> + const struct ip_header *ip = data;
>> + size_t ip_len;
>> +
>> + if (new_data) {
>> + if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
>> + return false;
>> + }
>> + }
>> +
>> + ip_len = IP_IHL(ip->ip_ihl_ver) * 4;
>> +
>> + if (new_data) {
>> + if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {
>> + return false;
>> + }
>> + if (OVS_UNLIKELY(size < ip_len)) {
>> + return false;
>> + }
>> +
>> + *new_data = (char *) data + ip_len;
>> + }
>> +
>> + if (IP_IS_FRAGMENT(ip->ip_frag_off)) {
>> + return false;
>> + }
>> +
>> + if (validate_checksum && csum(data, ip_len) != 0) {
>> + return false;
>> + }
>> +
>> + key->src.addr.ipv4 = ip->ip_src;
>> + key->dst.addr.ipv4 = ip->ip_dst;
>> + key->nw_proto = ip->ip_proto;
>> +
>> + return true;
>> +}
>> +
>> +/* The function stores a pointer to the first byte after the header in
>> + * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is
>> + * not interested in the header's tail, meaning that the header has
>> + * already been parsed (e.g. by flow_extract): we take this as a hint to
>> + * save a few checks. */
>> +static inline bool
>> +extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
>> + const char **new_data)
>> +{
>> + const struct ovs_16aligned_ip6_hdr *ip6 = data;
>> + uint8_t nw_proto = ip6->ip6_nxt;
>> + uint8_t nw_frag = 0;
>> +
>> + if (new_data) {
>> + if (OVS_UNLIKELY(size < sizeof *ip6)) {
>> + return false;
>> + }
>> + }
>> +
>> + data = ip6 + 1;
>> + size -= sizeof *ip6;
>> +
>> + if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag)) {
>> + return false;
>> + }
>> +
>> + if (new_data) {
>> + *new_data = data;
>> + }
>> +
>> + if (nw_frag) {
>> + return false;
>> + }
>> +
>> + key->src.addr.ipv6 = ip6->ip6_src;
>> + key->dst.addr.ipv6 = ip6->ip6_dst;
>> + key->nw_proto = nw_proto;
>> +
>> + return true;
>> +}
>> +
>> +static inline bool
>> +checksum_valid(const struct conn_key *key, const void *data, size_t size,
>> + const void *l3)
>> +{
>> + uint32_t csum = 0;
>> +
>> + if (key->dl_type == htons(ETH_TYPE_IP)) {
>> + csum = packet_csum_pseudoheader(l3);
>> + } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
>> + csum = packet_csum_pseudoheader6(l3);
>> + } else {
>> + return false;
>> + }
>> +
>> + csum = csum_continue(csum, data, size);
>> +
>> + return csum_finish(csum) == 0;
>> +}
>> +
>> +static inline bool
>> +check_l4_tcp(const struct conn_key *key, const void *data, size_t size,
>> + const void *l3)
>> +{
>> + const struct tcp_header *tcp = data;
>> + size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
>> +
>> + if (OVS_UNLIKELY(tcp_len < TCP_HEADER_LEN || tcp_len > size)) {
>> + return false;
>> + }
>> +
>> + return checksum_valid(key, data, size, l3);
>> +}
>> +
>> +static inline bool
>> +check_l4_udp(const struct conn_key *key, const void *data, size_t size,
>> + const void *l3)
>> +{
>> + const struct udp_header *udp = data;
>> + size_t udp_len = ntohs(udp->udp_len);
>> +
>> + if (OVS_UNLIKELY(udp_len < UDP_HEADER_LEN || udp_len > size)) {
>> + return false;
>> + }
>> +
>> + /* Validation must be skipped if checksum is 0 on IPv4 packets */
>> + return (udp->udp_csum == 0 && key->dl_type == htons(ETH_TYPE_IP))
>> + || checksum_valid(key, data, size, l3);
>> +}
>> +
>> +static inline bool
>> +check_l4_icmp(const void *data, size_t size)
>> +{
>> + return csum(data, size) == 0;
>> +}
>> +
>> +static inline bool
>> +check_l4_icmp6(const struct conn_key *key, const void *data, size_t size,
>> + const void *l3)
>> +{
>> + return checksum_valid(key, data, size, l3);
>> +}
>> +
>> +static inline bool
>> +extract_l4_tcp(struct conn_key *key, const void *data, size_t size)
>> +{
>> + const struct tcp_header *tcp = data;
>> +
>> + if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) {
>> + return false;
>> + }
>> +
>> + key->src.port = tcp->tcp_src;
>> + key->dst.port = tcp->tcp_dst;
>> +
>> + /* Port 0 is invalid */
>> + return key->src.port && key->dst.port;
>> +}
>> +
>> +static inline bool
>> +extract_l4_udp(struct conn_key *key, const void *data, size_t size)
>> +{
>> + const struct udp_header *udp = data;
>> +
>> + if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) {
>> + return false;
>> + }
>> +
>> + key->src.port = udp->udp_src;
>> + key->dst.port = udp->udp_dst;
>> +
>> + /* Port 0 is invalid */
>> + return key->src.port && key->dst.port;
>> +}
>> +
>> +static inline bool extract_l4(struct conn_key *key, const void *data,
>> + size_t size, bool *related, const void *l3);
>> +
>> +/* If 'related' is not NULL and the function is processing an ICMP
>> + * error packet, extract the l3 and l4 fields from the nested header
>> + * instead and set *related to true. If 'related' is NULL we're
>> + * already processing a nested header and no such recursion is
>> + * possible */
>> +static inline int
>> +extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
>> + bool *related)
>> +{
>> + const struct icmp_header *icmp = data;
>> +
>> + if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {
>> + return false;
>> + }
>> +
>> + switch (icmp->icmp_type) {
>> + case ICMP4_ECHO_REQUEST:
>> + case ICMP4_ECHO_REPLY:
>> + case ICMP4_TIMESTAMP:
>> + case ICMP4_TIMESTAMPREPLY:
>> + case ICMP4_INFOREQUEST:
>> + case ICMP4_INFOREPLY:
>> + /* Separate ICMP connection: identified using id */
>> + key->src.port = key->dst.port = icmp->icmp_fields.echo.id;
>> + break;
>> + case ICMP4_DST_UNREACH:
>> + case ICMP4_TIME_EXCEEDED:
>> + case ICMP4_PARAM_PROB:
>> + case ICMP4_SOURCEQUENCH:
>> + case ICMP4_REDIRECT: {
>> + /* ICMP packet part of another connection. We should
>> + * extract the key from embedded packet header */
>> + struct conn_key inner_key;
>> + const char *l3 = (const char *) (icmp + 1);
>> + const char *tail = (const char *) data + size;
>> + const char *l4;
>> + bool ok;
>> +
>> + if (!related) {
>> + return false;
>> + }
>> + *related = true;
>
>I don't think there's any bug, but I notice that some information is
>returned back like this, before all validation is done. Strictly
>speaking I suppose this should only be modified if we're about to
>return OK? (Also for IPv6)
Agreed, I moved the assignment before returning true.
>
>> +
>> + memset(&inner_key, 0, sizeof inner_key);
>> + inner_key.dl_type = htons(ETH_TYPE_IP);
>> + ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4, false);
>> + if (!ok) {
>> + return false;
>> + }
>> +
>> + /* pf doesn't do this, but it seems a good idea */
>> + if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned
>> + || inner_key.dst.addr.ipv4_aligned !=
>> key->src.addr.ipv4_aligned) {
>> + return false;
>> + }
>
>Interesting, I guess that in the case that a NAT exists between us and
>the host sending the ICMP response, the inner IPs should be translated
>along with the outer IPs. This check would filter out ICMP responses
>that have traversed a naive NAT that doesn't modify the ICMP payload.
>I think this makes sense.
>
>> + key->src = inner_key.src;
>> + key->dst = inner_key.dst;
>> + key->nw_proto = inner_key.nw_proto;
>> +
>> + ok = extract_l4(key, l4, tail - l4, NULL, l3);
>> + if (ok) {
>> + conn_key_reverse(key);
>> + }
>> + return ok;
>> + }
>> + default:
>> + return false;
>> + }
>> +
>> + return true;
>> +}
>> +
>> +/* If 'related' is not NULL and the function is processing an ICMP
>> + * error packet, extract the l3 and l4 fields from the nested header
>> + * instead and set *related to true. If 'related' is NULL we're
>> + * already processing a nested header and no such recursion is
>> + * possible */
>> +static inline bool
>> +extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
>> + bool *related)
>> +{
>> + const struct icmp6_header *icmp6 = data;
>> +
>> + /* All the messages that we support need at least 4 bytes after
>> + * the header */
>> + if (size < sizeof *icmp6 + 4) {
>> + return false;
>> + }
>> +
>> + switch (icmp6->icmp6_type) {
>> + case ICMP6_ECHO_REQUEST:
>> + case ICMP6_ECHO_REPLY:
>> + /* Separate ICMP connection: identified using id */
>> + key->src.port = key->dst.port = *(ovs_be16 *) (icmp6 + 1);
>> + break;
>> + case ICMP6_DST_UNREACH:
>> + case ICMP6_PACKET_TOO_BIG:
>> + case ICMP6_TIME_EXCEEDED:
>> + case ICMP6_PARAM_PROB: {
>> + /* ICMP packet part of another connection. We should
>> + * extract the key from embedded packet header */
>> + struct conn_key inner_key;
>> + const char *l3 = (const char *) icmp6 + 8;
>> + const char *tail = (const char *) data + size;
>> + const char *l4 = NULL;
>> + bool ok;
>> +
>> + if (!related) {
>> + return false;
>> + }
>> + *related = true;
>> +
>> + memset(&inner_key, 0, sizeof inner_key);
>> + inner_key.dl_type = htons(ETH_TYPE_IPV6);
>> + ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);
>> + if (!ok) {
>> + return false;
>> + }
>> +
>> + /* pf doesn't do this, but it seems a good idea */
>> + if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned,
>> + &key->dst.addr.ipv6_aligned)
>> + || !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned,
>> + &key->src.addr.ipv6_aligned)) {
>> + return false;
>> + }
>> +
>> + key->src = inner_key.src;
>> + key->dst = inner_key.dst;
>> + key->nw_proto = inner_key.nw_proto;
>> +
>> + ok = extract_l4(key, l4, tail - l4, NULL, l3);
>> + if (ok) {
>> + conn_key_reverse(key);
>> + }
>> + return ok;
>> + }
>> + default:
>> + return false;
>> + }
>> +
>> + return true;
>> +}
>> +
>> +/* Extract l4 fields into 'key', which must already contain valid l3
>> + * members.
>> + *
>> + * If 'related' is not NULL and an ICMP error packet is being
>> + * processed, the function will extract the key from the packet nested
>> + * in the ICMP paylod and set '*related' to true.
>> + *
>> + * If 'related' is NULL, it means that we're already parsing a header nested
>> + * in an ICMP error. In this case, we skip checksum and length validation.
>> */
>> +static inline bool
>> +extract_l4(struct conn_key *key, const void *data, size_t size, bool
>> *related,
>> + const void *l3)
>> +{
>> + if (key->nw_proto == IPPROTO_TCP) {
>> + return (!related || check_l4_tcp(key, data, size, l3))
>> + && extract_l4_tcp(key, data, size);
>> + } else if (key->nw_proto == IPPROTO_UDP) {
>> + return (!related || check_l4_udp(key, data, size, l3))
>> + && extract_l4_udp(key, data, size);
>> + } else if (key->dl_type == htons(ETH_TYPE_IP)
>> + && key->nw_proto == IPPROTO_ICMP) {
>> + return (!related || check_l4_icmp(data, size))
>> + && extract_l4_icmp(key, data, size, related);
>> + } else if (key->dl_type == htons(ETH_TYPE_IPV6)
>> + && key->nw_proto == IPPROTO_ICMPV6) {
>> + return (!related || check_l4_icmp6(key, data, size, l3))
>> + && extract_l4_icmp6(key, data, size, related);
>> + } else {
>> + return false;
>> + }
>> +}
>> +
>> +static bool
>> +conn_key_extract(struct conntrack *ct, struct dp_packet *pkt,
>> + struct conn_lookup_ctx *ctx, uint16_t zone)
>> +{
>> + const struct eth_header *l2 = dp_packet_l2(pkt);
>> + const struct ip_header *l3 = dp_packet_l3(pkt);
>> + const char *l4 = dp_packet_l4(pkt);
>> + const char *tail = dp_packet_tail(pkt);
>> + bool ok;
>> +
>> + memset(ctx, 0, sizeof *ctx);
>> +
>> + if (!l2 || !l3 || !l4) {
>> + return false;
>> + }
>> +
>> + ctx->key.zone = zone;
>> +
>> + /* XXX In this function we parse the packet (again, it has already
>> + * gone through miniflow_extract()) for two reasons:
>> + *
>> + * 1) To extract the l3 addresses and l4 ports.
>> + * We already have the l3 and l4 headers' pointers. Extracting
>> + * the l3 addresses and the l4 ports is really cheap, since they
>> + * can be found at fixed locations.
>> + * 2) To extract the l3 and l4 types.
>> + * Extracting the l3 and l4 types (especially the l3[1]) on the
>> + * other hand is quite expensive, because they're not at a
>> + * fixed location.
>> + *
>> + * Here's a way to avoid (2) with the help of the datapath.
>> + * The datapath doesn't keep the packet's extracted flow[2], so
>> + * using that is not an option. We could use the packet's matching
>> + * megaflow for l3 type (it's always unwildcarded), and for l4 type
>> + * (we have to unwildcard it first). This means either:
>> + *
>> + * a) dpif-netdev passes the matching megaflow to dp_execute_cb(), which
>> + * is used to extract the l3 type. Unfortunately, dp_execute_cb() is
>> + * used also in dpif_netdev_execute(), which doesn't have a matching
>> + * megaflow.
>> + *
>> + * b) We define an alternative OVS_ACTION_ATTR_CT, used only by the
>> + * userspace datapath, which includes l3 (and l4) type. The
>> + * alternative action could be generated by ofproto-dpif specifically
>> + * for the userspace datapath. Having a different interface for
>> + * userspace and kernel doesn't seem very clean, though.
>> + *
>> + * ---
>> + * [1] A simple benchmark (running only the connection tracker
>> + * over and over on the same packets) shows that if the
>> + * l3 type is already provided we are 15% faster (running the
>> + * connection tracker over a couple of DPDK devices with a
>> + * stream of UDP 64-bytes packets shows that we are 4% faster).
>> + *
>> + * [2] The reasons for this are that keeping the flow increases
>> + * (slightly) the cache footprint and increases computation
>> + * time as we move the packet around. Most importantly, the flow
>> + * should be updated by the actions and this can be slow, as
>> + * we use a sparse representation (miniflow).
>> + *
>> + */
>> + ctx->key.dl_type = parse_dl_type(l2, (char *) l3 - (char *) l2);
>> + if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {
>> + ok = extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, NULL, true);
>> + } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
>> + ok = extract_l3_ipv6(&ctx->key, l3, tail - (char *) l3, NULL);
>> + } else {
>> + ok = false;
>> + }
>> +
>> + if (ok) {
>> + if (extract_l4(&ctx->key, l4, tail - l4, &ctx->related, l3)) {
>> + ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
>> + return true;
>> + }
>> + }
>> +
>> + return false;
>> +}
>> +
>> +/* Symmetric */
>> +static uint32_t
>> +conn_key_hash(const struct conn_key *key, uint32_t basis)
>> +{
>> + uint32_t hsrc, hdst, hash;
>> + int i;
>> +
>> + hsrc = hdst = basis;
>> +
>> + /* Hash the source and destination tuple */
>> + for (i = 0; i < sizeof(key->src) / sizeof(uint32_t); i++) {
>> + hsrc = hash_add(hsrc, ((uint32_t *) &key->src)[i]);
>> + hdst = hash_add(hdst, ((uint32_t *) &key->dst)[i]);
>> + }
>> +
>> + /* Even if source and destination are swapped the hash will be the
>> same. */
>> + hash = hsrc ^ hdst;
>> +
>> + /* Hash the rest of the key(L3 and L4 types and zone). */
>> + hash = hash_words((uint32_t *) &key->dst + 1,
>> + (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1),
>> + hash);
>
>Do you think it's likely that 'struct conn_key' will be rearranged or
>new elements added? Perhaps the "+ 1" could break in such a case. Is
>it worth documenting above the struct that some functions, such as
>conn_key_hash() rely on the ordering of those elements?
Good point, I added a comment above struct conn_key
>> +
>> + return hash;
>> +}
>> +
>> +static void
>> +conn_key_reverse(struct conn_key *key)
>> +{
>> + struct ct_endpoint tmp;
>> + tmp = key->src;
>> + key->src = key->dst;
>> + key->dst = tmp;
>> +}
>> +
>> +static void
>> +conn_key_lookup(struct conntrack_bucket *ctb,
>> + struct conn_lookup_ctx *ctx,
>> + long long now)
>> +{
>> + uint32_t hash = ctx->hash;
>> + struct conn *conn;
>> +
>> + ctx->conn = NULL;
>> +
>> + HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) {
>> + if (!memcmp(&conn->key, &ctx->key, sizeof(conn->key))
>> + && !conn_expired(conn, now)) {
>> + ctx->conn = conn;
>> + ctx->reply = false;
>> + break;
>> + }
>> + if (!memcmp(&conn->rev_key, &ctx->key, sizeof(conn->rev_key))
>> + && !conn_expired(conn, now)) {
>> + ctx->conn = conn;
>> + ctx->reply = true;
>> + break;
>> + }
>> + }
>> +}
>> +
>> +static enum ct_update_res
>> +conn_update(struct conn *conn, struct dp_packet *pkt, bool reply,
>> + long long now)
>> +{
>> + return l4_protos[conn->key.nw_proto]->conn_update(conn, pkt, reply,
>> now);
>> +}
>> +
>> +static bool
>> +conn_expired(struct conn *conn, long long now)
>> +{
>> + return now >= conn->expiration;
>> +}
>> +
>> +static bool
>> +valid_new(struct dp_packet *pkt, struct conn_key *key)
>> +{
>> + return l4_protos[key->nw_proto]->valid_new(pkt);
>> +}
>> +
>> +static struct conn *
>> +new_conn(struct dp_packet *pkt, struct conn_key *key, long long now)
>> +{
>> + struct conn *newconn;
>> +
>> + newconn = l4_protos[key->nw_proto]->new_conn(pkt, now);
>> +
>> + if (newconn) {
>> + newconn->key = *key;
>> + }
>> +
>> + return newconn;
>> +}
>> +
>> +static void
>> +delete_conn(struct conn *conn)
>> +{
>> + free(conn);
>> +}
>> diff --git a/lib/conntrack.h b/lib/conntrack.h
>> new file mode 100644
>> index 0000000..54731bd
>> --- /dev/null
>> +++ b/lib/conntrack.h
>> @@ -0,0 +1,150 @@
>> +/*
>> + * Copyright (c) 2015, 2016 Nicira, Inc.
>> + *
>> + * Licensed under the Apache License, Version 2.0 (the "License");
>> + * you may not use this file except in compliance with the License.
>> + * You may obtain a copy of the License at:
>> + *
>> + * http://www.apache.org/licenses/LICENSE-2.0
>> + *
>> + * Unless required by applicable law or agreed to in writing, software
>> + * distributed under the License is distributed on an "AS IS" BASIS,
>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>> + * See the License for the specific language governing permissions and
>> + * limitations under the License.
>> + */
>> +
>> +#ifndef CONNTRACK_H
>> +#define CONNTRACK_H 1
>> +
>> +#include <stdbool.h>
>> +
>> +#include "hmap.h"
>> +#include "odp-netlink.h"
>> +#include "openvswitch/thread.h"
>> +#include "openvswitch/types.h"
>> +#include "ovs-atomic.h"
>> +
>> +/* Userspace connection tracker
>> + * ============================
>> + *
>> + * This is a connection tracking module that keeps all the state in
>> userspace.
>> + *
>> + * Usage
>> + * =====
>> + *
>> + * struct conntract ct;
>
>"conntrack", maybe? :-)
Maybe!
>
>> + *
>> + * Initialization:
>> + *
>> + * conntrack_init(&ct);
>> + *
>> + * It is necessary to periodically issue a call to
>> + *
>> + * conntrack_run(&ct);
>> + *
>> + * to allow the module to clean up expired connections.
>> + *
>> + * To send a group of packets through the connection tracker:
>> + *
>> + * conntrack_execute(&ct, pkts, n_pkts, ...);
>> + *
>> + * Thread-safety
>> + * =============
>> + *
>> + * conntrack_execute() can be called by multiple threads simultaneoulsy.
>> + */
>> +
>> +struct dp_packet_batch;
>> +
>> +struct conntrack;
>> +
>> +void conntrack_init(struct conntrack *);
>> +void conntrack_run(struct conntrack *);
>> +void conntrack_destroy(struct conntrack *);
>> +
>> +int conntrack_execute(struct conntrack *, struct dp_packet_batch *,
>> + bool commit, uint16_t zone, const uint32_t *setmark,
>> + const struct ovs_key_ct_labels *setlabel,
>> + const char *helper);
>> +
>> +/* 'struct ct_lock' is a wrapper for an adaptive mutex. It's useful to try
>> + * different types of locks (e.g. spinlocks) */
>> +
>> +struct OVS_LOCKABLE ct_lock {
>> + struct ovs_mutex lock;
>> +};
>> +
>> +static inline void ct_lock_init(struct ct_lock *lock)
>> +{
>> + ovs_mutex_init_adaptive(&lock->lock);
>> +}
>> +
>> +static inline void ct_lock_lock(struct ct_lock *lock)
>> + OVS_ACQUIRES(lock)
>> + OVS_NO_THREAD_SAFETY_ANALYSIS
>
>Doesn't the latter contradict the former?
>
>If Clang understands our code, we should use OVS_ACQUIRES. If it can't
>figure it out and it's beyond reasonable changes for us to teach it,
>we should use NO_THREAD_SAFETY_ANALYSIS. Unless there's a specific
>meaning that Clang interprets from both of these?
>
>> +{
>> + ovs_mutex_lock(&lock->lock);
>> +}
>> +
>> +static inline void ct_lock_unlock(struct ct_lock *lock)
>> + OVS_RELEASES(lock)
>> + OVS_NO_THREAD_SAFETY_ANALYSIS
>
>ditto.
When implementing a locking API, we need to describe the external interface,
i.e. OVS_ACQUIRES(lock), and then we ask clang not to look into the
implementation, i.e. OVS_NO_THREAD_SAFETY_ANALYSIS. This is how
ovs_mutex_lock_at() is implemented, with the difference that these are inline
functions, so both annotation must be provided on the function definition.
We could replace OVS_NO_THREAD_SAFETY_ANALYSIS with another
OVS_ACQUIRES(lock->lock), but that would defeat the purpose of struct ct_lock,
which is to allow eventually to have a DPDK rte_spinlock instead of a mutex.
>
>> +{
>> + ovs_mutex_unlock(&lock->lock);
>> +}
>> +
>> +static inline void ct_lock_destroy(struct ct_lock *lock)
>> +{
>> + ovs_mutex_destroy(&lock->lock);
>> +}
>> +
>> +/* Timeouts: all the possible timeout states passed to update_expiration()
>> + * are listed here. The name will be prefix by CT_TM_ and the value is in
>> + * milliseconds */
>> +#define CT_TIMEOUTS \
>> + CT_TIMEOUT(TCP_FIRST_PACKET, 30 * 1000) \
>> + CT_TIMEOUT(TCP_OPENING, 30 * 1000) \
>> + CT_TIMEOUT(TCP_ESTABLISHED, 24 * 60 * 60 * 1000) \
>> + CT_TIMEOUT(TCP_CLOSING, 15 * 60 * 1000) \
>> + CT_TIMEOUT(TCP_FIN_WAIT, 45 * 1000) \
>> + CT_TIMEOUT(TCP_CLOSED, 30 * 1000) \
>> + CT_TIMEOUT(OTHER_FIRST, 60 * 1000) \
>> + CT_TIMEOUT(OTHER_MULTIPLE, 60 * 1000) \
>> + CT_TIMEOUT(OTHER_BIDIR, 30 * 1000) \
>> +
>> +enum ct_timeout {
>> +#define CT_TIMEOUT(NAME, VALUE) CT_TM_##NAME,
>> + CT_TIMEOUTS
>> +#undef CT_TIMEOUT
>> + N_CT_TM
>> +};
>> +
>> +/* Locking:
>> + *
>> + * The connections are kept in different buckets, which are completely
>> + * independent. The connection bucket is determined by the hash of its key.
>> + * */
>> +struct conntrack_bucket {
>> + struct ct_lock lock;
>> + struct hmap connections OVS_GUARDED;
>> +};
>> +
>> +#define CONNTRACK_BUCKETS_SHIFT 8
>> +#define CONNTRACK_BUCKETS (1 << CONNTRACK_BUCKETS_SHIFT)
>> +
>> +struct conntrack {
>> + /* Independent buckets containing the connections */
>> + struct conntrack_bucket buckets[CONNTRACK_BUCKETS];
>> +
>> + /* Salt for hashing a connection key. */
>> + uint32_t hash_basis;
>> +
>> + /* Number of connections currently in the connection tracker. */
>> + atomic_count n_conn;
>> + /* Connections limit. When this limit is reached, no new connection
>> + * will be accepted. */
>> + atomic_uint n_conn_limit;
>> +};
>> +
>> +#endif /* conntrack.h */
>> diff --git a/lib/util.h b/lib/util.h
>> index 7be4a30..ad31e74 100644
>> --- a/lib/util.h
>> +++ b/lib/util.h
>> @@ -69,6 +69,15 @@ ovs_prefetch_range(const void *start, size_t size)
>> #define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
>> #endif
>>
>> +/* Comparisons for ints with modular arithmetic */
>> +#define INT_MOD_LT(a,b) ((int) ((a)-(b)) < 0)
>> +#define INT_MOD_LEQ(a,b) ((int) ((a)-(b)) <= 0)
>> +#define INT_MOD_GT(a,b) ((int) ((a)-(b)) > 0)
>> +#define INT_MOD_GEQ(a,b) ((int) ((a)-(b)) >= 0)
>> +
>> +#define INT_MOD_MIN(a, b) ((INT_MOD_LT(a, b)) ? (a) : (b))
>> +#define INT_MOD_MAX(a, b) ((INT_MOD_GT(a, b)) ? (a) : (b))
>> +
>> #define OVS_NOT_REACHED() abort()
>>
>> /* Use "%"PRIuSIZE to format size_t with printf(). */
>> --
>> 2.8.1
>>
_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev