Re: [ovs-dev] [PATCH v3 04/16] conntrack: New userspace connection tracker.

Fischetti, Antonio Tue, 24 May 2016 07:23:36 -0700

Hi Daniele, just a comment below. 
Apart from that, it looks good to me, thanks.


Acked-by: Antonio Fischetti <antonio.fische...@intel.com>

> -----Original Message-----
> From: dev [mailto:dev-boun...@openvswitch.org] On Behalf Of Daniele
> Di Proietto
> Sent: Tuesday, May 17, 2016 1:56 AM
> To: dev@openvswitch.org
> Subject: [ovs-dev] [PATCH v3 04/16] conntrack: New userspace
> connection tracker.
> 
> This commit adds the conntrack module.
> 
> It is a connection tracker that resides entirely in userspace.  Its
> primary user will be the dpif-netdev datapath.
> 
> The module main goal is to provide conntrack_execute(), which offers
> a
> convenient interface to implement the datapath ct() action.
> 
> The conntrack module uses two submodules to deal with the l4 protocol
> details (conntrack-other for UDP and ICMP, conntrack-tcp for TCP).
> 
> The conntrack-tcp submodule implementation is adapted from FreeBSD's
> pf
> subsystem, therefore it's BSD licensed.  It has been slightly altered
> to
> match the OVS coding style and to allow the pickup of already
> established connections.
> 
> Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com>
> ---
>  COPYING                     |   1 +
>  debian/copyright.in         |   4 +
>  include/openvswitch/types.h |   4 +
>  lib/automake.mk             |   5 +
>  lib/conntrack-other.c       |  85 +++++
>  lib/conntrack-private.h     |  88 +++++
>  lib/conntrack-tcp.c         | 463 +++++++++++++++++++++++
>  lib/conntrack.c             | 883
> ++++++++++++++++++++++++++++++++++++++++++++
>  lib/conntrack.h             | 151 ++++++++
>  lib/util.h                  |   9 +
>  10 files changed, 1693 insertions(+)
>  create mode 100644 lib/conntrack-other.c
>  create mode 100644 lib/conntrack-private.h
>  create mode 100644 lib/conntrack-tcp.c
>  create mode 100644 lib/conntrack.c
>  create mode 100644 lib/conntrack.h
> 
> diff --git a/COPYING b/COPYING
> index 308e3ea..afb98b9 100644
> --- a/COPYING
> +++ b/COPYING
> @@ -25,6 +25,7 @@ License, version 2.
>  The following files are licensed under the 2-clause BSD license.
>      include/windows/getopt.h
>      lib/getopt_long.c
> +    lib/conntrack-tcp.c
> 
>  The following files are licensed under the 3-clause BSD-license
>      include/windows/netinet/icmp6.h
> diff --git a/debian/copyright.in b/debian/copyright.in
> index 57d007a..a15f4dd 100644
> --- a/debian/copyright.in
> +++ b/debian/copyright.in
> @@ -21,6 +21,9 @@ Upstream Copyright Holders:
>       Copyright (c) 2014 Michael Chapman
>       Copyright (c) 2014 WindRiver, Inc.
>       Copyright (c) 2014 Avaya, Inc.
> +     Copyright (c) 2001 Daniel Hartmeier
> +     Copyright (c) 2002 - 2008 Henning Brauer
> +     Copyright (c) 2012 Gleb Smirnoff <gleb...@freebsd.org>
> 
>  License:
> 
> @@ -90,6 +93,7 @@ License:
>       lib/getopt_long.c
>       include/windows/getopt.h
>       datapath-windows/ovsext/Conntrack-tcp.c
> +     lib/conntrack-tcp.c
> 
>  * The following files are licensed under the 3-clause BSD-license
> 
> diff --git a/include/openvswitch/types.h
> b/include/openvswitch/types.h
> index 5f3347d..d7e94a6 100644
> --- a/include/openvswitch/types.h
> +++ b/include/openvswitch/types.h
> @@ -107,6 +107,10 @@ static const ovs_u128 OVS_U128_MAX = { {
> UINT32_MAX, UINT32_MAX,
>                                           UINT32_MAX, UINT32_MAX } };
>  static const ovs_be128 OVS_BE128_MAX OVS_UNUSED = { { OVS_BE32_MAX,
> OVS_BE32_MAX,
>                                             OVS_BE32_MAX,
> OVS_BE32_MAX } };
> +static const ovs_u128 OVS_U128_MIN OVS_UNUSED = { {0, 0, 0, 0} };
> +static const ovs_u128 OVS_BE128_MIN OVS_UNUSED = { {0, 0, 0, 0} };
> +
> +#define OVS_U128_ZERO OVS_U128_MIN
> 
>  /* A 64-bit value, in network byte order, that is only aligned on a
> 32-bit
>   * boundary. */
> diff --git a/lib/automake.mk b/lib/automake.mk
> index affbb5c..df8b07d 100644
> --- a/lib/automake.mk
> +++ b/lib/automake.mk
> @@ -47,6 +47,11 @@ lib_libopenvswitch_la_SOURCES = \
>       lib/compiler.h \
>       lib/connectivity.c \
>       lib/connectivity.h \
> +     lib/conntrack-private.h \
> +     lib/conntrack-tcp.c \
> +     lib/conntrack-other.c \
> +     lib/conntrack.c \
> +     lib/conntrack.h \
>       lib/coverage.c \
>       lib/coverage.h \
>       lib/crc32c.c \
> diff --git a/lib/conntrack-other.c b/lib/conntrack-other.c
> new file mode 100644
> index 0000000..295cb2c
> --- /dev/null
> +++ b/lib/conntrack-other.c
> @@ -0,0 +1,85 @@
> +/*
> + * Copyright (c) 2015, 2016 Nicira, Inc.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing,
> software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions
> and
> + * limitations under the License.
> + */
> +
> +#include <config.h>
> +
> +#include "conntrack-private.h"
> +#include "dp-packet.h"
> +
> +enum other_state {
> +    OTHERS_FIRST,
> +    OTHERS_MULTIPLE,
> +    OTHERS_BIDIR,
> +};
> +
> +struct conn_other {
> +    struct conn up;
> +    enum other_state state;
> +};
> +
> +static const enum ct_timeout other_timeouts[] = {
> +    [OTHERS_FIRST] = CT_TM_OTHER_FIRST,
> +    [OTHERS_MULTIPLE] = CT_TM_OTHER_MULTIPLE,
> +    [OTHERS_BIDIR] = CT_TM_OTHER_BIDIR,
> +};
> +
> +static struct conn_other *
> +conn_other_cast(const struct conn *conn)
> +{
> +    return CONTAINER_OF(conn, struct conn_other, up);
> +}
> +
> +static enum ct_update_res
> +other_conn_update(struct conn *conn_, struct dp_packet *pkt
> OVS_UNUSED,
> +                  bool reply, long long now)
> +{
> +    struct conn_other *conn = conn_other_cast(conn_);
> +
> +    if (reply && conn->state != OTHERS_BIDIR) {
> +        conn->state = OTHERS_BIDIR;
> +    } else if (conn->state == OTHERS_FIRST) {
> +        conn->state = OTHERS_MULTIPLE;
> +    }
> +
> +    update_expiration(conn_, other_timeouts[conn->state], now);
> +
> +    return CT_UPDATE_VALID;
> +}
> +
> +static bool
> +other_valid_new(struct dp_packet *pkt OVS_UNUSED)
> +{
> +    return true;
> +}
> +
> +static struct conn *
> +other_new_conn(struct dp_packet *pkt OVS_UNUSED, long long now)
> +{
> +    struct conn_other *conn;
> +
> +    conn = xzalloc(sizeof *conn);
> +    conn->state = OTHERS_FIRST;
> +
> +    update_expiration(&conn->up, other_timeouts[conn->state], now);
> +
> +    return &conn->up;
> +}
> +
> +struct ct_l4_proto ct_proto_other = {
> +    .new_conn = other_new_conn,
> +    .valid_new = other_valid_new,
> +    .conn_update = other_conn_update,
> +};
> diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
> new file mode 100644
> index 0000000..d3e0099
> --- /dev/null
> +++ b/lib/conntrack-private.h
> @@ -0,0 +1,88 @@
> +/*
> + * Copyright (c) 2015, 2016 Nicira, Inc.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing,
> software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions
> and
> + * limitations under the License.
> + */
> +
> +#ifndef CONNTRACK_PRIVATE_H
> +#define CONNTRACK_PRIVATE_H 1
> +
> +#include <sys/types.h>
> +#include <netinet/in.h>
> +#include <netinet/ip6.h>
> +
> +#include "conntrack.h"
> +#include "hmap.h"
> +#include "openvswitch/list.h"
> +#include "openvswitch/types.h"
> +#include "packets.h"
> +#include "unaligned.h"
> +
> +struct ct_addr {
> +    union {
> +        ovs_16aligned_be32 ipv4;
> +        union ovs_16aligned_in6_addr ipv6;
> +        ovs_be32 ipv4_aligned;
> +        struct in6_addr ipv6_aligned;
> +    };
> +};
> +
> +struct ct_endpoint {
> +    struct ct_addr addr;
> +    ovs_be16 port;
> +};
> +
> +struct conn_key {
> +    struct ct_endpoint src;
> +    struct ct_endpoint dst;
> +
> +    ovs_be16 dl_type;
> +    uint8_t nw_proto;
> +    uint16_t zone;
> +};
> +
> +struct conn {
> +    struct conn_key key;
> +    struct conn_key rev_key;
> +    long long expiration;
> +    struct ovs_list exp_node;
> +    struct hmap_node node;
> +    uint32_t mark;
> +    ovs_u128 label;
> +};
> +
> +enum ct_update_res {
> +    CT_UPDATE_INVALID,
> +    CT_UPDATE_VALID,
> +    CT_UPDATE_NEW,
> +};
> +
> +struct ct_l4_proto {
> +    struct conn *(*new_conn)(struct dp_packet *pkt, long long now);
> +    bool (*valid_new)(struct dp_packet *pkt);
> +    enum ct_update_res (*conn_update)(struct conn *conn, struct
> dp_packet *pkt,
> +                                      bool reply, long long now);
> +};
> +
> +extern struct ct_l4_proto ct_proto_tcp;
> +extern struct ct_l4_proto ct_proto_other;
> +
> +extern long long ct_timeout_val[];
> +
> +static inline void
> +update_expiration(struct conn *conn, enum ct_timeout tm, long long
> now)
> +{
> +    conn->expiration = now + ct_timeout_val[tm];
> +}
> +
> +#endif /* conntrack-private.h */
> diff --git a/lib/conntrack-tcp.c b/lib/conntrack-tcp.c
> new file mode 100644
> index 0000000..4d3d9c3
> --- /dev/null
> +++ b/lib/conntrack-tcp.c
> @@ -0,0 +1,463 @@
> +/*-
> + * Copyright (c) 2001 Daniel Hartmeier
> + * Copyright (c) 2002 - 2008 Henning Brauer
> + * Copyright (c) 2012 Gleb Smirnoff <gleb...@freebsd.org>
> + * Copyright (c) 2015, 2016 Nicira, Inc.
> + * All rights reserved.
> + *
> + * Redistribution and use in source and binary forms, with or
> without
> + * modification, are permitted provided that the following
> conditions
> + * are met:
> + *
> + *    - Redistributions of source code must retain the above
> copyright
> + *      notice, this list of conditions and the following
> disclaimer.
> + *    - Redistributions in binary form must reproduce the above
> + *      copyright notice, this list of conditions and the following
> + *      disclaimer in the documentation and/or other materials
> provided
> + *      with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> CONTRIBUTORS
> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
> + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
> INDIRECT,
> + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
> (INCLUDING,
> + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
> + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
> + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
> STRICT
> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
> + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
> + * POSSIBILITY OF SUCH DAMAGE.
> + *
> + * Effort sponsored in part by the Defense Advanced Research
> Projects
> + * Agency (DARPA) and Air Force Research Laboratory, Air Force
> + * Materiel Command, USAF, under agreement number F30602-01-2-0537.
> + *
> + *      $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $
> + */
> +
> +#include <config.h>
> +
> +#include "conntrack-private.h"
> +#include "ct-dpif.h"
> +#include "dp-packet.h"
> +#include "util.h"
> +
> +struct tcp_peer {
> +    enum ct_dpif_tcp_state state;
> +    uint32_t               seqlo;          /* Max sequence number
> sent     */
> +    uint32_t               seqhi;          /* Max the other end ACKd
> + win */
> +    uint16_t               max_win;        /* largest window (pre
> scaling) */
> +    uint8_t                wscale;         /* window scaling factor
> */
> +};
> +
> +struct conn_tcp {
> +    struct conn up;
> +    struct tcp_peer peer[2];
> +};
> +
> +enum {
> +    TCPOPT_EOL,
> +    TCPOPT_NOP,
> +    TCPOPT_WINDOW = 3,
> +};
> +
> +/* TCP sequence numbers are 32 bit integers operated
> + * on with modular arithmetic.  These macros can be
> + * used to compare such integers. */
> +#define SEQ_LT(a,b)     INT_MOD_LT(a, b)
> +#define SEQ_LEQ(a,b)    INT_MOD_LEQ(a, b)
> +#define SEQ_GT(a,b)     INT_MOD_GT(a, b)
> +#define SEQ_GEQ(a,b)    INT_MOD_GEQ(a, b)
> +
> +#define SEQ_MIN(a, b)   INT_MOD_MIN(a, b)
> +#define SEQ_MAX(a, b)   INT_MOD_MAX(a, b)
> +
> +static struct conn_tcp*
> +conn_tcp_cast(const struct conn* conn)
> +{
> +    return CONTAINER_OF(conn, struct conn_tcp, up);
> +}
> +
> +/* pf does this in in pf_normalize_tcp(), and it is called only if
> scrub
> + * is enabled.  We're not scrubbing, but this check seems
> reasonable.  */
> +static bool
> +tcp_invalid_flags(uint16_t flags)
> +{
> +
> +    if (flags & TCP_SYN) {
> +        if (flags & TCP_RST || flags & TCP_FIN) {
> +            return true;
> +        }
> +    } else {
> +        /* Illegal packet */
> +        if (!(flags & (TCP_ACK|TCP_RST))) {
> +            return true;
> +        }
> +    }
> +
> +    if (!(flags & TCP_ACK)) {
> +        /* These flags are only valid if ACK is set */
> +        if ((flags & TCP_FIN) || (flags & TCP_PSH) || (flags &
> TCP_URG)) {
> +            return true;
> +        }
> +    }
> +
> +    return false;
> +}
> +
> +#define TCP_MAX_WSCALE 14
> +#define CT_WSCALE_FLAG 0x80
> +#define CT_WSCALE_UNKNOWN 0x40
> +#define CT_WSCALE_MASK 0xf
> +
> +static uint8_t
> +tcp_get_wscale(const struct tcp_header *tcp)
> +{
> +    int len = TCP_OFFSET(tcp->tcp_ctl) * 4 - sizeof *tcp;
> +    const uint8_t *opt = (const uint8_t *)(tcp + 1);
> +    uint8_t wscale = 0;
> +    uint8_t optlen;
> +
> +    while (len >= 3) {
> +        switch (*opt) {
> +        case TCPOPT_EOL:
> +            return wscale;
> +        case TCPOPT_NOP:
> +            opt++;
> +            len--;
> +            break;
> +        case TCPOPT_WINDOW:
> +            wscale = MIN(opt[2], TCP_MAX_WSCALE);
> +            wscale |= CT_WSCALE_FLAG;
> +            /* fall through */
> +        default:
> +            optlen = opt[1];
> +            if (optlen < 2) {
> +                optlen = 2;
> +            }
> +            len -= optlen;
> +            opt += optlen;
> +        }
> +    }
> +
> +    return wscale;
> +}
> +
> +static uint32_t
> +tcp_payload_length(struct dp_packet *pkt)
> +{
> +    return (char *) dp_packet_tail(pkt) - dp_packet_l2_pad_size(pkt)
> +           - (char *) dp_packet_get_tcp_payload(pkt);
> +}
> +
> +static enum ct_update_res
> +tcp_conn_update(struct conn* conn_, struct dp_packet *pkt, bool
> reply,
> +                long long now)
> +{
> +    struct conn_tcp *conn = conn_tcp_cast(conn_);
> +    struct tcp_header *tcp = dp_packet_l4(pkt);
> +    /* The peer that sent 'pkt' */
> +    struct tcp_peer *src = &conn->peer[reply ? 1 : 0];
> +    /* The peer that should receive 'pkt' */
> +    struct tcp_peer *dst = &conn->peer[reply ? 0 : 1];
> +    uint8_t sws = 0, dws = 0;
> +    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
> +
> +    uint16_t win = ntohs(tcp->tcp_winsz);
> +    uint32_t ack, end, seq, orig_seq;
> +    uint32_t p_len = tcp_payload_length(pkt);
> +    int ackskew;
> +
> +    if (tcp_invalid_flags(tcp_flags)) {
> +        return CT_UPDATE_INVALID;
> +    }
> +
> +    if (((tcp_flags & (TCP_SYN|TCP_ACK)) == TCP_SYN) &&
> +            dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 &&
> +            src->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
> +        src->state = dst->state = CT_DPIF_TCPS_CLOSED;
> +        return CT_UPDATE_NEW;
> +    }
> +
> +    if (src->wscale & CT_WSCALE_FLAG
> +        && dst->wscale & CT_WSCALE_FLAG
> +        && !(tcp_flags & TCP_SYN)) {
> +
> +        sws = src->wscale & CT_WSCALE_MASK;
> +        dws = dst->wscale & CT_WSCALE_MASK;
> +
> +    } else if (src->wscale & CT_WSCALE_UNKNOWN
> +        && dst->wscale & CT_WSCALE_UNKNOWN
> +        && !(tcp_flags & TCP_SYN)) {
> +
> +        sws = TCP_MAX_WSCALE;
> +        dws = TCP_MAX_WSCALE;
> +    }
> +
> +    /*
> +     * Sequence tracking algorithm from Guido van Rooij's paper:
> +     *   http://www.madison-gurkha.com/publications/tcp_filtering/
> +     *      tcp_filtering.ps
> +     */
> +
> +    orig_seq = seq = ntohl(get_16aligned_be32(&tcp->tcp_seq));
> +    if (src->state < CT_DPIF_TCPS_SYN_SENT) {
> +        /* First packet from this end. Set its state */
> +
> +        ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));
> +
> +        end = seq + p_len;
> +        if (tcp_flags & TCP_SYN) {
> +            end++;
> +            if (dst->wscale & CT_WSCALE_FLAG) {
> +                src->wscale = tcp_get_wscale(tcp);
> +                if (src->wscale & CT_WSCALE_FLAG) {
> +                    /* Remove scale factor from initial window */
> +                    sws = src->wscale & CT_WSCALE_MASK;
> +                    win = DIV_ROUND_UP((uint32_t) win, 1 << sws);
> +                    dws = dst->wscale & CT_WSCALE_MASK;
> +                } else {
> +                    /* fixup other window */
> +                    dst->max_win <<= dst->wscale &
> +                        CT_WSCALE_MASK;
> +                    /* in case of a retrans SYN|ACK */
> +                    dst->wscale = 0;
> +                }
> +            }
> +        }
> +        if (tcp_flags & TCP_FIN) {
> +            end++;
> +        }
> +
> +        src->seqlo = seq;
> +        src->state = CT_DPIF_TCPS_SYN_SENT;
> +        /*
> +         * May need to slide the window (seqhi may have been set by
> +         * the crappy stack check or if we picked up the connection
> +         * after establishment)
> +         */
> +        if (src->seqhi == 1 ||
> +                SEQ_GEQ(end + MAX(1, dst->max_win << dws), src-
> >seqhi)) {
> +            src->seqhi = end + MAX(1, dst->max_win << dws);
> +        }
> +        if (win > src->max_win) {
> +            src->max_win = win;
> +        }
> +
> +    } else {
> +        ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));
> +        end = seq + p_len;
> +        if (tcp_flags & TCP_SYN) {
> +            end++;
> +        }
> +        if (tcp_flags & TCP_FIN) {
> +            end++;
> +        }
> +    }
> +
> +    if ((tcp_flags & TCP_ACK) == 0) {
> +        /* Let it pass through the ack skew check */
> +        ack = dst->seqlo;
> +    } else if ((ack == 0
> +                && (tcp_flags & (TCP_ACK|TCP_RST)) ==
> (TCP_ACK|TCP_RST))
> +               /* broken tcp stacks do not set ack */) {
> +        /* Many stacks (ours included) will set the ACK number in an
> +         * FIN|ACK if the SYN times out -- no sequence to ACK. */
> +        ack = dst->seqlo;
> +    }
> +
> +    if (seq == end) {
> +        /* Ease sequencing restrictions on no data packets */
> +        seq = src->seqlo;
> +        end = seq;
> +    }
> +
> +    ackskew = dst->seqlo - ack;
> +#define MAXACKWINDOW (0xffff + 1500)    /* 1500 is an arbitrary
> fudge factor */
> +    if (SEQ_GEQ(src->seqhi, end)
> +        /* Last octet inside other's window space */
> +        && SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws))
> +        /* Retrans: not more than one window back */
> +        && (ackskew >= -MAXACKWINDOW)
> +        /* Acking not more than one reassembled fragment backwards
> */
> +        && (ackskew <= (MAXACKWINDOW << sws))
> +        /* Acking not more than one window forward */
> +        && ((tcp_flags & TCP_RST) == 0 || orig_seq == src->seqlo
> +            || (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src-
> >seqlo))) {
> +        /* Require an exact/+1 sequence match on resets when
> possible */
> +
> +        /* update max window */
> +        if (src->max_win < win) {
> +            src->max_win = win;
> +        }
> +        /* synchronize sequencing */
> +        if (SEQ_GT(end, src->seqlo)) {
> +            src->seqlo = end;
> +        }
> +        /* slide the window of what the other end can send */
> +        if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
> +            dst->seqhi = ack + MAX((win << sws), 1);
> +        }
> +
> +        /* update states */
> +        if (tcp_flags & TCP_SYN && src->state <
> CT_DPIF_TCPS_SYN_SENT) {
> +                src->state = CT_DPIF_TCPS_SYN_SENT;
> +        }
> +        if (tcp_flags & TCP_FIN && src->state <
> CT_DPIF_TCPS_CLOSING) {
> +                src->state = CT_DPIF_TCPS_CLOSING;
> +        }
> +        if (tcp_flags & TCP_ACK) {
> +            if (dst->state == CT_DPIF_TCPS_SYN_SENT) {
> +                dst->state = CT_DPIF_TCPS_ESTABLISHED;
> +            } else if (dst->state == CT_DPIF_TCPS_CLOSING) {
> +                dst->state = CT_DPIF_TCPS_FIN_WAIT_2;
> +            }
> +        }
> +        if (tcp_flags & TCP_RST) {
> +            src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
> +        }
> +
> +        if (src->state >= CT_DPIF_TCPS_FIN_WAIT_2
> +            && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
> +            update_expiration(conn_, CT_TM_TCP_CLOSED, now);
> +        } else if (src->state >= CT_DPIF_TCPS_CLOSING
> +                   && dst->state >= CT_DPIF_TCPS_CLOSING) {
> +            update_expiration(conn_, CT_TM_TCP_FIN_WAIT, now);
> +        } else if (src->state < CT_DPIF_TCPS_ESTABLISHED
> +                   || dst->state < CT_DPIF_TCPS_ESTABLISHED) {
> +            update_expiration(conn_, now, CT_TM_TCP_OPENING);
> +        } else if (src->state >= CT_DPIF_TCPS_CLOSING
> +                   || dst->state >= CT_DPIF_TCPS_CLOSING) {
> +            update_expiration(conn_, now, CT_TM_TCP_CLOSING);
> +        } else {
> +            update_expiration(conn_, now, CT_TM_TCP_ESTABLISHED);
> +        }
> +    } else if ((dst->state < CT_DPIF_TCPS_SYN_SENT
> +                || dst->state >= CT_DPIF_TCPS_FIN_WAIT_2
> +                || src->state >= CT_DPIF_TCPS_FIN_WAIT_2)
> +               && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end)
> +               /* Within a window forward of the originating packet
> */
> +               && SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
> +               /* Within a window backward of the originating packet
> */
> +
> +        /*
> +         * This currently handles three situations:
> +         *  1) Stupid stacks will shotgun SYNs before their peer
> +         *     replies.
> +         *  2) When PF catches an already established stream (the
> +         *     firewall rebooted, the state table was flushed,
> routes
> +         *     changed...)
> +         *  3) Packets get funky immediately after the connection
> +         *     closes (this should catch Solaris spurious ACK|FINs
> +         *     that web servers like to spew after a close)
> +         *
> +         * This must be a little more careful than the above code
> +         * since packet floods will also be caught here. We don't
> +         * update the TTL here to mitigate the damage of a packet
> +         * flood and so the same code can handle awkward
> establishment
> +         * and a loosened connection close.
> +         * In the establishment case, a correct peer response will
> +         * validate the connection, go through the normal state code
> +         * and keep updating the state TTL.
> +         */
> +
> +        /* update max window */
> +        if (src->max_win < win) {
> +            src->max_win = win;
> +        }
> +        /* synchronize sequencing */
> +        if (SEQ_GT(end, src->seqlo)) {
> +            src->seqlo = end;
> +        }
> +        /* slide the window of what the other end can send */
> +        if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
> +            dst->seqhi = ack + MAX((win << sws), 1);
> +        }
> +
> +        /*
> +         * Cannot set dst->seqhi here since this could be a
> shotgunned
> +         * SYN and not an already established connection.
> +         */
> +
> +        if (tcp_flags & TCP_FIN && src->state <
> CT_DPIF_TCPS_CLOSING) {
> +            src->state = CT_DPIF_TCPS_CLOSING;
> +        }
> +
> +        if (tcp_flags & TCP_RST) {
> +            src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
> +        }
> +    } else {
> +        return CT_UPDATE_INVALID;
> +    }
> +
> +    return CT_UPDATE_VALID;
> +}
> +
> +static bool
> +tcp_valid_new(struct dp_packet *pkt)
> +{
> +    struct tcp_header *tcp = dp_packet_l4(pkt);
> +    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
> +
> +    if (tcp_invalid_flags(tcp_flags)) {
> +        return false;
> +    }
> +
> +    /* A syn+ack is not allowed to create a connection.  We want to
> allow
> +     * totally new connections (syn) or already established, not
> partially
> +     * open (syn+ack). */
> +    if ((tcp_flags & TCP_SYN) && (tcp_flags & TCP_ACK)) {
> +        return false;
> +    }
> +
> +    return true;
> +}
> +
> +static struct conn *
> +tcp_new_conn(struct dp_packet *pkt, long long now)
> +{
> +    struct conn_tcp* newconn = NULL;
> +    struct tcp_header *tcp = dp_packet_l4(pkt);
> +    struct tcp_peer *src, *dst;
> +    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
> +
> +    newconn = xzalloc(sizeof *newconn);
> +
> +    src = &newconn->peer[0];
> +    dst = &newconn->peer[1];
> +
> +    src->seqlo = ntohl(get_16aligned_be32(&tcp->tcp_seq));
> +    src->seqhi = src->seqlo + tcp_payload_length(pkt) + 1;
> +
> +    if (tcp_flags & TCP_SYN) {
> +        src->seqhi++;
> +        src->wscale = tcp_get_wscale(tcp);
> +    } else {
> +        src->wscale = CT_WSCALE_UNKNOWN;
> +        dst->wscale = CT_WSCALE_UNKNOWN;
> +    }
> +    src->max_win = MAX(ntohs(tcp->tcp_winsz), 1);
> +    if (src->wscale & CT_WSCALE_MASK) {
> +        /* Remove scale factor from initial window */
> +        uint8_t sws = src->wscale & CT_WSCALE_MASK;
> +        src->max_win = DIV_ROUND_UP((uint32_t) src->max_win, 1 <<
> sws);
> +    }
> +    if (tcp_flags & TCP_FIN) {
> +        src->seqhi++;
> +    }
> +    dst->seqhi = 1;
> +    dst->max_win = 1;
> +    src->state = CT_DPIF_TCPS_SYN_SENT;
> +    dst->state = CT_DPIF_TCPS_CLOSED;
> +
> +    update_expiration(&newconn->up, now, CT_TM_TCP_FIRST_PACKET);
> +
> +    return &newconn->up;
> +}
> +
> +struct ct_l4_proto ct_proto_tcp = {
> +    .new_conn = tcp_new_conn,
> +    .valid_new = tcp_valid_new,
> +    .conn_update = tcp_conn_update,
> +};
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> new file mode 100644
> index 0000000..e282485
> --- /dev/null
> +++ b/lib/conntrack.c
> @@ -0,0 +1,883 @@
> +/*
> + * Copyright (c) 2015, 2016 Nicira, Inc.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing,
> software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions
> and
> + * limitations under the License.
> + */
> +
> +#include <config.h>
> +#include "conntrack.h"
> +
> +#include <errno.h>
> +#include <sys/types.h>
> +#include <netinet/in.h>
> +#include <netinet/icmp6.h>
> +
> +#include "bitmap.h"
> +#include "conntrack-private.h"
> +#include "coverage.h"
> +#include "csum.h"
> +#include "dp-packet.h"
> +#include "flow.h"
> +#include "hmap.h"
> +#include "netdev.h"
> +#include "odp-netlink.h"
> +#include "openvswitch/vlog.h"
> +#include "ovs-rcu.h"
> +#include "random.h"
> +#include "timeval.h"
> +
> +VLOG_DEFINE_THIS_MODULE(conntrack);
> +
> +COVERAGE_DEFINE(conntrack_new_full);
> +
> +struct conn_lookup_ctx {
> +    struct conn_key key;
> +    struct conn *conn;
> +    uint32_t hash;
> +    bool reply;
> +    bool related;
> +};
> +
> +static bool conn_key_extract(struct conntrack *, struct dp_packet *,
> +                             struct conn_lookup_ctx *, uint16_t
> zone);
> +static uint32_t conn_key_hash(const struct conn_key *, uint32_t
> basis);
> +static void conn_key_reverse(struct conn_key *);
> +static void conn_key_lookup(struct conntrack_bucket *ctb,
> +                            struct conn_lookup_ctx *ctx,
> +                            long long now);
> +static bool valid_new(struct dp_packet *pkt, struct conn_key *);
> +static struct conn *new_conn(struct dp_packet *pkt, struct conn_key
> *,
> +                             long long now);
> +static void delete_conn(struct conn *);
> +static enum ct_update_res conn_update(struct conn *, struct
> dp_packet*,
> +                                      bool reply, long long now);
> +static bool conn_expired(struct conn *, long long now);
> +static void set_mark(struct dp_packet *, struct conn *,
> +                     uint32_t val, uint32_t mask);
> +static void set_label(struct dp_packet *, struct conn *,
> +                      const struct ovs_key_ct_labels *val,
> +                      const struct ovs_key_ct_labels *mask);
> +
> +static struct ct_l4_proto *l4_protos[] = {
> +    [IPPROTO_TCP] = &ct_proto_tcp,
> +    [IPPROTO_UDP] = &ct_proto_other,
> +    [IPPROTO_ICMP] = &ct_proto_other,
> +    [IPPROTO_ICMPV6] = &ct_proto_other,
> +};
> +
> +long long ct_timeout_val[] = {
> +#define CT_TIMEOUT(NAME, VAL) [CT_TM_##NAME] = VAL,
> +    CT_TIMEOUTS
> +#undef CT_TIMEOUT
> +};
> +
> +/* If the total number of connections goes above this value, no new
> connections
> + * are accepted */
> +#define DEFAULT_N_CONN_LIMIT 3000000
> +
> +/* Initializes the connection tracker 'ct'.  The caller is
> responbile for
> + * calling 'conntrack_destroy()', when the instance is not needed
> anymore */
> +void
> +conntrack_init(struct conntrack *ct)
> +{
> +    unsigned i;
> +
> +    for (i = 0; i < CONNTRACK_BUCKETS; i++) {
> +        struct conntrack_bucket *ctb = &ct->buckets[i];
> +
> +        ct_lock_init(&ctb->lock);
> +        ct_lock_lock(&ctb->lock);
> +        hmap_init(&ctb->connections);
> +        ct_lock_unlock(&ctb->lock);
> +    }
> +    ct->hash_basis = random_uint32();
> +    atomic_count_init(&ct->n_conn, 0);
> +    atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT);
> +}
> +
> +/* Destroys the connection tracker 'ct' and frees all the allocated
> memory. */
> +void
> +conntrack_destroy(struct conntrack *ct)
> +{
> +    unsigned i;
> +
> +    for (i = 0; i < CONNTRACK_BUCKETS; i++) {
> +        struct conntrack_bucket *ctb = &ct->buckets[i];
> +        struct conn *conn;
> +
> +        ct_lock_lock(&ctb->lock);
> +        HMAP_FOR_EACH_POP(conn, node, &ctb->connections) {
> +            atomic_count_dec(&ct->n_conn);
> +            delete_conn(conn);
> +        }
> +        hmap_destroy(&ctb->connections);
> +        ct_lock_unlock(&ctb->lock);
> +        ct_lock_destroy(&ctb->lock);
> +    }
> +}
> +

> +static unsigned hash_to_bucket(uint32_t hash)
> +{
> +    /* Extracts the most significant bits in hash. The least
> significant bits
> +     * are already used internally by the hmap implementation. */
> +    BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 &&
> CONNTRACK_BUCKETS_SHIFT >= 1);
> +
> +    return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) %
> CONNTRACK_BUCKETS;
> +}
> +
> +static void
> +write_ct_md(struct dp_packet *pkt, uint16_t state, uint16_t zone,
> +            uint32_t mark, ovs_u128 label)
> +{
> +    pkt->md.ct_state = state | CS_TRACKED;
> +    pkt->md.ct_zone = zone;
> +    pkt->md.ct_mark = mark;
> +    pkt->md.ct_label = label;
> +}
> +
> +static struct conn *
> +conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
> +               struct conn_lookup_ctx *ctx, uint16_t *state, bool
> commit,
> +               long long now)
> +{
> +    unsigned bucket = hash_to_bucket(ctx->hash);
> +    struct conn *nc = NULL;
> +
> +    if (!valid_new(pkt, &ctx->key)) {
> +        *state |= CS_INVALID;
> +        return nc;
> +    }
> +
> +    *state |= CS_NEW;
> +
> +    if (commit) {
> +        unsigned int n_conn_limit;
> +
> +        atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
> +
> +        if (atomic_count_get(&ct->n_conn) >= n_conn_limit) {
> +            COVERAGE_INC(conntrack_new_full);
> +            return nc;
> +        }
> +
> +        nc = new_conn(pkt, &ctx->key, now);
> +
> +        memcpy(&nc->rev_key, &ctx->key, sizeof nc->rev_key);
> +
> +        conn_key_reverse(&nc->rev_key);
> +        hmap_insert(&ct->buckets[bucket].connections, &nc->node,
> ctx->hash);
> +        atomic_count_inc(&ct->n_conn);
> +    }
> +
> +    return nc;
> +}
> +
> +static struct conn *
> +process_one(struct conntrack *ct, struct dp_packet *pkt,
> +            struct conn_lookup_ctx *ctx, uint16_t zone,
> +            bool commit, long long now)
> +{
> +    unsigned bucket = hash_to_bucket(ctx->hash);
> +    struct conn *conn = ctx->conn;
> +    uint16_t state = 0;
> +
> +    if (conn) {
> +        if (ctx->related) {
> +            state |= CS_RELATED;
> +            if (ctx->reply) {
> +                state |= CS_REPLY_DIR;
> +            }
> +        } else {
> +            enum ct_update_res res;
> +
> +            res = conn_update(conn, pkt, ctx->reply, now);
> +
> +            switch (res) {
> +            case CT_UPDATE_VALID:
> +                state |= CS_ESTABLISHED;
> +                if (ctx->reply) {
> +                    state |= CS_REPLY_DIR;
> +                }
> +                break;
> +            case CT_UPDATE_INVALID:
> +                state |= CS_INVALID;
> +                break;
> +            case CT_UPDATE_NEW:
> +                hmap_remove(&ct->buckets[bucket].connections, &conn-
> >node);
> +                atomic_count_dec(&ct->n_conn);
> +                delete_conn(conn);
> +                conn = conn_not_found(ct, pkt, ctx, &state, commit,
> now);
> +                break;
> +            }

[Antonio F] Sorry to repeat, but I'd prefer to add the 'default'
case, here. I mean something like

default:
    state |= CS_INVALID;
    break;

I know if we add new items to enum ct_update_res we can get a
warning from the compiler, but I wouldn't rely on that.


> +        }
> +    } else {
> +        conn = conn_not_found(ct, pkt, ctx, &state, commit, now);
> +    }
> +
> +    write_ct_md(pkt, state, zone, conn ? conn->mark : 0,
> +                conn ? conn->label : OVS_U128_ZERO);
> +
> +    return conn;
> +}
> +
> +/* Sends a group of 'cnt' packets ('pkts') through the connection
> tracker
> + * 'ct'.  If 'commit' is true, the packets are allowed to create new
> entries
> + * in the connection tables.  'setmark', if not NULL, should point
> to a two
> + * elements array containing a value and a mask to set the
> connection mark.
> + * 'setlabel' behaves similarly for the connection label.*/
> +int
> +conntrack_execute(struct conntrack *ct, struct dp_packet **pkts,
> size_t cnt,
> +                  bool commit, uint16_t zone, const uint32_t
> *setmark,
> +                  const struct ovs_key_ct_labels *setlabel,
> +                  const char *helper)
> +{
> +#if !defined(__CHECKER__) && !defined(_WIN32)
> +    const size_t KEY_ARRAY_SIZE = cnt;
> +#else
> +    enum { KEY_ARRAY_SIZE = NETDEV_MAX_BURST };
> +#endif
> +    struct conn_lookup_ctx ctxs[KEY_ARRAY_SIZE];
> +    int8_t bucket_list[CONNTRACK_BUCKETS];
> +    struct {
> +        unsigned bucket;
> +        unsigned long maps;
> +    } arr[KEY_ARRAY_SIZE];
> +    long long now = time_msec();
> +    size_t i = 0;
> +    uint8_t arrcnt = 0;
> +
> +    BUILD_ASSERT_DECL(sizeof arr[0].maps * CHAR_BIT >=
> NETDEV_MAX_BURST);
> +
> +    if (helper) {
> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5,
> 5);
> +
> +        VLOG_WARN_RL(&rl, "ALG helper \"%s\" not supported",
> helper);
> +        /* Continue without the helper */
> +    }
> +
> +    memset(bucket_list, INT8_C(-1), sizeof bucket_list);
> +    for (i = 0; i < cnt; i++) {
> +        unsigned bucket;
> +
> +        if (!conn_key_extract(ct, pkts[i], &ctxs[i], zone)) {
> +            write_ct_md(pkts[i], CS_INVALID, zone, 0,
> OVS_U128_ZERO);
> +            continue;
> +        }
> +
> +        bucket = hash_to_bucket(ctxs[i].hash);
> +        if (bucket_list[bucket] == INT8_C(-1)) {
> +            bucket_list[bucket] = arrcnt;
> +
> +            arr[arrcnt].maps = 0;
> +            ULLONG_SET1(arr[arrcnt].maps, i);
> +            arr[arrcnt++].bucket = bucket;
> +        } else {
> +            ULLONG_SET1(arr[bucket_list[bucket]].maps, i);
> +            arr[bucket_list[bucket]].maps |= 1UL << i;
> +        }
> +    }
> +
> +    for (i = 0; i < arrcnt; i++) {
> +        struct conntrack_bucket *ctb = &ct->buckets[arr[i].bucket];
> +        size_t j;
> +
> +        ct_lock_lock(&ctb->lock);
> +
> +        ULLONG_FOR_EACH_1(j, arr[i].maps) {
> +            struct conn *conn;
> +
> +            conn_key_lookup(ctb, &ctxs[j], now);
> +
> +            conn = process_one(ct, pkts[j], &ctxs[j], zone, commit,
> now);
> +
> +            if (conn && setmark) {
> +                set_mark(pkts[j], conn, setmark[0], setmark[1]);
> +            }
> +
> +            if (conn && setlabel) {
> +                set_label(pkts[j], conn, &setlabel[0],
> &setlabel[1]);
> +            }
> +        }
> +        ct_lock_unlock(&ctb->lock);
> +    }
> +
> +    return 0;
> +}
> +
> +static void
> +set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val,
> uint32_t mask)
> +{
> +    pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));
> +    conn->mark = pkt->md.ct_mark;
> +}
> +
> +static void
> +set_label(struct dp_packet *pkt, struct conn *conn,
> +          const struct ovs_key_ct_labels *val,
> +          const struct ovs_key_ct_labels *mask)
> +{
> +    ovs_u128 v, m;
> +
> +    memcpy(&v, val, sizeof v);
> +    memcpy(&m, mask, sizeof m);
> +
> +    pkt->md.ct_label.u64.lo = v.u64.lo
> +                              | (pkt->md.ct_label.u64.lo &
> ~(m.u64.lo));
> +    pkt->md.ct_label.u64.hi = v.u64.hi
> +                              | (pkt->md.ct_label.u64.hi &
> ~(m.u64.hi));
> +    conn->label = pkt->md.ct_label;
> +}
> +

> +/* Key extraction */
> +
> +/* The function stores a pointer to the first byte after the header
> in
> + * '*new_data', if 'new_data' is not NULL.  If it is NULL, the
> caller is
> + * not interested in the header's tail,  meaning that the header has
> + * already been parsed (e.g. by flow_extract): we take this as a
> hint to
> + * save a few checks.  If 'validate_checksum' is true, the function
> returns
> + * false if the IPv4 checksum is invalid. */
> +static inline bool
> +extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
> +                const char **new_data, bool validate_checksum)
> +{
> +    const struct ip_header *ip = data;
> +    size_t ip_len;
> +
> +    if (new_data) {
> +        if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
> +            return false;
> +        }
> +    }
> +
> +    ip_len = IP_IHL(ip->ip_ihl_ver) * 4;
> +
> +    if (new_data) {
> +        if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {
> +            return false;
> +        }
> +        if (OVS_UNLIKELY(size < ip_len)) {
> +            return false;
> +        }
> +
> +        *new_data = (char *) data + ip_len;
> +    }
> +
> +    if (IP_IS_FRAGMENT(ip->ip_frag_off)) {
> +        return false;
> +    }
> +
> +    if (validate_checksum && csum(data, ip_len) != 0) {
> +        return false;
> +    }
> +
> +    key->src.addr.ipv4 = ip->ip_src;
> +    key->dst.addr.ipv4 = ip->ip_dst;
> +    key->nw_proto = ip->ip_proto;
> +
> +    return true;
> +}
> +
> +/* The function stores a pointer to the first byte after the header
> in
> + * '*new_data', if 'new_data' is not NULL.  If it is NULL, the
> caller is
> + * not interested in the header's tail,  meaning that the header has
> + * already been parsed (e.g. by flow_extract): we take this as a
> hint to
> + * save a few checks. */
> +static inline bool
> +extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
> +                const char **new_data)
> +{
> +    const struct ovs_16aligned_ip6_hdr *ip6 = data;
> +    uint8_t nw_proto = ip6->ip6_nxt;
> +    uint8_t nw_frag = 0;
> +
> +    if (new_data) {
> +        if (OVS_UNLIKELY(size < sizeof *ip6)) {
> +            return false;
> +        }
> +    }
> +
> +    data = ip6 + 1;
> +    size -=  sizeof *ip6;
> +
> +    if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag)) {
> +        return false;
> +    }
> +
> +    if (new_data) {
> +        *new_data = data;
> +    }
> +
> +    if (nw_frag) {
> +        return false;
> +    }
> +
> +    key->src.addr.ipv6 = ip6->ip6_src;
> +    key->dst.addr.ipv6 = ip6->ip6_dst;
> +    key->nw_proto = nw_proto;
> +
> +    return true;
> +}
> +
> +static inline bool
> +checksum_valid(const struct conn_key *key, const void *data, size_t
> size,
> +               const void *l3)
> +{
> +    uint32_t csum = 0;
> +
> +    if (key->dl_type == htons(ETH_TYPE_IP)) {
> +        csum = packet_csum_pseudoheader(l3);
> +    } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
> +        csum = packet_csum_pseudoheader6(l3);
> +    } else {
> +        return false;
> +    }
> +
> +    csum = csum_continue(csum, data, size);
> +
> +    return csum_finish(csum) == 0;
> +}
> +
> +static inline bool
> +check_l4_tcp(const struct conn_key *key, const void *data, size_t
> size,
> +             const void *l3)
> +{
> +    const struct tcp_header *tcp = data;
> +    size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
> +
> +    if (OVS_UNLIKELY(tcp_len < TCP_HEADER_LEN || tcp_len > size)) {
> +        return false;
> +    }
> +
> +    return checksum_valid(key, data, size, l3);
> +}
> +
> +static inline bool
> +check_l4_udp(const struct conn_key *key, const void *data, size_t
> size,
> +             const void *l3)
> +{
> +    const struct udp_header *udp = data;
> +    size_t udp_len = ntohs(udp->udp_len);
> +
> +    if (OVS_UNLIKELY(udp_len < UDP_HEADER_LEN || udp_len > size)) {
> +        return false;
> +    }
> +
> +    /* Validation must be skipped if checksum is 0 on IPv4 packets
> */
> +    return (udp->udp_csum == 0 && key->dl_type ==
> htons(ETH_TYPE_IP))
> +           || checksum_valid(key, data, size, l3);
> +}
> +
> +static inline bool
> +check_l4_icmp(const void *data, size_t size)
> +{
> +    return csum(data, size) == 0;
> +}
> +
> +static inline bool
> +check_l4_icmp6(const struct conn_key *key, const void *data, size_t
> size,
> +               const void *l3)
> +{
> +    return checksum_valid(key, data, size, l3);
> +}
> +
> +static inline bool
> +extract_l4_tcp(struct conn_key *key, const void *data, size_t size)
> +{
> +    const struct tcp_header *tcp = data;
> +
> +    if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) {
> +        return false;
> +    }
> +
> +    key->src.port = tcp->tcp_src;
> +    key->dst.port = tcp->tcp_dst;
> +
> +    /* Port 0 is invalid */
> +    return key->src.port && key->dst.port;
> +}
> +
> +static inline bool
> +extract_l4_udp(struct conn_key *key, const void *data, size_t size)
> +{
> +    const struct udp_header *udp = data;
> +
> +    if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) {
> +        return false;
> +    }
> +
> +    key->src.port = udp->udp_src;
> +    key->dst.port = udp->udp_dst;
> +
> +    /* Port 0 is invalid */
> +    return key->src.port && key->dst.port;
> +}
> +
> +static inline bool extract_l4(struct conn_key *key, const void
> *data,
> +                              size_t size, bool *related, const void
> *l3);
> +
> +/* If 'related' is not NULL and the function is processing an ICMP
> + * error packet, extract the l3 and l4 fields from the nested header
> + * instead and set *related to true.  If 'related' is NULL we're
> + * already processing a nested header and no such recursion is
> + * possible */
> +static inline int
> +extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
> +                bool *related)
> +{
> +    const struct icmp_header *icmp = data;
> +
> +    if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {
> +        return false;
> +    }
> +
> +    switch (icmp->icmp_type) {
> +    case ICMP4_ECHO_REQUEST:
> +    case ICMP4_ECHO_REPLY:
> +    case ICMP4_TIMESTAMP:
> +    case ICMP4_TIMESTAMPREPLY:
> +    case ICMP4_INFOREQUEST:
> +    case ICMP4_INFOREPLY:
> +        /* Separate ICMP connection: identified using id */
> +        key->src.port = key->dst.port = icmp->icmp_fields.echo.id;
> +        break;
> +    case ICMP4_DST_UNREACH:
> +    case ICMP4_TIME_EXCEEDED:
> +    case ICMP4_PARAM_PROB:
> +    case ICMP4_SOURCEQUENCH:
> +    case ICMP4_REDIRECT: {
> +        /* ICMP packet part of another connection. We should
> +         * extract the key from embedded packet header */
> +        struct conn_key inner_key;
> +        const char *l3 = (const char *) (icmp + 1);
> +        const char *tail = (const char *) data + size;
> +        const char *l4;
> +        bool ok;
> +
> +        if (!related) {
> +            return false;
> +        }
> +        *related = true;
> +
> +        memset(&inner_key, 0, sizeof inner_key);
> +        inner_key.dl_type = htons(ETH_TYPE_IP);
> +        ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4, false);
> +        if (!ok) {
> +            return false;
> +        }
> +
> +        /* pf doesn't do this, but it seems a good idea */
> +        if (inner_key.src.addr.ipv4_aligned != key-
> >dst.addr.ipv4_aligned
> +            || inner_key.dst.addr.ipv4_aligned != key-
> >src.addr.ipv4_aligned) {
> +            return false;
> +        }
> +
> +        key->src = inner_key.src;
> +        key->dst = inner_key.dst;
> +        key->nw_proto = inner_key.nw_proto;
> +
> +        ok = extract_l4(key, l4, tail - l4, NULL, l3);
> +        if (ok) {
> +            conn_key_reverse(key);
> +        }
> +        return ok;
> +    }
> +    default:
> +        return false;
> +    }
> +
> +    return true;
> +}
> +
> +/* If 'related' is not NULL and the function is processing an ICMP
> + * error packet, extract the l3 and l4 fields from the nested header
> + * instead and set *related to true.  If 'related' is NULL we're
> + * already processing a nested header and no such recursion is
> + * possible */
> +static inline bool
> +extract_l4_icmp6(struct conn_key *key, const void *data, size_t
> size,
> +                 bool *related)
> +{
> +    const struct icmp6_header *icmp6 = data;
> +
> +    /* All the messages that we support need at least 4 bytes after
> +     * the header */
> +    if (size < sizeof *icmp6 + 4) {
> +        return false;
> +    }
> +
> +    switch (icmp6->icmp6_type) {
> +    case ICMP6_ECHO_REQUEST:
> +    case ICMP6_ECHO_REPLY:
> +        /* Separate ICMP connection: identified using id */
> +        key->src.port = key->dst.port = *(ovs_be16 *) (icmp6 + 1);
> +        break;
> +    case ICMP6_DST_UNREACH:
> +    case ICMP6_PACKET_TOO_BIG:
> +    case ICMP6_TIME_EXCEEDED:
> +    case ICMP6_PARAM_PROB: {
> +        /* ICMP packet part of another connection. We should
> +         * extract the key from embedded packet header */
> +        struct conn_key inner_key;
> +        const char *l3 = (const char *) icmp6 + 8;
> +        const char *tail = (const char *) data + size;
> +        const char *l4 = NULL;
> +        bool ok;
> +
> +        if (!related) {
> +            return false;
> +        }
> +        *related = true;
> +
> +        memset(&inner_key, 0, sizeof inner_key);
> +        inner_key.dl_type = htons(ETH_TYPE_IPV6);
> +        ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);
> +        if (!ok) {
> +            return false;
> +        }
> +
> +        /* pf doesn't do this, but it seems a good idea */
> +        if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned,
> +                              &key->dst.addr.ipv6_aligned)
> +            || !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned,
> +                                 &key->src.addr.ipv6_aligned)) {
> +            return false;
> +        }
> +
> +        key->src = inner_key.src;
> +        key->dst = inner_key.dst;
> +        key->nw_proto = inner_key.nw_proto;
> +
> +        ok = extract_l4(key, l4, tail - l4, NULL, l3);
> +        if (ok) {
> +            conn_key_reverse(key);
> +        }
> +        return ok;
> +    }
> +    default:
> +        return false;
> +    }
> +
> +    return true;
> +}
> +
> +/* Extract l4 fields into 'key', which must already contain valid l3
> + * members.
> + *
> + * If 'related' is not NULL and an ICMP error packet is being
> + * processed, the function will extract the key from the packet
> nested
> + * in the ICMP paylod and set '*related' to true.
> + *
> + * If 'related' is NULL, it means that we're already parsing a
> header nested
> + * in an ICMP error.  In this case, we skip checksum and length
> validation. */
> +static inline bool
> +extract_l4(struct conn_key *key, const void *data, size_t size, bool
> *related,
> +           const void *l3)
> +{
> +    if (key->nw_proto == IPPROTO_TCP) {
> +        return (!related || check_l4_tcp(key, data, size, l3))
> +               && extract_l4_tcp(key, data, size);
> +    } else if (key->nw_proto == IPPROTO_UDP) {
> +        return (!related || check_l4_udp(key, data, size, l3))
> +               && extract_l4_udp(key, data, size);
> +    } else if (key->dl_type == htons(ETH_TYPE_IP)
> +               && key->nw_proto == IPPROTO_ICMP) {
> +        return (!related || check_l4_icmp(data, size))
> +               && extract_l4_icmp(key, data, size, related);
> +    } else if (key->dl_type == htons(ETH_TYPE_IPV6)
> +               && key->nw_proto == IPPROTO_ICMPV6) {
> +        return (!related || check_l4_icmp6(key, data, size, l3))
> +               && extract_l4_icmp6(key, data, size, related);
> +    } else {
> +        return false;
> +    }
> +}
> +
> +static bool
> +conn_key_extract(struct conntrack *ct, struct dp_packet *pkt,
> +                 struct conn_lookup_ctx *ctx, uint16_t zone)
> +{
> +    const struct eth_header *l2 = dp_packet_l2(pkt);
> +    const struct ip_header *l3 = dp_packet_l3(pkt);
> +    const char *l4 = dp_packet_l4(pkt);
> +    const char *tail = dp_packet_tail(pkt);
> +    bool ok;
> +
> +    memset(ctx, 0, sizeof *ctx);
> +
> +    if (!l2 || !l3 || !l4) {
> +        return false;
> +    }
> +
> +    ctx->key.zone = zone;
> +
> +    /* XXX In this function we parse the packet (again, it has
> already
> +     * gone through miniflow_extract()) for two reasons:
> +     *
> +     * 1) To extract the l3 addresses and l4 ports.
> +     *    We already have the l3 and l4 headers' pointers.
> Extracting
> +     *    the l3 addresses and the l4 ports is really cheap, since
> they
> +     *    can be found at fixed locations.
> +     * 2) To extract the l3 and l4 types.
> +     *    Extracting the l3 and l4 types (especially the l3[1]) on
> the
> +     *    other hand is quite expensive, because they're not at a
> +     *    fixed location.
> +     *
> +     * Here's a way to avoid (2) with the help of the datapath.
> +     * The datapath doesn't keep the packet's extracted flow[2], so
> +     * using that is not an option.  We could use the packet's
> matching
> +     * megaflow for l3 type (it's always unwildcarded), and for l4
> type
> +     * (we have to unwildcard it first).  This means either:
> +     *
> +     * a) dpif-netdev passes the matching megaflow to
> dp_execute_cb(), which
> +     *    is used to extract the l3 type.  Unfortunately,
> dp_execute_cb() is
> +     *    used also in dpif_netdev_execute(), which doesn't have a
> matching
> +     *    megaflow.
> +     *
> +     * b) We define an alternative OVS_ACTION_ATTR_CT, used only by
> the
> +     *    userspace datapath, which includes l3 (and l4) type.  The
> +     *    alternative action could be generated by ofproto-dpif
> specifically
> +     *    for the userspace datapath. Having a different interface
> for
> +     *    userspace and kernel doesn't seem very clean, though.
> +     *
> +     * ---
> +     * [1] A simple benchmark (running only the connection tracker
> +     *     over and over on the same packets) shows that if the
> +     *     l3 type is already provided we are 15% faster (running
> the
> +     *     connection tracker over a couple of DPDK devices with a
> +     *     stream of UDP 64-bytes packets shows that we are 4%
> faster).
> +     *
> +     * [2] The reasons for this are that keeping the flow increases
> +     *     (slightly) the cache footprint and increases computation
> +     *     time as we move the packet around. Most importantly, the
> flow
> +     *     should be updated by the actions and this can be slow, as
> +     *     we use a sparse representation (miniflow).
> +     *
> +     */
> +    ctx->key.dl_type = parse_dl_type(l2, (char *) l3 - (char *) l2);
> +    if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {
> +        ok = extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3,
> NULL, true);
> +    } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
> +        ok = extract_l3_ipv6(&ctx->key, l3, tail - (char *) l3,
> NULL);
> +    } else {
> +        ok = false;
> +    }
> +
> +    if (ok) {
> +        if (extract_l4(&ctx->key, l4, tail - l4, &ctx->related, l3))
> {
> +            ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
> +            return true;
> +        }
> +    }
> +
> +    return false;
> +}
> +

> +/* Symmetric */
> +static uint32_t
> +conn_key_hash(const struct conn_key *key, uint32_t basis)
> +{
> +    uint32_t hsrc, hdst, hash;
> +    int i;
> +
> +    hsrc = hdst = basis;
> +
> +    /* Hash the source and destination tuple */
> +    for (i = 0; i < sizeof(key->src) / sizeof(uint32_t); i++) {
> +        hsrc = hash_add(hsrc, ((uint32_t *) &key->src)[i]);
> +        hdst = hash_add(hdst, ((uint32_t *) &key->dst)[i]);
> +    }
> +
> +    /* Even if source and destination are swapped the hash will be
> the same. */
> +    hash = hsrc ^ hdst;
> +
> +    /* Hash the rest of the key(L3 and L4 types and zone). */
> +    hash = hash_words((uint32_t *) &key->dst + 1,
> +                      (uint32_t *) (key + 1) - (uint32_t *) (&key-
> >dst + 1),
> +                      hash);
> +
> +    return hash;
> +}
> +
> +static void
> +conn_key_reverse(struct conn_key *key)
> +{
> +    struct ct_endpoint tmp;
> +    tmp = key->src;
> +    key->src = key->dst;
> +    key->dst = tmp;
> +}
> +
> +static void
> +conn_key_lookup(struct conntrack_bucket *ctb,
> +                struct conn_lookup_ctx *ctx,
> +                long long now)
> +{
> +    uint32_t hash = ctx->hash;
> +    struct conn *conn;
> +
> +    ctx->conn = NULL;
> +
> +    HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) {
> +        if (!memcmp(&conn->key, &ctx->key, sizeof(conn->key))
> +                && !conn_expired(conn, now)) {
> +            ctx->conn = conn;
> +            ctx->reply = false;
> +            break;
> +        }
> +        if (!memcmp(&conn->rev_key, &ctx->key, sizeof(conn-
> >rev_key))
> +                && !conn_expired(conn, now)) {
> +            ctx->conn = conn;
> +            ctx->reply = true;
> +            break;
> +        }
> +    }
> +}
> +
> +static enum ct_update_res
> +conn_update(struct conn *conn, struct dp_packet *pkt, bool reply,
> +            long long now)
> +{
> +    return l4_protos[conn->key.nw_proto]->conn_update(conn, pkt,
> reply, now);
> +}
> +
> +static bool
> +conn_expired(struct conn *conn, long long now)
> +{
> +    return now >= conn->expiration;
> +}
> +
> +static bool
> +valid_new(struct dp_packet *pkt, struct conn_key *key)
> +{
> +    return l4_protos[key->nw_proto]->valid_new(pkt);
> +}
> +
> +static struct conn *
> +new_conn(struct dp_packet *pkt, struct conn_key *key, long long now)
> +{
> +    struct conn *newconn;
> +
> +    newconn = l4_protos[key->nw_proto]->new_conn(pkt, now);
> +
> +    if (newconn) {
> +        newconn->key = *key;
> +    }
> +
> +    return newconn;
> +}
> +
> +static void
> +delete_conn(struct conn *conn)
> +{
> +    free(conn);
> +}
> diff --git a/lib/conntrack.h b/lib/conntrack.h
> new file mode 100644
> index 0000000..d905bd2
> --- /dev/null
> +++ b/lib/conntrack.h
> @@ -0,0 +1,151 @@
> +/*
> + * Copyright (c) 2015, 2016 Nicira, Inc.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing,
> software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions
> and
> + * limitations under the License.
> + */
> +
> +#ifndef CONNTRACK_H
> +#define CONNTRACK_H 1
> +
> +#include <stdbool.h>
> +
> +#include "hmap.h"
> +#include "odp-netlink.h"
> +#include "openvswitch/thread.h"
> +#include "openvswitch/types.h"
> +#include "ovs-atomic.h"
> +
> +
> +struct dp_packet;
> +
> +/* Userspace connection tracker
> + * ============================
> + *
> + * This is a connection tracking module that keeps all the state in
> userspace.
> + *
> + * Usage
> + * =====
> + *
> + *     struct conntract ct;
> + *
> + * Initialization:
> + *
> + *     conntrack_init(&ct);
> + *
> + * It is necessary to periodically issue a call to
> + *
> + *     conntrack_run(&ct);
> + *
> + * to allow the module to clean up expired connections.
> + *
> + * To send a group of packets through the connection tracker:
> + *
> + *     conntrack_execute(&ct, pkts, n_pkts, ...);
> + *
> + * Thread-safety
> + * =============
> + *
> + * conntrack_execute() can be called by multiple threads
> simultaneoulsy.
> + */
> +
> +struct conntrack;
> +
> +void conntrack_init(struct conntrack *);
> +void conntrack_run(struct conntrack *);
> +void conntrack_destroy(struct conntrack *);
> +
> +int conntrack_execute(struct conntrack *, struct dp_packet **,
> size_t,
> +                      bool commit, uint16_t zone, const uint32_t
> *setmark,
> +                      const struct ovs_key_ct_labels *setlabel,
> +                      const char *helper);
> +

> +/* 'struct ct_lock' is a wrapper for an adaptive mutex.  It's useful
> to try
> + * different types of locks (e.g. spinlocks) */
> +
> +struct OVS_LOCKABLE ct_lock {
> +    struct ovs_mutex lock;
> +};
> +
> +static inline void ct_lock_init(struct ct_lock *lock)
> +{
> +    ovs_mutex_init_adaptive(&lock->lock);
> +}
> +
> +static inline void ct_lock_lock(struct ct_lock *lock)
> +    OVS_ACQUIRES(lock)
> +    OVS_NO_THREAD_SAFETY_ANALYSIS
> +{
> +    ovs_mutex_lock(&lock->lock);
> +}
> +
> +static inline void ct_lock_unlock(struct ct_lock *lock)
> +    OVS_RELEASES(lock)
> +    OVS_NO_THREAD_SAFETY_ANALYSIS
> +{
> +    ovs_mutex_unlock(&lock->lock);
> +}
> +
> +static inline void ct_lock_destroy(struct ct_lock *lock)
> +{
> +    ovs_mutex_destroy(&lock->lock);
> +}
> +

> +/* Timeouts: all the possible timeout states passed to
> update_expiration()
> + * are listed here. The name will be prefix by CT_TM_ and the value
> is in
> + * milliseconds */
> +#define CT_TIMEOUTS \
> +    CT_TIMEOUT(TCP_FIRST_PACKET, 30 * 1000) \
> +    CT_TIMEOUT(TCP_OPENING, 30 * 1000) \
> +    CT_TIMEOUT(TCP_ESTABLISHED, 24 * 60 * 60 * 1000) \
> +    CT_TIMEOUT(TCP_CLOSING, 15 * 60 * 1000) \
> +    CT_TIMEOUT(TCP_FIN_WAIT, 45 * 1000) \
> +    CT_TIMEOUT(TCP_CLOSED, 30 * 1000) \
> +    CT_TIMEOUT(OTHER_FIRST, 60 * 1000) \
> +    CT_TIMEOUT(OTHER_MULTIPLE, 60 * 1000) \
> +    CT_TIMEOUT(OTHER_BIDIR, 30 * 1000) \
> +
> +enum ct_timeout {
> +#define CT_TIMEOUT(NAME, VALUE) CT_TM_##NAME,
> +    CT_TIMEOUTS
> +#undef CT_TIMEOUT
> +    N_CT_TM
> +};
> +
> +/* Locking:
> + *
> + * The connections are kept in different buckets, which are
> completely
> + * independent. The connection bucket is determined by the hash of
> its key.
> + * */
> +struct conntrack_bucket {
> +    struct ct_lock lock;
> +    struct hmap connections OVS_GUARDED;
> +};
> +
> +#define CONNTRACK_BUCKETS_SHIFT 8
> +#define CONNTRACK_BUCKETS (1 << CONNTRACK_BUCKETS_SHIFT)
> +
> +struct conntrack {
> +    /* Independent buckets containing the connections */
> +    struct conntrack_bucket buckets[CONNTRACK_BUCKETS];
> +
> +    /* Salt for hashing a connection key. */
> +    uint32_t hash_basis;
> +
> +    /* Number of connections currently in the connection tracker. */
> +    atomic_count n_conn;
> +    /* Connections limit. When this limit is reached, no new
> connection
> +     * will be accepted. */
> +    atomic_uint n_conn_limit;
> +};
> +
> +#endif /* conntrack.h */
> diff --git a/lib/util.h b/lib/util.h
> index 7be4a30..ad31e74 100644
> --- a/lib/util.h
> +++ b/lib/util.h
> @@ -69,6 +69,15 @@ ovs_prefetch_range(const void *start, size_t size)
>  #define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
>  #endif
> 
> +/* Comparisons for ints with modular arithmetic */
> +#define INT_MOD_LT(a,b)     ((int) ((a)-(b)) < 0)
> +#define INT_MOD_LEQ(a,b)    ((int) ((a)-(b)) <= 0)
> +#define INT_MOD_GT(a,b)     ((int) ((a)-(b)) > 0)
> +#define INT_MOD_GEQ(a,b)    ((int) ((a)-(b)) >= 0)
> +
> +#define INT_MOD_MIN(a, b)   ((INT_MOD_LT(a, b)) ? (a) : (b))
> +#define INT_MOD_MAX(a, b)   ((INT_MOD_GT(a, b)) ? (a) : (b))
> +
>  #define OVS_NOT_REACHED() abort()
> 
>  /* Use "%"PRIuSIZE to format size_t with printf(). */
> --
> 2.1.4
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> http://openvswitch.org/mailman/listinfo/dev
_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

Re: [ovs-dev] [PATCH v3 04/16] conntrack: New userspace connection tracker.

Reply via email to