Author: glebius
Date: Mon Feb 17 11:50:56 2014
New Revision: 262027
URL: http://svnweb.freebsd.org/changeset/base/262027

Log:
  o Remove at compile time the HASH_ALL code, that was never
    tested and is unfinished. However, I've tested my version,
    it works okay. As before it is unfinished: timeout aren't
    driven by TCP session state. To enable the HASH_ALL mode,
    one needs in kernel config:
  
        options FLOWTABLE_HASH_ALL
  
  o Reduce the alignment on flentry to 64 bytes. Without
    the FLOWTABLE_HASH_ALL option, twice less memory would
    be consumed by flows.
  o API to ip_output()/ip6_output() got even more thin: 1 liner.
  o Remove unused unions. Simply use fle->f_key[].
  o Merge all IPv4 code into flowtable_lookup_ipv4(), and do same
    flowtable_lookup_ipv6(). Stop copying data to on stack
    sockaddr structures, simply use key[] on stack.
  o Move code from flowtable_lookup_common() that actually works
    on insertion into flowtable_insert().
  
  Sponsored by: Netflix
  Sponsored by: Nginx, Inc.

Modified:
  head/sys/conf/options
  head/sys/net/flowtable.c
  head/sys/net/flowtable.h
  head/sys/netinet/ip_output.c
  head/sys/netinet6/ip6_output.c

Modified: head/sys/conf/options
==============================================================================
--- head/sys/conf/options       Mon Feb 17 11:44:58 2014        (r262026)
+++ head/sys/conf/options       Mon Feb 17 11:50:56 2014        (r262027)
@@ -440,6 +440,7 @@ TCP_SIGNATURE               opt_inet.h
 VLAN_ARRAY             opt_vlan.h
 XBONEHACK
 FLOWTABLE              opt_route.h
+FLOWTABLE_HASH_ALL     opt_route.h
 
 #
 # SCTP

Modified: head/sys/net/flowtable.c
==============================================================================
--- head/sys/net/flowtable.c    Mon Feb 17 11:44:58 2014        (r262026)
+++ head/sys/net/flowtable.c    Mon Feb 17 11:50:56 2014        (r262027)
@@ -73,91 +73,53 @@ __FBSDID("$FreeBSD$");
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
+#ifdef FLOWTABLE_HASH_ALL
 #include <netinet/tcp.h>
 #include <netinet/udp.h>
 #include <netinet/sctp.h>
+#endif
 
 #include <ddb/ddb.h>
 
-#ifdef INET
-struct ipv4_tuple {
-       uint16_t        ip_sport;       /* source port */
-       uint16_t        ip_dport;       /* destination port */
-       in_addr_t       ip_saddr;       /* source address */
-       in_addr_t       ip_daddr;       /* destination address */
-};
-
-union ipv4_flow {
-       struct ipv4_tuple ipf_ipt;
-       uint32_t        ipf_key[3];
-};
+#ifdef FLOWTABLE_HASH_ALL
+#define        KEY_PORTS       (sizeof(uint16_t) * 2)
+#define        KEY_ADDRS       2
+#else
+#define        KEY_PORTS       0
+#define        KEY_ADDRS       1
 #endif
 
-#ifdef INET6
-struct ipv6_tuple {
-       uint16_t        ip_sport;       /* source port */
-       uint16_t        ip_dport;       /* destination port */
-       struct in6_addr ip_saddr;       /* source address */
-       struct in6_addr ip_daddr;       /* destination address */
-};
-
-union ipv6_flow {
-       struct ipv6_tuple ipf_ipt;
-       uint32_t        ipf_key[9];
-};
+#ifdef INET6
+#define        KEY_ADDR_LEN    sizeof(struct in6_addr)
+#else
+#define        KEY_ADDR_LEN    sizeof(struct in_addr)
 #endif
 
+#define        KEYLEN  ((KEY_ADDR_LEN * KEY_ADDRS + KEY_PORTS) / 
sizeof(uint32_t))
+
 struct flentry {
-       uint32_t                f_fhash;        /* hash flowing forward */
-       uint16_t                f_flags;        /* flow flags */
-       uint8_t                 f_pad;
-       uint8_t                 f_proto;        /* protocol */
-       uint32_t                f_fibnum;       /* fib index */
+       uint32_t                f_hash;         /* hash flowing forward */
+       uint32_t                f_key[KEYLEN];  /* address(es and ports) */
        uint32_t                f_uptime;       /* uptime at last access */
+       uint16_t                f_fibnum;       /* fib index */
+#ifdef FLOWTABLE_HASH_ALL
+       uint8_t                 f_proto;        /* protocol */
+       uint8_t                 f_flags;        /* stale? */
+#define FL_STALE               1
+#endif
        SLIST_ENTRY(flentry)    f_next;         /* pointer to collision entry */
        struct rtentry          *f_rt;          /* rtentry for flow */
        struct llentry          *f_lle;         /* llentry for flow */
-       union {
-#ifdef INET
-               union ipv4_flow v4;
-#endif
-#ifdef INET6
-               union ipv6_flow v6;
-#endif
-       } f_flow;
-#define        f_flow4 f_flow.v4
-#define        f_flow6 f_flow.v6
 };
-#define        KEYLEN(flags)   ((((flags) & FL_IPV6) ? 9 : 3) * 4)
-
-/* Make sure f_flow begins with key. */
-#ifdef INET
-CTASSERT(offsetof(struct flentry, f_flow) ==
-    offsetof(struct flentry, f_flow4.ipf_key));
-#endif
-#ifdef INET6
-CTASSERT(offsetof(struct flentry, f_flow) ==
-    offsetof(struct flentry, f_flow6.ipf_key));
-#endif
+#undef KEYLEN
 
 SLIST_HEAD(flist, flentry);
 /* Make sure we can use pcpu_zone_ptr for struct flist. */
 CTASSERT(sizeof(struct flist) == sizeof(void *));
 
-#define        SECS_PER_HOUR           3600
-#define        SECS_PER_DAY            (24*SECS_PER_HOUR)
-
-#define        SYN_IDLE                300
-#define        UDP_IDLE                300
-#define        FIN_WAIT_IDLE           600
-#define        TCP_IDLE                SECS_PER_DAY
-
 struct flowtable {
        counter_u64_t   *ft_stat;
        int             ft_size;
-       uint32_t        ft_flags;
-       uint32_t        ft_max_depth;
-
        /*
         * ft_table is a malloc(9)ed array of pointers.  Pointers point to
         * memory from UMA_ZONE_PCPU zone.
@@ -167,12 +129,6 @@ struct flowtable {
        struct flist    **ft_table;
        bitstr_t        **ft_masks;
        bitstr_t        *ft_tmpmask;
-
-       uint32_t        ft_udp_idle;
-       uint32_t        ft_fin_wait_idle;
-       uint32_t        ft_syn_idle;
-       uint32_t        ft_tcp_idle;
-       boolean_t       ft_full;
 };
 
 #define        FLOWSTAT_ADD(ft, name, v)       \
@@ -186,7 +142,6 @@ static struct cv    flowclean_f_cv;
 static struct cv       flowclean_c_cv;
 static struct mtx      flowclean_lock;
 static uint32_t                flowclean_cycles;
-static uint32_t                flowclean_freq;
 
 /*
  * TODO:
@@ -213,16 +168,7 @@ static VNET_DEFINE(struct flowtable, ip6
 static uma_zone_t flow_zone;
 
 static VNET_DEFINE(int, flowtable_enable) = 1;
-static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
-static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
-static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
-static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE;
-
 #define        V_flowtable_enable              VNET(flowtable_enable)
-#define        V_flowtable_syn_expire          VNET(flowtable_syn_expire)
-#define        V_flowtable_udp_expire          VNET(flowtable_udp_expire)
-#define        V_flowtable_fin_wait_expire     VNET(flowtable_fin_wait_expire)
-#define        V_flowtable_tcp_expire          VNET(flowtable_tcp_expire)
 
 static SYSCTL_NODE(_net, OID_AUTO, flowtable, CTLFLAG_RD, NULL,
     "flowtable");
@@ -231,197 +177,96 @@ SYSCTL_VNET_INT(_net_flowtable, OID_AUTO
 SYSCTL_UMA_MAX(_net_flowtable, OID_AUTO, maxflows, CTLFLAG_RW,
     &flow_zone, "Maximum number of flows allowed");
 
-/*
- * XXX This does not end up updating timeouts at runtime
- * and only reflects the value for the last table added :-/
- */
-SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
-    &VNET_NAME(flowtable_syn_expire), 0,
-    "seconds after which to remove syn allocated flow.");
-SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
-    &VNET_NAME(flowtable_udp_expire), 0,
-    "seconds after which to remove flow allocated to UDP.");
-SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
-    &VNET_NAME(flowtable_fin_wait_expire), 0,
-    "seconds after which to remove a flow in FIN_WAIT.");
-SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
-    &VNET_NAME(flowtable_tcp_expire), 0,
-    "seconds after which to remove flow allocated to a TCP connection.");
-
-#define FL_STALE       (1<<8)
-
 static MALLOC_DEFINE(M_FTABLE, "flowtable", "flowtable hashes and bitstrings");
 
-static struct flentry *flowtable_lookup_common(struct flowtable *,
-    struct sockaddr_storage *, struct sockaddr_storage *, struct mbuf *, int);
-
-static __inline int
-proto_to_flags(uint8_t proto)
-{
-       int flag;
-
-       switch (proto) {
-       case IPPROTO_TCP:
-               flag = FL_TCP;
-               break;
-       case IPPROTO_SCTP:
-               flag = FL_SCTP;
-               break;
-       case IPPROTO_UDP:
-               flag = FL_UDP;
-               break;
-       default:
-               flag = 0;
-               break;
-       }
-
-       return (flag);
-}
-
-static __inline int
-flags_to_proto(int flags)
-{
-       int proto, protoflags;
-
-       protoflags = flags & (FL_TCP|FL_SCTP|FL_UDP);
-       switch (protoflags) {
-       case FL_TCP:
-               proto = IPPROTO_TCP;
-               break;
-       case FL_SCTP:
-               proto = IPPROTO_SCTP;
-               break;
-       case FL_UDP:
-               proto = IPPROTO_UDP;
-               break;
-       default:
-               proto = 0;
-               break;
-       }
-       return (proto);
-}
+static struct flentry *
+flowtable_lookup_common(struct flowtable *, uint32_t *, int, uint32_t);
 
 #ifdef INET
-static int
-ipv4_mbuf_demarshal(struct mbuf *m, struct sockaddr_in *ssin,
-    struct sockaddr_in *dsin, uint16_t *flags)
+static struct flentry *
+flowtable_lookup_ipv4(struct mbuf *m, struct route *ro)
 {
+       struct flentry *fle;
+       struct sockaddr_in *sin;
        struct ip *ip;
-       uint8_t proto;
+       uint32_t fibnum;
+#ifdef FLOWTABLE_HASH_ALL
+       uint32_t key[3];
        int iphlen;
-       struct tcphdr *th;
-       struct udphdr *uh;
-       struct sctphdr *sh;
        uint16_t sport, dport;
+       uint8_t proto;
+#endif
 
-       proto = sport = dport = 0;
        ip = mtod(m, struct ip *);
-       dsin->sin_family = AF_INET;
-       dsin->sin_len = sizeof(*dsin);
-       dsin->sin_addr = ip->ip_dst;
-       ssin->sin_family = AF_INET;
-       ssin->sin_len = sizeof(*ssin);
-       ssin->sin_addr = ip->ip_src;
 
-       proto = ip->ip_p;
-       if ((*flags & FL_HASH_ALL) == 0)
-               goto skipports;
+       if (ip->ip_src.s_addr == ip->ip_dst.s_addr ||
+           (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
+           (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
+               return (NULL);
+
+       fibnum = M_GETFIB(m);
 
-       iphlen = ip->ip_hl << 2; /* XXX options? */
+#ifdef FLOWTABLE_HASH_ALL
+       iphlen = ip->ip_hl << 2;
+       proto = ip->ip_p;
 
        switch (proto) {
-       case IPPROTO_TCP:
-               th = (struct tcphdr *)((caddr_t)ip + iphlen);
+       case IPPROTO_TCP: {
+               struct tcphdr *th;
+
+               th = (struct tcphdr *)((char *)ip + iphlen);
                sport = th->th_sport;
                dport = th->th_dport;
-               if ((*flags & FL_HASH_ALL) &&
-                   (th->th_flags & (TH_RST|TH_FIN)))
-                       *flags |= FL_STALE;
+               if (th->th_flags & (TH_RST|TH_FIN))
+                       fibnum |= (FL_STALE << 24);
                break;
-       case IPPROTO_UDP:
-               uh = (struct udphdr *)((caddr_t)ip + iphlen);
+       }
+       case IPPROTO_UDP: {
+               struct udphdr *uh;
+
+               uh = (struct udphdr *)((char *)ip + iphlen);
                sport = uh->uh_sport;
                dport = uh->uh_dport;
                break;
-       case IPPROTO_SCTP:
-               sh = (struct sctphdr *)((caddr_t)ip + iphlen);
+       }
+       case IPPROTO_SCTP: {
+               struct sctphdr *sh;
+
+               sh = (struct sctphdr *)((char *)ip + iphlen);
                sport = sh->src_port;
                dport = sh->dest_port;
+               /* XXXGL: handle stale? */
                break;
+       }
        default:
-               return (ENOTSUP);
-               /* no port - hence not a protocol we care about */
+               sport = dport = 0;
                break;
-
        }
 
-skipports:
-       *flags |= proto_to_flags(proto);
-       ssin->sin_port = sport;
-       dsin->sin_port = dport;
-       return (0);
-}
-
-static uint32_t
-ipv4_flow_lookup_hash(
-       struct sockaddr_in *ssin, struct sockaddr_in *dsin,
-           uint32_t *key, uint16_t flags)
-{
-       uint16_t sport, dport;
-       uint8_t proto;
-       int offset = 0;
+       key[0] = ip->ip_dst.s_addr;
+       key[1] = ip->ip_src.s_addr;
+       key[2] = (dport << 16) | sport;
+       fibnum |= proto << 16;
 
-       proto = flags_to_proto(flags);
-       sport = dport = key[2] = key[1] = key[0] = 0;
-       if ((ssin != NULL) && (flags & FL_HASH_ALL)) {
-               key[1] = ssin->sin_addr.s_addr;
-               sport = ssin->sin_port;
-       }
-       if (dsin != NULL) {
-               key[2] = dsin->sin_addr.s_addr;
-               dport = dsin->sin_port;
-       }
-       if (flags & FL_HASH_ALL) {
-               ((uint16_t *)key)[0] = sport;
-               ((uint16_t *)key)[1] = dport;
-       } else
-               offset = flow_hashjitter + proto;
+       fle = flowtable_lookup_common(&V_ip4_ft, key, 3 * sizeof(uint32_t),
+           fibnum);
 
-       return (jenkins_hash32(key, 3, offset));
-}
+#else  /* !FLOWTABLE_HASH_ALL */
 
-static struct flentry *
-flowtable_lookup_ipv4(struct mbuf *m)
-{
-       struct sockaddr_storage ssa, dsa;
-       uint16_t flags;
-       struct sockaddr_in *dsin, *ssin;
-
-       dsin = (struct sockaddr_in *)&dsa;
-       ssin = (struct sockaddr_in *)&ssa;
-       bzero(dsin, sizeof(*dsin));
-       bzero(ssin, sizeof(*ssin));
-       flags = V_ip4_ft.ft_flags;
-       if (ipv4_mbuf_demarshal(m, ssin, dsin, &flags) != 0)
-               return (NULL);
+       fle = flowtable_lookup_common(&V_ip4_ft, (uint32_t *)&ip->ip_dst,
+           sizeof(struct in_addr), fibnum);
 
-       return (flowtable_lookup_common(&V_ip4_ft, &ssa, &dsa, m, flags));
-}
+#endif /* FLOWTABLE_HASH_ALL */
 
-void
-flow_to_route(struct flentry *fle, struct route *ro)
-{
-       uint32_t *hashkey = NULL;
-       struct sockaddr_in *sin;
+       if (fle == NULL)
+               return (NULL);
 
        sin = (struct sockaddr_in *)&ro->ro_dst;
        sin->sin_family = AF_INET;
        sin->sin_len = sizeof(*sin);
-       hashkey = fle->f_flow4.ipf_key;
-       sin->sin_addr.s_addr = hashkey[2];
-       ro->ro_rt = fle->f_rt;
-       ro->ro_lle = fle->f_lle;
-       ro->ro_flags |= RT_NORTREF;
+       sin->sin_addr = ip->ip_dst;
+
+       return (fle);
 }
 #endif /* INET */
 
@@ -435,9 +280,8 @@ flow_to_route(struct flentry *fle, struc
 #define PULLUP_TO(_len, p, T)                                          \
 do {                                                                   \
        int x = (_len) + sizeof(T);                                     \
-       if ((m)->m_len < x) {                                           \
-               goto receive_failed;                                    \
-       }                                                               \
+       if ((m)->m_len < x)                                             \
+               return (NULL);                                          \
        p = (mtod(m, char *) + (_len));                                 \
 } while (0)
 
@@ -445,26 +289,35 @@ do {                                                      
                \
 #define        SCTP(p)         ((struct sctphdr *)(p))
 #define        UDP(p)          ((struct udphdr *)(p))
 
-static int
-ipv6_mbuf_demarshal(struct mbuf *m, struct sockaddr_in6 *ssin6,
-    struct sockaddr_in6 *dsin6, uint16_t *flags)
+static struct flentry *
+flowtable_lookup_ipv6(struct mbuf *m, struct route *ro)
 {
+       struct flentry *fle;
+       struct sockaddr_in6 *sin6;
        struct ip6_hdr *ip6;
-       uint8_t proto;
+       uint32_t fibnum;
+#ifdef FLOWTABLE_HASH_ALL
+       uint32_t key[9];
+       void *ulp;
        int hlen;
-       uint16_t src_port, dst_port;
+       uint16_t sport, dport;
        u_short offset;
-       void *ulp;
+       uint8_t proto;
+#else
+       uint32_t key[4];
+#endif
 
-       offset = hlen = src_port = dst_port = 0;
-       ulp = NULL;
        ip6 = mtod(m, struct ip6_hdr *);
-       hlen = sizeof(struct ip6_hdr);
-       proto = ip6->ip6_nxt;
+       if (in6_localaddr(&ip6->ip6_dst))
+               return (NULL);
 
-       if ((*flags & FL_HASH_ALL) == 0)
-               goto skipports;
+       fibnum = M_GETFIB(m);
 
+#ifdef FLOWTABLE_HASH_ALL
+       hlen = sizeof(struct ip6_hdr);
+       proto = ip6->ip6_nxt;
+       offset = sport = dport = 0;
+       ulp = NULL;
        while (ulp == NULL) {
                switch (proto) {
                case IPPROTO_ICMPV6:
@@ -477,21 +330,21 @@ ipv6_mbuf_demarshal(struct mbuf *m, stru
                        break;
                case IPPROTO_TCP:
                        PULLUP_TO(hlen, ulp, struct tcphdr);
-                       dst_port = TCP(ulp)->th_dport;
-                       src_port = TCP(ulp)->th_sport;
-                       if ((*flags & FL_HASH_ALL) &&
-                           (TCP(ulp)->th_flags & (TH_RST|TH_FIN)))
-                               *flags |= FL_STALE;
+                       dport = TCP(ulp)->th_dport;
+                       sport = TCP(ulp)->th_sport;
+                       if (TCP(ulp)->th_flags & (TH_RST|TH_FIN))
+                               fibnum |= (FL_STALE << 24);
                        break;
                case IPPROTO_SCTP:
                        PULLUP_TO(hlen, ulp, struct sctphdr);
-                       src_port = SCTP(ulp)->src_port;
-                       dst_port = SCTP(ulp)->dest_port;
+                       dport = SCTP(ulp)->src_port;
+                       sport = SCTP(ulp)->dest_port;
+                       /* XXXGL: handle stale? */
                        break;
                case IPPROTO_UDP:
                        PULLUP_TO(hlen, ulp, struct udphdr);
-                       dst_port = UDP(ulp)->uh_dport;
-                       src_port = UDP(ulp)->uh_sport;
+                       dport = UDP(ulp)->uh_dport;
+                       sport = UDP(ulp)->uh_sport;
                        break;
                case IPPROTO_HOPOPTS:   /* RFC 2460 */
                        PULLUP_TO(hlen, ulp, struct ip6_hbh);
@@ -531,102 +384,28 @@ ipv6_mbuf_demarshal(struct mbuf *m, stru
                }
        }
 
-       if (src_port == 0) {
-       receive_failed:
-               return (ENOTSUP);
-       }
-
-skipports:
-       dsin6->sin6_family = AF_INET6;
-       dsin6->sin6_len = sizeof(*dsin6);
-       dsin6->sin6_port = dst_port;
-       memcpy(&dsin6->sin6_addr, &ip6->ip6_dst, sizeof(struct in6_addr));
-
-       ssin6->sin6_family = AF_INET6;
-       ssin6->sin6_len = sizeof(*ssin6);
-       ssin6->sin6_port = src_port;
-       memcpy(&ssin6->sin6_addr, &ip6->ip6_src, sizeof(struct in6_addr));
-       *flags |= proto_to_flags(proto);
-
-       return (0);
-}
-
-#define zero_key(key)          \
-do {                           \
-       key[0] = 0;             \
-       key[1] = 0;             \
-       key[2] = 0;             \
-       key[3] = 0;             \
-       key[4] = 0;             \
-       key[5] = 0;             \
-       key[6] = 0;             \
-       key[7] = 0;             \
-       key[8] = 0;             \
-} while (0)
-
-static uint32_t
-ipv6_flow_lookup_hash(
-       struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6,
-           uint32_t *key, uint16_t flags)
-{
-       uint16_t sport, dport;
-       uint8_t proto;
-       int offset = 0;
-
-       proto = flags_to_proto(flags);
-       zero_key(key);
-       sport = dport = 0;
-       if (dsin6 != NULL) {
-               memcpy(&key[1], &dsin6->sin6_addr, sizeof(struct in6_addr));
-               dport = dsin6->sin6_port;
-       }
-       if ((ssin6 != NULL) && (flags & FL_HASH_ALL)) {
-               memcpy(&key[5], &ssin6->sin6_addr, sizeof(struct in6_addr));
-               sport = ssin6->sin6_port;
-       }
-       if (flags & FL_HASH_ALL) {
-               ((uint16_t *)key)[0] = sport;
-               ((uint16_t *)key)[1] = dport;
-       } else
-               offset = flow_hashjitter + proto;
-
-       return (jenkins_hash32(key, 9, offset));
-}
-
-static struct flentry *
-flowtable_lookup_ipv6(struct mbuf *m)
-{
-       struct sockaddr_storage ssa, dsa;
-       struct sockaddr_in6 *dsin6, *ssin6;
-       uint16_t flags;
-
-       dsin6 = (struct sockaddr_in6 *)&dsa;
-       ssin6 = (struct sockaddr_in6 *)&ssa;
-       bzero(dsin6, sizeof(*dsin6));
-       bzero(ssin6, sizeof(*ssin6));
-       flags = V_ip6_ft.ft_flags;
+       bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr));
+       bcopy(&ip6->ip6_src, &key[4], sizeof(struct in6_addr));
+       key[8] = (dport << 16) | sport;
+       fibnum |= proto << 16;
+
+       fle = flowtable_lookup_common(&V_ip6_ft, key, 9 * sizeof(uint32_t),
+           fibnum);
+#else  /* !FLOWTABLE_HASH_ALL */
+       bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr));
+       fle = flowtable_lookup_common(&V_ip6_ft, key, sizeof(struct in6_addr),
+           fibnum);
+#endif /* FLOWTABLE_HASH_ALL */
 
-       if (ipv6_mbuf_demarshal(m, ssin6, dsin6, &flags) != 0)
+       if (fle == NULL)
                return (NULL);
 
-       return (flowtable_lookup_common(&V_ip6_ft, &ssa, &dsa, m, flags));
-}
-
-void
-flow_to_route_in6(struct flentry *fle, struct route_in6 *ro)
-{
-       uint32_t *hashkey = NULL;
-       struct sockaddr_in6 *sin6;
-
        sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
-
        sin6->sin6_family = AF_INET6;
        sin6->sin6_len = sizeof(*sin6);
-       hashkey = fle->f_flow6.ipf_key;
-       memcpy(&sin6->sin6_addr, &hashkey[5], sizeof (struct in6_addr));
-       ro->ro_rt = fle->f_rt;
-       ro->ro_lle = fle->f_lle;
-       ro->ro_flags |= RT_NORTREF;
+       bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(struct in6_addr));
+
+       return (fle);
 }
 #endif /* INET6 */
 
@@ -654,75 +433,57 @@ flowtable_list(struct flowtable *ft, uin
 }
 
 static int
-flow_stale(struct flowtable *ft, struct flentry *fle)
+flow_stale(struct flowtable *ft, struct flentry *fle, int maxidle)
 {
-       time_t idle_time;
 
-       if ((fle->f_fhash == 0)
-           || ((fle->f_rt->rt_flags & RTF_HOST) &&
-               ((fle->f_rt->rt_flags & (RTF_UP))
-                   != (RTF_UP)))
-           || (fle->f_rt->rt_ifp == NULL)
-           || !RT_LINK_IS_UP(fle->f_rt->rt_ifp))
+       if (((fle->f_rt->rt_flags & RTF_HOST) &&
+           ((fle->f_rt->rt_flags & (RTF_UP)) != (RTF_UP))) ||
+           (fle->f_rt->rt_ifp == NULL) ||
+           !RT_LINK_IS_UP(fle->f_rt->rt_ifp) ||
+           (fle->f_lle->la_flags & LLE_VALID) == 0)
                return (1);
 
-       idle_time = time_uptime - fle->f_uptime;
+       if (time_uptime - fle->f_uptime > maxidle)
+               return (1);
 
-       if ((fle->f_flags & FL_STALE) ||
-           ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0
-               && (idle_time > ft->ft_udp_idle)) ||
-           ((fle->f_flags & TH_FIN)
-               && (idle_time > ft->ft_fin_wait_idle)) ||
-           ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN
-               && (idle_time > ft->ft_syn_idle)) ||
-           ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)
-               && (idle_time > ft->ft_tcp_idle)) ||
-           ((fle->f_rt->rt_flags & RTF_UP) == 0 ||
-               (fle->f_rt->rt_ifp == NULL)))
+#ifdef FLOWTABLE_HASH_ALL
+       if (fle->f_flags & FL_STALE)
                return (1);
+#endif
 
        return (0);
 }
 
 static int
-flow_full(struct flowtable *ft)
+flow_full(void)
 {
-       boolean_t full;
        int count, max;
 
-       full = ft->ft_full;
        count = uma_zone_get_cur(flow_zone);
        max = uma_zone_get_max(flow_zone);
 
-       if (full && (count < (max - (max >> 3))))
-               ft->ft_full = FALSE;
-       else if (!full && (count > (max - (max >> 5))))
-               ft->ft_full = TRUE;
-
-       if (full && !ft->ft_full) {
-               flowclean_freq = 4*hz;
-               if ((ft->ft_flags & FL_HASH_ALL) == 0)
-                       ft->ft_udp_idle = ft->ft_fin_wait_idle =
-                           ft->ft_syn_idle = ft->ft_tcp_idle = 5;
-               cv_broadcast(&flowclean_c_cv);
-       } else if (!full && ft->ft_full) {
-               flowclean_freq = 20*hz;
-               if ((ft->ft_flags & FL_HASH_ALL) == 0)
-                       ft->ft_udp_idle = ft->ft_fin_wait_idle =
-                           ft->ft_syn_idle = ft->ft_tcp_idle = 30;
-       }
-
-       return (ft->ft_full);
+       return (count > (max - (max >> 3)));
 }
 
 static int
-flow_matches(struct flentry *fle, uint32_t hash, uint32_t *key, uint8_t
-   proto, uint32_t fibnum)
+flow_matches(struct flentry *fle, uint32_t *key, int keylen, uint32_t fibnum)
 {
+#ifdef FLOWTABLE_HASH_ALL
+       uint8_t proto;
+
+       proto = (fibnum >> 16) & 0xff;
+       fibnum &= 0xffff;
+#endif
+
+       CRITICAL_ASSERT(curthread);
 
-       if (fle->f_fhash == hash &&
-           bcmp(&fle->f_flow, key, KEYLEN(fle->f_flags)) == 0 &&
-           proto == fle->f_proto && fibnum == fle->f_fibnum &&
+       /* Microoptimization for IPv4: don't use bcmp(). */
+       if (((keylen == sizeof(uint32_t) && (fle->f_key[0] != key[0])) ||
+           (bcmp(fle->f_key, key, keylen) == 0)) &&
+           fibnum == fle->f_fibnum &&
+#ifdef FLOWTABLE_HASH_ALL
+           proto == fle->f_proto &&
+#endif
            (fle->f_rt->rt_flags & RTF_UP) &&
            fle->f_rt->rt_ifp != NULL &&
            (fle->f_lle->la_flags & LLE_VALID))
@@ -733,27 +494,131 @@ flow_matches(struct flentry *fle, uint32
 
 static struct flentry *
 flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
-    uint32_t fibnum, struct route *ro, uint16_t flags)
+    int keylen, uint32_t fibnum0)
 {
+#ifdef INET6
+        struct route_in6 sro6;
+#endif
+#ifdef INET
+        struct route sro;
+#endif
+       struct route *ro = NULL;
+       struct rtentry *rt;
+       struct lltable *lt = NULL;
+       struct llentry *lle;
+       struct sockaddr_storage *l3addr;
+       struct ifnet *ifp;
        struct flist *flist;
        struct flentry *fle, *iter;
        bitstr_t *mask;
-       int depth;
+       uint16_t fibnum = fibnum0;
+#ifdef FLOWTABLE_HASH_ALL
        uint8_t proto;
 
+       proto = (fibnum0 >> 16) & 0xff;
+       fibnum = fibnum0 & 0xffff;
+#endif
+
+       /*
+        * This bit of code ends up locking the
+        * same route 3 times (just like ip_output + ether_output)
+        * - at lookup
+        * - in rt_check when called by arpresolve
+        * - dropping the refcount for the rtentry
+        *
+        * This could be consolidated to one if we wrote a variant
+        * of arpresolve with an rt_check variant that expected to
+        * receive the route locked
+        */
+#ifdef INET
+       if (ft == &V_ip4_ft) {
+               struct sockaddr_in *sin;
+
+               ro = &sro;
+               bzero(&sro.ro_dst, sizeof(sro.ro_dst));
+
+               sin = (struct sockaddr_in *)&sro.ro_dst;
+               sin->sin_family = AF_INET;
+               sin->sin_len = sizeof(*sin);
+               sin->sin_addr.s_addr = key[0];
+       }
+#endif
+#ifdef INET6
+       if (ft == &V_ip6_ft) {
+               struct sockaddr_in6 *sin6;
+
+               ro = (struct route *)&sro6;
+               sin6 = &sro6.ro_dst;
+
+               bzero(sin6, sizeof(*sin6));
+               sin6->sin6_family = AF_INET6;
+               sin6->sin6_len = sizeof(*sin6);
+               bcopy(key, &sin6->sin6_addr, sizeof(struct in6_addr));
+       }
+#endif
+
+       ro->ro_rt = NULL;
+#ifdef RADIX_MPATH
+       rtalloc_mpath_fib(ro, hash, fibnum);
+#else
+       rtalloc_ign_fib(ro, 0, fibnum);
+#endif
+       if (ro->ro_rt == NULL)
+               return (NULL);
+
+       rt = ro->ro_rt;
+       ifp = rt->rt_ifp;
+
+       if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
+               RTFREE(rt);
+               return (NULL);
+       }
+
+#ifdef INET
+       if (ft == &V_ip4_ft)
+               lt = LLTABLE(ifp);
+#endif
+#ifdef INET6
+       if (ft == &V_ip6_ft)
+               lt = LLTABLE6(ifp);
+#endif
+
+       if (rt->rt_flags & RTF_GATEWAY)
+               l3addr = (struct sockaddr_storage *)rt->rt_gateway;
+       else
+               l3addr = (struct sockaddr_storage *)&ro->ro_dst;
+       lle = llentry_alloc(ifp, lt, l3addr);
+
+       if (lle == NULL) {
+               RTFREE(rt);
+               return (NULL);
+       }
+
+       /* Don't insert the entry if the ARP hasn't yet finished resolving. */
+       if ((lle->la_flags & LLE_VALID) == 0) {
+               RTFREE(rt);
+               LLE_FREE(lle);
+               FLOWSTAT_INC(ft, ft_fail_lle_invalid);
+               return (NULL);
+       }
+
        fle = uma_zalloc(flow_zone, M_NOWAIT | M_ZERO);
-       if (fle == NULL)
+       if (fle == NULL) {
+               RTFREE(rt);
+               LLE_FREE(lle);
                return (NULL);
+       }
 
-       proto = flags_to_proto(flags);
-       bcopy(key, &fle->f_flow, KEYLEN(flags));
-       fle->f_flags |= (flags & FL_IPV6);
-       fle->f_proto = proto;
-       fle->f_rt = ro->ro_rt;
-       fle->f_lle = ro->ro_lle;
-       fle->f_fhash = hash;
+       fle->f_hash = hash;
+       bcopy(key, &fle->f_key, keylen);
+       fle->f_rt = rt;
+       fle->f_lle = lle;
        fle->f_fibnum = fibnum;
        fle->f_uptime = time_uptime;
+#ifdef FLOWTABLE_HASH_ALL
+       fle->f_proto = proto;
+       fle->f_flags = fibnum0 >> 24;
+#endif
 
        critical_enter();
        mask = flowtable_mask(ft);
@@ -765,13 +630,13 @@ flowtable_insert(struct flowtable *ft, u
                goto skip;
        }
 
-       depth = 0;
        /*
         * find end of list and make sure that we were not
         * preempted by another thread handling this flow
         */
        SLIST_FOREACH(iter, flist, f_next) {
-               if (flow_matches(iter, hash, key, proto, fibnum)) {
+               KASSERT(iter->f_hash == hash, ("%s: wrong hash", __func__));
+               if (flow_matches(iter, key, keylen, fibnum)) {
                        /*
                         * We probably migrated to an other CPU after
                         * lookup in flowtable_lookup_common() failed.
@@ -779,18 +644,16 @@ flowtable_insert(struct flowtable *ft, u
                         * entry.
                         */
                        iter->f_uptime = time_uptime;
-                       iter->f_flags |= flags;
+#ifdef FLOWTABLE_HASH_ALL
+                       iter->f_flags |= fibnum >> 24;
+#endif
                        critical_exit();
                        FLOWSTAT_INC(ft, ft_collisions);
                        uma_zfree(flow_zone, fle);
                        return (iter);
                }
-               depth++;
        }
 
-       if (depth > ft->ft_max_depth)
-               ft->ft_max_depth = depth;
-
        SLIST_INSERT_HEAD(flist, fle, f_next);
 skip:
        critical_exit();
@@ -799,215 +662,75 @@ skip:
        return (fle);
 }
 
-struct flentry *
-flowtable_lookup(sa_family_t sa, struct mbuf *m)
+int
+flowtable_lookup(sa_family_t sa, struct mbuf *m, struct route *ro)
 {
+       struct flentry *fle;
+
+       if (V_flowtable_enable == 0)
+               return (ENXIO);
 
        switch (sa) {
 #ifdef INET
        case AF_INET:
-               return (flowtable_lookup_ipv4(m));
+               fle = flowtable_lookup_ipv4(m, ro);
+               break;
 #endif
 #ifdef INET6
        case AF_INET6:
-               return (flowtable_lookup_ipv6(m));
+               fle = flowtable_lookup_ipv6(m, ro);
+               break;
 #endif
        default:
                panic("%s: sa %d", __func__, sa);
        }
-}
 
-static struct flentry *
-flowtable_lookup_common(struct flowtable *ft, struct sockaddr_storage *ssa,
-    struct sockaddr_storage *dsa, struct mbuf *m, int flags)
-{
-       struct route_in6 sro6;
-       struct route sro, *ro;
-       struct flist *flist;
-       struct flentry *fle;
-       struct rtentry *rt;
-       struct llentry *lle;
-       struct sockaddr_storage *l3addr;
-       struct ifnet *ifp;
-       uint32_t key[9], hash, fibnum;
-       uint8_t proto;
-
-       if (V_flowtable_enable == 0)
-               return (NULL);
-
-       sro.ro_rt = sro6.ro_rt = NULL;
-       sro.ro_lle = sro6.ro_lle = NULL;
-       flags |= ft->ft_flags;
-       proto = flags_to_proto(flags);
-       fibnum = M_GETFIB(m);
-
-       switch (ssa->ss_family) {
-#ifdef INET
-       case AF_INET: {
-               struct sockaddr_in *ssin, *dsin;
-
-               KASSERT(dsa->ss_family == AF_INET,
-                   ("%s: dsa family %d\n", __func__, dsa->ss_family));
-
-               ro = &sro;
-               memcpy(&ro->ro_dst, dsa, sizeof(struct sockaddr_in));
-               /*
-                * The harvested source and destination addresses
-                * may contain port information if the packet is
-                * from a transport protocol (e.g. TCP/UDP). The
-                * port field must be cleared before performing
-                * a route lookup.
-                */
-               ((struct sockaddr_in *)&ro->ro_dst)->sin_port = 0;
-               dsin = (struct sockaddr_in *)dsa;
-               ssin = (struct sockaddr_in *)ssa;
-               if ((dsin->sin_addr.s_addr == ssin->sin_addr.s_addr) ||
-                   (ntohl(dsin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == 
IN_LOOPBACKNET ||
-                   (ntohl(ssin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == 
IN_LOOPBACKNET)
-                       return (NULL);
+       if (fle == NULL)
+               return (EHOSTUNREACH);
 
-               hash = ipv4_flow_lookup_hash(ssin, dsin, key, flags);
-               break;
+       if (!(m->m_flags & M_FLOWID)) {
+               m->m_flags |= M_FLOWID;
+               m->m_pkthdr.flowid = fle->f_hash;

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-all@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to