Author: glebius
Date: Fri Feb  7 15:18:23 2014
New Revision: 261601
URL: http://svnweb.freebsd.org/changeset/base/261601

Log:
  o Revamp API between flowtable and netinet, netinet6.
    - ip_output() and ip_output6() simply call flowtable_lookup(),
      passing mbuf and address family. That's the only code under
      #ifdef FLOWTABLE in the protocols code now.
  o Revamp statistics gathering and export.
    - Remove hand made pcpu stats, and utilize counter(9).
    - Snapshot of statistics is available via 'netstat -rs'.
    - All sysctls are moved into net.flowtable namespace, since
      spreading them over net.inet isn't correct.
  o Properly separate at compile time INET and INET6 parts.
  o General cleanup.
    - Remove chain of multiple flowtables. We simply have one for
      IPv4 and one for IPv6.
    - Flowtables are allocated in flowtable.c, symbols are static.
    - With proper argument to SYSINIT() we no longer need flowtable_ready.
    - Hash salt doesn't need to be per-VNET.
    - Removed rudimentary debugging, which use quite useless in dtrace era.
  
  The runtime behavior of flowtable shouldn't be changed by this commit.
  
  Sponsored by: Netflix
  Sponsored by: Nginx, Inc.

Added:
  head/usr.bin/netstat/flowtable.c   (contents, props changed)
Modified:
  head/sys/net/flowtable.c
  head/sys/net/flowtable.h
  head/sys/net/route.c
  head/sys/netinet/ip_input.c
  head/sys/netinet/ip_output.c
  head/sys/netinet6/in6_proto.c
  head/sys/netinet6/ip6_input.c
  head/sys/netinet6/ip6_output.c
  head/usr.bin/netstat/Makefile
  head/usr.bin/netstat/main.c
  head/usr.bin/netstat/netstat.h

Modified: head/sys/net/flowtable.c
==============================================================================
--- head/sys/net/flowtable.c    Fri Feb  7 15:10:24 2014        (r261600)
+++ head/sys/net/flowtable.c    Fri Feb  7 15:18:23 2014        (r261601)
@@ -146,23 +146,13 @@ union flentryp {
        struct flentry          **pcpu[MAXCPU];
 };
 
-struct flowtable_stats {
-       uint64_t        ft_collisions;
-       uint64_t        ft_allocated;
-       uint64_t        ft_misses;
-       uint64_t        ft_max_depth;
-       uint64_t        ft_free_checks;
-       uint64_t        ft_frees;
-       uint64_t        ft_hits;
-       uint64_t        ft_lookups;
-} __aligned(CACHE_LINE_SIZE);
-
 struct flowtable {
-       struct  flowtable_stats ft_stats[MAXCPU];
+       counter_u64_t   *ft_stat;
+       uma_zone_t      ft_zone;
        int             ft_size;
        int             ft_lock_count;
        uint32_t        ft_flags;
-       char            *ft_name;
+       uint32_t        ft_max_depth;
        fl_lock_t       *ft_lock;
        fl_lock_t       *ft_unlock;
        fl_rtalloc_t    *ft_rtalloc;
@@ -173,9 +163,7 @@ struct flowtable {
        union flentryp  ft_table;
        bitstr_t        *ft_masks[MAXCPU];
        bitstr_t        *ft_tmpmask;
-       struct flowtable *ft_next;
 
-       uint32_t        ft_count __aligned(CACHE_LINE_SIZE);
        uint32_t        ft_udp_idle __aligned(CACHE_LINE_SIZE);
        uint32_t        ft_fin_wait_idle;
        uint32_t        ft_syn_idle;
@@ -183,17 +171,12 @@ struct flowtable {
        boolean_t       ft_full;
 } __aligned(CACHE_LINE_SIZE);
 
-static struct proc *flowcleanerproc;
-static VNET_DEFINE(struct flowtable *, flow_list_head);
-static VNET_DEFINE(uint32_t, flow_hashjitter);
-static VNET_DEFINE(uma_zone_t, flow_ipv4_zone);
-static VNET_DEFINE(uma_zone_t, flow_ipv6_zone);
-
-#define        V_flow_list_head        VNET(flow_list_head)
-#define        V_flow_hashjitter       VNET(flow_hashjitter)
-#define        V_flow_ipv4_zone        VNET(flow_ipv4_zone)
-#define        V_flow_ipv6_zone        VNET(flow_ipv6_zone)
+#define        FLOWSTAT_ADD(ft, name, v)       \
+       counter_u64_add((ft)->ft_stat[offsetof(struct flowtable_stat, name) / 
sizeof(uint64_t)], (v))
+#define        FLOWSTAT_INC(ft, name)  FLOWSTAT_ADD(ft, name, 1)
 
+static struct proc *flowcleanerproc;
+static uint32_t flow_hashjitter;
 
 static struct cv       flowclean_f_cv;
 static struct cv       flowclean_c_cv;
@@ -201,24 +184,8 @@ static struct mtx  flowclean_lock;
 static uint32_t                flowclean_cycles;
 static uint32_t                flowclean_freq;
 
-#ifdef FLOWTABLE_DEBUG
-#define FLDPRINTF(ft, flags, fmt, ...)                 \
-do {                                           \
-       if ((ft)->ft_flags & (flags))           \
-               printf((fmt), __VA_ARGS__);     \
-} while (0);                                   \
-
-#else
-#define FLDPRINTF(ft, flags, fmt, ...)
-
-#endif
-
-
 /*
  * TODO:
- * - Make flowtable stats per-cpu, aggregated at sysctl call time,
- *   to avoid extra cache evictions caused by incrementing a shared
- *   counter
  * - add sysctls to resize && flush flow tables
  * - Add per flowtable sysctls for statistics and configuring timeouts
  * - add saturation counter to rtentry to support per-packet load-balancing
@@ -230,148 +197,51 @@ do {                                             \
  * - support explicit connection state (currently only ad-hoc for DSR)
  * - idetach() cleanup for options VIMAGE builds.
  */
-VNET_DEFINE(int, flowtable_enable) = 1;
-static VNET_DEFINE(int, flowtable_debug);
+#ifdef INET
+static VNET_DEFINE(struct flowtable, ip4_ft);
+#define V_ip4_ft       VNET(ip4_ft)
+static uma_zone_t      flow_ipv4_zone;
+#endif
+#ifdef INET6
+static VNET_DEFINE(struct flowtable, ip6_ft);
+#define        V_ip6_ft        VNET(ip6_ft)
+static uma_zone_t      flow_ipv6_zone;
+#endif
+
+static VNET_DEFINE(int, flowtable_enable) = 1;
 static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
 static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
 static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
 static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE;
-static VNET_DEFINE(int, flowtable_nmbflows);
-static VNET_DEFINE(int, flowtable_ready) = 0;
 
 #define        V_flowtable_enable              VNET(flowtable_enable)
-#define        V_flowtable_debug               VNET(flowtable_debug)
 #define        V_flowtable_syn_expire          VNET(flowtable_syn_expire)
 #define        V_flowtable_udp_expire          VNET(flowtable_udp_expire)
 #define        V_flowtable_fin_wait_expire     VNET(flowtable_fin_wait_expire)
 #define        V_flowtable_tcp_expire          VNET(flowtable_tcp_expire)
-#define        V_flowtable_nmbflows            VNET(flowtable_nmbflows)
-#define        V_flowtable_ready               VNET(flowtable_ready)
 
-static SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL,
+static SYSCTL_NODE(_net, OID_AUTO, flowtable, CTLFLAG_RD, NULL,
     "flowtable");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, debug, CTLFLAG_RW,
-    &VNET_NAME(flowtable_debug), 0, "print debug info.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW,
+SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, enable, CTLFLAG_RW,
     &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
 
 /*
  * XXX This does not end up updating timeouts at runtime
  * and only reflects the value for the last table added :-/
  */
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
+SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
     &VNET_NAME(flowtable_syn_expire), 0,
     "seconds after which to remove syn allocated flow.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
+SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
     &VNET_NAME(flowtable_udp_expire), 0,
     "seconds after which to remove flow allocated to UDP.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
+SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
     &VNET_NAME(flowtable_fin_wait_expire), 0,
     "seconds after which to remove a flow in FIN_WAIT.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
+SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
     &VNET_NAME(flowtable_tcp_expire), 0,
     "seconds after which to remove flow allocated to a TCP connection.");
 
-
-/*
- * Maximum number of flows that can be allocated of a given type.
- *
- * The table is allocated at boot time (for the pure caching case
- * there is no reason why this could not be changed at runtime)
- * and thus (currently) needs to be set with a tunable.
- */
-static int
-sysctl_nmbflows(SYSCTL_HANDLER_ARGS)
-{
-       int error, newnmbflows;
-
-       newnmbflows = V_flowtable_nmbflows;
-       error = sysctl_handle_int(oidp, &newnmbflows, 0, req);
-       if (error == 0 && req->newptr) {
-               if (newnmbflows > V_flowtable_nmbflows) {
-                       V_flowtable_nmbflows = newnmbflows;
-                       uma_zone_set_max(V_flow_ipv4_zone,
-                           V_flowtable_nmbflows);
-                       uma_zone_set_max(V_flow_ipv6_zone,
-                           V_flowtable_nmbflows);
-               } else
-                       error = EINVAL;
-       }
-       return (error);
-}
-SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, nmbflows,
-    CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU",
-    "Maximum number of flows allowed");
-
-
-
-#define FS_PRINT(sb, field)    sbuf_printf((sb), "\t%s: %jd\n", #field, 
fs->ft_##field)
-
-static void
-fs_print(struct sbuf *sb, struct flowtable_stats *fs)
-{
-
-       FS_PRINT(sb, collisions);
-       FS_PRINT(sb, allocated);
-       FS_PRINT(sb, misses);
-       FS_PRINT(sb, max_depth);
-       FS_PRINT(sb, free_checks);
-       FS_PRINT(sb, frees);
-       FS_PRINT(sb, hits);
-       FS_PRINT(sb, lookups);
-}
-
-static void
-flowtable_show_stats(struct sbuf *sb, struct flowtable *ft)
-{
-       int i;
-       struct flowtable_stats fs, *pfs;
-
-       if (ft->ft_flags & FL_PCPU) {
-               bzero(&fs, sizeof(fs));
-               pfs = &fs;
-               CPU_FOREACH(i) {
-                       pfs->ft_collisions  += ft->ft_stats[i].ft_collisions;
-                       pfs->ft_allocated   += ft->ft_stats[i].ft_allocated;
-                       pfs->ft_misses      += ft->ft_stats[i].ft_misses;
-                       pfs->ft_free_checks += ft->ft_stats[i].ft_free_checks;
-                       pfs->ft_frees       += ft->ft_stats[i].ft_frees;
-                       pfs->ft_hits        += ft->ft_stats[i].ft_hits;
-                       pfs->ft_lookups     += ft->ft_stats[i].ft_lookups;
-                       if (ft->ft_stats[i].ft_max_depth > pfs->ft_max_depth)
-                               pfs->ft_max_depth = 
ft->ft_stats[i].ft_max_depth;
-               }
-       } else {
-               pfs = &ft->ft_stats[0];
-       }
-       fs_print(sb, pfs);
-}
-
-static int
-sysctl_flowtable_stats(SYSCTL_HANDLER_ARGS)
-{
-       struct flowtable *ft;
-       struct sbuf *sb;
-       int error;
-
-       sb = sbuf_new(NULL, NULL, 64*1024, SBUF_FIXEDLEN);
-
-       ft = V_flow_list_head;
-       while (ft != NULL) {
-               sbuf_printf(sb, "\ntable name: %s\n", ft->ft_name);
-               flowtable_show_stats(sb, ft);
-               ft = ft->ft_next;
-       }
-       sbuf_finish(sb);
-       error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
-       sbuf_delete(sb);
-
-       return (error);
-}
-SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, stats, 
CTLTYPE_STRING|CTLFLAG_RD,
-    NULL, 0, sysctl_flowtable_stats, "A", "flowtable statistics");
-
-
 #ifndef RADIX_MPATH
 static void
 rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum)
@@ -419,12 +289,8 @@ flowtable_pcpu_unlock(struct flowtable *
 #define FL_STALE       (1<<8)
 #define FL_OVERWRITE   (1<<10)
 
-void
-flow_invalidate(struct flentry *fle)
-{
-
-       fle->f_flags |= FL_STALE;
-}
+static struct flentry *flowtable_lookup_common(struct flowtable *,
+    struct sockaddr_storage *, struct sockaddr_storage *, struct mbuf *, int);
 
 static __inline int
 proto_to_flags(uint8_t proto)
@@ -495,8 +361,8 @@ ipv4_flow_print_tuple(int flags, int pro
 #endif
 
 static int
-ipv4_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
-    struct sockaddr_in *ssin, struct sockaddr_in *dsin, uint16_t *flags)
+ipv4_mbuf_demarshal(struct mbuf *m, struct sockaddr_in *ssin,
+    struct sockaddr_in *dsin, uint16_t *flags)
 {
        struct ip *ip;
        uint8_t proto;
@@ -516,11 +382,8 @@ ipv4_mbuf_demarshal(struct flowtable *ft
        ssin->sin_addr = ip->ip_src;    
 
        proto = ip->ip_p;
-       if ((*flags & FL_HASH_ALL) == 0) {
-               FLDPRINTF(ft, FL_DEBUG_ALL, "skip port check flags=0x%x ",
-                   *flags);
+       if ((*flags & FL_HASH_ALL) == 0)
                goto skipports;
-       }
 
        iphlen = ip->ip_hl << 2; /* XXX options? */
 
@@ -544,7 +407,6 @@ ipv4_mbuf_demarshal(struct flowtable *ft
                dport = sh->dest_port;
                break;
        default:
-               FLDPRINTF(ft, FL_DEBUG_ALL, "proto=0x%x not supported\n", 
proto);
                return (ENOTSUP);
                /* no port - hence not a protocol we care about */
                break;
@@ -559,7 +421,7 @@ skipports:
 }
 
 static uint32_t
-ipv4_flow_lookup_hash_internal(
+ipv4_flow_lookup_hash(
        struct sockaddr_in *ssin, struct sockaddr_in *dsin,
            uint32_t *key, uint16_t flags)
 {
@@ -567,8 +429,6 @@ ipv4_flow_lookup_hash_internal(
        uint8_t proto;
        int offset = 0;
 
-       if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
-               return (0);
        proto = flags_to_proto(flags);
        sport = dport = key[2] = key[1] = key[0] = 0;
        if ((ssin != NULL) && (flags & FL_HASH_ALL)) {
@@ -583,13 +443,13 @@ ipv4_flow_lookup_hash_internal(
                ((uint16_t *)key)[0] = sport;
                ((uint16_t *)key)[1] = dport;
        } else
-               offset = V_flow_hashjitter + proto;
+               offset = flow_hashjitter + proto;
 
        return (jenkins_hash32(key, 3, offset));
 }
 
 static struct flentry *
-flowtable_lookup_mbuf4(struct flowtable *ft, struct mbuf *m)
+flowtable_lookup_ipv4(struct mbuf *m)
 {
        struct sockaddr_storage ssa, dsa;
        uint16_t flags;
@@ -599,11 +459,11 @@ flowtable_lookup_mbuf4(struct flowtable 
        ssin = (struct sockaddr_in *)&ssa;
        bzero(dsin, sizeof(*dsin));
        bzero(ssin, sizeof(*ssin));
-       flags = ft->ft_flags;
-       if (ipv4_mbuf_demarshal(ft, m, ssin, dsin, &flags) != 0)
+       flags = V_ip4_ft.ft_flags;
+       if (ipv4_mbuf_demarshal(m, ssin, dsin, &flags) != 0)
                return (NULL);
 
-       return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
+       return (flowtable_lookup_common(&V_ip4_ft, &ssa, &dsa, m, flags));
 }
 
 void
@@ -644,8 +504,8 @@ do {                                                        
                \
 #define        UDP(p)          ((struct udphdr *)(p))
 
 static int
-ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
-    struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, uint16_t *flags)
+ipv6_mbuf_demarshal(struct mbuf *m, struct sockaddr_in6 *ssin6,
+    struct sockaddr_in6 *dsin6, uint16_t *flags)
 {
        struct ip6_hdr *ip6;
        uint8_t proto;
@@ -763,7 +623,7 @@ do {                                \
 } while (0)
        
 static uint32_t
-ipv6_flow_lookup_hash_internal(
+ipv6_flow_lookup_hash(
        struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6,
            uint32_t *key, uint16_t flags)
 {
@@ -771,9 +631,6 @@ ipv6_flow_lookup_hash_internal(
        uint8_t proto;
        int offset = 0;
 
-       if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
-               return (0);
-
        proto = flags_to_proto(flags);
        zero_key(key);
        sport = dport = 0;
@@ -789,13 +646,13 @@ ipv6_flow_lookup_hash_internal(
                ((uint16_t *)key)[0] = sport;
                ((uint16_t *)key)[1] = dport;
        } else
-               offset = V_flow_hashjitter + proto;
+               offset = flow_hashjitter + proto;
 
        return (jenkins_hash32(key, 9, offset));
 }
 
 static struct flentry *
-flowtable_lookup_mbuf6(struct flowtable *ft, struct mbuf *m)
+flowtable_lookup_ipv6(struct mbuf *m)
 {
        struct sockaddr_storage ssa, dsa;
        struct sockaddr_in6 *dsin6, *ssin6;     
@@ -805,12 +662,12 @@ flowtable_lookup_mbuf6(struct flowtable 
        ssin6 = (struct sockaddr_in6 *)&ssa;
        bzero(dsin6, sizeof(*dsin6));
        bzero(ssin6, sizeof(*ssin6));
-       flags = ft->ft_flags;
+       flags = V_ip6_ft.ft_flags;
        
-       if (ipv6_mbuf_demarshal(ft, m, ssin6, dsin6, &flags) != 0)
+       if (ipv6_mbuf_demarshal(m, ssin6, dsin6, &flags) != 0)
                return (NULL);
 
-       return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
+       return (flowtable_lookup_common(&V_ip6_ft, &ssa, &dsa, m, flags));
 }
 
 void
@@ -910,43 +767,19 @@ flowtable_set_hashkey(struct flentry *fl
                hashkey[i] = key[i];
 }
 
-static struct flentry *
-flow_alloc(struct flowtable *ft)
-{
-       struct flentry *newfle;
-       uma_zone_t zone;
-
-       newfle = NULL;
-       zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
-
-       newfle = uma_zalloc(zone, M_NOWAIT | M_ZERO);
-       if (newfle != NULL)
-               atomic_add_int(&ft->ft_count, 1);
-       return (newfle);
-}
-
-static void
-flow_free(struct flentry *fle, struct flowtable *ft)
-{
-       uma_zone_t zone;
-
-       zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
-       atomic_add_int(&ft->ft_count, -1);
-       uma_zfree(zone, fle);
-}
-
 static int
 flow_full(struct flowtable *ft)
 {
        boolean_t full;
-       uint32_t count;
+       int count, max;
        
        full = ft->ft_full;
-       count = ft->ft_count;
+       count = uma_zone_get_cur(ft->ft_zone);
+       max = uma_zone_get_max(ft->ft_zone);
 
-       if (full && (count < (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 
3))))
+       if (full && (count < (max - (max >> 3))))
                ft->ft_full = FALSE;
-       else if (!full && (count > (V_flowtable_nmbflows - 
(V_flowtable_nmbflows >> 5))))
+       else if (!full && (count > (max - (max >> 5))))
                ft->ft_full = TRUE;
        
        if (full && !ft->ft_full) {
@@ -970,12 +803,11 @@ flowtable_insert(struct flowtable *ft, u
     uint32_t fibnum, struct route *ro, uint16_t flags)
 {
        struct flentry *fle, *fletail, *newfle, **flep;
-       struct flowtable_stats *fs = &ft->ft_stats[curcpu];
        int depth;
        bitstr_t *mask;
        uint8_t proto;
 
-       newfle = flow_alloc(ft);
+       newfle = uma_zalloc(ft->ft_zone, M_NOWAIT | M_ZERO);
        if (newfle == NULL)
                return (ENOMEM);
 
@@ -994,7 +826,7 @@ flowtable_insert(struct flowtable *ft, u
        }
        
        depth = 0;
-       fs->ft_collisions++;
+       FLOWSTAT_INC(ft, ft_collisions);
        /*
         * find end of list and make sure that we were not
         * preempted by another thread handling this flow
@@ -1006,7 +838,7 @@ flowtable_insert(struct flowtable *ft, u
                         * or we lost a race to insert
                         */
                        FL_ENTRY_UNLOCK(ft, hash);
-                       flow_free(newfle, ft);
+                       uma_zfree(ft->ft_zone, newfle);
                        
                        if (flags & FL_OVERWRITE)
                                goto skip;
@@ -1022,8 +854,8 @@ flowtable_insert(struct flowtable *ft, u
                fle = fle->f_next;
        }
 
-       if (depth > fs->ft_max_depth)
-               fs->ft_max_depth = depth;
+       if (depth > ft->ft_max_depth)
+               ft->ft_max_depth = depth;
        fletail->f_next = newfle;
        fle = newfle;
 skip:
@@ -1039,35 +871,6 @@ skip:
        return (0);
 }
 
-int
-kern_flowtable_insert(struct flowtable *ft,
-    struct sockaddr_storage *ssa, struct sockaddr_storage *dsa,
-    struct route *ro, uint32_t fibnum, int flags)
-{
-       uint32_t key[9], hash;
-
-       flags = (ft->ft_flags | flags | FL_OVERWRITE);
-       hash = 0;
-
-#ifdef INET
-       if (ssa->ss_family == AF_INET)
-               hash = ipv4_flow_lookup_hash_internal((struct sockaddr_in *)ssa,
-                   (struct sockaddr_in *)dsa, key, flags);
-#endif
-#ifdef INET6
-       if (ssa->ss_family == AF_INET6)
-               hash = ipv6_flow_lookup_hash_internal((struct sockaddr_in6 
*)ssa,
-                   (struct sockaddr_in6 *)dsa, key, flags);
-#endif 
-       if (ro->ro_rt == NULL || ro->ro_lle == NULL)
-               return (EINVAL);
-
-       FLDPRINTF(ft, FL_DEBUG,
-           "kern_flowtable_insert: key=%x:%x:%x hash=%x fibnum=%d flags=%x\n",
-           key[0], key[1], key[2], hash, fibnum, flags);
-       return (flowtable_insert(ft, hash, key, fibnum, ro, flags));
-}
-
 static int
 flowtable_key_equal(struct flentry *fle, uint32_t *key)
 {
@@ -1090,49 +893,54 @@ flowtable_key_equal(struct flentry *fle,
 }
 
 struct flentry *
-flowtable_lookup_mbuf(struct flowtable *ft, struct mbuf *m, int af)
+flowtable_lookup(sa_family_t sa, struct mbuf *m)
 {
-       struct flentry *fle = NULL;
 
+       switch (sa) {
 #ifdef INET
-       if (af == AF_INET)
-               fle = flowtable_lookup_mbuf4(ft, m);
+       case AF_INET:
+               return (flowtable_lookup_ipv4(m));
 #endif
 #ifdef INET6
-       if (af == AF_INET6)
-               fle = flowtable_lookup_mbuf6(ft, m);
-#endif 
-       if (fle != NULL && m != NULL && (m->m_flags & M_FLOWID) == 0) {
-               m->m_flags |= M_FLOWID;
-               m->m_pkthdr.flowid = fle->f_fhash;
+       case AF_INET6:
+               return (flowtable_lookup_ipv6(m));
+#endif
+       default:
+               panic("%s: sa %d", __func__, sa);
        }
-       return (fle);
 }
-       
-struct flentry *
-flowtable_lookup(struct flowtable *ft, struct sockaddr_storage *ssa,
-    struct sockaddr_storage *dsa, uint32_t fibnum, int flags)
+
+static struct flentry *
+flowtable_lookup_common(struct flowtable *ft, struct sockaddr_storage *ssa,
+    struct sockaddr_storage *dsa, struct mbuf *m, int flags)
 {
-       uint32_t key[9], hash;
+       struct route_in6 sro6;
+       struct route sro, *ro;
        struct flentry *fle;
-       struct flowtable_stats *fs = &ft->ft_stats[curcpu];
-       uint8_t proto = 0;
-       int error = 0;
        struct rtentry *rt;
        struct llentry *lle;
-       struct route sro, *ro;
-       struct route_in6 sro6;
+       struct sockaddr_storage *l3addr;
+       struct ifnet *ifp;
+       uint32_t key[9], hash, fibnum;
+       uint8_t proto;
+
+       if (V_flowtable_enable == 0)
+               return (NULL);
 
        sro.ro_rt = sro6.ro_rt = NULL;
        sro.ro_lle = sro6.ro_lle = NULL;
-       ro = NULL;
-       hash = 0;
        flags |= ft->ft_flags;
        proto = flags_to_proto(flags);
+       fibnum = M_GETFIB(m);
+
+       switch (ssa->ss_family) {
 #ifdef INET
-       if (ssa->ss_family == AF_INET) {
+       case AF_INET: {
                struct sockaddr_in *ssin, *dsin;
 
+               KASSERT(dsa->ss_family == AF_INET,
+                   ("%s: dsa family %d\n", __func__, dsa->ss_family));
+
                ro = &sro;
                memcpy(&ro->ro_dst, dsa, sizeof(struct sockaddr_in));
                /*
@@ -1150,13 +958,17 @@ flowtable_lookup(struct flowtable *ft, s
                    (ntohl(ssin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == 
IN_LOOPBACKNET)
                        return (NULL);
 
-               hash = ipv4_flow_lookup_hash_internal(ssin, dsin, key, flags);
+               hash = ipv4_flow_lookup_hash(ssin, dsin, key, flags);
+               break;
        }
 #endif
 #ifdef INET6
-       if (ssa->ss_family == AF_INET6) {
+       case AF_INET6: {
                struct sockaddr_in6 *ssin6, *dsin6;
 
+               KASSERT(dsa->ss_family == AF_INET6,
+                   ("%s: dsa family %d\n", __func__, dsa->ss_family));
+
                ro = (struct route *)&sro6;
                memcpy(&sro6.ro_dst, dsa,
                    sizeof(struct sockaddr_in6));
@@ -1165,19 +977,24 @@ flowtable_lookup(struct flowtable *ft, s
                ssin6 = (struct sockaddr_in6 *)ssa;
 
                flags |= FL_IPV6;
-               hash = ipv6_flow_lookup_hash_internal(ssin6, dsin6, key, flags);
+               hash = ipv6_flow_lookup_hash(ssin6, dsin6, key, flags);
+               break;
        }
 #endif
+       default:
+               panic("%s: ssa family %d", __func__, ssa->ss_family);
+       }
+
        /*
         * Ports are zero and this isn't a transmit cache
         * - thus not a protocol for which we need to keep
         * state
         * FL_HASH_ALL => key[0] != 0 for TCP || UDP || SCTP
         */
-       if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_ALL)))
+       if (key[0] == 0 && (ft->ft_flags & FL_HASH_ALL))
                return (NULL);
 
-       fs->ft_lookups++;
+       FLOWSTAT_INC(ft, ft_lookups);
        FL_ENTRY_LOCK(ft, hash);
        if ((fle = FL_ENTRY(ft, hash)) == NULL) {
                FL_ENTRY_UNLOCK(ft, hash);
@@ -1195,11 +1012,11 @@ keycheck:       
            && (rt->rt_flags & RTF_UP)
            && (rt->rt_ifp != NULL)
            && (lle->la_flags & LLE_VALID)) {
-               fs->ft_hits++;
+               FLOWSTAT_INC(ft, ft_hits);
                fle->f_uptime = time_uptime;
                fle->f_flags |= flags;
                FL_ENTRY_UNLOCK(ft, hash);
-               return (fle);
+               goto success;
        } else if (fle->f_next != NULL) {
                fle = fle->f_next;
                goto keycheck;
@@ -1209,7 +1026,7 @@ uncached:
        if (flags & FL_NOAUTO || flow_full(ft))
                return (NULL);
 
-       fs->ft_misses++;
+       FLOWSTAT_INC(ft, ft_misses);
        /*
         * This bit of code ends up locking the
         * same route 3 times (just like ip_output + ether_output)
@@ -1222,73 +1039,66 @@ uncached:
         * receive the route locked
         */
 
-#ifdef INVARIANTS
-       if ((ro->ro_dst.sa_family != AF_INET) &&
-           (ro->ro_dst.sa_family != AF_INET6))
-               panic("sa_family == %d\n", ro->ro_dst.sa_family);
-#endif
-
        ft->ft_rtalloc(ro, hash, fibnum);
        if (ro->ro_rt == NULL)
-               error = ENETUNREACH;
-       else {
-               struct llentry *lle = NULL;
-               struct sockaddr_storage *l3addr;
-               struct rtentry *rt = ro->ro_rt;
-               struct ifnet *ifp = rt->rt_ifp;
+               return (NULL);
 
-               if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
-                       RTFREE(rt);
-                       ro->ro_rt = NULL;
-                       return (NULL);
-               }
-#ifdef INET6
-               if (ssa->ss_family == AF_INET6) {
-                       struct sockaddr_in6 *dsin6;
+       rt = ro->ro_rt;
+       ifp = rt->rt_ifp;
 
-                       dsin6 = (struct sockaddr_in6 *)dsa;                     
-                       if (in6_localaddr(&dsin6->sin6_addr)) {
-                               RTFREE(rt);
-                               ro->ro_rt = NULL;
-                               return (NULL);                          
-                       }
+       if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
+               RTFREE(rt);
+               return (NULL);
+       }
 
-                       if (rt->rt_flags & RTF_GATEWAY)
-                               l3addr = (struct sockaddr_storage 
*)rt->rt_gateway;
-                       
-                       else
-                               l3addr = (struct sockaddr_storage *)&ro->ro_dst;
-                       lle = llentry_alloc(ifp, LLTABLE6(ifp), l3addr);
-               }
-#endif 
+       switch (ssa->ss_family) {
 #ifdef INET
-               if (ssa->ss_family == AF_INET) {
-                       if (rt->rt_flags & RTF_GATEWAY)
-                               l3addr = (struct sockaddr_storage 
*)rt->rt_gateway;
-                       else
-                               l3addr = (struct sockaddr_storage *)&ro->ro_dst;
-                       lle = llentry_alloc(ifp, LLTABLE(ifp), l3addr); 
-               }
-                       
+       case AF_INET:
+               if (rt->rt_flags & RTF_GATEWAY)
+                       l3addr = (struct sockaddr_storage *)rt->rt_gateway;
+               else
+                       l3addr = (struct sockaddr_storage *)&ro->ro_dst;
+               lle = llentry_alloc(ifp, LLTABLE(ifp), l3addr); 
+               break;
 #endif
-               ro->ro_lle = lle;
+#ifdef INET6
+       case AF_INET6: {
+               struct sockaddr_in6 *dsin6;
 
-               if (lle == NULL) {
+               dsin6 = (struct sockaddr_in6 *)dsa;                     
+               if (in6_localaddr(&dsin6->sin6_addr)) {
                        RTFREE(rt);
-                       ro->ro_rt = NULL;
-                       return (NULL);
+                       return (NULL);                          
                }
-               error = flowtable_insert(ft, hash, key, fibnum, ro, flags);
 
-               if (error) {
-                       RTFREE(rt);
-                       LLE_FREE(lle);
-                       ro->ro_rt = NULL;
-                       ro->ro_lle = NULL;
-               }
+               if (rt->rt_flags & RTF_GATEWAY)
+                       l3addr = (struct sockaddr_storage *)rt->rt_gateway;
+               else
+                       l3addr = (struct sockaddr_storage *)&ro->ro_dst;
+               lle = llentry_alloc(ifp, LLTABLE6(ifp), l3addr);
+               break;
+       }
+#endif 
+       }
+
+       if (lle == NULL) {
+               RTFREE(rt);
+               return (NULL);
+       }
+       ro->ro_lle = lle;
+
+       if (flowtable_insert(ft, hash, key, fibnum, ro, flags) != 0) {
+               RTFREE(rt);
+               LLE_FREE(lle);
+               return (NULL);
        }
 
-       return ((error) ? NULL : fle);
+success:
+       if (fle != NULL && (m->m_flags & M_FLOWID) == 0) {
+               m->m_flags |= M_FLOWID;
+               m->m_pkthdr.flowid = fle->f_fhash;
+       }
+       return (fle);
 }
 
 /*
@@ -1296,37 +1106,24 @@ uncached:
  */
 #define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO)
        
-struct flowtable *
-flowtable_alloc(char *name, int nentry, int flags)
+static void
+flowtable_alloc(struct flowtable *ft)
 {
-       struct flowtable *ft, *fttail;
-       int i;
-
-       if (V_flow_hashjitter == 0)
-               V_flow_hashjitter = arc4random();
 
-       KASSERT(nentry > 0, ("nentry must be > 0, is %d\n", nentry));
-
-       ft = malloc(sizeof(struct flowtable),
-           M_RTABLE, M_WAITOK | M_ZERO);
-
-       ft->ft_name = name;
-       ft->ft_flags = flags;
-       ft->ft_size = nentry;
 #ifdef RADIX_MPATH
        ft->ft_rtalloc = rtalloc_mpath_fib;
 #else
        ft->ft_rtalloc = rtalloc_ign_wrapper;
 #endif
-       if (flags & FL_PCPU) {
+       if (ft->ft_flags & FL_PCPU) {
                ft->ft_lock = flowtable_pcpu_lock;
                ft->ft_unlock = flowtable_pcpu_unlock;
 
-               for (i = 0; i <= mp_maxid; i++) {
+               for (int i = 0; i <= mp_maxid; i++) {
                        ft->ft_table.pcpu[i] =
-                           malloc(nentry*sizeof(struct flentry *),
+                           malloc(ft->ft_size * sizeof(struct flentry *),
                                M_RTABLE, M_WAITOK | M_ZERO);
-                       ft->ft_masks[i] = bit_alloc(nentry);
+                       ft->ft_masks[i] = bit_alloc(ft->ft_size);
                }
        } else {
                ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1):
@@ -1335,23 +1132,24 @@ flowtable_alloc(char *name, int nentry, 
                ft->ft_lock = flowtable_global_lock;
                ft->ft_unlock = flowtable_global_unlock;
                ft->ft_table.global =
-                           malloc(nentry*sizeof(struct flentry *),
+                           malloc(ft->ft_size * sizeof(struct flentry *),
                                M_RTABLE, M_WAITOK | M_ZERO);
                ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx),
                                M_RTABLE, M_WAITOK | M_ZERO);
-               for (i = 0; i < ft->ft_lock_count; i++)
-                       mtx_init(&ft->ft_locks[i], "flow", NULL, 
MTX_DEF|MTX_DUPOK);
+               for (int i = 0; i < ft->ft_lock_count; i++)
+                       mtx_init(&ft->ft_locks[i], "flow", NULL,
+                           MTX_DEF | MTX_DUPOK);
 
-               ft->ft_masks[0] = bit_alloc(nentry);
+               ft->ft_masks[0] = bit_alloc(ft->ft_size);
        }
-       ft->ft_tmpmask = bit_alloc(nentry);
+       ft->ft_tmpmask = bit_alloc(ft->ft_size);
 
        /*
         * In the local transmit case the table truly is
         * just a cache - so everything is eligible for
         * replacement after 5s of non-use
         */
-       if (flags & FL_HASH_ALL) {
+       if (ft->ft_flags & FL_HASH_ALL) {
                ft->ft_udp_idle = V_flowtable_udp_expire;
                ft->ft_syn_idle = V_flowtable_syn_expire;
                ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire;
@@ -1361,20 +1159,6 @@ flowtable_alloc(char *name, int nentry, 
                    ft->ft_syn_idle = ft->ft_tcp_idle = 30;
                
        }
-
-       /*
-        * hook in to the cleaner list
-        */
-       if (V_flow_list_head == NULL)
-               V_flow_list_head = ft;
-       else {
-               fttail = V_flow_list_head;
-               while (fttail->ft_next != NULL)
-                       fttail = fttail->ft_next;
-               fttail->ft_next = ft;
-       }
-
-       return (ft);
 }
 
 /*
@@ -1395,17 +1179,16 @@ fle_free(struct flentry *fle, struct flo
                RTFREE(rt);
        if (lle != NULL)
                LLE_FREE(lle);
-       flow_free(fle, ft);
+       uma_zfree(ft->ft_zone, fle);
 }
 
 static void
 flowtable_free_stale(struct flowtable *ft, struct rtentry *rt)
 {
-       int curbit = 0, count, tmpsize;
+       int curbit = 0, tmpsize;
        struct flentry *fle,  **flehead, *fleprev;
        struct flentry *flefreehead, *flefreetail, *fletmp;
        bitstr_t *mask, *tmpmask;
-       struct flowtable_stats *fs = &ft->ft_stats[curcpu];
 
        flefreehead = flefreetail = NULL;
        mask = flowtable_mask(ft);
@@ -1429,7 +1212,7 @@ flowtable_free_stale(struct flowtable *f
                flehead = flowtable_entry(ft, curbit);
                fle = fleprev = *flehead;
 
-               fs->ft_free_checks++;
+               FLOWSTAT_INC(ft, ft_free_checks);
 #ifdef DIAGNOSTIC
                if (fle == NULL && curbit > 0) {
                        log(LOG_ALERT,
@@ -1484,22 +1267,34 @@ flowtable_free_stale(struct flowtable *f
                tmpsize -= (curbit / 8) * 8;
                bit_ffs(tmpmask, tmpsize, &curbit);
        }
-       count = 0;
        while ((fle = flefreehead) != NULL) {
                flefreehead = fle->f_next;
-               count++;
-               fs->ft_frees++;
+               FLOWSTAT_INC(ft, ft_frees);
                fle_free(fle, ft);
        }
-       if (V_flowtable_debug && count)
-               log(LOG_DEBUG, "freed %d flow entries\n", count);
 }
 
 void
-flowtable_route_flush(struct flowtable *ft, struct rtentry *rt)
+flowtable_route_flush(sa_family_t sa, struct rtentry *rt)
 {
+       struct flowtable *ft;
        int i;
 
+       switch (sa) {
+#ifdef INET
+       case AF_INET:
+               ft = &V_ip4_ft;
+               break;
+#endif
+#ifdef INET6
+       case AF_INET6:
+               ft = &V_ip6_ft;
+               break;
+#endif
+       default:
+               panic("%s: sa %d", __func__, sa);
+       }
+
        if (ft->ft_flags & FL_PCPU) {
                CPU_FOREACH(i) {
                        if (smp_started == 1) {
@@ -1522,34 +1317,29 @@ flowtable_route_flush(struct flowtable *
 }
 
 static void
-flowtable_clean_vnet(void)
+flowtable_clean_vnet(struct flowtable *ft)
 {
-       struct flowtable *ft;
-       int i;
-
-       ft = V_flow_list_head;
-       while (ft != NULL) {
-               if (ft->ft_flags & FL_PCPU) {
-                       CPU_FOREACH(i) {
-                               if (smp_started == 1) {
-                                       thread_lock(curthread);
-                                       sched_bind(curthread, i);
-                                       thread_unlock(curthread);
-                               }
 
-                               flowtable_free_stale(ft, NULL);
+       if (ft->ft_flags & FL_PCPU) {
+               int i;

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-all@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to