>Number:         164901
>Category:       kern
>Synopsis:       [regression] [patch] [lagg] igb/lagg poor traffic distribution
>Confidential:   no
>Severity:       serious
>Priority:       low
>Responsible:    freebsd-bugs
>State:          open
>Quarter:        
>Keywords:       
>Date-Required:
>Class:          sw-bug
>Submitter-Id:   current-users
>Arrival-Date:   Wed Feb 08 09:00:20 UTC 2012
>Closed-Date:
>Last-Modified:
>Originator:     Eugene Grosbein
>Release:        FreeBSD 8.2-STABLE i386
>Organization:
RDTC JSC
>Environment:
System: FreeBSD eg.sd.rdtc.ru 8.2-STABLE FreeBSD 8.2-STABLE #36: Fri Dec 23 
15:04:05 NOVT 2011 r...@eg.sd.rdtc.ru:/usr/local/obj/usr/local/src/sys/EG i386

>Description:

        Suppose, we have a router (BRAS) using two lagg(4) interfaces in LACP 
mode.

        Two-port lagg0 has IP address and its ports carry untagged IPoE frames.
        lagg1 has no IP address and has two ports (82576-based igb0 and igb1)
        that carry 1000 dot-q vlans with PPPoE frames only.

        In RELENG_7, lagg(4) evenly distributes traffic going from lagg1 to 
lagg0.
        Since 8.0-RELEASE all this traffic goes out through one of lagg0's 
ports only.

        82576-based NICs and igb(4) support Microsoft Receive-Side Scaling 
(RSS),
        see 
http://download.intel.com/design/network/datashts/82576_Datasheet.pdf
        
        RSS states that queue number for non-IP frames (PPPoE/GRE/etc.)
        is not computed with hash. So, all these frames get same (zero)
        queue number and igb(4) assigns tag M_FLOWID=0 to mbufs.

        Since 8.0-RELEASE, lagg(4) skips its own hash computation for mbuts
        having M_FLOWID tag attached. Hence, it directs all such traffic
        to its first port only in this setup.

>How-To-Repeat:
        
        See above.

>Fix:

        The following patch fixes the regression by introducing new sysctls
        that disable usage of M_FLOWID per lagg interface:

net.link.lagg.0.use_flowid
net.link.lagg.1.use_flowid

        Default value is 1 that corresponds to current behaviour of lagg(4).
        To fix our issue, we set net.link.lagg.0.use_flowid=0
        that restores pre-8 behaviour for lagg0 only, so it ignores misleading
        M_FLOWID assigned to mbufs by lagg1's ports.

--- sys/net/if_lagg.h.orig      2010-12-27 12:59:59.000000000 +0600
+++ sys/net/if_lagg.h   2012-01-23 16:34:15.000000000 +0700
@@ -21,6 +21,8 @@
 #ifndef _NET_LAGG_H
 #define _NET_LAGG_H
 
+#include <sys/sysctl.h>
+
 /*
  * Global definitions
  */
@@ -202,6 +204,8 @@ struct lagg_softc {
        eventhandler_tag vlan_attach;
        eventhandler_tag vlan_detach;
 #endif
+       struct sysctl_ctx_list          ctx;            /* sysctl variables */
+       int                             use_flowid;     /* use M_FLOWID */
 };
 
 struct lagg_port {
--- sys/net/if_lagg.c.orig      2011-08-08 19:16:42.000000000 +0700
+++ sys/net/if_lagg.c   2012-01-23 16:33:04.000000000 +0700
@@ -257,6 +257,8 @@ lagg_clone_create(struct if_clone *ifc, 
        struct ifnet *ifp;
        int i, error = 0;
        static const u_char eaddr[6];   /* 00:00:00:00:00:00 */
+       struct sysctl_oid *oid;
+       char num[14];                   /* sufficient for 32 bits */
 
        sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO);
        ifp = sc->sc_ifp = if_alloc(IFT_ETHER);
@@ -265,6 +267,15 @@ lagg_clone_create(struct if_clone *ifc, 
                return (ENOSPC);
        }
 
+       sysctl_ctx_init(&sc->ctx);
+       snprintf(num, sizeof(num), "%u", unit);
+       sc->use_flowid = 1;
+       oid = SYSCTL_ADD_NODE(&sc->ctx, &SYSCTL_NODE_CHILDREN(_net_link, lagg),
+               OID_AUTO, num, CTLFLAG_RD, NULL, "");
+       SYSCTL_ADD_INT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
+               "use_flowid", CTLTYPE_INT|CTLFLAG_RW, &sc->use_flowid, 
sc->use_flowid,
+               "Use flow id for load sharing");
+
        sc->sc_proto = LAGG_PROTO_NONE;
        for (i = 0; lagg_protos[i].ti_proto != LAGG_PROTO_NONE; i++) {
                if (lagg_protos[i].ti_proto == LAGG_PROTO_DEFAULT) {
@@ -344,6 +355,7 @@ lagg_clone_destroy(struct ifnet *ifp)
 
        LAGG_WUNLOCK(sc);
 
+       sysctl_ctx_free(&sc->ctx);
        ifmedia_removeall(&sc->sc_media);
        ether_ifdetach(ifp);
        if_free_type(ifp, IFT_ETHER);
@@ -1668,7 +1680,7 @@ lagg_lb_start(struct lagg_softc *sc, str
        struct lagg_port *lp = NULL;
        uint32_t p = 0;
 
-       if (m->m_flags & M_FLOWID)
+       if (sc->use_flowid && (m->m_flags & M_FLOWID))
                p = m->m_pkthdr.flowid;
        else
                p = lagg_hashmbuf(m, lb->lb_key);
--- sys/net/ieee8023ad_lacp.c.orig      2009-08-03 16:13:06.000000000 +0800
+++ sys/net/ieee8023ad_lacp.c   2012-01-23 13:44:00.000000000 +0700
@@ -812,7 +812,7 @@ lacp_select_tx_port(struct lagg_softc *s
                return (NULL);
        }
 
-       if (m->m_flags & M_FLOWID)
+       if (sc->use_flowid && (m->m_flags & M_FLOWID))
                hash = m->m_pkthdr.flowid;
        else
                hash = lagg_hashmbuf(m, lsc->lsc_hashkey);
>Release-Note:
>Audit-Trail:
>Unformatted:
_______________________________________________
freebsd-bugs@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-bugs
To unsubscribe, send any mail to "freebsd-bugs-unsubscr...@freebsd.org"

Reply via email to