On Fri, Apr 14, 2023 at 10:26:14AM +0800, Kevin Lo wrote:
> On Thu, Apr 13, 2023 at 01:30:36PM -0500, Brian Conway wrote:
> > Reviving this thread, apologies for discontinuity in mail readers: 
> > https://marc.info/?t=165642193500008
> > 
> > After rebasing on 7.3, my results have mirrored Hrvoje's testing at
> > the end of that thread. No issues with throughput, unusual latency,
> > or reliability. `vmstat -i` shows some level of balancing between
> > the queues. I've been testing on as many em(4) systems as I have
> > access to, some manually, some in a packet forwarder/firewall
> > scenarios:
> 
> Last time I tested (about a year go) on I211, rx locked up if I tried 
> something
> like iperf3 or tcpbench.  Don't know if you have a similar problem.

I rebased the rest to current and tested it with tcpbench between the
following interfaces:

em0 at pci7 dev 0 function 0 "Intel 82580" rev 0x01, msix, 4 queues, address 
90:e2:ba:df:d5:2c
em0 at pci5 dev 0 function 0 "Intel I350" rev 0x01, msix, 8 queues, address 
00:25:90:eb:b3:c2

After a second the connection stucked.  As far as I can see, the
sending side got a problem.

ot45# tcpbench 192.168.99.3
  elapsed_ms          bytes         mbps   bwidth
        1012       14574120      115.210  100.00%
Conn:   1 Mbps:      115.210 Peak Mbps:      115.210 Avg Mbps:      115.210
        2022              0        0.000    -nan%
...

ot46# tcpbench -s
  elapsed_ms          bytes         mbps   bwidth
        1017       14313480      112.594  100.00%
Conn:   1 Mbps:      112.594 Peak Mbps:      112.594 Avg Mbps:      112.594
        2027              0        0.000    -nan%
...

ot45# netstat  -nf inet -p tcp
Active Internet connections
Proto   Recv-Q Send-Q  Local Address          Foreign Address        TCP-State
tcp          0 260640  192.168.99.1.18530     192.168.99.3.12345     CLOSING

When I retried it, it sometimes work and most times not.

kstat tells me, that transmit queues 1 to 3 are oactive and just 0
works:

em0:0:txq:0
         packets: 4042648 packets
           bytes: 5310138322 bytes
          qdrops: 9 packets
          errors: 0 packets
            qlen: 0 packets
         maxqlen: 511 packets
         oactive: false
em0:0:txq:1
         packets: 9812 packets
           bytes: 14846716 bytes
          qdrops: 0 packets
          errors: 0 packets
            qlen: 184 packets
         maxqlen: 511 packets
         oactive: true
em0:0:txq:2
         packets: 690362 packets
           bytes: 60011484 bytes
          qdrops: 0 packets
          errors: 0 packets
            qlen: 185 packets
         maxqlen: 511 packets
         oactive: true
em0:0:txq:3
         packets: 443181 packets
           bytes: 43829886 bytes
          qdrops: 0 packets
          errors: 0 packets
            qlen: 198 packets
         maxqlen: 511 packets
         oactive: true

This is the rebased diff on current i tested:

Index: dev/pci/files.pci
===================================================================
RCS file: /cvs/src/sys/dev/pci/files.pci,v
retrieving revision 1.361
diff -u -p -r1.361 files.pci
--- dev/pci/files.pci   23 Apr 2023 00:20:26 -0000      1.361
+++ dev/pci/files.pci   25 Apr 2023 11:25:47 -0000
@@ -334,7 +334,7 @@ attach      fxp at pci with fxp_pci
 file   dev/pci/if_fxp_pci.c            fxp_pci
 
 # Intel Pro/1000
-device em: ether, ifnet, ifmedia
+device em: ether, ifnet, ifmedia, intrmap, stoeplitz
 attach em at pci
 file   dev/pci/if_em.c                 em
 file   dev/pci/if_em_hw.c              em
Index: dev/pci/if_em.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_em.c,v
retrieving revision 1.365
diff -u -p -r1.365 if_em.c
--- dev/pci/if_em.c     9 Feb 2023 21:21:27 -0000       1.365
+++ dev/pci/if_em.c     25 Apr 2023 11:25:47 -0000
@@ -247,6 +247,7 @@ int  em_intr(void *);
 int  em_allocate_legacy(struct em_softc *);
 void em_start(struct ifqueue *);
 int  em_ioctl(struct ifnet *, u_long, caddr_t);
+int  em_rxrinfo(struct em_softc *, struct if_rxrinfo *);
 void em_watchdog(struct ifnet *);
 void em_init(void *);
 void em_stop(void *, int);
@@ -309,8 +310,10 @@ int  em_setup_queues_msix(struct em_soft
 int  em_queue_intr_msix(void *);
 int  em_link_intr_msix(void *);
 void em_enable_queue_intr_msix(struct em_queue *);
+void em_setup_rss(struct em_softc *);
 #else
 #define em_allocate_msix(_sc)  (-1)
+#define em_setup_rss(_sc)      0
 #endif
 
 #if NKSTAT > 0
@@ -333,7 +336,6 @@ struct cfdriver em_cd = {
 };
 
 static int em_smart_pwr_down = FALSE;
-int em_enable_msix = 0;
 
 /*********************************************************************
  *  Device identification routine
@@ -629,12 +631,12 @@ err_pci:
 void
 em_start(struct ifqueue *ifq)
 {
+       struct em_queue *que = ifq->ifq_softc;
        struct ifnet *ifp = ifq->ifq_if;
        struct em_softc *sc = ifp->if_softc;
        u_int head, free, used;
        struct mbuf *m;
        int post = 0;
-       struct em_queue *que = sc->queues; /* Use only first queue. */
 
        if (!sc->link_active) {
                ifq_purge(ifq);
@@ -769,8 +771,7 @@ em_ioctl(struct ifnet *ifp, u_long comma
                break;
 
        case SIOCGIFRXR:
-               error = if_rxr_ioctl((struct if_rxrinfo *)ifr->ifr_data,
-                   NULL, EM_MCLBYTES, &sc->queues->rx.sc_rx_ring);
+               error = em_rxrinfo(sc, (struct if_rxrinfo *)ifr->ifr_data);
                break;
 
        case SIOCGIFSFFPAGE:
@@ -801,6 +802,32 @@ em_ioctl(struct ifnet *ifp, u_long comma
        return (error);
 }
 
+int
+em_rxrinfo(struct em_softc *sc, struct if_rxrinfo *ifri)
+{
+       struct if_rxring_info *ifr;
+       struct em_queue *que;
+       int i;
+       int error;
+
+       ifr = mallocarray(sc->num_queues, sizeof(*ifr), M_TEMP,
+           M_WAITOK | M_ZERO | M_CANFAIL);
+       if (ifr == NULL)
+               return (ENOMEM);
+
+       i = 0;
+       FOREACH_QUEUE(sc, que) {
+               ifr[i].ifr_size = EM_MCLBYTES;
+               ifr[i].ifr_info = que->rx.sc_rx_ring;
+               i++;
+       }
+
+       error = if_rxr_info_ioctl(ifri, sc->num_queues, ifr);
+       free(ifr, M_TEMP, sc->num_queues * sizeof(*ifr));
+
+       return (error);
+}
+
 /*********************************************************************
  *  Watchdog entry point
  *
@@ -812,21 +839,22 @@ void
 em_watchdog(struct ifnet *ifp)
 {
        struct em_softc *sc = ifp->if_softc;
-       struct em_queue *que = sc->queues; /* Use only first queue. */
-
+       struct em_queue *que;
 
-       /* If we are in this routine because of pause frames, then
-        * don't reset the hardware.
-        */
-       if (E1000_READ_REG(&sc->hw, STATUS) & E1000_STATUS_TXOFF) {
-               ifp->if_timer = EM_TX_TIMEOUT;
-               return;
+       FOREACH_QUEUE(sc, que) {
+               /* If we are in this routine because of pause frames, then
+                * don't reset the hardware.
+                */
+               if (E1000_READ_REG(&sc->hw, STATUS) & E1000_STATUS_TXOFF) {
+                       ifp->if_timer = EM_TX_TIMEOUT;
+                       return;
+               }
+               printf("%s: watchdog queue %d: head %u tail %u TDH %u TDT %u\n",
+                   DEVNAME(sc), que->me,
+                   que->tx.sc_tx_desc_head, que->tx.sc_tx_desc_tail,
+                   E1000_READ_REG(&sc->hw, TDH(que->me)),
+                   E1000_READ_REG(&sc->hw, TDT(que->me)));
        }
-       printf("%s: watchdog: head %u tail %u TDH %u TDT %u\n",
-           DEVNAME(sc),
-           que->tx.sc_tx_desc_head, que->tx.sc_tx_desc_tail,
-           E1000_READ_REG(&sc->hw, TDH(que->me)),
-           E1000_READ_REG(&sc->hw, TDT(que->me)));
 
        em_init(sc);
 
@@ -1669,7 +1697,6 @@ em_allocate_pci_resources(struct em_soft
 {
        int             val, rid;
        struct pci_attach_args *pa = &sc->osdep.em_pa;
-       struct em_queue        *que = NULL;
 
        val = pci_conf_read(pa->pa_pc, pa->pa_tag, EM_MMBA);
        if (PCI_MAPREG_TYPE(val) != PCI_MAPREG_TYPE_MEM) {
@@ -1742,18 +1769,6 @@ em_allocate_pci_resources(struct em_soft
        sc->osdep.dev = (struct device *)sc;
        sc->hw.back = &sc->osdep;
 
-       /* Only one queue for the moment. */
-       que = malloc(sizeof(struct em_queue), M_DEVBUF, M_NOWAIT | M_ZERO);
-       if (que == NULL) {
-               printf(": unable to allocate queue memory\n");
-               return (ENOMEM);
-       }
-       que->me = 0;
-       que->sc = sc;
-       timeout_set(&que->rx_refill, em_rxrefill, que);
-
-       sc->queues = que;
-       sc->num_queues = 1;
        sc->msix = 0;
        sc->legacy_irq = 0;
        if (em_allocate_msix(sc) && em_allocate_legacy(sc))
@@ -1826,11 +1841,7 @@ em_free_pci_resources(struct em_softc *s
        sc->legacy_irq = 0;
        sc->msix_linkvec = 0;
        sc->msix_queuesmask = 0;
-       if (sc->queues)
-               free(sc->queues, M_DEVBUF,
-                   sc->num_queues * sizeof(struct em_queue));
        sc->num_queues = 0;
-       sc->queues = NULL;
 }
 
 /*********************************************************************
@@ -1949,8 +1960,10 @@ void
 em_setup_interface(struct em_softc *sc)
 {
        struct ifnet   *ifp;
+       struct em_queue *que;
        uint64_t fiber_type = IFM_1000_SX;
-
+       int i;
+       
        INIT_DEBUGOUT("em_setup_interface: begin");
 
        ifp = &sc->sc_ac.ac_if;
@@ -2012,6 +2025,22 @@ em_setup_interface(struct em_softc *sc)
 
        if_attach(ifp);
        ether_ifattach(ifp);
+
+       if_attach_iqueues(ifp, sc->num_queues);
+       if_attach_queues(ifp, sc->num_queues);
+
+       i = 0;
+       FOREACH_QUEUE(sc, que) {
+               que->me = i;
+               que->sc = sc;
+
+               ifp->if_iqs[i]->ifiq_softc = que;
+               ifp->if_ifqs[i]->ifq_softc = que;
+
+               timeout_set(&que->rx_refill, em_rxrefill, que);
+               i++;
+       }
+
        em_enable_intr(sc);
 }
 
@@ -2820,6 +2849,9 @@ em_initialize_receive_unit(struct em_sof
        if (sc->hw.mac_type == em_82573)
                E1000_WRITE_REG(&sc->hw, RDTR, 0x20);
 
+       if (sc->num_queues > 1)
+               em_setup_rss(sc);
+
        FOREACH_QUEUE(sc, que) {
                if (sc->num_queues > 1) {
                        /*
@@ -3487,6 +3519,12 @@ em_allocate_legacy(struct em_softc *sc)
                }
                sc->legacy_irq = 1;
        }
+       sc->num_queues = 1;
+       sc->queues = malloc(sizeof(struct em_queue), M_DEVBUF, M_NOWAIT | 
M_ZERO);
+       if (sc->queues == NULL) {
+               printf(": couldn't allocate queues\n");
+               return (ENOMEM);
+       }
 
        intrstr = pci_intr_string(pc, ih);
        sc->sc_intrhand = pci_intr_establish(pc, ih, IPL_NET | IPL_MPSAFE,
@@ -3869,44 +3907,66 @@ em_allocate_msix(struct em_softc *sc)
        const char              *intrstr = NULL;
        struct pci_attach_args  *pa = &sc->osdep.em_pa;
        pci_chipset_tag_t        pc = pa->pa_pc;
-       struct em_queue         *que = sc->queues; /* Use only first queue. */
+       struct em_queue         *que;
+       int                      nmsix;
        int                      vec;
-
-       if (!em_enable_msix)
-               return (ENODEV);
+       int                      max_queues;
 
        switch (sc->hw.mac_type) {
        case em_82576:
        case em_82580:
        case em_i350:
+               max_queues = 8;
+               break;
        case em_i210:
+               if (sc->hw.device_id == PCI_PRODUCT_INTEL_I211_COPPER)
+                       max_queues = 2;
+               else
+                       max_queues = 4;
                break;
        default:
                return (ENODEV);
        }
 
+       /* if we only have one vector, just use msi */
+       nmsix = pci_intr_msix_count(pa);
+       if (nmsix < 2)
+               return (ENODEV);
+
        vec = 0;
        if (pci_intr_map_msix(pa, vec, &ih))
                return (ENODEV);
        sc->msix = 1;
 
-       que->me = vec;
-       que->eims = 1 << vec;
-       snprintf(que->name, sizeof(que->name), "%s:%d", DEVNAME(sc), vec);
+       nmsix--;
+       sc->intrmap = intrmap_create(&sc->sc_dev, nmsix, max_queues, 
INTRMAP_POWEROF2);
+       sc->num_queues = intrmap_count(sc->intrmap);
+       KASSERT(sc->num_queues > 0);
+       KASSERT(powerof2(sc->num_queues));
 
-       intrstr = pci_intr_string(pc, ih);
-       que->tag = pci_intr_establish(pc, ih, IPL_NET | IPL_MPSAFE,
-           em_queue_intr_msix, que, que->name);
-       if (que->tag == NULL) {
-               printf(": couldn't establish interrupt");
-               if (intrstr != NULL)
-                       printf(" at %s", intrstr);
-               printf("\n");
-               return (ENXIO);
+       sc->queues = mallocarray(sizeof(*que), sc->num_queues, M_DEVBUF, 
M_NOWAIT | M_ZERO);
+       if (sc->queues == NULL)
+               return (ENOMEM);
+
+       FOREACH_QUEUE(sc, que) {
+               que->eims = 1 << vec;
+               snprintf(que->name, sizeof(que->name), "%s:%d", DEVNAME(sc), 
vec);
+
+               if (pci_intr_map_msix(pa, vec, &ih)) {
+                       printf(": unable to map msi-x vector %d", vec);
+                       return (ENXIO);
+               }
+
+               que->tag = pci_intr_establish_cpu(pc, ih, IPL_NET | IPL_MPSAFE,
+                   intrmap_cpu(sc->intrmap, vec), em_queue_intr_msix, que, 
que->name);
+               if (que->tag == NULL) {
+                       printf(": couldn't establish queue interrupt %d\n", 
vec);
+                       return (ENXIO);
+               }
+               vec++;
        }
 
        /* Setup linkvector, use last queue vector + 1 */
-       vec++;
        sc->msix_linkvec = vec;
        if (pci_intr_map_msix(pa, sc->msix_linkvec, &ih)) {
                printf(": couldn't map link vector\n");
@@ -4096,6 +4156,40 @@ void
 em_enable_queue_intr_msix(struct em_queue *que)
 {
        E1000_WRITE_REG(&que->sc->hw, EIMS, que->eims);
+}
+
+void
+em_setup_rss(struct em_softc *sc)
+{
+       uint32_t rss_key[10];
+       uint32_t mrqc;
+       uint32_t reta;
+       int i;
+       int queue_id;
+
+       /* set redirection table to round robin across queues */
+       reta = 0;
+       for (i = 0; i < 128; i++) {
+               queue_id = i % sc->num_queues;
+               reta = reta >> 8;
+               reta = reta | (((uint32_t) queue_id) << 24);
+               if ((i & 3) == 3) {
+                       E1000_WRITE_REG(&sc->hw, RETA(i >> 2), reta);
+                       reta = 0;
+               }
+       }
+
+       stoeplitz_to_key(rss_key, sizeof(rss_key));
+       for (i = 0; i < nitems(rss_key); i++)
+               E1000_WRITE_REG_ARRAY(&sc->hw, RSSRK(0), i, rss_key[i]);
+
+       mrqc = E1000_MRQC_ENABLE_RSS_8Q;
+       mrqc |= E1000_MRQC_RSS_FIELD_IPV4 | E1000_MRQC_RSS_FIELD_IPV4_TCP
+           | E1000_MRQC_RSS_FIELD_IPV6 | E1000_MRQC_RSS_FIELD_IPV6_TCP
+           | E1000_MRQC_RSS_FIELD_IPV4_UDP | E1000_MRQC_RSS_FIELD_IPV6_UDP
+           | E1000_MRQC_RSS_FIELD_IPV6_UDP_EX | 
E1000_MRQC_RSS_FIELD_IPV6_TCP_EX;
+       mrqc |= 0 << 3;         /* default queue */
+       E1000_WRITE_REG(&sc->hw, MRQC, mrqc);
 }
 #endif /* !SMALL_KERNEL */
 
Index: dev/pci/if_em.h
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_em.h,v
retrieving revision 1.80
diff -u -p -r1.80 if_em.h
--- dev/pci/if_em.h     9 Jan 2022 05:42:50 -0000       1.80
+++ dev/pci/if_em.h     25 Apr 2023 11:25:47 -0000
@@ -52,9 +52,11 @@ POSSIBILITY OF SUCH DAMAGE.
 #include <sys/timeout.h>
 #include <sys/atomic.h>
 #include <sys/kstat.h>
+#include <sys/intrmap.h>
 
 #include <net/if.h>
 #include <net/if_media.h>
+#include <net/toeplitz.h>
 
 #include <netinet/in.h>
 #include <netinet/ip.h>
@@ -449,6 +451,7 @@ struct em_softc {
        uint32_t                 msix_queuesmask;
        int                      num_queues;
        struct em_queue         *queues;
+       struct intrmap          *intrmap;
 
        struct kstat            *kstat;
        struct mutex             kstat_mtx;

Reply via email to