On Fri, Apr 14, 2023 at 10:26:14AM +0800, Kevin Lo wrote: > On Thu, Apr 13, 2023 at 01:30:36PM -0500, Brian Conway wrote: > > Reviving this thread, apologies for discontinuity in mail readers: > > https://marc.info/?t=165642193500008 > > > > After rebasing on 7.3, my results have mirrored Hrvoje's testing at > > the end of that thread. No issues with throughput, unusual latency, > > or reliability. `vmstat -i` shows some level of balancing between > > the queues. I've been testing on as many em(4) systems as I have > > access to, some manually, some in a packet forwarder/firewall > > scenarios: > > Last time I tested (about a year go) on I211, rx locked up if I tried > something > like iperf3 or tcpbench. Don't know if you have a similar problem.
I rebased the rest to current and tested it with tcpbench between the following interfaces: em0 at pci7 dev 0 function 0 "Intel 82580" rev 0x01, msix, 4 queues, address 90:e2:ba:df:d5:2c em0 at pci5 dev 0 function 0 "Intel I350" rev 0x01, msix, 8 queues, address 00:25:90:eb:b3:c2 After a second the connection stucked. As far as I can see, the sending side got a problem. ot45# tcpbench 192.168.99.3 elapsed_ms bytes mbps bwidth 1012 14574120 115.210 100.00% Conn: 1 Mbps: 115.210 Peak Mbps: 115.210 Avg Mbps: 115.210 2022 0 0.000 -nan% ... ot46# tcpbench -s elapsed_ms bytes mbps bwidth 1017 14313480 112.594 100.00% Conn: 1 Mbps: 112.594 Peak Mbps: 112.594 Avg Mbps: 112.594 2027 0 0.000 -nan% ... ot45# netstat -nf inet -p tcp Active Internet connections Proto Recv-Q Send-Q Local Address Foreign Address TCP-State tcp 0 260640 192.168.99.1.18530 192.168.99.3.12345 CLOSING When I retried it, it sometimes work and most times not. kstat tells me, that transmit queues 1 to 3 are oactive and just 0 works: em0:0:txq:0 packets: 4042648 packets bytes: 5310138322 bytes qdrops: 9 packets errors: 0 packets qlen: 0 packets maxqlen: 511 packets oactive: false em0:0:txq:1 packets: 9812 packets bytes: 14846716 bytes qdrops: 0 packets errors: 0 packets qlen: 184 packets maxqlen: 511 packets oactive: true em0:0:txq:2 packets: 690362 packets bytes: 60011484 bytes qdrops: 0 packets errors: 0 packets qlen: 185 packets maxqlen: 511 packets oactive: true em0:0:txq:3 packets: 443181 packets bytes: 43829886 bytes qdrops: 0 packets errors: 0 packets qlen: 198 packets maxqlen: 511 packets oactive: true This is the rebased diff on current i tested: Index: dev/pci/files.pci =================================================================== RCS file: /cvs/src/sys/dev/pci/files.pci,v retrieving revision 1.361 diff -u -p -r1.361 files.pci --- dev/pci/files.pci 23 Apr 2023 00:20:26 -0000 1.361 +++ dev/pci/files.pci 25 Apr 2023 11:25:47 -0000 @@ -334,7 +334,7 @@ attach fxp at pci with fxp_pci file dev/pci/if_fxp_pci.c fxp_pci # Intel Pro/1000 -device em: ether, ifnet, ifmedia +device em: ether, ifnet, ifmedia, intrmap, stoeplitz attach em at pci file dev/pci/if_em.c em file dev/pci/if_em_hw.c em Index: dev/pci/if_em.c =================================================================== RCS file: /cvs/src/sys/dev/pci/if_em.c,v retrieving revision 1.365 diff -u -p -r1.365 if_em.c --- dev/pci/if_em.c 9 Feb 2023 21:21:27 -0000 1.365 +++ dev/pci/if_em.c 25 Apr 2023 11:25:47 -0000 @@ -247,6 +247,7 @@ int em_intr(void *); int em_allocate_legacy(struct em_softc *); void em_start(struct ifqueue *); int em_ioctl(struct ifnet *, u_long, caddr_t); +int em_rxrinfo(struct em_softc *, struct if_rxrinfo *); void em_watchdog(struct ifnet *); void em_init(void *); void em_stop(void *, int); @@ -309,8 +310,10 @@ int em_setup_queues_msix(struct em_soft int em_queue_intr_msix(void *); int em_link_intr_msix(void *); void em_enable_queue_intr_msix(struct em_queue *); +void em_setup_rss(struct em_softc *); #else #define em_allocate_msix(_sc) (-1) +#define em_setup_rss(_sc) 0 #endif #if NKSTAT > 0 @@ -333,7 +336,6 @@ struct cfdriver em_cd = { }; static int em_smart_pwr_down = FALSE; -int em_enable_msix = 0; /********************************************************************* * Device identification routine @@ -629,12 +631,12 @@ err_pci: void em_start(struct ifqueue *ifq) { + struct em_queue *que = ifq->ifq_softc; struct ifnet *ifp = ifq->ifq_if; struct em_softc *sc = ifp->if_softc; u_int head, free, used; struct mbuf *m; int post = 0; - struct em_queue *que = sc->queues; /* Use only first queue. */ if (!sc->link_active) { ifq_purge(ifq); @@ -769,8 +771,7 @@ em_ioctl(struct ifnet *ifp, u_long comma break; case SIOCGIFRXR: - error = if_rxr_ioctl((struct if_rxrinfo *)ifr->ifr_data, - NULL, EM_MCLBYTES, &sc->queues->rx.sc_rx_ring); + error = em_rxrinfo(sc, (struct if_rxrinfo *)ifr->ifr_data); break; case SIOCGIFSFFPAGE: @@ -801,6 +802,32 @@ em_ioctl(struct ifnet *ifp, u_long comma return (error); } +int +em_rxrinfo(struct em_softc *sc, struct if_rxrinfo *ifri) +{ + struct if_rxring_info *ifr; + struct em_queue *que; + int i; + int error; + + ifr = mallocarray(sc->num_queues, sizeof(*ifr), M_TEMP, + M_WAITOK | M_ZERO | M_CANFAIL); + if (ifr == NULL) + return (ENOMEM); + + i = 0; + FOREACH_QUEUE(sc, que) { + ifr[i].ifr_size = EM_MCLBYTES; + ifr[i].ifr_info = que->rx.sc_rx_ring; + i++; + } + + error = if_rxr_info_ioctl(ifri, sc->num_queues, ifr); + free(ifr, M_TEMP, sc->num_queues * sizeof(*ifr)); + + return (error); +} + /********************************************************************* * Watchdog entry point * @@ -812,21 +839,22 @@ void em_watchdog(struct ifnet *ifp) { struct em_softc *sc = ifp->if_softc; - struct em_queue *que = sc->queues; /* Use only first queue. */ - + struct em_queue *que; - /* If we are in this routine because of pause frames, then - * don't reset the hardware. - */ - if (E1000_READ_REG(&sc->hw, STATUS) & E1000_STATUS_TXOFF) { - ifp->if_timer = EM_TX_TIMEOUT; - return; + FOREACH_QUEUE(sc, que) { + /* If we are in this routine because of pause frames, then + * don't reset the hardware. + */ + if (E1000_READ_REG(&sc->hw, STATUS) & E1000_STATUS_TXOFF) { + ifp->if_timer = EM_TX_TIMEOUT; + return; + } + printf("%s: watchdog queue %d: head %u tail %u TDH %u TDT %u\n", + DEVNAME(sc), que->me, + que->tx.sc_tx_desc_head, que->tx.sc_tx_desc_tail, + E1000_READ_REG(&sc->hw, TDH(que->me)), + E1000_READ_REG(&sc->hw, TDT(que->me))); } - printf("%s: watchdog: head %u tail %u TDH %u TDT %u\n", - DEVNAME(sc), - que->tx.sc_tx_desc_head, que->tx.sc_tx_desc_tail, - E1000_READ_REG(&sc->hw, TDH(que->me)), - E1000_READ_REG(&sc->hw, TDT(que->me))); em_init(sc); @@ -1669,7 +1697,6 @@ em_allocate_pci_resources(struct em_soft { int val, rid; struct pci_attach_args *pa = &sc->osdep.em_pa; - struct em_queue *que = NULL; val = pci_conf_read(pa->pa_pc, pa->pa_tag, EM_MMBA); if (PCI_MAPREG_TYPE(val) != PCI_MAPREG_TYPE_MEM) { @@ -1742,18 +1769,6 @@ em_allocate_pci_resources(struct em_soft sc->osdep.dev = (struct device *)sc; sc->hw.back = &sc->osdep; - /* Only one queue for the moment. */ - que = malloc(sizeof(struct em_queue), M_DEVBUF, M_NOWAIT | M_ZERO); - if (que == NULL) { - printf(": unable to allocate queue memory\n"); - return (ENOMEM); - } - que->me = 0; - que->sc = sc; - timeout_set(&que->rx_refill, em_rxrefill, que); - - sc->queues = que; - sc->num_queues = 1; sc->msix = 0; sc->legacy_irq = 0; if (em_allocate_msix(sc) && em_allocate_legacy(sc)) @@ -1826,11 +1841,7 @@ em_free_pci_resources(struct em_softc *s sc->legacy_irq = 0; sc->msix_linkvec = 0; sc->msix_queuesmask = 0; - if (sc->queues) - free(sc->queues, M_DEVBUF, - sc->num_queues * sizeof(struct em_queue)); sc->num_queues = 0; - sc->queues = NULL; } /********************************************************************* @@ -1949,8 +1960,10 @@ void em_setup_interface(struct em_softc *sc) { struct ifnet *ifp; + struct em_queue *que; uint64_t fiber_type = IFM_1000_SX; - + int i; + INIT_DEBUGOUT("em_setup_interface: begin"); ifp = &sc->sc_ac.ac_if; @@ -2012,6 +2025,22 @@ em_setup_interface(struct em_softc *sc) if_attach(ifp); ether_ifattach(ifp); + + if_attach_iqueues(ifp, sc->num_queues); + if_attach_queues(ifp, sc->num_queues); + + i = 0; + FOREACH_QUEUE(sc, que) { + que->me = i; + que->sc = sc; + + ifp->if_iqs[i]->ifiq_softc = que; + ifp->if_ifqs[i]->ifq_softc = que; + + timeout_set(&que->rx_refill, em_rxrefill, que); + i++; + } + em_enable_intr(sc); } @@ -2820,6 +2849,9 @@ em_initialize_receive_unit(struct em_sof if (sc->hw.mac_type == em_82573) E1000_WRITE_REG(&sc->hw, RDTR, 0x20); + if (sc->num_queues > 1) + em_setup_rss(sc); + FOREACH_QUEUE(sc, que) { if (sc->num_queues > 1) { /* @@ -3487,6 +3519,12 @@ em_allocate_legacy(struct em_softc *sc) } sc->legacy_irq = 1; } + sc->num_queues = 1; + sc->queues = malloc(sizeof(struct em_queue), M_DEVBUF, M_NOWAIT | M_ZERO); + if (sc->queues == NULL) { + printf(": couldn't allocate queues\n"); + return (ENOMEM); + } intrstr = pci_intr_string(pc, ih); sc->sc_intrhand = pci_intr_establish(pc, ih, IPL_NET | IPL_MPSAFE, @@ -3869,44 +3907,66 @@ em_allocate_msix(struct em_softc *sc) const char *intrstr = NULL; struct pci_attach_args *pa = &sc->osdep.em_pa; pci_chipset_tag_t pc = pa->pa_pc; - struct em_queue *que = sc->queues; /* Use only first queue. */ + struct em_queue *que; + int nmsix; int vec; - - if (!em_enable_msix) - return (ENODEV); + int max_queues; switch (sc->hw.mac_type) { case em_82576: case em_82580: case em_i350: + max_queues = 8; + break; case em_i210: + if (sc->hw.device_id == PCI_PRODUCT_INTEL_I211_COPPER) + max_queues = 2; + else + max_queues = 4; break; default: return (ENODEV); } + /* if we only have one vector, just use msi */ + nmsix = pci_intr_msix_count(pa); + if (nmsix < 2) + return (ENODEV); + vec = 0; if (pci_intr_map_msix(pa, vec, &ih)) return (ENODEV); sc->msix = 1; - que->me = vec; - que->eims = 1 << vec; - snprintf(que->name, sizeof(que->name), "%s:%d", DEVNAME(sc), vec); + nmsix--; + sc->intrmap = intrmap_create(&sc->sc_dev, nmsix, max_queues, INTRMAP_POWEROF2); + sc->num_queues = intrmap_count(sc->intrmap); + KASSERT(sc->num_queues > 0); + KASSERT(powerof2(sc->num_queues)); - intrstr = pci_intr_string(pc, ih); - que->tag = pci_intr_establish(pc, ih, IPL_NET | IPL_MPSAFE, - em_queue_intr_msix, que, que->name); - if (que->tag == NULL) { - printf(": couldn't establish interrupt"); - if (intrstr != NULL) - printf(" at %s", intrstr); - printf("\n"); - return (ENXIO); + sc->queues = mallocarray(sizeof(*que), sc->num_queues, M_DEVBUF, M_NOWAIT | M_ZERO); + if (sc->queues == NULL) + return (ENOMEM); + + FOREACH_QUEUE(sc, que) { + que->eims = 1 << vec; + snprintf(que->name, sizeof(que->name), "%s:%d", DEVNAME(sc), vec); + + if (pci_intr_map_msix(pa, vec, &ih)) { + printf(": unable to map msi-x vector %d", vec); + return (ENXIO); + } + + que->tag = pci_intr_establish_cpu(pc, ih, IPL_NET | IPL_MPSAFE, + intrmap_cpu(sc->intrmap, vec), em_queue_intr_msix, que, que->name); + if (que->tag == NULL) { + printf(": couldn't establish queue interrupt %d\n", vec); + return (ENXIO); + } + vec++; } /* Setup linkvector, use last queue vector + 1 */ - vec++; sc->msix_linkvec = vec; if (pci_intr_map_msix(pa, sc->msix_linkvec, &ih)) { printf(": couldn't map link vector\n"); @@ -4096,6 +4156,40 @@ void em_enable_queue_intr_msix(struct em_queue *que) { E1000_WRITE_REG(&que->sc->hw, EIMS, que->eims); +} + +void +em_setup_rss(struct em_softc *sc) +{ + uint32_t rss_key[10]; + uint32_t mrqc; + uint32_t reta; + int i; + int queue_id; + + /* set redirection table to round robin across queues */ + reta = 0; + for (i = 0; i < 128; i++) { + queue_id = i % sc->num_queues; + reta = reta >> 8; + reta = reta | (((uint32_t) queue_id) << 24); + if ((i & 3) == 3) { + E1000_WRITE_REG(&sc->hw, RETA(i >> 2), reta); + reta = 0; + } + } + + stoeplitz_to_key(rss_key, sizeof(rss_key)); + for (i = 0; i < nitems(rss_key); i++) + E1000_WRITE_REG_ARRAY(&sc->hw, RSSRK(0), i, rss_key[i]); + + mrqc = E1000_MRQC_ENABLE_RSS_8Q; + mrqc |= E1000_MRQC_RSS_FIELD_IPV4 | E1000_MRQC_RSS_FIELD_IPV4_TCP + | E1000_MRQC_RSS_FIELD_IPV6 | E1000_MRQC_RSS_FIELD_IPV6_TCP + | E1000_MRQC_RSS_FIELD_IPV4_UDP | E1000_MRQC_RSS_FIELD_IPV6_UDP + | E1000_MRQC_RSS_FIELD_IPV6_UDP_EX | E1000_MRQC_RSS_FIELD_IPV6_TCP_EX; + mrqc |= 0 << 3; /* default queue */ + E1000_WRITE_REG(&sc->hw, MRQC, mrqc); } #endif /* !SMALL_KERNEL */ Index: dev/pci/if_em.h =================================================================== RCS file: /cvs/src/sys/dev/pci/if_em.h,v retrieving revision 1.80 diff -u -p -r1.80 if_em.h --- dev/pci/if_em.h 9 Jan 2022 05:42:50 -0000 1.80 +++ dev/pci/if_em.h 25 Apr 2023 11:25:47 -0000 @@ -52,9 +52,11 @@ POSSIBILITY OF SUCH DAMAGE. #include <sys/timeout.h> #include <sys/atomic.h> #include <sys/kstat.h> +#include <sys/intrmap.h> #include <net/if.h> #include <net/if_media.h> +#include <net/toeplitz.h> #include <netinet/in.h> #include <netinet/ip.h> @@ -449,6 +451,7 @@ struct em_softc { uint32_t msix_queuesmask; int num_queues; struct em_queue *queues; + struct intrmap *intrmap; struct kstat *kstat; struct mutex kstat_mtx;