The branch main has been updated by kbowling:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=3e501ef896671cb190e8c40c6258b8f27d136f07

commit 3e501ef896671cb190e8c40c6258b8f27d136f07
Author:     Kevin Bowling <kbowl...@freebsd.org>
AuthorDate: 2024-09-22 09:26:05 +0000
Commit:     Kevin Bowling <kbowl...@freebsd.org>
CommitDate: 2024-10-11 05:36:43 +0000

    e1000: Re-add AIM
    
    We originally left this out because iflib modulates interrupts and
    accomplishes some level of batching versus the custom queues in the
    older driver. Upon more detailed study of the Linux driver which has a
    newer implementation, it finally became clear to me this is actually a
    holdoff timer and not an interrupt limit as it is conventionally
    (statically) programmed and displayed as an interrupt rate. The data
    sheets also make this somewhat clear.
    
    Thus, AIM accomplishes two beneficial things for a wide variety of
    workloads[1]:
    
    1. At low throughput/packet rates, it will significantly lower latency
    (by counter-intuitively "increasing" the interrupt rate.. better
    thought of as decreasing the holdoff timer because you will modulate
    down before coming anywhere near these interrupt rates).
    2. At bulk data rates, it is tuned to achieve a lower interrupt rate
    (by increasing the holdoff timer) than the current static 8000/s. This
    decreases processing overhead and yields more headroom for other work
    such as packet filters or userland.
    
    For a single NIC this might be worth a few sys% on common CPUs, but may
    be meaningful when multiplied such as if_lagg, if_bridge and forwarding
    setups.
    
    The AIM algorithm was re-introduced from the older igb or out of tree
    driver, and then modernized with permission to use Intel code from other
    drivers.
    
    I have retroactively added it to lem(4) and em(4) where the same concept
    applies, albeit to a single ITR register.
    
    [1]: 
http://iommu.com/datasheets/ethernet/controllers-nics/intel/e1000/gbe-controllers-interrupt-moderation-appl-note.pdf
    
    Tested by:      cc (https://wiki.freebsd.org/chengcui/testD46768)
    MFC after:      1 week
    Relnotes:       yes
    Sponsored by:   Rubicon Communications, LLC ("Netgate")
    Sponsored by:   BBOX.io
    Differential Revision:  https://reviews.freebsd.org/D46768
---
 sys/dev/e1000/em_txrx.c  |  10 ++
 sys/dev/e1000/if_em.c    | 269 ++++++++++++++++++++++++++++++++++++++++++++++-
 sys/dev/e1000/if_em.h    |  28 +++--
 sys/dev/e1000/igb_txrx.c |   4 +
 4 files changed, 299 insertions(+), 12 deletions(-)

diff --git a/sys/dev/e1000/em_txrx.c b/sys/dev/e1000/em_txrx.c
index eec198df7466..6e8fff07cd82 100644
--- a/sys/dev/e1000/em_txrx.c
+++ b/sys/dev/e1000/em_txrx.c
@@ -455,6 +455,10 @@ em_isc_txd_encap(void *arg, if_pkt_info_t pi)
            "tx_buffers[%d]->eop = %d ipi_new_pidx=%d\n", first, pidx_last, i);
        pi->ipi_new_pidx = i;
 
+       /* Sent data accounting for AIM */
+       txr->tx_bytes += pi->ipi_len;
+       ++txr->tx_packets;
+
        return (0);
 }
 
@@ -669,6 +673,7 @@ lem_isc_rxd_pkt_get(void *arg, if_rxd_info_t ri)
 
                len = le16toh(rxd->length);
                ri->iri_len += len;
+               rxr->rx_bytes += ri->iri_len;
 
                eop = (status & E1000_RXD_STAT_EOP) != 0;
 
@@ -690,6 +695,8 @@ lem_isc_rxd_pkt_get(void *arg, if_rxd_info_t ri)
                i++;
        } while (!eop);
 
+       rxr->rx_packets++;
+
        if (scctx->isc_capenable & IFCAP_RXCSUM)
                em_receive_checksum(status, errors, ri);
 
@@ -732,6 +739,7 @@ em_isc_rxd_pkt_get(void *arg, if_rxd_info_t ri)
 
                len = le16toh(rxd->wb.upper.length);
                ri->iri_len += len;
+               rxr->rx_bytes += ri->iri_len;
 
                eop = (staterr & E1000_RXD_STAT_EOP) != 0;
 
@@ -752,6 +760,8 @@ em_isc_rxd_pkt_get(void *arg, if_rxd_info_t ri)
                i++;
        } while (!eop);
 
+       rxr->rx_packets++;
+
        if (scctx->isc_capenable & IFCAP_RXCSUM)
                em_receive_checksum(staterr, staterr >> 24, ri);
 
diff --git a/sys/dev/e1000/if_em.c b/sys/dev/e1000/if_em.c
index b9c6131e6741..22578a3d8655 100644
--- a/sys/dev/e1000/if_em.c
+++ b/sys/dev/e1000/if_em.c
@@ -1,8 +1,9 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
+ * Copyright (c) 2001-2024, Intel Corporation
  * Copyright (c) 2016 Nicole Graziano <nic...@nextbsd.org>
- * All rights reserved.
+ * Copyright (c) 2024 Kevin Bowling <kbowl...@freebsd.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -329,10 +330,12 @@ static int        
em_sysctl_debug_info(SYSCTL_HANDLER_ARGS);
 static int     em_get_rs(SYSCTL_HANDLER_ARGS);
 static void    em_print_debug_info(struct e1000_softc *);
 static int     em_is_valid_ether_addr(u8 *);
+static void    em_newitr(struct e1000_softc *, struct em_rx_queue *,
+    struct tx_ring *, struct rx_ring *);
 static bool    em_automask_tso(if_ctx_t);
 static int     em_sysctl_int_delay(SYSCTL_HANDLER_ARGS);
 static void    em_add_int_delay_sysctl(struct e1000_softc *, const char *,
-                   const char *, struct em_int_delay_info *, int, int);
+    const char *, struct em_int_delay_info *, int, int);
 /* Management and WOL Support */
 static void    em_init_manageability(struct e1000_softc *);
 static void    em_release_manageability(struct e1000_softc *);
@@ -545,10 +548,19 @@ static int eee_setting = 1;
 SYSCTL_INT(_hw_em, OID_AUTO, eee_setting, CTLFLAG_RDTUN, &eee_setting, 0,
     "Enable Energy Efficient Ethernet");
 
+/*
+ * AIM: Adaptive Interrupt Moderation
+ * which means that the interrupt rate is varied over time based on the
+ * traffic for that interrupt vector
+ */
+static int em_enable_aim = 1;
+SYSCTL_INT(_hw_em, OID_AUTO, enable_aim, CTLFLAG_RWTUN, &em_enable_aim,
+    0, "Enable adaptive interrupt moderation (1=normal, 2=lowlatency)");
+
 /*
 ** Tuneable Interrupt rate
 */
-static int em_max_interrupt_rate = EM_INTS_PER_SEC;
+static int em_max_interrupt_rate = EM_INTS_DEFAULT;
 SYSCTL_INT(_hw_em, OID_AUTO, max_interrupt_rate, CTLFLAG_RDTUN,
     &em_max_interrupt_rate, 0, "Maximum interrupts per second");
 
@@ -832,6 +844,11 @@ em_if_attach_pre(if_ctx_t ctx)
            CTLTYPE_INT | CTLFLAG_RW, sc, 0,
            em_sysctl_nvm_info, "I", "NVM Information");
 
+       sc->enable_aim = em_enable_aim;
+       SYSCTL_ADD_INT(ctx_list, child, OID_AUTO, "enable_aim",
+           CTLFLAG_RW, &sc->enable_aim, 0,
+               "Interrupt Moderation (1=normal, 2=lowlatency)");
+
        SYSCTL_ADD_PROC(ctx_list, child, OID_AUTO, "fw_version",
            CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
            em_sysctl_print_fw_version, "A",
@@ -1437,6 +1454,159 @@ em_if_init(if_ctx_t ctx)
        }
 }
 
+enum itr_latency_target {
+       itr_latency_disabled = 0,
+       itr_latency_lowest = 1,
+       itr_latency_low = 2,
+       itr_latency_bulk = 3
+};
+/*********************************************************************
+ *
+ *  Helper to calculate next (E)ITR value for AIM
+ *
+ *********************************************************************/
+static void
+em_newitr(struct e1000_softc *sc, struct em_rx_queue *que,
+    struct tx_ring *txr, struct rx_ring *rxr)
+{
+       struct e1000_hw *hw = &sc->hw;
+       u32 newitr;
+       u32 bytes;
+       u32 bytes_packets;
+       u32 packets;
+       u8 nextlatency;
+
+       /* Idle, do nothing */
+       if ((txr->tx_bytes == 0) && (rxr->rx_bytes == 0))
+               return;
+
+       newitr = 0;
+
+       if (sc->enable_aim) {
+               nextlatency = rxr->rx_nextlatency;
+
+               /* Use half default (4K) ITR if sub-gig */
+               if (sc->link_speed != 1000) {
+                       newitr = EM_INTS_4K;
+                       goto em_set_next_itr;
+               }
+               /* Want at least enough packet buffer for two frames to AIM */
+               if (sc->shared->isc_max_frame_size * 2 > (sc->pba << 10)) {
+                       newitr = em_max_interrupt_rate;
+                       sc->enable_aim = 0;
+                       goto em_set_next_itr;
+               }
+
+               /* Get the largest values from the associated tx and rx ring */
+               if (txr->tx_bytes && txr->tx_packets) {
+                       bytes = txr->tx_bytes;
+                       bytes_packets = txr->tx_bytes/txr->tx_packets;
+                       packets = txr->tx_packets;
+               }
+               if (rxr->rx_bytes && rxr->rx_packets) {
+                       bytes = max(bytes, rxr->rx_bytes);
+                       bytes_packets = max(bytes_packets, 
rxr->rx_bytes/rxr->rx_packets);
+                       packets = max(packets, rxr->rx_packets);
+               }
+
+               /* Latency state machine */
+               switch (nextlatency) {
+               case itr_latency_disabled: /* Bootstrapping */
+                       nextlatency = itr_latency_low;
+                       break;
+               case itr_latency_lowest: /* 70k ints/s */
+                       /* TSO and jumbo frames */
+                       if (bytes_packets > 8000)
+                               nextlatency = itr_latency_bulk;
+                       else if ((packets < 5) && (bytes > 512))
+                               nextlatency = itr_latency_low;
+                       break;
+               case itr_latency_low: /* 20k ints/s */
+                       if (bytes > 10000) {
+                               /* Handle TSO */
+                               if (bytes_packets > 8000)
+                                       nextlatency = itr_latency_bulk;
+                               else if ((packets < 10) || (bytes_packets > 
1200))
+                                       nextlatency = itr_latency_bulk;
+                               else if (packets > 35)
+                                       nextlatency = itr_latency_lowest;
+                       } else if (bytes_packets > 2000) {
+                               nextlatency = itr_latency_bulk;
+                       } else if (packets < 3 && bytes < 512) {
+                               nextlatency = itr_latency_lowest;
+                       }
+                       break;
+               case itr_latency_bulk: /* 4k ints/s */
+                       if (bytes > 25000) {
+                               if (packets > 35)
+                                       nextlatency = itr_latency_low;
+                       } else if (bytes < 1500)
+                               nextlatency = itr_latency_low;
+                       break;
+               default:
+                       nextlatency = itr_latency_low;
+                       device_printf(sc->dev, "Unexpected newitr transition 
%d\n",
+                           nextlatency);
+                       break;
+               }
+
+               /* Trim itr_latency_lowest for default AIM setting */
+               if (sc->enable_aim == 1 && nextlatency == itr_latency_lowest)
+                       nextlatency = itr_latency_low;
+
+               /* Request new latency */
+               rxr->rx_nextlatency = nextlatency;
+       } else {
+               /* We may have toggled to AIM disabled */
+               nextlatency = itr_latency_disabled;
+               rxr->rx_nextlatency = nextlatency;
+       }
+
+       /* ITR state machine */
+       switch(nextlatency) {
+       case itr_latency_lowest:
+               newitr = EM_INTS_70K;
+               break;
+       case itr_latency_low:
+               newitr = EM_INTS_20K;
+               break;
+       case itr_latency_bulk:
+               newitr = EM_INTS_4K;
+               break;
+       case itr_latency_disabled:
+       default:
+               newitr = em_max_interrupt_rate;
+               break;
+       }
+
+em_set_next_itr:
+       if (hw->mac.type >= igb_mac_min) {
+               newitr = IGB_INTS_TO_EITR(newitr);
+
+               if (hw->mac.type == e1000_82575)
+                       newitr |= newitr << 16;
+               else
+                       newitr |= E1000_EITR_CNT_IGNR;
+
+               if (newitr != que->itr_setting) {
+                       que->itr_setting = newitr;
+                       E1000_WRITE_REG(hw, E1000_EITR(que->msix), 
que->itr_setting);
+               }
+       } else {
+               newitr = EM_INTS_TO_ITR(newitr);
+
+               if (newitr != que->itr_setting) {
+                       que->itr_setting = newitr;
+                       if (hw->mac.type == e1000_82574 && que->msix) {
+                               E1000_WRITE_REG(hw,
+                                   E1000_EITR_82574(que->msix), 
que->itr_setting);
+                       } else {
+                               E1000_WRITE_REG(hw, E1000_ITR, 
que->itr_setting);
+                       }
+               }
+       }
+}
+
 /*********************************************************************
  *
  *  Fast Legacy/MSI Combined Interrupt Service routine
@@ -1446,10 +1616,14 @@ int
 em_intr(void *arg)
 {
        struct e1000_softc *sc = arg;
+       struct e1000_hw *hw = &sc->hw;
+       struct em_rx_queue *que = &sc->rx_queues[0];
+       struct tx_ring *txr = &sc->tx_queues[0].txr;
+       struct rx_ring *rxr = &que->rxr;
        if_ctx_t ctx = sc->ctx;
        u32 reg_icr;
 
-       reg_icr = E1000_READ_REG(&sc->hw, E1000_ICR);
+       reg_icr = E1000_READ_REG(hw, E1000_ICR);
 
        /* Hot eject? */
        if (reg_icr == 0xffffffff)
@@ -1463,7 +1637,7 @@ em_intr(void *arg)
         * Starting with the 82571 chip, bit 31 should be used to
         * determine whether the interrupt belongs to us.
         */
-       if (sc->hw.mac.type >= e1000_82571 &&
+       if (hw->mac.type >= e1000_82571 &&
            (reg_icr & E1000_ICR_INT_ASSERTED) == 0)
                return FILTER_STRAY;
 
@@ -1482,6 +1656,15 @@ em_intr(void *arg)
        if (reg_icr & E1000_ICR_RXO)
                sc->rx_overruns++;
 
+       if (hw->mac.type >= e1000_82540)
+               em_newitr(sc, que, txr, rxr);
+
+       /* Reset state */
+       txr->tx_bytes = 0;
+       txr->tx_packets = 0;
+       rxr->rx_bytes = 0;
+       rxr->rx_packets = 0;
+
        return (FILTER_SCHEDULE_THREAD);
 }
 
@@ -1534,9 +1717,20 @@ static int
 em_msix_que(void *arg)
 {
        struct em_rx_queue *que = arg;
+       struct e1000_softc *sc = que->sc;
+       struct tx_ring *txr = &sc->tx_queues[que->msix].txr;
+       struct rx_ring *rxr = &que->rxr;
 
        ++que->irqs;
 
+       em_newitr(sc, que, txr, rxr);
+
+       /* Reset state */
+       txr->tx_bytes = 0;
+       txr->tx_packets = 0;
+       rxr->rx_bytes = 0;
+       rxr->rx_packets = 0;
+
        return (FILTER_SCHEDULE_THREAD);
 }
 
@@ -2882,6 +3076,9 @@ em_reset(if_ctx_t ctx)
        if (hw->mac.type >= igb_mac_min)
                igb_init_dmac(sc, pba);
 
+       /* Save the final PBA off if it needs to be used elsewhere i.e. AIM */
+       sc->pba = pba;
+
        E1000_WRITE_REG(hw, E1000_VET, ETHERTYPE_VLAN);
        e1000_get_phy_info(hw);
        e1000_check_for_link(hw);
@@ -3741,6 +3938,7 @@ em_if_intr_enable(if_ctx_t ctx)
                E1000_WRITE_REG(hw, EM_EIAC, sc->ims);
                ims_mask |= sc->ims;
        }
+
        E1000_WRITE_REG(hw, E1000_IMS, ims_mask);
        E1000_WRITE_FLUSH(hw);
 }
@@ -4410,6 +4608,57 @@ em_sysctl_reg_handler(SYSCTL_HANDLER_ARGS)
        return (sysctl_handle_int(oidp, &val, 0, req));
 }
 
+/* Per queue holdoff interrupt rate handler */
+static int
+em_sysctl_interrupt_rate_handler(SYSCTL_HANDLER_ARGS)
+{
+       struct em_rx_queue *rque;
+       struct em_tx_queue *tque;
+       struct e1000_hw *hw;
+       int error;
+       u32 reg, usec, rate;
+
+       bool tx = oidp->oid_arg2;
+
+       if (tx) {
+               tque = oidp->oid_arg1;
+               hw = &tque->sc->hw;
+               if (hw->mac.type >= igb_mac_min)
+                       reg = E1000_READ_REG(hw, E1000_EITR(tque->me));
+               else if (hw->mac.type == e1000_82574 && tque->msix)
+                       reg = E1000_READ_REG(hw, E1000_EITR_82574(tque->me));
+               else
+                       reg = E1000_READ_REG(hw, E1000_ITR);
+       } else {
+               rque = oidp->oid_arg1;
+               hw = &rque->sc->hw;
+               if (hw->mac.type >= igb_mac_min)
+                       reg = E1000_READ_REG(hw, E1000_EITR(rque->msix));
+               else if (hw->mac.type == e1000_82574 && rque->msix)
+                       reg = E1000_READ_REG(hw, E1000_EITR_82574(rque->msix));
+               else
+                       reg = E1000_READ_REG(hw, E1000_ITR);
+       }
+
+       if (hw->mac.type < igb_mac_min) {
+               if (reg > 0)
+                       rate = EM_INTS_TO_ITR(reg);
+               else
+                       rate = 0;
+       } else {
+               usec = (reg & IGB_QVECTOR_MASK);
+               if (usec > 0)
+                       rate = IGB_INTS_TO_EITR(usec);
+               else
+                       rate = 0;
+       }
+
+       error = sysctl_handle_int(oidp, &rate, 0, req);
+       if (error || !req->newptr)
+               return error;
+       return 0;
+}
+
 /*
  * Add sysctl variables, one per statistic, to the system.
  */
@@ -4466,6 +4715,11 @@ em_add_hw_stats(struct e1000_softc *sc)
                    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TX Queue Name");
                queue_list = SYSCTL_CHILDREN(queue_node);
 
+               SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "interrupt_rate",
+                   CTLTYPE_UINT | CTLFLAG_RD, tx_que,
+                   true, em_sysctl_interrupt_rate_handler,
+                   "IU", "Interrupt Rate");
+
                SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "txd_head",
                    CTLTYPE_UINT | CTLFLAG_RD, sc,
                    E1000_TDH(txr->me), em_sysctl_reg_handler, "IU",
@@ -4486,6 +4740,11 @@ em_add_hw_stats(struct e1000_softc *sc)
                    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "RX Queue Name");
                queue_list = SYSCTL_CHILDREN(queue_node);
 
+               SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "interrupt_rate",
+                   CTLTYPE_UINT | CTLFLAG_RD, rx_que,
+                   false, em_sysctl_interrupt_rate_handler,
+                   "IU", "Interrupt Rate");
+
                SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "rxd_head",
                    CTLTYPE_UINT | CTLFLAG_RD, sc,
                    E1000_RDH(rxr->me), em_sysctl_reg_handler, "IU",
diff --git a/sys/dev/e1000/if_em.h b/sys/dev/e1000/if_em.h
index 7219dc57c333..52bfed0f9a42 100644
--- a/sys/dev/e1000/if_em.h
+++ b/sys/dev/e1000/if_em.h
@@ -1,8 +1,9 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
+ * Copyright (c) 2001-2024, Intel Corporation
  * Copyright (c) 2016 Nicole Graziano <nic...@nextbsd.org>
- * All rights reserved.
+ * Copyright (c) 2024 Kevin Bowling <kbowl...@freebsd.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -243,16 +244,19 @@
 /* Support AutoMediaDetect for Marvell M88 PHY in i354 */
 #define IGB_MEDIA_RESET                (1 << 0)
 
-/* Define the starting Interrupt rate per Queue */
-#define EM_INTS_PER_SEC                8000
+/* Define the interrupt rates and ITR helpers */
+#define EM_INTS_4K             4000
+#define EM_INTS_20K            20000
+#define EM_INTS_70K            70000
+#define EM_INTS_DEFAULT                8000
 #define EM_INTS_MULTIPLIER     256
 #define EM_ITR_DIVIDEND                1000000000
 #define EM_INTS_TO_ITR(i)      (EM_ITR_DIVIDEND/(i * EM_INTS_MULTIPLIER))
-#define IGB_ITR_DIVIDEND       1000000
-#define IGB_ITR_SHIFT          2
+#define IGB_EITR_DIVIDEND      1000000
+#define IGB_EITR_SHIFT         2
 #define IGB_QVECTOR_MASK       0x7FFC
-#define IGB_INTS_TO_EITR(i)    (((IGB_ITR_DIVIDEND/i) & IGB_QVECTOR_MASK) << \
-                                   IGB_ITR_SHIFT)
+#define IGB_INTS_TO_EITR(i)    (((IGB_EITR_DIVIDEND/i) & IGB_QVECTOR_MASK) << \
+                                   IGB_EITR_SHIFT)
 
 #define IGB_LINK_ITR           2000
 #define I210_LINK_DELAY                1000
@@ -390,7 +394,11 @@ struct tx_ring {
        /* Interrupt resources */
        void                    *tag;
        struct resource         *res;
+
+       /* Soft stats */
        unsigned long           tx_irq;
+       unsigned long           tx_packets;
+       unsigned long           tx_bytes;
 
        /* Saved csum offloading context information */
        int                     csum_flags;
@@ -426,6 +434,9 @@ struct rx_ring {
        unsigned long           rx_discarded;
        unsigned long           rx_packets;
        unsigned long           rx_bytes;
+
+       /* Next requested ITR latency */
+       u8                      rx_nextlatency;
 };
 
 struct em_tx_queue {
@@ -441,6 +452,7 @@ struct em_rx_queue {
        u32                     me;
        u32                     msix;
        u32                     eims;
+       u32                     itr_setting;
        struct rx_ring          rxr;
        u64                     irqs;
        struct if_irq           que_irq;
@@ -489,6 +501,7 @@ struct e1000_softc {
 
        u32                     rx_mbuf_sz;
 
+       int                     enable_aim;
        /* Management and WOL features */
        u32                     wol;
        bool                    has_manage;
@@ -512,6 +525,7 @@ struct e1000_softc {
        u16                     link_duplex;
        u32                     smartspeed;
        u32                     dmac;
+       u32                     pba;
        int                     link_mask;
        int                     tso_automasked;
 
diff --git a/sys/dev/e1000/igb_txrx.c b/sys/dev/e1000/igb_txrx.c
index 2819150acba6..82cbb37ce4cd 100644
--- a/sys/dev/e1000/igb_txrx.c
+++ b/sys/dev/e1000/igb_txrx.c
@@ -292,6 +292,10 @@ igb_isc_txd_encap(void *arg, if_pkt_info_t pi)
        txd->read.cmd_type_len |= htole32(E1000_TXD_CMD_EOP | txd_flags);
        pi->ipi_new_pidx = i;
 
+       /* Sent data accounting for AIM */
+       txr->tx_bytes += pi->ipi_len;
+       ++txr->tx_packets;
+
        return (0);
 }
 

Reply via email to