The branch main has been updated by kbowling: URL: https://cgit.FreeBSD.org/src/commit/?id=3e501ef896671cb190e8c40c6258b8f27d136f07
commit 3e501ef896671cb190e8c40c6258b8f27d136f07 Author: Kevin Bowling <kbowl...@freebsd.org> AuthorDate: 2024-09-22 09:26:05 +0000 Commit: Kevin Bowling <kbowl...@freebsd.org> CommitDate: 2024-10-11 05:36:43 +0000 e1000: Re-add AIM We originally left this out because iflib modulates interrupts and accomplishes some level of batching versus the custom queues in the older driver. Upon more detailed study of the Linux driver which has a newer implementation, it finally became clear to me this is actually a holdoff timer and not an interrupt limit as it is conventionally (statically) programmed and displayed as an interrupt rate. The data sheets also make this somewhat clear. Thus, AIM accomplishes two beneficial things for a wide variety of workloads[1]: 1. At low throughput/packet rates, it will significantly lower latency (by counter-intuitively "increasing" the interrupt rate.. better thought of as decreasing the holdoff timer because you will modulate down before coming anywhere near these interrupt rates). 2. At bulk data rates, it is tuned to achieve a lower interrupt rate (by increasing the holdoff timer) than the current static 8000/s. This decreases processing overhead and yields more headroom for other work such as packet filters or userland. For a single NIC this might be worth a few sys% on common CPUs, but may be meaningful when multiplied such as if_lagg, if_bridge and forwarding setups. The AIM algorithm was re-introduced from the older igb or out of tree driver, and then modernized with permission to use Intel code from other drivers. I have retroactively added it to lem(4) and em(4) where the same concept applies, albeit to a single ITR register. [1]: http://iommu.com/datasheets/ethernet/controllers-nics/intel/e1000/gbe-controllers-interrupt-moderation-appl-note.pdf Tested by: cc (https://wiki.freebsd.org/chengcui/testD46768) MFC after: 1 week Relnotes: yes Sponsored by: Rubicon Communications, LLC ("Netgate") Sponsored by: BBOX.io Differential Revision: https://reviews.freebsd.org/D46768 --- sys/dev/e1000/em_txrx.c | 10 ++ sys/dev/e1000/if_em.c | 269 ++++++++++++++++++++++++++++++++++++++++++++++- sys/dev/e1000/if_em.h | 28 +++-- sys/dev/e1000/igb_txrx.c | 4 + 4 files changed, 299 insertions(+), 12 deletions(-) diff --git a/sys/dev/e1000/em_txrx.c b/sys/dev/e1000/em_txrx.c index eec198df7466..6e8fff07cd82 100644 --- a/sys/dev/e1000/em_txrx.c +++ b/sys/dev/e1000/em_txrx.c @@ -455,6 +455,10 @@ em_isc_txd_encap(void *arg, if_pkt_info_t pi) "tx_buffers[%d]->eop = %d ipi_new_pidx=%d\n", first, pidx_last, i); pi->ipi_new_pidx = i; + /* Sent data accounting for AIM */ + txr->tx_bytes += pi->ipi_len; + ++txr->tx_packets; + return (0); } @@ -669,6 +673,7 @@ lem_isc_rxd_pkt_get(void *arg, if_rxd_info_t ri) len = le16toh(rxd->length); ri->iri_len += len; + rxr->rx_bytes += ri->iri_len; eop = (status & E1000_RXD_STAT_EOP) != 0; @@ -690,6 +695,8 @@ lem_isc_rxd_pkt_get(void *arg, if_rxd_info_t ri) i++; } while (!eop); + rxr->rx_packets++; + if (scctx->isc_capenable & IFCAP_RXCSUM) em_receive_checksum(status, errors, ri); @@ -732,6 +739,7 @@ em_isc_rxd_pkt_get(void *arg, if_rxd_info_t ri) len = le16toh(rxd->wb.upper.length); ri->iri_len += len; + rxr->rx_bytes += ri->iri_len; eop = (staterr & E1000_RXD_STAT_EOP) != 0; @@ -752,6 +760,8 @@ em_isc_rxd_pkt_get(void *arg, if_rxd_info_t ri) i++; } while (!eop); + rxr->rx_packets++; + if (scctx->isc_capenable & IFCAP_RXCSUM) em_receive_checksum(staterr, staterr >> 24, ri); diff --git a/sys/dev/e1000/if_em.c b/sys/dev/e1000/if_em.c index b9c6131e6741..22578a3d8655 100644 --- a/sys/dev/e1000/if_em.c +++ b/sys/dev/e1000/if_em.c @@ -1,8 +1,9 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * + * Copyright (c) 2001-2024, Intel Corporation * Copyright (c) 2016 Nicole Graziano <nic...@nextbsd.org> - * All rights reserved. + * Copyright (c) 2024 Kevin Bowling <kbowl...@freebsd.org> * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -329,10 +330,12 @@ static int em_sysctl_debug_info(SYSCTL_HANDLER_ARGS); static int em_get_rs(SYSCTL_HANDLER_ARGS); static void em_print_debug_info(struct e1000_softc *); static int em_is_valid_ether_addr(u8 *); +static void em_newitr(struct e1000_softc *, struct em_rx_queue *, + struct tx_ring *, struct rx_ring *); static bool em_automask_tso(if_ctx_t); static int em_sysctl_int_delay(SYSCTL_HANDLER_ARGS); static void em_add_int_delay_sysctl(struct e1000_softc *, const char *, - const char *, struct em_int_delay_info *, int, int); + const char *, struct em_int_delay_info *, int, int); /* Management and WOL Support */ static void em_init_manageability(struct e1000_softc *); static void em_release_manageability(struct e1000_softc *); @@ -545,10 +548,19 @@ static int eee_setting = 1; SYSCTL_INT(_hw_em, OID_AUTO, eee_setting, CTLFLAG_RDTUN, &eee_setting, 0, "Enable Energy Efficient Ethernet"); +/* + * AIM: Adaptive Interrupt Moderation + * which means that the interrupt rate is varied over time based on the + * traffic for that interrupt vector + */ +static int em_enable_aim = 1; +SYSCTL_INT(_hw_em, OID_AUTO, enable_aim, CTLFLAG_RWTUN, &em_enable_aim, + 0, "Enable adaptive interrupt moderation (1=normal, 2=lowlatency)"); + /* ** Tuneable Interrupt rate */ -static int em_max_interrupt_rate = EM_INTS_PER_SEC; +static int em_max_interrupt_rate = EM_INTS_DEFAULT; SYSCTL_INT(_hw_em, OID_AUTO, max_interrupt_rate, CTLFLAG_RDTUN, &em_max_interrupt_rate, 0, "Maximum interrupts per second"); @@ -832,6 +844,11 @@ em_if_attach_pre(if_ctx_t ctx) CTLTYPE_INT | CTLFLAG_RW, sc, 0, em_sysctl_nvm_info, "I", "NVM Information"); + sc->enable_aim = em_enable_aim; + SYSCTL_ADD_INT(ctx_list, child, OID_AUTO, "enable_aim", + CTLFLAG_RW, &sc->enable_aim, 0, + "Interrupt Moderation (1=normal, 2=lowlatency)"); + SYSCTL_ADD_PROC(ctx_list, child, OID_AUTO, "fw_version", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, em_sysctl_print_fw_version, "A", @@ -1437,6 +1454,159 @@ em_if_init(if_ctx_t ctx) } } +enum itr_latency_target { + itr_latency_disabled = 0, + itr_latency_lowest = 1, + itr_latency_low = 2, + itr_latency_bulk = 3 +}; +/********************************************************************* + * + * Helper to calculate next (E)ITR value for AIM + * + *********************************************************************/ +static void +em_newitr(struct e1000_softc *sc, struct em_rx_queue *que, + struct tx_ring *txr, struct rx_ring *rxr) +{ + struct e1000_hw *hw = &sc->hw; + u32 newitr; + u32 bytes; + u32 bytes_packets; + u32 packets; + u8 nextlatency; + + /* Idle, do nothing */ + if ((txr->tx_bytes == 0) && (rxr->rx_bytes == 0)) + return; + + newitr = 0; + + if (sc->enable_aim) { + nextlatency = rxr->rx_nextlatency; + + /* Use half default (4K) ITR if sub-gig */ + if (sc->link_speed != 1000) { + newitr = EM_INTS_4K; + goto em_set_next_itr; + } + /* Want at least enough packet buffer for two frames to AIM */ + if (sc->shared->isc_max_frame_size * 2 > (sc->pba << 10)) { + newitr = em_max_interrupt_rate; + sc->enable_aim = 0; + goto em_set_next_itr; + } + + /* Get the largest values from the associated tx and rx ring */ + if (txr->tx_bytes && txr->tx_packets) { + bytes = txr->tx_bytes; + bytes_packets = txr->tx_bytes/txr->tx_packets; + packets = txr->tx_packets; + } + if (rxr->rx_bytes && rxr->rx_packets) { + bytes = max(bytes, rxr->rx_bytes); + bytes_packets = max(bytes_packets, rxr->rx_bytes/rxr->rx_packets); + packets = max(packets, rxr->rx_packets); + } + + /* Latency state machine */ + switch (nextlatency) { + case itr_latency_disabled: /* Bootstrapping */ + nextlatency = itr_latency_low; + break; + case itr_latency_lowest: /* 70k ints/s */ + /* TSO and jumbo frames */ + if (bytes_packets > 8000) + nextlatency = itr_latency_bulk; + else if ((packets < 5) && (bytes > 512)) + nextlatency = itr_latency_low; + break; + case itr_latency_low: /* 20k ints/s */ + if (bytes > 10000) { + /* Handle TSO */ + if (bytes_packets > 8000) + nextlatency = itr_latency_bulk; + else if ((packets < 10) || (bytes_packets > 1200)) + nextlatency = itr_latency_bulk; + else if (packets > 35) + nextlatency = itr_latency_lowest; + } else if (bytes_packets > 2000) { + nextlatency = itr_latency_bulk; + } else if (packets < 3 && bytes < 512) { + nextlatency = itr_latency_lowest; + } + break; + case itr_latency_bulk: /* 4k ints/s */ + if (bytes > 25000) { + if (packets > 35) + nextlatency = itr_latency_low; + } else if (bytes < 1500) + nextlatency = itr_latency_low; + break; + default: + nextlatency = itr_latency_low; + device_printf(sc->dev, "Unexpected newitr transition %d\n", + nextlatency); + break; + } + + /* Trim itr_latency_lowest for default AIM setting */ + if (sc->enable_aim == 1 && nextlatency == itr_latency_lowest) + nextlatency = itr_latency_low; + + /* Request new latency */ + rxr->rx_nextlatency = nextlatency; + } else { + /* We may have toggled to AIM disabled */ + nextlatency = itr_latency_disabled; + rxr->rx_nextlatency = nextlatency; + } + + /* ITR state machine */ + switch(nextlatency) { + case itr_latency_lowest: + newitr = EM_INTS_70K; + break; + case itr_latency_low: + newitr = EM_INTS_20K; + break; + case itr_latency_bulk: + newitr = EM_INTS_4K; + break; + case itr_latency_disabled: + default: + newitr = em_max_interrupt_rate; + break; + } + +em_set_next_itr: + if (hw->mac.type >= igb_mac_min) { + newitr = IGB_INTS_TO_EITR(newitr); + + if (hw->mac.type == e1000_82575) + newitr |= newitr << 16; + else + newitr |= E1000_EITR_CNT_IGNR; + + if (newitr != que->itr_setting) { + que->itr_setting = newitr; + E1000_WRITE_REG(hw, E1000_EITR(que->msix), que->itr_setting); + } + } else { + newitr = EM_INTS_TO_ITR(newitr); + + if (newitr != que->itr_setting) { + que->itr_setting = newitr; + if (hw->mac.type == e1000_82574 && que->msix) { + E1000_WRITE_REG(hw, + E1000_EITR_82574(que->msix), que->itr_setting); + } else { + E1000_WRITE_REG(hw, E1000_ITR, que->itr_setting); + } + } + } +} + /********************************************************************* * * Fast Legacy/MSI Combined Interrupt Service routine @@ -1446,10 +1616,14 @@ int em_intr(void *arg) { struct e1000_softc *sc = arg; + struct e1000_hw *hw = &sc->hw; + struct em_rx_queue *que = &sc->rx_queues[0]; + struct tx_ring *txr = &sc->tx_queues[0].txr; + struct rx_ring *rxr = &que->rxr; if_ctx_t ctx = sc->ctx; u32 reg_icr; - reg_icr = E1000_READ_REG(&sc->hw, E1000_ICR); + reg_icr = E1000_READ_REG(hw, E1000_ICR); /* Hot eject? */ if (reg_icr == 0xffffffff) @@ -1463,7 +1637,7 @@ em_intr(void *arg) * Starting with the 82571 chip, bit 31 should be used to * determine whether the interrupt belongs to us. */ - if (sc->hw.mac.type >= e1000_82571 && + if (hw->mac.type >= e1000_82571 && (reg_icr & E1000_ICR_INT_ASSERTED) == 0) return FILTER_STRAY; @@ -1482,6 +1656,15 @@ em_intr(void *arg) if (reg_icr & E1000_ICR_RXO) sc->rx_overruns++; + if (hw->mac.type >= e1000_82540) + em_newitr(sc, que, txr, rxr); + + /* Reset state */ + txr->tx_bytes = 0; + txr->tx_packets = 0; + rxr->rx_bytes = 0; + rxr->rx_packets = 0; + return (FILTER_SCHEDULE_THREAD); } @@ -1534,9 +1717,20 @@ static int em_msix_que(void *arg) { struct em_rx_queue *que = arg; + struct e1000_softc *sc = que->sc; + struct tx_ring *txr = &sc->tx_queues[que->msix].txr; + struct rx_ring *rxr = &que->rxr; ++que->irqs; + em_newitr(sc, que, txr, rxr); + + /* Reset state */ + txr->tx_bytes = 0; + txr->tx_packets = 0; + rxr->rx_bytes = 0; + rxr->rx_packets = 0; + return (FILTER_SCHEDULE_THREAD); } @@ -2882,6 +3076,9 @@ em_reset(if_ctx_t ctx) if (hw->mac.type >= igb_mac_min) igb_init_dmac(sc, pba); + /* Save the final PBA off if it needs to be used elsewhere i.e. AIM */ + sc->pba = pba; + E1000_WRITE_REG(hw, E1000_VET, ETHERTYPE_VLAN); e1000_get_phy_info(hw); e1000_check_for_link(hw); @@ -3741,6 +3938,7 @@ em_if_intr_enable(if_ctx_t ctx) E1000_WRITE_REG(hw, EM_EIAC, sc->ims); ims_mask |= sc->ims; } + E1000_WRITE_REG(hw, E1000_IMS, ims_mask); E1000_WRITE_FLUSH(hw); } @@ -4410,6 +4608,57 @@ em_sysctl_reg_handler(SYSCTL_HANDLER_ARGS) return (sysctl_handle_int(oidp, &val, 0, req)); } +/* Per queue holdoff interrupt rate handler */ +static int +em_sysctl_interrupt_rate_handler(SYSCTL_HANDLER_ARGS) +{ + struct em_rx_queue *rque; + struct em_tx_queue *tque; + struct e1000_hw *hw; + int error; + u32 reg, usec, rate; + + bool tx = oidp->oid_arg2; + + if (tx) { + tque = oidp->oid_arg1; + hw = &tque->sc->hw; + if (hw->mac.type >= igb_mac_min) + reg = E1000_READ_REG(hw, E1000_EITR(tque->me)); + else if (hw->mac.type == e1000_82574 && tque->msix) + reg = E1000_READ_REG(hw, E1000_EITR_82574(tque->me)); + else + reg = E1000_READ_REG(hw, E1000_ITR); + } else { + rque = oidp->oid_arg1; + hw = &rque->sc->hw; + if (hw->mac.type >= igb_mac_min) + reg = E1000_READ_REG(hw, E1000_EITR(rque->msix)); + else if (hw->mac.type == e1000_82574 && rque->msix) + reg = E1000_READ_REG(hw, E1000_EITR_82574(rque->msix)); + else + reg = E1000_READ_REG(hw, E1000_ITR); + } + + if (hw->mac.type < igb_mac_min) { + if (reg > 0) + rate = EM_INTS_TO_ITR(reg); + else + rate = 0; + } else { + usec = (reg & IGB_QVECTOR_MASK); + if (usec > 0) + rate = IGB_INTS_TO_EITR(usec); + else + rate = 0; + } + + error = sysctl_handle_int(oidp, &rate, 0, req); + if (error || !req->newptr) + return error; + return 0; +} + /* * Add sysctl variables, one per statistic, to the system. */ @@ -4466,6 +4715,11 @@ em_add_hw_stats(struct e1000_softc *sc) CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TX Queue Name"); queue_list = SYSCTL_CHILDREN(queue_node); + SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "interrupt_rate", + CTLTYPE_UINT | CTLFLAG_RD, tx_que, + true, em_sysctl_interrupt_rate_handler, + "IU", "Interrupt Rate"); + SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "txd_head", CTLTYPE_UINT | CTLFLAG_RD, sc, E1000_TDH(txr->me), em_sysctl_reg_handler, "IU", @@ -4486,6 +4740,11 @@ em_add_hw_stats(struct e1000_softc *sc) CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "RX Queue Name"); queue_list = SYSCTL_CHILDREN(queue_node); + SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "interrupt_rate", + CTLTYPE_UINT | CTLFLAG_RD, rx_que, + false, em_sysctl_interrupt_rate_handler, + "IU", "Interrupt Rate"); + SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "rxd_head", CTLTYPE_UINT | CTLFLAG_RD, sc, E1000_RDH(rxr->me), em_sysctl_reg_handler, "IU", diff --git a/sys/dev/e1000/if_em.h b/sys/dev/e1000/if_em.h index 7219dc57c333..52bfed0f9a42 100644 --- a/sys/dev/e1000/if_em.h +++ b/sys/dev/e1000/if_em.h @@ -1,8 +1,9 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * + * Copyright (c) 2001-2024, Intel Corporation * Copyright (c) 2016 Nicole Graziano <nic...@nextbsd.org> - * All rights reserved. + * Copyright (c) 2024 Kevin Bowling <kbowl...@freebsd.org> * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -243,16 +244,19 @@ /* Support AutoMediaDetect for Marvell M88 PHY in i354 */ #define IGB_MEDIA_RESET (1 << 0) -/* Define the starting Interrupt rate per Queue */ -#define EM_INTS_PER_SEC 8000 +/* Define the interrupt rates and ITR helpers */ +#define EM_INTS_4K 4000 +#define EM_INTS_20K 20000 +#define EM_INTS_70K 70000 +#define EM_INTS_DEFAULT 8000 #define EM_INTS_MULTIPLIER 256 #define EM_ITR_DIVIDEND 1000000000 #define EM_INTS_TO_ITR(i) (EM_ITR_DIVIDEND/(i * EM_INTS_MULTIPLIER)) -#define IGB_ITR_DIVIDEND 1000000 -#define IGB_ITR_SHIFT 2 +#define IGB_EITR_DIVIDEND 1000000 +#define IGB_EITR_SHIFT 2 #define IGB_QVECTOR_MASK 0x7FFC -#define IGB_INTS_TO_EITR(i) (((IGB_ITR_DIVIDEND/i) & IGB_QVECTOR_MASK) << \ - IGB_ITR_SHIFT) +#define IGB_INTS_TO_EITR(i) (((IGB_EITR_DIVIDEND/i) & IGB_QVECTOR_MASK) << \ + IGB_EITR_SHIFT) #define IGB_LINK_ITR 2000 #define I210_LINK_DELAY 1000 @@ -390,7 +394,11 @@ struct tx_ring { /* Interrupt resources */ void *tag; struct resource *res; + + /* Soft stats */ unsigned long tx_irq; + unsigned long tx_packets; + unsigned long tx_bytes; /* Saved csum offloading context information */ int csum_flags; @@ -426,6 +434,9 @@ struct rx_ring { unsigned long rx_discarded; unsigned long rx_packets; unsigned long rx_bytes; + + /* Next requested ITR latency */ + u8 rx_nextlatency; }; struct em_tx_queue { @@ -441,6 +452,7 @@ struct em_rx_queue { u32 me; u32 msix; u32 eims; + u32 itr_setting; struct rx_ring rxr; u64 irqs; struct if_irq que_irq; @@ -489,6 +501,7 @@ struct e1000_softc { u32 rx_mbuf_sz; + int enable_aim; /* Management and WOL features */ u32 wol; bool has_manage; @@ -512,6 +525,7 @@ struct e1000_softc { u16 link_duplex; u32 smartspeed; u32 dmac; + u32 pba; int link_mask; int tso_automasked; diff --git a/sys/dev/e1000/igb_txrx.c b/sys/dev/e1000/igb_txrx.c index 2819150acba6..82cbb37ce4cd 100644 --- a/sys/dev/e1000/igb_txrx.c +++ b/sys/dev/e1000/igb_txrx.c @@ -292,6 +292,10 @@ igb_isc_txd_encap(void *arg, if_pkt_info_t pi) txd->read.cmd_type_len |= htole32(E1000_TXD_CMD_EOP | txd_flags); pi->ipi_new_pidx = i; + /* Sent data accounting for AIM */ + txr->tx_bytes += pi->ipi_len; + ++txr->tx_packets; + return (0); }