On 04/12/15(Fri) 12:47, Mark Kettenis wrote:
> Here is a new diff to make ix(4) mpsafe. Should now longer get stuck
> in the OACTIVE state. Tests more than welcome.
Like for em(4) it is the right time to put this in, ok mpi@
> Index: if_ix.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
> retrieving revision 1.129
> diff -u -p -r1.129 if_ix.c
> --- if_ix.c 25 Nov 2015 03:09:59 -0000 1.129
> +++ if_ix.c 3 Dec 2015 16:50:26 -0000
> @@ -210,8 +210,6 @@ ixgbe_attach(struct device *parent, stru
> sc->osdep.os_sc = sc;
> sc->osdep.os_pa = *pa;
>
> - mtx_init(&sc->rx_mtx, IPL_NET);
> -
> /* Set up the timer callout */
> timeout_set(&sc->timer, ixgbe_local_timer, sc);
> timeout_set(&sc->rx_refill, ixgbe_rxrefill, sc);
> @@ -385,6 +383,16 @@ ixgbe_start(struct ifnet * ifp)
> if (ixgbe_encap(txr, m_head)) {
> ifq_deq_rollback(&ifp->if_snd, m_head);
> ifq_set_oactive(&ifp->if_snd);
> + /*
> + * Make sure there are still packets on the
> + * ring. The interrupt handler may have
> + * cleaned up the ring before we were able to
> + * set the IF_OACTIVE flag.
> + */
> + if (txr->tx_avail == sc->num_tx_desc) {
> + ifq_clr_oactive(&ifp->if_snd);
> + continue;
> + }
> break;
> }
>
> @@ -527,10 +535,7 @@ ixgbe_watchdog(struct ifnet * ifp)
>
> /*
> * The timer is set to 5 every time ixgbe_start() queues a packet.
> - * Then ixgbe_txeof() keeps resetting to 5 as long as it cleans at
> - * least one descriptor.
> - * Finally, anytime all descriptors are clean the timer is
> - * set to 0.
> + * Anytime all descriptors are clean the timer is set to 0.
> */
> for (i = 0; i < sc->num_queues; i++, txr++) {
> if (txr->watchdog_timer == 0 || --txr->watchdog_timer)
> @@ -862,7 +867,7 @@ ixgbe_intr(void *arg)
> struct tx_ring *txr = sc->tx_rings;
> struct ixgbe_hw *hw = &sc->hw;
> uint32_t reg_eicr;
> - int i, refill = 0, was_active = 0;
> + int i, refill = 0;
>
> reg_eicr = IXGBE_READ_REG(&sc->hw, IXGBE_EICR);
> if (reg_eicr == 0) {
> @@ -872,11 +877,8 @@ ixgbe_intr(void *arg)
>
> if (ISSET(ifp->if_flags, IFF_RUNNING)) {
> ixgbe_rxeof(que);
> - refill = 1;
> -
> - if (ifq_is_oactive(&ifp->if_snd))
> - was_active = 1;
> ixgbe_txeof(txr);
> + refill = 1;
> }
>
> if (refill) {
> @@ -892,6 +894,8 @@ ixgbe_intr(void *arg)
> if (reg_eicr & IXGBE_EICR_LSC) {
> KERNEL_LOCK();
> ixgbe_update_link_status(sc);
> + if (!IFQ_IS_EMPTY(&ifp->if_snd))
> + ixgbe_start(ifp);
> KERNEL_UNLOCK();
> }
>
> @@ -913,12 +917,6 @@ ixgbe_intr(void *arg)
> }
> }
>
> - if (was_active) {
> - KERNEL_LOCK();
> - ixgbe_start(ifp);
> - KERNEL_UNLOCK();
> - }
> -
> /* Check for fan failure */
> if ((hw->device_id == IXGBE_DEV_ID_82598AT) &&
> (reg_eicr & IXGBE_EICR_GPI_SDP1)) {
> @@ -1061,17 +1059,9 @@ ixgbe_encap(struct tx_ring *txr, struct
> cmd_type_len |= IXGBE_ADVTXD_DCMD_VLE;
> #endif
>
> - /*
> - * Force a cleanup if number of TX descriptors
> - * available is below the threshold. If it fails
> - * to get above, then abort transmit.
> - */
> - if (txr->tx_avail <= IXGBE_TX_CLEANUP_THRESHOLD) {
> - ixgbe_txeof(txr);
> - /* Make sure things have improved */
> - if (txr->tx_avail <= IXGBE_TX_OP_THRESHOLD)
> - return (ENOBUFS);
> - }
> + /* Check that we have least the minimal number of TX descriptors. */
> + if (txr->tx_avail <= IXGBE_TX_OP_THRESHOLD)
> + return (ENOBUFS);
>
> /*
> * Important to capture the first descriptor
> @@ -1135,8 +1125,6 @@ ixgbe_encap(struct tx_ring *txr, struct
>
> txd->read.cmd_type_len |=
> htole32(IXGBE_TXD_CMD_EOP | IXGBE_TXD_CMD_RS);
> - txr->tx_avail -= map->dm_nsegs;
> - txr->next_avail_desc = i;
>
> txbuf->m_head = m_head;
> /*
> @@ -1154,6 +1142,11 @@ ixgbe_encap(struct tx_ring *txr, struct
> txbuf = &txr->tx_buffers[first];
> txbuf->eop_index = last;
>
> + membar_producer();
> +
> + atomic_sub_int(&txr->tx_avail, map->dm_nsegs);
> + txr->next_avail_desc = i;
> +
> ++txr->tx_packets;
> return (0);
>
> @@ -1322,6 +1315,10 @@ ixgbe_stop(void *arg)
> /* reprogram the RAR[0] in case user changed it. */
> ixgbe_set_rar(&sc->hw, 0, sc->hw.mac.addr, 0, IXGBE_RAH_AV);
>
> + intr_barrier(sc->tag);
> +
> + KASSERT((ifp->if_flags & IFF_RUNNING) == 0);
> +
> /* Should we really clear all structures on stop? */
> ixgbe_free_transmit_structures(sc);
> ixgbe_free_receive_structures(sc);
> @@ -2202,11 +2199,13 @@ ixgbe_tx_ctx_setup(struct tx_ring *txr,
> tx_buffer->m_head = NULL;
> tx_buffer->eop_index = -1;
>
> + membar_producer();
> +
> /* We've consumed the first desc, adjust counters */
> if (++ctxd == sc->num_tx_desc)
> ctxd = 0;
> txr->next_avail_desc = ctxd;
> - --txr->tx_avail;
> + atomic_dec_int(&txr->tx_avail);
>
> return (0);
> }
> @@ -2320,10 +2319,12 @@ ixgbe_tso_setup(struct tx_ring *txr, str
>
> TXD->seqnum_seed = htole32(0);
>
> + membar_producer();
> +
> if (++ctxd == sc->num_tx_desc)
> ctxd = 0;
>
> - txr->tx_avail--;
> + atomic_dec_int(&txr->tx_avail);
> txr->next_avail_desc = ctxd;
> *cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
> *olinfo_status |= IXGBE_TXD_POPTS_TXSM << 8;
> @@ -2345,6 +2346,7 @@ ixgbe_txeof(struct tx_ring *txr)
> struct ix_softc *sc = txr->sc;
> struct ifnet *ifp = &sc->arpcom.ac_if;
> uint32_t first, last, done, processed;
> + uint32_t num_avail;
> struct ixgbe_tx_buf *tx_buffer;
> struct ixgbe_legacy_tx_desc *tx_desc, *eop_desc;
>
> @@ -2356,23 +2358,19 @@ ixgbe_txeof(struct tx_ring *txr)
> return FALSE;
> }
>
> - KERNEL_LOCK();
> + membar_consumer();
>
> processed = 0;
> first = txr->next_to_clean;
> /* was the txt queue cleaned up in the meantime */
> - if (txr->tx_buffers == NULL) {
> - KERNEL_UNLOCK();
> + if (txr->tx_buffers == NULL)
> return FALSE;
> - }
> tx_buffer = &txr->tx_buffers[first];
> /* For cleanup we just use legacy struct */
> tx_desc = (struct ixgbe_legacy_tx_desc *)&txr->tx_base[first];
> last = tx_buffer->eop_index;
> - if (last == -1) {
> - KERNEL_UNLOCK();
> + if (last == -1)
> return FALSE;
> - }
> eop_desc = (struct ixgbe_legacy_tx_desc *)&txr->tx_base[last];
>
> /*
> @@ -2394,7 +2392,6 @@ ixgbe_txeof(struct tx_ring *txr)
> tx_desc->upper.data = 0;
> tx_desc->lower.data = 0;
> tx_desc->buffer_addr = 0;
> - ++txr->tx_avail;
> ++processed;
>
> if (tx_buffer->m_head) {
> @@ -2436,31 +2433,24 @@ ixgbe_txeof(struct tx_ring *txr)
>
> txr->next_to_clean = first;
>
> + num_avail = atomic_add_int_nv(&txr->tx_avail, processed);
> +
> + /* All clean, turn off the timer */
> + if (num_avail == sc->num_tx_desc)
> + ifp->if_timer = 0;
> +
> /*
> * If we have enough room, clear IFF_OACTIVE to tell the stack that
> - * it is OK to send packets. If there are no pending descriptors,
> - * clear the timeout. Otherwise, if some descriptors have been freed,
> - * restart the timeout.
> + * it is OK to send packets.
> */
> - if (txr->tx_avail > IXGBE_TX_CLEANUP_THRESHOLD) {
> + if (ifq_is_oactive(&ifp->if_snd) &&
> + num_avail > IXGBE_TX_OP_THRESHOLD) {
> ifq_clr_oactive(&ifp->if_snd);
> -
> - /* If all are clean turn off the timer */
> - if (txr->tx_avail == sc->num_tx_desc) {
> - ifp->if_timer = 0;
> - txr->watchdog_timer = 0;
> - KERNEL_UNLOCK();
> - return FALSE;
> - }
> - /* Some were cleaned, so reset timer */
> - else if (processed) {
> - ifp->if_timer = IXGBE_TX_TIMEOUT;
> - txr->watchdog_timer = IXGBE_TX_TIMEOUT;
> - }
> + KERNEL_LOCK();
> + ixgbe_start(ifp);
> + KERNEL_UNLOCK();
> }
>
> - KERNEL_UNLOCK();
> -
> return TRUE;
> }
>
> @@ -2610,8 +2600,6 @@ ixgbe_rxfill(struct rx_ring *rxr)
> u_int slots;
> int i;
>
> - mtx_enter(&sc->rx_mtx);
> -
> i = rxr->last_desc_filled;
> for (slots = if_rxr_get(&rxr->rx_ring, sc->num_rx_desc);
> slots > 0; slots--) {
> @@ -2627,8 +2615,6 @@ ixgbe_rxfill(struct rx_ring *rxr)
>
> if_rxr_put(&rxr->rx_ring, slots);
>
> - mtx_leave(&sc->rx_mtx);
> -
> return (post);
> }
>
> @@ -2796,10 +2782,8 @@ ixgbe_free_receive_structures(struct ix_
> struct rx_ring *rxr;
> int i;
>
> - mtx_enter(&sc->rx_mtx);
> for (i = 0, rxr = sc->rx_rings; i < sc->num_queues; i++, rxr++)
> if_rxr_init(&rxr->rx_ring, 0, 0);
> - mtx_leave(&sc->rx_mtx);
>
> for (i = 0, rxr = sc->rx_rings; i < sc->num_queues; i++, rxr++)
> ixgbe_free_receive_buffers(rxr);
> @@ -2853,7 +2837,6 @@ ixgbe_rxeof(struct ix_queue *que)
> struct rx_ring *rxr = que->rxr;
> struct ifnet *ifp = &sc->arpcom.ac_if;
> struct mbuf_list ml = MBUF_LIST_INITIALIZER();
> - struct mbuf_list free_ml = MBUF_LIST_INITIALIZER();
> struct mbuf *mp, *sendmp;
> uint8_t eop = 0;
> uint16_t len, vtag;
> @@ -2866,7 +2849,6 @@ ixgbe_rxeof(struct ix_queue *que)
> if (!ISSET(ifp->if_flags, IFF_RUNNING))
> return FALSE;
>
> - mtx_enter(&sc->rx_mtx);
> i = rxr->next_to_check;
> while (if_rxr_inuse(&rxr->rx_ring) > 0) {
> bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
> @@ -2901,11 +2883,11 @@ ixgbe_rxeof(struct ix_queue *que)
> sc->dropped_pkts++;
>
> if (rxbuf->fmp) {
> - ml_enqueue(&free_ml, rxbuf->fmp);
> + m_freem(rxbuf->fmp);
> rxbuf->fmp = NULL;
> }
>
> - ml_enqueue(&free_ml, mp);
> + m_freem(mp);
> rxbuf->buf = NULL;
> goto next_desc;
> }
> @@ -2983,9 +2965,6 @@ next_desc:
> i = 0;
> }
> rxr->next_to_check = i;
> - mtx_leave(&sc->rx_mtx);
> -
> - ml_purge(&free_ml);
>
> if_input(ifp, &ml);
>
> Index: if_ix.h
> ===================================================================
> RCS file: /cvs/src/sys/dev/pci/if_ix.h,v
> retrieving revision 1.29
> diff -u -p -r1.29 if_ix.h
> --- if_ix.h 11 Sep 2015 13:02:28 -0000 1.29
> +++ if_ix.h 3 Dec 2015 16:50:26 -0000
> @@ -77,10 +77,9 @@
> #define IXGBE_TX_TIMEOUT 5 /* set to 5 seconds */
>
> /*
> - * This parameters control when the driver calls the routine to reclaim
> - * transmit descriptors.
> + * Thise parameter controls the minimum number of available transmit
> + * descriptors needed before we attempt transmission of a packet.
> */
> -#define IXGBE_TX_CLEANUP_THRESHOLD (sc->num_tx_desc / 16)
> #define IXGBE_TX_OP_THRESHOLD (sc->num_tx_desc / 32)
>
> #define IXGBE_MAX_FRAME_SIZE 9216
> @@ -170,7 +169,7 @@ struct tx_ring {
> union ixgbe_adv_tx_desc *tx_base;
> struct ixgbe_tx_buf *tx_buffers;
> struct ixgbe_dma_alloc txdma;
> - volatile uint16_t tx_avail;
> + volatile uint32_t tx_avail;
> uint32_t next_avail_desc;
> uint32_t next_to_clean;
> enum {
> @@ -277,7 +276,6 @@ struct ix_softc {
> * Receive rings:
> * Allocated at run time, an array of rings.
> */
> - struct mutex rx_mtx;
> struct rx_ring *rx_rings;
> uint64_t que_mask;
> int num_rx_desc;
> Index: ixgbe.h
> ===================================================================
> RCS file: /cvs/src/sys/dev/pci/ixgbe.h,v
> retrieving revision 1.20
> diff -u -p -r1.20 ixgbe.h
> --- ixgbe.h 24 Nov 2015 17:11:40 -0000 1.20
> +++ ixgbe.h 3 Dec 2015 16:50:26 -0000
> @@ -52,6 +52,7 @@
> #include <sys/timeout.h>
> #include <sys/pool.h>
> #include <sys/rwlock.h>
> +#include <sys/atomic.h>
>
> #include <net/if.h>
> #include <net/bpf.h>
>