The interrupt handling in ral(4) for RT2661 has a couple of problems,
which causes the interface to get stuck under heavy load with OACTIVE
set (the problems are likely especially severe on slow systems such as
my 600MHz VIA system); bouncing the interface down and back up fixes
things.  As I describe below, I think I've been able to fix it, and
I'd be happy to see the patch below reviewed and applied.

I've seen other reports that look similar to the problems I was
having; eg bug kernel/5958 starts out talking about RT2860 (which is
completely different code) but some of the "me too" replies are for
RT2561S, which I hope this patch fixes (I've cc'ed those reporters;
test reports welcome!).  I've not looked at the RT2860 code due to
lack of hardware, but if someone wants to send me a PCI card....

The first problem is that multiple TX completions may happen before
the interrupt handler gets to rt2661_tx_intr().  When this happens,
the TX interrupt handler only completes one entry in the TX ring,
which leads to the driver getting behind the hardware.  To fix this, I
extended the qid field in the TX descriptor to contain the index in
the TX ring as well as the queue ID, and then when an interrupt is
missed, free the earlier TX entries as well as the entry that the
interrupt is for.  (I did see this code trigger under load)

This exposes the second problem: there is a race that is inherent in
separating TX completion handling between TX DMA interrupts and TX
interrupts -- the driver may handle all the TX DMAs that finished when
it called rt2661_tx_dma_intr(), but by the time it gets to
rt2661_tx_intr(), another TX may have completed and the driver may end
up processing a TX completion for which it hasn't handled the TX DMA
completion.  This ends up leaking mbufs if a new send is enqueued
before the TX DMA interrupt has a chance to "catch up."  (This happens
in practice on my system as well)

It is probably possible to fix this and keep the split DMA/TX
handling, but that seems to require unneeded complexity.  Instead, we
can just ignore TX DMA interrupts and handle everything when the TX
actually completes.  This means we don't free the mbuf quite as soon,
but since we can't reuse the slot in the TX ring anyway, I don't see
this as a problem in practice.

With this patch applied, the ral interface on my access point is able
to continue operating under load that would cause the interface to get
stuck with the stock driver fairly quickly.
---
 rt2661.c    |  118 ++++++++++++++++++++++++----------------------------------
 rt2661reg.h |    3 +-
 rt2661var.h |    1 -
 3 files changed, 51 insertions(+), 71 deletions(-)

diff --git a/rt2661.c b/rt2661.c
index f838969..9a9cc53 100644
--- a/rt2661.c
+++ b/rt2661.c
@@ -97,9 +97,8 @@ void          rt2661_newassoc(struct ieee80211com *, struct 
ieee80211_node *,
 int            rt2661_newstate(struct ieee80211com *, enum ieee80211_state,
                    int);
 uint16_t       rt2661_eeprom_read(struct rt2661_softc *, uint8_t);
+void           rt2661_free_tx_desc(struct rt2661_softc *, struct 
rt2661_tx_ring *);
 void           rt2661_tx_intr(struct rt2661_softc *);
-void           rt2661_tx_dma_intr(struct rt2661_softc *,
-                   struct rt2661_tx_ring *);
 void           rt2661_rx_intr(struct rt2661_softc *);
 #ifndef IEEE80211_STA_ONLY
 void           rt2661_mcu_beacon_expire(struct rt2661_softc *);
@@ -115,7 +114,7 @@ uint16_t    rt2661_txtime(int, int, uint32_t);
 uint8_t                rt2661_plcp_signal(int);
 void           rt2661_setup_tx_desc(struct rt2661_softc *,
                    struct rt2661_tx_desc *, uint32_t, uint16_t, int, int,
-                   const bus_dma_segment_t *, int, int);
+                   const bus_dma_segment_t *, int, int, int);
 int            rt2661_tx_mgt(struct rt2661_softc *, struct mbuf *,
                    struct ieee80211_node *);
 int            rt2661_tx_data(struct rt2661_softc *, struct mbuf *,
@@ -376,7 +375,7 @@ rt2661_alloc_tx_ring(struct rt2661_softc *sc, struct 
rt2661_tx_ring *ring,
 
        ring->count = count;
        ring->queued = 0;
-       ring->cur = ring->next = ring->stat = 0;
+       ring->cur = ring->stat = 0;
 
        error = bus_dmamap_create(sc->sc_dmat, count * RT2661_TX_DESC_SIZE, 1,
            count * RT2661_TX_DESC_SIZE, 0, BUS_DMA_NOWAIT, &ring->map);
@@ -470,7 +469,7 @@ rt2661_reset_tx_ring(struct rt2661_softc *sc, struct 
rt2661_tx_ring *ring)
            BUS_DMASYNC_PREWRITE);
 
        ring->queued = 0;
-       ring->cur = ring->next = ring->stat = 0;
+       ring->cur = ring->stat = 0;
 }
 
 void
@@ -881,6 +880,36 @@ rt2661_eeprom_read(struct rt2661_softc *sc, uint8_t addr)
 }
 
 void
+rt2661_free_tx_desc(struct rt2661_softc *sc, struct rt2661_tx_ring *txq)
+{
+       struct rt2661_tx_desc *desc = &txq->desc[txq->stat];
+       struct rt2661_tx_data *data = &txq->data[txq->stat];
+       struct ieee80211com *ic = &sc->sc_ic;
+
+       bus_dmamap_sync(sc->sc_dmat, data->map, 0,
+                       data->map->dm_mapsize, BUS_DMASYNC_POSTWRITE);
+       bus_dmamap_unload(sc->sc_dmat, data->map);
+       m_freem(data->m);
+       data->m = NULL;
+
+       /* descriptor is no longer valid */
+       desc->flags &= ~htole32(RT2661_TX_VALID);
+
+       bus_dmamap_sync(sc->sc_dmat, txq->map,
+                       txq->stat * RT2661_TX_DESC_SIZE, RT2661_TX_DESC_SIZE,
+                       BUS_DMASYNC_PREWRITE);
+
+       if (data->ni) {
+               ieee80211_release_node(ic, data->ni);
+               data->ni = NULL;
+       }
+
+       txq->queued--;
+       if (++txq->stat >= txq->count)  /* faster than % count */
+               txq->stat = 0;
+}
+
+void
 rt2661_tx_intr(struct rt2661_softc *sc)
 {
        struct ieee80211com *ic = &sc->sc_ic;
@@ -888,7 +917,7 @@ rt2661_tx_intr(struct rt2661_softc *sc)
        struct rt2661_tx_ring *txq;
        struct rt2661_tx_data *data;
        struct rt2661_node *rn;
-       int qid, retrycnt;
+       int qid, ind, retrycnt;
 
        for (;;) {
                const uint32_t val = RAL_READ(sc, RT2661_STA_CSR4);
@@ -898,6 +927,13 @@ rt2661_tx_intr(struct rt2661_softc *sc)
                /* retrieve the queue in which this frame was sent */
                qid = RT2661_TX_QID(val);
                txq = (qid <= 3) ? &sc->txq[qid] : &sc->mgtq;
+               ind = RT2661_TX_INDEX(val);
+
+               if (txq->stat != ind)
+                       DPRINTFN(10, ("missed TX interrupt, catching up "
+                                "stat %d to index %d\n", txq->stat, ind, qid));
+               while (txq->stat != ind)
+                       rt2661_free_tx_desc(sc, txq);
 
                /* retrieve rate control algorithm context */
                data = &txq->data[txq->stat];
@@ -934,14 +970,9 @@ rt2661_tx_intr(struct rt2661_softc *sc)
                        ifp->if_oerrors++;
                }
 
-               ieee80211_release_node(ic, data->ni);
-               data->ni = NULL;
-
                DPRINTFN(15, ("tx done q=%d idx=%u\n", qid, txq->stat));
 
-               txq->queued--;
-               if (++txq->stat >= txq->count)  /* faster than % count */
-                       txq->stat = 0;
+               rt2661_free_tx_desc(sc, txq);
        }
 
        sc->sc_tx_timer = 0;
@@ -950,42 +981,6 @@ rt2661_tx_intr(struct rt2661_softc *sc)
 }
 
 void
-rt2661_tx_dma_intr(struct rt2661_softc *sc, struct rt2661_tx_ring *txq)
-{
-       for (;;) {
-               struct rt2661_tx_desc *desc = &txq->desc[txq->next];
-               struct rt2661_tx_data *data = &txq->data[txq->next];
-
-               bus_dmamap_sync(sc->sc_dmat, txq->map,
-                   txq->next * RT2661_TX_DESC_SIZE, RT2661_TX_DESC_SIZE,
-                   BUS_DMASYNC_POSTREAD);
-
-               if ((letoh32(desc->flags) & RT2661_TX_BUSY) ||
-                   !(letoh32(desc->flags) & RT2661_TX_VALID))
-                       break;
-
-               bus_dmamap_sync(sc->sc_dmat, data->map, 0,
-                   data->map->dm_mapsize, BUS_DMASYNC_POSTWRITE);
-               bus_dmamap_unload(sc->sc_dmat, data->map);
-               m_freem(data->m);
-               data->m = NULL;
-               /* node reference is released in rt2661_tx_intr() */
-
-               /* descriptor is no longer valid */
-               desc->flags &= ~htole32(RT2661_TX_VALID);
-
-               bus_dmamap_sync(sc->sc_dmat, txq->map,
-                   txq->next * RT2661_TX_DESC_SIZE, RT2661_TX_DESC_SIZE,
-                   BUS_DMASYNC_PREWRITE);
-
-               DPRINTFN(15, ("tx dma done q=%p idx=%u\n", txq, txq->next));
-
-               if (++txq->next >= txq->count)  /* faster than % count */
-                       txq->next = 0;
-       }
-}
-
-void
 rt2661_rx_intr(struct rt2661_softc *sc)
 {
        struct ieee80211com *ic = &sc->sc_ic;
@@ -1212,24 +1207,9 @@ rt2661_intr(void *arg)
        if (!(ifp->if_flags & IFF_RUNNING))
                return 0;
 
-       if (r1 & RT2661_MGT_DONE)
-               rt2661_tx_dma_intr(sc, &sc->mgtq);
-
        if (r1 & RT2661_RX_DONE)
                rt2661_rx_intr(sc);
 
-       if (r1 & RT2661_TX0_DMA_DONE)
-               rt2661_tx_dma_intr(sc, &sc->txq[0]);
-
-       if (r1 & RT2661_TX1_DMA_DONE)
-               rt2661_tx_dma_intr(sc, &sc->txq[1]);
-
-       if (r1 & RT2661_TX2_DMA_DONE)
-               rt2661_tx_dma_intr(sc, &sc->txq[2]);
-
-       if (r1 & RT2661_TX3_DMA_DONE)
-               rt2661_tx_dma_intr(sc, &sc->txq[3]);
-
        if (r1 & RT2661_TX_DONE)
                rt2661_tx_intr(sc);
 
@@ -1377,7 +1357,7 @@ rt2661_plcp_signal(int rate)
 void
 rt2661_setup_tx_desc(struct rt2661_softc *sc, struct rt2661_tx_desc *desc,
     uint32_t flags, uint16_t xflags, int len, int rate,
-    const bus_dma_segment_t *segs, int nsegs, int ac)
+    const bus_dma_segment_t *segs, int nsegs, int ac, int ind)
 {
        struct ieee80211com *ic = &sc->sc_ic;
        uint16_t plcp_length;
@@ -1401,7 +1381,7 @@ rt2661_setup_tx_desc(struct rt2661_softc *sc, struct 
rt2661_tx_desc *desc,
         * private data only. It will be made available by the NIC in STA_CSR4
         * on Tx interrupts.
         */
-       desc->qid = ac;
+       desc->qid = (ac <= 3 ? ac : 7) | (ind << 3);
 
        /* setup PLCP fields */
        desc->plcp_signal  = rt2661_plcp_signal(rate);
@@ -1505,7 +1485,7 @@ rt2661_tx_mgt(struct rt2661_softc *sc, struct mbuf *m0,
 
        rt2661_setup_tx_desc(sc, desc, flags, 0 /* XXX HWSEQ */,
            m0->m_pkthdr.len, rate, data->map->dm_segs, data->map->dm_nsegs,
-           RT2661_QID_MGT);
+           RT2661_QID_MGT, sc->mgtq.cur);
 
        bus_dmamap_sync(sc->sc_dmat, data->map, 0, data->map->dm_mapsize,
            BUS_DMASYNC_PREWRITE);
@@ -1633,7 +1613,7 @@ rt2661_tx_data(struct rt2661_softc *sc, struct mbuf *m0,
                rt2661_setup_tx_desc(sc, desc,
                    (needrts ? RT2661_TX_NEED_ACK : 0) | RT2661_TX_MORE_FRAG,
                    0, mprot->m_pkthdr.len, protrate, data->map->dm_segs,
-                   data->map->dm_nsegs, ac);
+                   data->map->dm_nsegs, ac, txq->cur);
 
                bus_dmamap_sync(sc->sc_dmat, data->map, 0,
                    data->map->dm_mapsize, BUS_DMASYNC_PREWRITE);
@@ -1723,7 +1703,7 @@ rt2661_tx_data(struct rt2661_softc *sc, struct mbuf *m0,
        }
 
        rt2661_setup_tx_desc(sc, desc, flags, 0, m0->m_pkthdr.len, rate,
-           data->map->dm_segs, data->map->dm_nsegs, ac);
+           data->map->dm_segs, data->map->dm_nsegs, ac, txq->cur);
 
        bus_dmamap_sync(sc->sc_dmat, data->map, 0, data->map->dm_mapsize,
            BUS_DMASYNC_PREWRITE);
@@ -2797,7 +2777,7 @@ rt2661_prepare_beacon(struct rt2661_softc *sc)
        rate = IEEE80211_IS_CHAN_5GHZ(ni->ni_chan) ? 12 : 2;
 
        rt2661_setup_tx_desc(sc, &desc, RT2661_TX_TIMESTAMP, RT2661_TX_HWSEQ,
-           m0->m_pkthdr.len, rate, NULL, 0, RT2661_QID_MGT);
+           m0->m_pkthdr.len, rate, NULL, 0, RT2661_QID_MGT, 0);
 
        /* copy the first 24 bytes of Tx descriptor into NIC memory */
        RAL_WRITE_REGION_1(sc, RT2661_HW_BEACON_BASE0, (uint8_t *)&desc, 24);
diff --git a/rt2661reg.h b/rt2661reg.h
index 22d7723..ec9be76 100644
--- a/rt2661reg.h
+++ b/rt2661reg.h
@@ -189,7 +189,8 @@
 #define RT2661_TX_STAT_VALID   (1 << 0)
 #define RT2661_TX_RESULT(v)    (((v) >> 1) & 0x7)
 #define RT2661_TX_RETRYCNT(v)  (((v) >> 4) & 0xf)
-#define RT2661_TX_QID(v)       (((v) >> 8) & 0xf)
+#define RT2661_TX_QID(v)       (((v) >> 8) & 0x7)
+#define RT2661_TX_INDEX(v)     (((v) >>11) & 0x1f)
 #define RT2661_TX_SUCCESS      0
 #define RT2661_TX_RETRY_FAIL   6
 
diff --git a/rt2661var.h b/rt2661var.h
index 7793ade..fbe85bb 100644
--- a/rt2661var.h
+++ b/rt2661var.h
@@ -62,7 +62,6 @@ struct rt2661_tx_ring {
        int                     count;
        int                     queued;
        int                     cur;
-       int                     next;
        int                     stat;
 };
 
-- 
1.6.3.3

Reply via email to