The interrupt handling in ral(4) for RT2661 has a couple of problems, which causes the interface to get stuck under heavy load with OACTIVE set (the problems are likely especially severe on slow systems such as my 600MHz VIA system); bouncing the interface down and back up fixes things. As I describe below, I think I've been able to fix it, and I'd be happy to see the patch below reviewed and applied.
I've seen other reports that look similar to the problems I was having; eg bug kernel/5958 starts out talking about RT2860 (which is completely different code) but some of the "me too" replies are for RT2561S, which I hope this patch fixes (I've cc'ed those reporters; test reports welcome!). I've not looked at the RT2860 code due to lack of hardware, but if someone wants to send me a PCI card.... The first problem is that multiple TX completions may happen before the interrupt handler gets to rt2661_tx_intr(). When this happens, the TX interrupt handler only completes one entry in the TX ring, which leads to the driver getting behind the hardware. To fix this, I extended the qid field in the TX descriptor to contain the index in the TX ring as well as the queue ID, and then when an interrupt is missed, free the earlier TX entries as well as the entry that the interrupt is for. (I did see this code trigger under load) This exposes the second problem: there is a race that is inherent in separating TX completion handling between TX DMA interrupts and TX interrupts -- the driver may handle all the TX DMAs that finished when it called rt2661_tx_dma_intr(), but by the time it gets to rt2661_tx_intr(), another TX may have completed and the driver may end up processing a TX completion for which it hasn't handled the TX DMA completion. This ends up leaking mbufs if a new send is enqueued before the TX DMA interrupt has a chance to "catch up." (This happens in practice on my system as well) It is probably possible to fix this and keep the split DMA/TX handling, but that seems to require unneeded complexity. Instead, we can just ignore TX DMA interrupts and handle everything when the TX actually completes. This means we don't free the mbuf quite as soon, but since we can't reuse the slot in the TX ring anyway, I don't see this as a problem in practice. With this patch applied, the ral interface on my access point is able to continue operating under load that would cause the interface to get stuck with the stock driver fairly quickly. --- rt2661.c | 118 ++++++++++++++++++++++++---------------------------------- rt2661reg.h | 3 +- rt2661var.h | 1 - 3 files changed, 51 insertions(+), 71 deletions(-) diff --git a/rt2661.c b/rt2661.c index f838969..9a9cc53 100644 --- a/rt2661.c +++ b/rt2661.c @@ -97,9 +97,8 @@ void rt2661_newassoc(struct ieee80211com *, struct ieee80211_node *, int rt2661_newstate(struct ieee80211com *, enum ieee80211_state, int); uint16_t rt2661_eeprom_read(struct rt2661_softc *, uint8_t); +void rt2661_free_tx_desc(struct rt2661_softc *, struct rt2661_tx_ring *); void rt2661_tx_intr(struct rt2661_softc *); -void rt2661_tx_dma_intr(struct rt2661_softc *, - struct rt2661_tx_ring *); void rt2661_rx_intr(struct rt2661_softc *); #ifndef IEEE80211_STA_ONLY void rt2661_mcu_beacon_expire(struct rt2661_softc *); @@ -115,7 +114,7 @@ uint16_t rt2661_txtime(int, int, uint32_t); uint8_t rt2661_plcp_signal(int); void rt2661_setup_tx_desc(struct rt2661_softc *, struct rt2661_tx_desc *, uint32_t, uint16_t, int, int, - const bus_dma_segment_t *, int, int); + const bus_dma_segment_t *, int, int, int); int rt2661_tx_mgt(struct rt2661_softc *, struct mbuf *, struct ieee80211_node *); int rt2661_tx_data(struct rt2661_softc *, struct mbuf *, @@ -376,7 +375,7 @@ rt2661_alloc_tx_ring(struct rt2661_softc *sc, struct rt2661_tx_ring *ring, ring->count = count; ring->queued = 0; - ring->cur = ring->next = ring->stat = 0; + ring->cur = ring->stat = 0; error = bus_dmamap_create(sc->sc_dmat, count * RT2661_TX_DESC_SIZE, 1, count * RT2661_TX_DESC_SIZE, 0, BUS_DMA_NOWAIT, &ring->map); @@ -470,7 +469,7 @@ rt2661_reset_tx_ring(struct rt2661_softc *sc, struct rt2661_tx_ring *ring) BUS_DMASYNC_PREWRITE); ring->queued = 0; - ring->cur = ring->next = ring->stat = 0; + ring->cur = ring->stat = 0; } void @@ -881,6 +880,36 @@ rt2661_eeprom_read(struct rt2661_softc *sc, uint8_t addr) } void +rt2661_free_tx_desc(struct rt2661_softc *sc, struct rt2661_tx_ring *txq) +{ + struct rt2661_tx_desc *desc = &txq->desc[txq->stat]; + struct rt2661_tx_data *data = &txq->data[txq->stat]; + struct ieee80211com *ic = &sc->sc_ic; + + bus_dmamap_sync(sc->sc_dmat, data->map, 0, + data->map->dm_mapsize, BUS_DMASYNC_POSTWRITE); + bus_dmamap_unload(sc->sc_dmat, data->map); + m_freem(data->m); + data->m = NULL; + + /* descriptor is no longer valid */ + desc->flags &= ~htole32(RT2661_TX_VALID); + + bus_dmamap_sync(sc->sc_dmat, txq->map, + txq->stat * RT2661_TX_DESC_SIZE, RT2661_TX_DESC_SIZE, + BUS_DMASYNC_PREWRITE); + + if (data->ni) { + ieee80211_release_node(ic, data->ni); + data->ni = NULL; + } + + txq->queued--; + if (++txq->stat >= txq->count) /* faster than % count */ + txq->stat = 0; +} + +void rt2661_tx_intr(struct rt2661_softc *sc) { struct ieee80211com *ic = &sc->sc_ic; @@ -888,7 +917,7 @@ rt2661_tx_intr(struct rt2661_softc *sc) struct rt2661_tx_ring *txq; struct rt2661_tx_data *data; struct rt2661_node *rn; - int qid, retrycnt; + int qid, ind, retrycnt; for (;;) { const uint32_t val = RAL_READ(sc, RT2661_STA_CSR4); @@ -898,6 +927,13 @@ rt2661_tx_intr(struct rt2661_softc *sc) /* retrieve the queue in which this frame was sent */ qid = RT2661_TX_QID(val); txq = (qid <= 3) ? &sc->txq[qid] : &sc->mgtq; + ind = RT2661_TX_INDEX(val); + + if (txq->stat != ind) + DPRINTFN(10, ("missed TX interrupt, catching up " + "stat %d to index %d\n", txq->stat, ind, qid)); + while (txq->stat != ind) + rt2661_free_tx_desc(sc, txq); /* retrieve rate control algorithm context */ data = &txq->data[txq->stat]; @@ -934,14 +970,9 @@ rt2661_tx_intr(struct rt2661_softc *sc) ifp->if_oerrors++; } - ieee80211_release_node(ic, data->ni); - data->ni = NULL; - DPRINTFN(15, ("tx done q=%d idx=%u\n", qid, txq->stat)); - txq->queued--; - if (++txq->stat >= txq->count) /* faster than % count */ - txq->stat = 0; + rt2661_free_tx_desc(sc, txq); } sc->sc_tx_timer = 0; @@ -950,42 +981,6 @@ rt2661_tx_intr(struct rt2661_softc *sc) } void -rt2661_tx_dma_intr(struct rt2661_softc *sc, struct rt2661_tx_ring *txq) -{ - for (;;) { - struct rt2661_tx_desc *desc = &txq->desc[txq->next]; - struct rt2661_tx_data *data = &txq->data[txq->next]; - - bus_dmamap_sync(sc->sc_dmat, txq->map, - txq->next * RT2661_TX_DESC_SIZE, RT2661_TX_DESC_SIZE, - BUS_DMASYNC_POSTREAD); - - if ((letoh32(desc->flags) & RT2661_TX_BUSY) || - !(letoh32(desc->flags) & RT2661_TX_VALID)) - break; - - bus_dmamap_sync(sc->sc_dmat, data->map, 0, - data->map->dm_mapsize, BUS_DMASYNC_POSTWRITE); - bus_dmamap_unload(sc->sc_dmat, data->map); - m_freem(data->m); - data->m = NULL; - /* node reference is released in rt2661_tx_intr() */ - - /* descriptor is no longer valid */ - desc->flags &= ~htole32(RT2661_TX_VALID); - - bus_dmamap_sync(sc->sc_dmat, txq->map, - txq->next * RT2661_TX_DESC_SIZE, RT2661_TX_DESC_SIZE, - BUS_DMASYNC_PREWRITE); - - DPRINTFN(15, ("tx dma done q=%p idx=%u\n", txq, txq->next)); - - if (++txq->next >= txq->count) /* faster than % count */ - txq->next = 0; - } -} - -void rt2661_rx_intr(struct rt2661_softc *sc) { struct ieee80211com *ic = &sc->sc_ic; @@ -1212,24 +1207,9 @@ rt2661_intr(void *arg) if (!(ifp->if_flags & IFF_RUNNING)) return 0; - if (r1 & RT2661_MGT_DONE) - rt2661_tx_dma_intr(sc, &sc->mgtq); - if (r1 & RT2661_RX_DONE) rt2661_rx_intr(sc); - if (r1 & RT2661_TX0_DMA_DONE) - rt2661_tx_dma_intr(sc, &sc->txq[0]); - - if (r1 & RT2661_TX1_DMA_DONE) - rt2661_tx_dma_intr(sc, &sc->txq[1]); - - if (r1 & RT2661_TX2_DMA_DONE) - rt2661_tx_dma_intr(sc, &sc->txq[2]); - - if (r1 & RT2661_TX3_DMA_DONE) - rt2661_tx_dma_intr(sc, &sc->txq[3]); - if (r1 & RT2661_TX_DONE) rt2661_tx_intr(sc); @@ -1377,7 +1357,7 @@ rt2661_plcp_signal(int rate) void rt2661_setup_tx_desc(struct rt2661_softc *sc, struct rt2661_tx_desc *desc, uint32_t flags, uint16_t xflags, int len, int rate, - const bus_dma_segment_t *segs, int nsegs, int ac) + const bus_dma_segment_t *segs, int nsegs, int ac, int ind) { struct ieee80211com *ic = &sc->sc_ic; uint16_t plcp_length; @@ -1401,7 +1381,7 @@ rt2661_setup_tx_desc(struct rt2661_softc *sc, struct rt2661_tx_desc *desc, * private data only. It will be made available by the NIC in STA_CSR4 * on Tx interrupts. */ - desc->qid = ac; + desc->qid = (ac <= 3 ? ac : 7) | (ind << 3); /* setup PLCP fields */ desc->plcp_signal = rt2661_plcp_signal(rate); @@ -1505,7 +1485,7 @@ rt2661_tx_mgt(struct rt2661_softc *sc, struct mbuf *m0, rt2661_setup_tx_desc(sc, desc, flags, 0 /* XXX HWSEQ */, m0->m_pkthdr.len, rate, data->map->dm_segs, data->map->dm_nsegs, - RT2661_QID_MGT); + RT2661_QID_MGT, sc->mgtq.cur); bus_dmamap_sync(sc->sc_dmat, data->map, 0, data->map->dm_mapsize, BUS_DMASYNC_PREWRITE); @@ -1633,7 +1613,7 @@ rt2661_tx_data(struct rt2661_softc *sc, struct mbuf *m0, rt2661_setup_tx_desc(sc, desc, (needrts ? RT2661_TX_NEED_ACK : 0) | RT2661_TX_MORE_FRAG, 0, mprot->m_pkthdr.len, protrate, data->map->dm_segs, - data->map->dm_nsegs, ac); + data->map->dm_nsegs, ac, txq->cur); bus_dmamap_sync(sc->sc_dmat, data->map, 0, data->map->dm_mapsize, BUS_DMASYNC_PREWRITE); @@ -1723,7 +1703,7 @@ rt2661_tx_data(struct rt2661_softc *sc, struct mbuf *m0, } rt2661_setup_tx_desc(sc, desc, flags, 0, m0->m_pkthdr.len, rate, - data->map->dm_segs, data->map->dm_nsegs, ac); + data->map->dm_segs, data->map->dm_nsegs, ac, txq->cur); bus_dmamap_sync(sc->sc_dmat, data->map, 0, data->map->dm_mapsize, BUS_DMASYNC_PREWRITE); @@ -2797,7 +2777,7 @@ rt2661_prepare_beacon(struct rt2661_softc *sc) rate = IEEE80211_IS_CHAN_5GHZ(ni->ni_chan) ? 12 : 2; rt2661_setup_tx_desc(sc, &desc, RT2661_TX_TIMESTAMP, RT2661_TX_HWSEQ, - m0->m_pkthdr.len, rate, NULL, 0, RT2661_QID_MGT); + m0->m_pkthdr.len, rate, NULL, 0, RT2661_QID_MGT, 0); /* copy the first 24 bytes of Tx descriptor into NIC memory */ RAL_WRITE_REGION_1(sc, RT2661_HW_BEACON_BASE0, (uint8_t *)&desc, 24); diff --git a/rt2661reg.h b/rt2661reg.h index 22d7723..ec9be76 100644 --- a/rt2661reg.h +++ b/rt2661reg.h @@ -189,7 +189,8 @@ #define RT2661_TX_STAT_VALID (1 << 0) #define RT2661_TX_RESULT(v) (((v) >> 1) & 0x7) #define RT2661_TX_RETRYCNT(v) (((v) >> 4) & 0xf) -#define RT2661_TX_QID(v) (((v) >> 8) & 0xf) +#define RT2661_TX_QID(v) (((v) >> 8) & 0x7) +#define RT2661_TX_INDEX(v) (((v) >>11) & 0x1f) #define RT2661_TX_SUCCESS 0 #define RT2661_TX_RETRY_FAIL 6 diff --git a/rt2661var.h b/rt2661var.h index 7793ade..fbe85bb 100644 --- a/rt2661var.h +++ b/rt2661var.h @@ -62,7 +62,6 @@ struct rt2661_tx_ring { int count; int queued; int cur; - int next; int stat; }; -- 1.6.3.3