Author: luigi
Date: Thu Jan 26 09:55:16 2012
New Revision: 230572
URL: http://svn.freebsd.org/changeset/base/230572

Log:
  ixgbe changes:
  - remove experimental code for disabling CRC
  - use the correct constant for conversion between interrupt rate
    and EITR values (the previous values were off by a factor of 2)
  - make dev.ix.N.queueM.interrupt_rate a RW sysctl variable.
    Changing individual values affects the queue immediately,
    and propagates to all interfaces at the next reinit.
  - add dev.ix.N.queueM.irqs rdonly sysctl, to export the actual
    interrupt counts
  
  Netmap-related changes for ixgbe:
  - use the "new" format for TX descriptors in netmap mode.
  - pass interrupt mitigation delays to the user process doing poll()
    on a netmap file descriptor.
    On the RX side this means we will not check the ring more than once
    per interrupt. This gives the process a chance to sleep and process
    packets in larger batches, thus reducing CPU usage.
    On the TX side we take this even further: completed transmissions are
    reclaimed every half ring even if the NIC interrupts more often.
    This saves even more CPU without any additional tx delays.
  
  Generic Netmap-related changes:
  - align the netmap_kring to cache lines so that there is no false sharing
    (possibly useful for multiqueue NICs and MSIX interrupts, which are
    handled by different cores). It's a minor improvement but it does not
    cost anything.
  
  Reviewed by:  Jack Vogel
  Approved by:  Jack Vogel

Modified:
  head/sys/dev/ixgbe/ixgbe.c
  head/sys/dev/netmap/ixgbe_netmap.h
  head/sys/dev/netmap/netmap.c
  head/sys/dev/netmap/netmap_kern.h

Modified: head/sys/dev/ixgbe/ixgbe.c
==============================================================================
--- head/sys/dev/ixgbe/ixgbe.c  Thu Jan 26 09:45:14 2012        (r230571)
+++ head/sys/dev/ixgbe/ixgbe.c  Thu Jan 26 09:55:16 2012        (r230572)
@@ -232,7 +232,7 @@ MODULE_DEPEND(ixgbe, ether, 1, 1, 1);
 static int ixgbe_enable_aim = TRUE;
 TUNABLE_INT("hw.ixgbe.enable_aim", &ixgbe_enable_aim);
 
-static int ixgbe_max_interrupt_rate = (8000000 / IXGBE_LOW_LATENCY);
+static int ixgbe_max_interrupt_rate = (4000000 / IXGBE_LOW_LATENCY);
 TUNABLE_INT("hw.ixgbe.max_interrupt_rate", &ixgbe_max_interrupt_rate);
 
 /* How many packets rxeof tries to clean at a time */
@@ -3385,22 +3385,41 @@ ixgbe_txeof(struct tx_ring *txr)
 #ifdef DEV_NETMAP
        if (ifp->if_capenable & IFCAP_NETMAP) {
                struct netmap_adapter *na = NA(ifp);
+               struct netmap_kring *kring = &na->tx_rings[txr->me];
 
+               tx_desc = (struct ixgbe_legacy_tx_desc *)txr->tx_base;
+
+               bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
+                   BUS_DMASYNC_POSTREAD);
                /*
                 * In netmap mode, all the work is done in the context
                 * of the client thread. Interrupt handlers only wake up
                 * clients, which may be sleeping on individual rings
                 * or on a global resource for all rings.
+                * To implement tx interrupt mitigation, we wake up the client
+                * thread roughly every half ring, even if the NIC interrupts
+                * more frequently. This is implemented as follows:
+                * - ixgbe_txsync() sets kring->nr_kflags with the index of
+                *   the slot that should wake up the thread (nkr_num_slots
+                *   means the user thread should not be woken up);
+                * - the driver ignores tx interrupts unless netmap_mitigate=0
+                *   or the slot has the DD bit set.
+                *
                 * When the driver has separate locks, we need to
                 * release and re-acquire txlock to avoid deadlocks.
                 * XXX see if we can find a better way.
                 */
-               selwakeuppri(&na->tx_rings[txr->me].si, PI_NET);
-               IXGBE_TX_UNLOCK(txr);
-               IXGBE_CORE_LOCK(adapter);
-               selwakeuppri(&na->tx_rings[na->num_queues + 1].si, PI_NET);
-               IXGBE_CORE_UNLOCK(adapter);
-               IXGBE_TX_LOCK(txr);
+               if (!netmap_mitigate ||
+                   (kring->nr_kflags < kring->nkr_num_slots &&
+                    tx_desc[kring->nr_kflags].upper.fields.status & 
IXGBE_TXD_STAT_DD)) {
+                       kring->nr_kflags = kring->nkr_num_slots;
+                       selwakeuppri(&na->tx_rings[txr->me].si, PI_NET);
+                       IXGBE_TX_UNLOCK(txr);
+                       IXGBE_CORE_LOCK(adapter);
+                       selwakeuppri(&na->tx_rings[na->num_queues + 1].si, 
PI_NET);
+                       IXGBE_CORE_UNLOCK(adapter);
+                       IXGBE_TX_LOCK(txr);
+               }
                return FALSE;
        }
 #endif /* DEV_NETMAP */
@@ -3928,21 +3947,6 @@ skip_head:
                lro->ifp = adapter->ifp;
        }
 
-#ifdef DEV_NETMAP1     /* XXX experimental CRC strip */
-       {
-               struct  ixgbe_hw        *hw = &adapter->hw;
-               u32                     rdrxctl;
-
-               rdrxctl = IXGBE_READ_REG(hw, IXGBE_RDRXCTL);
-               rdrxctl &= ~IXGBE_RDRXCTL_RSCFRSTSIZE;
-               if (slot)
-                       rdrxctl &= ~IXGBE_RDRXCTL_CRCSTRIP;
-               else
-                       rdrxctl |= IXGBE_RDRXCTL_CRCSTRIP;
-               rdrxctl |= IXGBE_RDRXCTL_RSCACKC;
-               IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rdrxctl);
-       }
-#endif /* DEV_NETMAP1 */
        IXGBE_RX_UNLOCK(rxr);
        return (0);
 
@@ -4022,12 +4026,6 @@ ixgbe_initialize_receive_units(struct ad
                hlreg |= IXGBE_HLREG0_JUMBOEN;
        else
                hlreg &= ~IXGBE_HLREG0_JUMBOEN;
-#ifdef DEV_NETMAP1     /* XXX experimental CRCSTRIP */
-        if (ifp->if_capenable & IFCAP_NETMAP)
-               hlreg &= ~IXGBE_HLREG0_RXCRCSTRP;
-       else
-               hlreg |= IXGBE_HLREG0_RXCRCSTRP;
-#endif /* DEV_NETMAP1 */
        IXGBE_WRITE_REG(hw, IXGBE_HLREG0, hlreg);
 
        bufsz = (adapter->rx_mbuf_sz + BSIZEPKT_ROUNDUP) >> 
IXGBE_SRRCTL_BSIZEPKT_SHIFT;
@@ -4297,11 +4295,14 @@ ixgbe_rxeof(struct ix_queue *que, int co
 #ifdef DEV_NETMAP
        if (ifp->if_capenable & IFCAP_NETMAP) {
                /*
-                * Same as the txeof routine, only wakeup clients
-                * and make sure there are no deadlocks.
+                * Same as the txeof routine: only wakeup clients on intr.
+                * NKR_PENDINTR in nr_kflags is used to implement interrupt
+                * mitigation (ixgbe_rxsync() will not look for new packets
+                * unless NKR_PENDINTR is set).
                 */
                struct netmap_adapter *na = NA(ifp);
 
+               na->rx_rings[rxr->me].nr_kflags |= NKR_PENDINTR;
                selwakeuppri(&na->rx_rings[rxr->me].si, PI_NET);
                IXGBE_RX_UNLOCK(rxr);
                IXGBE_CORE_LOCK(adapter);
@@ -4830,7 +4831,7 @@ ixgbe_configure_ivars(struct adapter *ad
        u32 newitr;
 
        if (ixgbe_max_interrupt_rate > 0)
-               newitr = (8000000 / ixgbe_max_interrupt_rate) & 0x0FF8;
+               newitr = (4000000 / ixgbe_max_interrupt_rate) & 0x0FF8;
        else
                newitr = 0;
 
@@ -5193,12 +5194,21 @@ ixgbe_sysctl_interrupt_rate_handler(SYSC
        reg = IXGBE_READ_REG(&que->adapter->hw, IXGBE_EITR(que->msix));
        usec = ((reg & 0x0FF8) >> 3);
        if (usec > 0)
-               rate = 1000000 / usec;
+               rate = 500000 / usec;
        else
                rate = 0;
        error = sysctl_handle_int(oidp, &rate, 0, req);
        if (error || !req->newptr)
                return error;
+       reg &= ~0xfff; /* default, no limitation */
+       ixgbe_max_interrupt_rate = 0;
+       if (rate > 0 && rate < 500000) {
+               if (rate < 1000)
+                       rate = 1000;
+               ixgbe_max_interrupt_rate = rate;
+               reg |= ((4000000/rate) & 0xff8 );
+       }
+       IXGBE_WRITE_REG(&que->adapter->hw, IXGBE_EITR(que->msix), reg);
        return 0;
 }
 
@@ -5252,10 +5262,13 @@ ixgbe_add_hw_stats(struct adapter *adapt
                queue_list = SYSCTL_CHILDREN(queue_node);
 
                SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "interrupt_rate",
-                               CTLTYPE_UINT | CTLFLAG_RD, &adapter->queues[i],
+                               CTLTYPE_UINT | CTLFLAG_RW, &adapter->queues[i],
                                sizeof(&adapter->queues[i]),
                                ixgbe_sysctl_interrupt_rate_handler, "IU",
                                "Interrupt Rate");
+               SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "irqs",
+                               CTLFLAG_RD, &(adapter->queues[i].irqs),
+                               "irqs on this queue");
                SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "txd_head", 
                                CTLTYPE_UINT | CTLFLAG_RD, txr, sizeof(txr),
                                ixgbe_sysctl_tdh_handler, "IU",

Modified: head/sys/dev/netmap/ixgbe_netmap.h
==============================================================================
--- head/sys/dev/netmap/ixgbe_netmap.h  Thu Jan 26 09:45:14 2012        
(r230571)
+++ head/sys/dev/netmap/ixgbe_netmap.h  Thu Jan 26 09:55:16 2012        
(r230572)
@@ -191,6 +191,10 @@ fail:
  * (this is also true for every use of ring in the kernel).
  *
  * ring->avail is never used, only checked for bogus values.
+ *
+ * do_lock is set iff the function is called from the ioctl handler.
+ * In this case, grab a lock around the body, and also reclaim transmitted
+ * buffers irrespective of interrupt mitigation.
  */
 static int
 ixgbe_netmap_txsync(void *a, u_int ring_nr, int do_lock)
@@ -292,10 +296,11 @@ ring_reset:
                         * need this.
                         */
                        curr->read.buffer_addr = htole64(paddr);
-                       curr->read.olinfo_status = 0;
+                       curr->read.olinfo_status = htole32(len << 
IXGBE_ADVTXD_PAYLEN_SHIFT);
                        curr->read.cmd_type_len =
                            htole32(txr->txd_cmd | len |
                                (IXGBE_ADVTXD_DTYP_DATA |
+                                   IXGBE_ADVTXD_DCMD_DEXT |
                                    IXGBE_ADVTXD_DCMD_IFCS |
                                    IXGBE_TXD_CMD_EOP | flags) );
                        /* If the buffer has changed, unload and reload map
@@ -328,15 +333,41 @@ ring_reset:
        }
 
        /*
-        * If no packets are sent, or there is no room in the tx ring,
-        * Check whether there are completed transmissions.
-        * Because this is expensive (we need a register etc.)
-        * we only do it if absolutely necessary, i.e. there is no room
-        * in the tx ring, or where were no completed transmissions
-        * (meaning that probably the caller really wanted to check
-        * for completed transmissions).
+        * Reclaim buffers for completed transmissions.
+        * Because this is expensive (we read a NIC register etc.)
+        * we only do it in specific cases (see below).
+        * In all cases kring->nr_kflags indicates which slot will be
+        * checked upon a tx interrupt (nkr_num_slots means none).
         */
-       if (n == 0 || kring->nr_hwavail < 1) {
+       if (do_lock) {
+               j = 1; /* forced reclaim, ignore interrupts */
+               kring->nr_kflags = kring->nkr_num_slots;
+       } else if (kring->nr_hwavail > 0) {
+               j = 0; /* buffers still available: no reclaim, ignore intr. */
+               kring->nr_kflags = kring->nkr_num_slots;
+       } else {
+               /*
+                * no buffers available, locate a slot for which we request
+                * ReportStatus (approximately half ring after next_to_clean)
+                * and record it in kring->nr_kflags.
+                * If the slot has DD set, do the reclaim looking at TDH,
+                * otherwise we go to sleep (in netmap_poll()) and will be
+                * woken up when slot nr_kflags will be ready.
+                */
+               struct ixgbe_legacy_tx_desc *txd = (struct ixgbe_legacy_tx_desc 
*)txr->tx_base;
+
+               j = txr->next_to_clean + kring->nkr_num_slots/2;
+               if (j >= kring->nkr_num_slots)
+                       j -= kring->nkr_num_slots;
+               // round to the closest with dd set
+               j= (j < kring->nkr_num_slots / 4 || j >= 
kring->nkr_num_slots*3/4) ?
+                       0 : report_frequency;
+               kring->nr_kflags = j; /* the slot to check */
+               j = txd[j].upper.fields.status & IXGBE_TXD_STAT_DD;
+       }
+       if (!j) {
+               netmap_skip_txsync++;
+       } else {
                int delta;
 
                /*
@@ -391,6 +422,8 @@ ring_reset:
  * We must subtract the newly consumed slots (cur - nr_hwcur)
  * from nr_hwavail, make the descriptors available for the next reads,
  * and set kring->nr_hwcur = ring->cur and ring->avail = kring->nr_hwavail.
+ *
+ * do_lock has a special meaning: please refer to txsync.
  */
 static int
 ixgbe_netmap_rxsync(void *a, u_int ring_nr, int do_lock)
@@ -401,6 +434,7 @@ ixgbe_netmap_rxsync(void *a, u_int ring_
        struct netmap_kring *kring = &na->rx_rings[ring_nr];
        struct netmap_ring *ring = kring->ring;
        int j, k, l, n, lim = kring->nkr_num_slots - 1;
+       int force_update = do_lock || kring->nr_kflags & NKR_PENDINTR;
 
        k = ring->cur;  /* cache and check value, same as in txsync */
        n = k - kring->nr_hwcur;
@@ -437,6 +471,7 @@ ixgbe_netmap_rxsync(void *a, u_int ring_
        if (j > lim)
                j -= lim + 1;
 
+    if (force_update) {
        for (n = 0; ; n++) {
                union ixgbe_adv_rx_desc *curr = &rxr->rx_base[l];
                uint32_t staterr = le32toh(curr->wb.upper.status_error);
@@ -453,6 +488,8 @@ ixgbe_netmap_rxsync(void *a, u_int ring_
                rxr->next_to_check = l;
                kring->nr_hwavail += n;
        }
+       kring->nr_kflags &= ~NKR_PENDINTR;
+    }
 
        /*
         * Skip past packets that userspace has already processed

Modified: head/sys/dev/netmap/netmap.c
==============================================================================
--- head/sys/dev/netmap/netmap.c        Thu Jan 26 09:45:14 2012        
(r230571)
+++ head/sys/dev/netmap/netmap.c        Thu Jan 26 09:55:16 2012        
(r230572)
@@ -146,6 +146,12 @@ SYSCTL_INT(_dev_netmap, OID_AUTO, total_
     CTLFLAG_RD, &nm_buf_pool.total_buffers, 0, "total_buffers");
 SYSCTL_INT(_dev_netmap, OID_AUTO, free_buffers,
     CTLFLAG_RD, &nm_buf_pool.free, 0, "free_buffers");
+int netmap_mitigate = 1;
+SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, 
"");
+int netmap_skip_txsync;
+SYSCTL_INT(_dev_netmap, OID_AUTO, skip_txsync, CTLFLAG_RW, 
&netmap_skip_txsync, 0, "");
+int netmap_skip_rxsync;
+SYSCTL_INT(_dev_netmap, OID_AUTO, skip_rxsync, CTLFLAG_RW, 
&netmap_skip_rxsync, 0, "");
 
 /*
  * Allocate n buffers from the ring, and fill the slot.

Modified: head/sys/dev/netmap/netmap_kern.h
==============================================================================
--- head/sys/dev/netmap/netmap_kern.h   Thu Jan 26 09:45:14 2012        
(r230571)
+++ head/sys/dev/netmap/netmap_kern.h   Thu Jan 26 09:55:16 2012        
(r230572)
@@ -65,13 +65,14 @@ struct netmap_kring {
        struct netmap_ring *ring;
        u_int nr_hwcur;
        int nr_hwavail;
-       u_int nr_kflags;
+       u_int nr_kflags;        /* private driver flags */
+#define NKR_PENDINTR   0x1     // Pending interrupt.
        u_int nkr_num_slots;
 
        int     nkr_hwofs;      /* offset between NIC and netmap ring */
        struct netmap_adapter *na;       // debugging
        struct selinfo si; /* poll/select wait queue */
-};
+} __attribute__((__aligned__(64)));
 
 /*
  * This struct is part of and extends the 'struct adapter' (or
@@ -171,6 +172,8 @@ struct netmap_slot *netmap_reset(struct 
        enum txrx tx, int n, u_int new_cur);
 int netmap_ring_reinit(struct netmap_kring *);
 
+extern int netmap_mitigate;
+extern int netmap_skip_txsync, netmap_skip_rxsync;
 extern u_int netmap_total_buffers;
 extern char *netmap_buffer_base;
 extern int netmap_verbose;     // XXX debugging
_______________________________________________
svn-src-head@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to