Author: sbruno
Date: Wed Jun  3 18:01:09 2015
New Revision: 283959
URL: https://svnweb.freebsd.org/changeset/base/283959

Log:
  Change EM_MULTIQUEUE to a real kernconf entry and enable support for
  up to 2 rx/tx queues for the 82574.
  
  Program the 82574 to enable 5 msix vectors, assign 1 to each rx queue,
  1 to each tx queue and 1 to the link handler.
  
  Inspired by DragonFlyBSD, enable some RSS logic for handling tx queue
  handling/processing.
  
  Move multiqueue handler functions so that they line up better in a diff
  review to if_igb.c
  
  Always enqueue tx work to be done in em_mq_start, if unable to acquire
  the TX lock, then this will be processed in the background later by the
  taskqueue.  Remove mbuf argument from em_start_mq_locked() as the work
  is always enqueued.  (stolen from igb)
  
  Setup TARC, TXDCTL and RXDCTL registers for better performance and stability
  in multiqueue and singlequeue implementations. Handle Intel errata  3 and
  generic multiqueue behavior with the initialization of TARC(0) and TARC(1)
  
  Bind interrupt threads to cpus in order.  (stolen from igb)
  
  Add 2 new DDB functions, one to display the queue(s) and their settings and
  one to reset the adapter.  Primarily used for debugging.
  
  In the multiqueue configuration, bump RXD and TXD ring size to max for the
  adapter (4096).  Setup an RDTR of 64 and an RADV of 128 in multiqueue 
configuration
  to cut down on the number of interrupts.  RADV was arbitrarily set to 2x RDTR
  and can be adjusted as needed.
  
  Cleanup the display in top a bit to make it clearer where the taskqueue 
threads
  are running and what they should be doing.
  
  Ensure that both queues are processed by em_local_timer() by writing them both
  to the IMS register to generate soft interrupts.
  
  Ensure that an soft interrupt is generated when em_msix_link() is run so that
  any races between assertion of the link/status interrupt and a rx/tx interrupt
  are handled.
  
  Document existing tuneables: hw.em.eee_setting, hw.em.msix, 
hw.em.smart_pwr_down, hw.em.sbp
  
  Document use of hw.em.num_queues and the new kernel option EM_MULTIQUEUE
  
  Thanks to Intel for their continued support of FreeBSD.
  
  Reviewed by:  erj jfv hiren gnn wblock
  Obtained from:        Intel Corporation
  MFC after:    2 weeks
  Relnotes:     Yes
  Sponsored by: Limelight Networks
  Differential Revision:        https://reviews.freebsd.org/D1994

Modified:
  head/share/man/man4/em.4
  head/sys/conf/NOTES
  head/sys/conf/options
  head/sys/dev/e1000/e1000_defines.h
  head/sys/dev/e1000/if_em.c
  head/sys/dev/e1000/if_em.h
  head/sys/dev/netmap/if_em_netmap.h
  head/sys/modules/em/Makefile

Modified: head/share/man/man4/em.4
==============================================================================
--- head/share/man/man4/em.4    Wed Jun  3 17:45:45 2015        (r283958)
+++ head/share/man/man4/em.4    Wed Jun  3 18:01:09 2015        (r283959)
@@ -45,6 +45,14 @@ kernel configuration file:
 .Cd "device em"
 .Ed
 .Pp
+Optional multiqueue support is available via the following kernel
+compile options:
+.Bd -ragged -offset indent
+.Cd "options EM_MULTIQUEUE"
+.Ed
+.Pp
+Note:  Activating EM_MULTIQUEUE support is not supported by Intel.
+.Pp
 Alternatively, to load the driver as a
 module at boot time, place the following line in
 .Xr loader.conf 5 :
@@ -197,6 +205,18 @@ Tunables can be set at the
 prompt before booting the kernel or stored in
 .Xr loader.conf 5 .
 .Bl -tag -width indent
+.It Va hw.em.eee_setting
+Disable or enable Energy Efficient Ethernet.
+Default 1 (disabled).
+.It Va hw.em.msix
+Enable or disable MSI-X style interrupts.
+Default 1 (enabled).
+.It Va hw.em.smart_pwr_down
+Enable or disable smart power down features on newer adapters.
+Default 0 (disabled).
+.It Va hw.em.sbp
+Show bad packets when in promiscuous mode.
+Default 0 (off).
 .It Va hw.em.rxd
 Number of receive descriptors allocated by the driver.
 The default value is 1024 for adapters newer than 82547,
@@ -228,6 +248,11 @@ If
 .Va hw.em.tx_int_delay
 is non-zero, this tunable limits the maximum delay in which a transmit
 interrupt is generated.
+.It Va hw.em.num_queues
+Number of hardware queues that will be configured on this adapter (maximum of 
2)
+Defaults to 1.
+Only valid with kernel configuration
+.Cd "options EM_MULTIQUEUE".
 .El
 .Sh FILES
 .Bl -tag -width /dev/led/em*
@@ -287,3 +312,5 @@ You can enable it on an
 .Nm
 interface using
 .Xr ifconfig 8 .
+.Pp
+Activating EM_MULTIQUEUE support requires MSI-X features.

Modified: head/sys/conf/NOTES
==============================================================================
--- head/sys/conf/NOTES Wed Jun  3 17:45:45 2015        (r283958)
+++ head/sys/conf/NOTES Wed Jun  3 18:01:09 2015        (r283959)
@@ -2980,6 +2980,9 @@ options   RANDOM_DEBUG    # Debugging messag
 # Module to enable execution of application via emulators like QEMU
 options         IMAGACT_BINMISC
 
+# Intel em(4) driver
+options                EM_MULTIQUEUE # Activate multiqueue features/disable 
MSI-X
+
 # zlib I/O stream support
 # This enables support for compressed core dumps.
 options        GZIO

Modified: head/sys/conf/options
==============================================================================
--- head/sys/conf/options       Wed Jun  3 17:45:45 2015        (r283958)
+++ head/sys/conf/options       Wed Jun  3 18:01:09 2015        (r283959)
@@ -940,3 +940,6 @@ RCTL                opt_global.h
 RANDOM_YARROW  opt_random.h
 RANDOM_FORTUNA opt_random.h
 RANDOM_DEBUG   opt_random.h
+
+# Intel em(4) driver
+EM_MULTIQUEUE  opt_em.h

Modified: head/sys/dev/e1000/e1000_defines.h
==============================================================================
--- head/sys/dev/e1000/e1000_defines.h  Wed Jun  3 17:45:45 2015        
(r283958)
+++ head/sys/dev/e1000/e1000_defines.h  Wed Jun  3 18:01:09 2015        
(r283959)
@@ -158,10 +158,12 @@
        E1000_RXDEXT_STATERR_CXE |      \
        E1000_RXDEXT_STATERR_RXE)
 
+#define E1000_MRQC_RSS_ENABLE_2Q               0x00000001
 #define E1000_MRQC_RSS_FIELD_MASK              0xFFFF0000
 #define E1000_MRQC_RSS_FIELD_IPV4_TCP          0x00010000
 #define E1000_MRQC_RSS_FIELD_IPV4              0x00020000
 #define E1000_MRQC_RSS_FIELD_IPV6_TCP_EX       0x00040000
+#define E1000_MRQC_RSS_FIELD_IPV6_EX           0x00080000
 #define E1000_MRQC_RSS_FIELD_IPV6              0x00100000
 #define E1000_MRQC_RSS_FIELD_IPV6_TCP          0x00200000
 

Modified: head/sys/dev/e1000/if_em.c
==============================================================================
--- head/sys/dev/e1000/if_em.c  Wed Jun  3 17:45:45 2015        (r283958)
+++ head/sys/dev/e1000/if_em.c  Wed Jun  3 18:01:09 2015        (r283959)
@@ -32,6 +32,8 @@
 ******************************************************************************/
 /*$FreeBSD$*/
 
+#include "opt_em.h"
+#include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
@@ -41,6 +43,10 @@
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#ifdef DDB
+#include <sys/types.h>
+#include <ddb/ddb.h>
+#endif
 #if __FreeBSD_version >= 800000
 #include <sys/buf_ring.h>
 #endif
@@ -52,6 +58,7 @@
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/rman.h>
+#include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
@@ -208,7 +215,7 @@ static int  em_resume(device_t);
 #ifdef EM_MULTIQUEUE
 static int     em_mq_start(if_t, struct mbuf *);
 static int     em_mq_start_locked(if_t,
-                   struct tx_ring *, struct mbuf *);
+                   struct tx_ring *);
 static void    em_qflush(if_t);
 #else
 static void    em_start(if_t);
@@ -299,6 +306,10 @@ static void        em_handle_tx(void *context, 
 static void    em_handle_rx(void *context, int pending);
 static void    em_handle_link(void *context, int pending);
 
+#ifdef EM_MULTIQUEUE
+static void    em_enable_vectors_82574(struct adapter *);
+#endif
+
 static void    em_set_sysctl_value(struct adapter *, const char *,
                    const char *, int *, int);
 static int     em_set_flowcntl(SYSCTL_HANDLER_ARGS);
@@ -388,6 +399,19 @@ static int em_enable_msix = TRUE;
 SYSCTL_INT(_hw_em, OID_AUTO, enable_msix, CTLFLAG_RDTUN, &em_enable_msix, 0,
     "Enable MSI-X interrupts");
 
+#ifdef EM_MULTIQUEUE
+static int em_num_queues = 1;
+SYSCTL_INT(_hw_em, OID_AUTO, num_queues, CTLFLAG_RDTUN, &em_num_queues, 0,
+    "82574 only: Number of queues to configure, 0 indicates autoconfigure");
+#endif
+
+/*
+** Global variable to store last used CPU when binding queues
+** to CPUs in igb_allocate_msix.  Starts at CPU_FIRST and increments when a
+** queue is bound to a cpu.
+*/
+static int em_last_bind_cpu = -1;
+
 /* How many packets rxeof tries to clean at a time */
 static int em_rx_process_limit = 100;
 SYSCTL_INT(_hw_em, OID_AUTO, rx_process_limit, CTLFLAG_RDTUN,
@@ -420,10 +444,10 @@ static int
 em_probe(device_t dev)
 {
        char            adapter_name[60];
-       u16             pci_vendor_id = 0;
-       u16             pci_device_id = 0;
-       u16             pci_subvendor_id = 0;
-       u16             pci_subdevice_id = 0;
+       uint16_t        pci_vendor_id = 0;
+       uint16_t        pci_device_id = 0;
+       uint16_t        pci_subvendor_id = 0;
+       uint16_t        pci_subdevice_id = 0;
        em_vendor_info_t *ent;
 
        INIT_DEBUGOUT("em_probe: begin");
@@ -550,6 +574,11 @@ em_attach(device_t dev)
                goto err_pci;
        }
 
+       /*
+        * Setup MSI/X or MSI if PCI Express
+        */
+       adapter->msix = em_setup_msix(adapter);
+
        e1000_get_bus_info(hw);
 
        /* Set up some sysctls for the tunable interrupt delays */
@@ -880,7 +909,7 @@ em_resume(device_t dev)
                        EM_TX_LOCK(txr);
 #ifdef EM_MULTIQUEUE
                        if (!drbr_empty(ifp, txr->br))
-                               em_mq_start_locked(ifp, txr, NULL);
+                               em_mq_start_locked(ifp, txr);
 #else
                        if (!if_sendq_empty(ifp))
                                em_start_locked(ifp, txr);
@@ -894,105 +923,7 @@ em_resume(device_t dev)
 }
 
 
-#ifdef EM_MULTIQUEUE
-/*********************************************************************
- *  Multiqueue Transmit routines 
- *
- *  em_mq_start is called by the stack to initiate a transmit.
- *  however, if busy the driver can queue the request rather
- *  than do an immediate send. It is this that is an advantage
- *  in this driver, rather than also having multiple tx queues.
- **********************************************************************/
-static int
-em_mq_start_locked(if_t ifp, struct tx_ring *txr, struct mbuf *m)
-{
-       struct adapter  *adapter = txr->adapter;
-        struct mbuf     *next;
-        int             err = 0, enq = 0;
-
-       if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
-           IFF_DRV_RUNNING || adapter->link_active == 0) {
-               if (m != NULL)
-                       err = drbr_enqueue(ifp, txr->br, m);
-               return (err);
-       }
-
-       enq = 0;
-       if (m != NULL) {
-               err = drbr_enqueue(ifp, txr->br, m);
-               if (err)
-                       return (err);
-       } 
-
-       /* Process the queue */
-       while ((next = drbr_peek(ifp, txr->br)) != NULL) {
-               if ((err = em_xmit(txr, &next)) != 0) {
-                       if (next == NULL)
-                               drbr_advance(ifp, txr->br);
-                       else 
-                               drbr_putback(ifp, txr->br, next);
-                       break;
-               }
-               drbr_advance(ifp, txr->br);
-               enq++;
-               if_inc_counter(ifp, IFCOUNTER_OBYTES, next->m_pkthdr.len);
-               if (next->m_flags & M_MCAST)
-                       if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
-               if_etherbpfmtap(ifp, next);
-               if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0)
-                        break;
-       }
-
-       /* Mark the queue as having work */
-       if ((enq > 0) && (txr->busy == EM_TX_IDLE))
-               txr->busy = EM_TX_BUSY;
-
-       if (txr->tx_avail < EM_MAX_SCATTER)
-               em_txeof(txr);
-       if (txr->tx_avail < EM_MAX_SCATTER)
-               if_setdrvflagbits(ifp, IFF_DRV_OACTIVE,0);
-       return (err);
-}
-
-/*
-** Multiqueue capable stack interface
-*/
-static int
-em_mq_start(if_t ifp, struct mbuf *m)
-{
-       struct adapter  *adapter = if_getsoftc(ifp);
-       struct tx_ring  *txr = adapter->tx_rings;
-       int             error;
-
-       if (EM_TX_TRYLOCK(txr)) {
-               error = em_mq_start_locked(ifp, txr, m);
-               EM_TX_UNLOCK(txr);
-       } else 
-               error = drbr_enqueue(ifp, txr->br, m);
-
-       return (error);
-}
-
-/*
-** Flush all ring buffers
-*/
-static void
-em_qflush(if_t ifp)
-{
-       struct adapter  *adapter = if_getsoftc(ifp);
-       struct tx_ring  *txr = adapter->tx_rings;
-       struct mbuf     *m;
-
-       for (int i = 0; i < adapter->num_queues; i++, txr++) {
-               EM_TX_LOCK(txr);
-               while ((m = buf_ring_dequeue_sc(txr->br)) != NULL)
-                       m_freem(m);
-               EM_TX_UNLOCK(txr);
-       }
-       if_qflush(ifp);
-}
-#else  /* !EM_MULTIQUEUE */
-
+#ifndef EM_MULTIQUEUE
 static void
 em_start_locked(if_t ifp, struct tx_ring *txr)
 {
@@ -1035,7 +966,8 @@ em_start_locked(if_t ifp, struct tx_ring
                        txr->busy = EM_TX_BUSY;
 
                /* Send a copy of the frame to the BPF listener */
-               if_etherbpfmtap(ifp, m_head);
+               ETHER_BPF_MTAP(ifp, m_head);
+
        }
 
        return;
@@ -1054,6 +986,115 @@ em_start(if_t ifp)
        }
        return;
 }
+#else /* EM_MULTIQUEUE */
+/*********************************************************************
+ *  Multiqueue Transmit routines 
+ *
+ *  em_mq_start is called by the stack to initiate a transmit.
+ *  however, if busy the driver can queue the request rather
+ *  than do an immediate send. It is this that is an advantage
+ *  in this driver, rather than also having multiple tx queues.
+ **********************************************************************/
+/*
+** Multiqueue capable stack interface
+*/
+static int
+em_mq_start(if_t ifp, struct mbuf *m)
+{
+       struct adapter  *adapter = if_getsoftc(ifp);
+       struct tx_ring  *txr = adapter->tx_rings;
+       unsigned int    i, error;
+
+       if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
+               i = m->m_pkthdr.flowid % adapter->num_queues;
+       else
+               i = curcpu % adapter->num_queues;
+
+       txr = &adapter->tx_rings[i];
+
+       error = drbr_enqueue(ifp, txr->br, m);
+       if (error)
+               return (error);
+
+       if (EM_TX_TRYLOCK(txr)) {
+               em_mq_start_locked(ifp, txr);
+               EM_TX_UNLOCK(txr);
+       } else 
+               taskqueue_enqueue(txr->tq, &txr->tx_task);
+
+       return (0);
+}
+
+static int
+em_mq_start_locked(if_t ifp, struct tx_ring *txr)
+{
+       struct adapter  *adapter = txr->adapter;
+        struct mbuf     *next;
+        int             err = 0, enq = 0;
+
+       EM_TX_LOCK_ASSERT(txr);
+
+       if (((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) ||
+           adapter->link_active == 0) {
+               return (ENETDOWN);
+       }
+
+       /* Process the queue */
+       while ((next = drbr_peek(ifp, txr->br)) != NULL) {
+               if ((err = em_xmit(txr, &next)) != 0) {
+                       if (next == NULL) {
+                               /* It was freed, move forward */
+                               drbr_advance(ifp, txr->br);
+                       } else {
+                               /* 
+                                * Still have one left, it may not be
+                                * the same since the transmit function
+                                * may have changed it.
+                                */
+                               drbr_putback(ifp, txr->br, next);
+                       }
+                       break;
+               }
+               drbr_advance(ifp, txr->br);
+               enq++;
+               if_inc_counter(ifp, IFCOUNTER_OBYTES, next->m_pkthdr.len);
+               if (next->m_flags & M_MCAST)
+                       if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
+               ETHER_BPF_MTAP(ifp, next);
+               if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0)
+                        break;
+       }
+
+       /* Mark the queue as having work */
+       if ((enq > 0) && (txr->busy == EM_TX_IDLE))
+               txr->busy = EM_TX_BUSY;
+
+       if (txr->tx_avail < EM_MAX_SCATTER)
+               em_txeof(txr);
+       if (txr->tx_avail < EM_MAX_SCATTER) {
+               if_setdrvflagbits(ifp, IFF_DRV_OACTIVE,0);
+       }
+       return (err);
+}
+
+/*
+** Flush all ring buffers
+*/
+static void
+em_qflush(if_t ifp)
+{
+       struct adapter  *adapter = if_getsoftc(ifp);
+       struct tx_ring  *txr = adapter->tx_rings;
+       struct mbuf     *m;
+
+       for (int i = 0; i < adapter->num_queues; i++, txr++) {
+               EM_TX_LOCK(txr);
+               while ((m = buf_ring_dequeue_sc(txr->br)) != NULL)
+                       m_freem(m);
+               EM_TX_UNLOCK(txr);
+       }
+       if_qflush(ifp);
+}
 #endif /* EM_MULTIQUEUE */
 
 /*********************************************************************
@@ -1449,7 +1490,7 @@ em_poll(if_t ifp, enum poll_cmd cmd, int
        em_txeof(txr);
 #ifdef EM_MULTIQUEUE
        if (!drbr_empty(ifp, txr->br))
-               em_mq_start_locked(ifp, txr, NULL);
+               em_mq_start_locked(ifp, txr);
 #else
        if (!if_sendq_empty(ifp))
                em_start_locked(ifp, txr);
@@ -1516,14 +1557,14 @@ em_handle_que(void *context, int pending
        struct tx_ring  *txr = adapter->tx_rings;
        struct rx_ring  *rxr = adapter->rx_rings;
 
-
        if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
                bool more = em_rxeof(rxr, adapter->rx_process_limit, NULL);
+
                EM_TX_LOCK(txr);
                em_txeof(txr);
 #ifdef EM_MULTIQUEUE
                if (!drbr_empty(ifp, txr->br))
-                       em_mq_start_locked(ifp, txr, NULL);
+                       em_mq_start_locked(ifp, txr);
 #else
                if (!if_sendq_empty(ifp))
                        em_start_locked(ifp, txr);
@@ -1557,11 +1598,12 @@ em_msix_tx(void *arg)
        em_txeof(txr);
 #ifdef EM_MULTIQUEUE
        if (!drbr_empty(ifp, txr->br))
-               em_mq_start_locked(ifp, txr, NULL);
+               em_mq_start_locked(ifp, txr);
 #else
        if (!if_sendq_empty(ifp))
                em_start_locked(ifp, txr);
 #endif
+
        /* Reenable this interrupt */
        E1000_WRITE_REG(&adapter->hw, E1000_IMS, txr->ims);
        EM_TX_UNLOCK(txr);
@@ -1587,9 +1629,10 @@ em_msix_rx(void *arg)
        more = em_rxeof(rxr, adapter->rx_process_limit, NULL);
        if (more)
                taskqueue_enqueue(rxr->tq, &rxr->rx_task);
-       else
+       else {
                /* Reenable this interrupt */
                E1000_WRITE_REG(&adapter->hw, E1000_IMS, rxr->ims);
+       }
        return;
 }
 
@@ -1616,6 +1659,16 @@ em_msix_link(void *arg)
        } else
                E1000_WRITE_REG(&adapter->hw, E1000_IMS,
                    EM_MSIX_LINK | E1000_IMS_LSC);
+       /*
+       ** Because we must read the ICR for this interrupt
+       ** it may clear other causes using autoclear, for
+       ** this reason we simply create a soft interrupt
+       ** for all these vectors.
+       */
+       if (reg_icr) {
+               E1000_WRITE_REG(&adapter->hw,
+                       E1000_ICS, adapter->ims);
+       }
        return;
 }
 
@@ -1629,9 +1682,10 @@ em_handle_rx(void *context, int pending)
        more = em_rxeof(rxr, adapter->rx_process_limit, NULL);
        if (more)
                taskqueue_enqueue(rxr->tq, &rxr->rx_task);
-       else
+       else {
                /* Reenable this interrupt */
                E1000_WRITE_REG(&adapter->hw, E1000_IMS, rxr->ims);
+       }
 }
 
 static void
@@ -1645,7 +1699,7 @@ em_handle_tx(void *context, int pending)
        em_txeof(txr);
 #ifdef EM_MULTIQUEUE
        if (!drbr_empty(ifp, txr->br))
-               em_mq_start_locked(ifp, txr, NULL);
+               em_mq_start_locked(ifp, txr);
 #else
        if (!if_sendq_empty(ifp))
                em_start_locked(ifp, txr);
@@ -1675,7 +1729,7 @@ em_handle_link(void *context, int pendin
                        EM_TX_LOCK(txr);
 #ifdef EM_MULTIQUEUE
                        if (!drbr_empty(ifp, txr->br))
-                               em_mq_start_locked(ifp, txr, NULL);
+                               em_mq_start_locked(ifp, txr);
 #else
                        if (if_sendq_empty(ifp))
                                em_start_locked(ifp, txr);
@@ -2219,7 +2273,7 @@ em_local_timer(void *arg)
        if_t ifp = adapter->ifp;
        struct tx_ring  *txr = adapter->tx_rings;
        struct rx_ring  *rxr = adapter->rx_rings;
-       u32             trigger;
+       u32             trigger = 0;
 
        EM_CORE_LOCK_ASSERT(adapter);
 
@@ -2232,9 +2286,11 @@ em_local_timer(void *arg)
                e1000_rar_set(&adapter->hw, adapter->hw.mac.addr, 0);
 
        /* Mask to use in the irq trigger */
-       if (adapter->msix_mem)
-               trigger = rxr->ims;
-       else
+       if (adapter->msix_mem) {
+               for (int i = 0; i < adapter->num_queues; i++, rxr++)
+                       trigger |= rxr->ims;
+               rxr = adapter->rx_rings;
+       } else
                trigger = E1000_ICS_RXDMT0;
 
        /*
@@ -2243,7 +2299,6 @@ em_local_timer(void *arg)
        ** and the HUNG state will be static if set.
        */
        for (int i = 0; i < adapter->num_queues; i++, txr++) {
-               /* Last cycle a queue was declared hung */
                if (txr->busy == EM_TX_HUNG)
                        goto hung;
                if (txr->busy >= EM_TX_MAXTRIES)
@@ -2261,14 +2316,9 @@ em_local_timer(void *arg)
        return;
 hung:
        /* Looks like we're hung */
-       device_printf(adapter->dev, "Watchdog timeout -- resetting\n");
-       device_printf(adapter->dev,
-           "Queue(%d) tdh = %d, hw tdt = %d\n", txr->me,
-           E1000_READ_REG(&adapter->hw, E1000_TDH(txr->me)),
-           E1000_READ_REG(&adapter->hw, E1000_TDT(txr->me)));
-       device_printf(adapter->dev,"TX(%d) desc avail = %d,"
-           "Next TX to Clean = %d\n",
-           txr->me, txr->tx_avail, txr->next_to_clean);
+       device_printf(adapter->dev, "Watchdog timeout Queue[%d]-- resetting\n",
+                       txr->me);
+       em_print_debug_info(adapter);
        if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING);
        adapter->watchdog_events++;
        em_init_locked(adapter);
@@ -2320,7 +2370,7 @@ em_update_link_status(struct adapter *ad
                    (hw->mac.type == e1000_82572))) {
                        int tarc0;
                        tarc0 = E1000_READ_REG(hw, E1000_TARC(0));
-                       tarc0 &= ~SPEED_MODE_BIT;
+                       tarc0 &= ~TARC_SPEED_MODE_BIT;
                        E1000_WRITE_REG(hw, E1000_TARC(0), tarc0);
                }
                if (bootverbose)
@@ -2436,14 +2486,6 @@ em_allocate_pci_resources(struct adapter
            rman_get_bushandle(adapter->memory);
        adapter->hw.hw_addr = (u8 *)&adapter->osdep.mem_bus_space_handle;
 
-       /* Default to a single queue */
-       adapter->num_queues = 1;
-
-       /*
-        * Setup MSI/X or MSI if PCI Express
-        */
-       adapter->msix = em_setup_msix(adapter);
-
        adapter->hw.back = &adapter->osdep;
 
        return (0);
@@ -2518,13 +2560,14 @@ em_allocate_msix(struct adapter *adapter
        struct          tx_ring *txr = adapter->tx_rings;
        struct          rx_ring *rxr = adapter->rx_rings;
        int             error, rid, vector = 0;
+       int             cpu_id = 0;
 
 
        /* Make sure all interrupts are disabled */
        E1000_WRITE_REG(&adapter->hw, E1000_IMC, 0xffffffff);
 
        /* First set up ring resources */
-       for (int i = 0; i < adapter->num_queues; i++, txr++, rxr++) {
+       for (int i = 0; i < adapter->num_queues; i++, rxr++, vector++) {
 
                /* RX ring */
                rid = vector + 1;
@@ -2544,14 +2587,20 @@ em_allocate_msix(struct adapter *adapter
                        return (error);
                }
 #if __FreeBSD_version >= 800504
-               bus_describe_intr(dev, rxr->res, rxr->tag, "rx %d", i);
+               bus_describe_intr(dev, rxr->res, rxr->tag, "rx%d", i);
 #endif
-               rxr->msix = vector++; /* NOTE increment vector for TX */
+               rxr->msix = vector;
+
+               if (em_last_bind_cpu < 0)
+                       em_last_bind_cpu = CPU_FIRST();
+               cpu_id = em_last_bind_cpu;
+               bus_bind_intr(dev, rxr->res, cpu_id);
+
                TASK_INIT(&rxr->rx_task, 0, em_handle_rx, rxr);
                rxr->tq = taskqueue_create_fast("em_rxq", M_NOWAIT,
                    taskqueue_thread_enqueue, &rxr->tq);
-               taskqueue_start_threads(&rxr->tq, 1, PI_NET, "%s rxq",
-                   device_get_nameunit(adapter->dev));
+               taskqueue_start_threads(&rxr->tq, 1, PI_NET, "%s rxq (cpuid 
%d)",
+                   device_get_nameunit(adapter->dev), cpu_id);
                /*
                ** Set the bit to enable interrupt
                ** in E1000_IMS -- bits 20 and 21
@@ -2559,8 +2608,13 @@ em_allocate_msix(struct adapter *adapter
                ** NOTHING to do with the MSIX vector
                */
                rxr->ims = 1 << (20 + i);
+               adapter->ims |= rxr->ims;
                adapter->ivars |= (8 | rxr->msix) << (i * 4);
 
+               em_last_bind_cpu = CPU_NEXT(em_last_bind_cpu);
+       }
+
+       for (int i = 0; i < adapter->num_queues; i++, txr++, vector++) {
                /* TX ring */
                rid = vector + 1;
                txr->res = bus_alloc_resource_any(dev,
@@ -2578,14 +2632,20 @@ em_allocate_msix(struct adapter *adapter
                        return (error);
                }
 #if __FreeBSD_version >= 800504
-               bus_describe_intr(dev, txr->res, txr->tag, "tx %d", i);
+               bus_describe_intr(dev, txr->res, txr->tag, "tx%d", i);
 #endif
-               txr->msix = vector++; /* Increment vector for next pass */
+               txr->msix = vector;
+
+                if (em_last_bind_cpu < 0)
+                        em_last_bind_cpu = CPU_FIRST();
+                cpu_id = em_last_bind_cpu;
+                bus_bind_intr(dev, txr->res, cpu_id);
+
                TASK_INIT(&txr->tx_task, 0, em_handle_tx, txr);
                txr->tq = taskqueue_create_fast("em_txq", M_NOWAIT,
                    taskqueue_thread_enqueue, &txr->tq);
-               taskqueue_start_threads(&txr->tq, 1, PI_NET, "%s txq",
-                   device_get_nameunit(adapter->dev));
+               taskqueue_start_threads(&txr->tq, 1, PI_NET, "%s txq (cpuid 
%d)",
+                   device_get_nameunit(adapter->dev), cpu_id);
                /*
                ** Set the bit to enable interrupt
                ** in E1000_IMS -- bits 22 and 23
@@ -2593,13 +2653,16 @@ em_allocate_msix(struct adapter *adapter
                ** NOTHING to do with the MSIX vector
                */
                txr->ims = 1 << (22 + i);
+               adapter->ims |= txr->ims;
                adapter->ivars |= (8 | txr->msix) << (8 + (i * 4));
+
+               em_last_bind_cpu = CPU_NEXT(em_last_bind_cpu);
        }
 
        /* Link interrupt */
-       ++rid;
+       rid = vector + 1;
        adapter->res = bus_alloc_resource_any(dev,
-           SYS_RES_IRQ, &rid, RF_ACTIVE);
+           SYS_RES_IRQ, &rid, RF_SHAREABLE | RF_ACTIVE);
        if (!adapter->res) {
                device_printf(dev,"Unable to allocate "
                    "bus resource: Link interrupt [%d]\n", rid);
@@ -2615,7 +2678,7 @@ em_allocate_msix(struct adapter *adapter
                return (error);
        }
 #if __FreeBSD_version >= 800504
-               bus_describe_intr(dev, adapter->res, adapter->tag, "link");
+       bus_describe_intr(dev, adapter->res, adapter->tag, "link");
 #endif
        adapter->linkvec = vector;
        adapter->ivars |=  (8 | vector) << 16;
@@ -2639,9 +2702,8 @@ em_free_pci_resources(struct adapter *ad
        */
        for (int i = 0; i < adapter->num_queues; i++) {
                txr = &adapter->tx_rings[i];
-               rxr = &adapter->rx_rings[i];
                /* an early abort? */
-               if ((txr == NULL) || (rxr == NULL))
+               if (txr == NULL)
                        break;
                rid = txr->msix +1;
                if (txr->tag != NULL) {
@@ -2651,6 +2713,11 @@ em_free_pci_resources(struct adapter *ad
                if (txr->res != NULL)
                        bus_release_resource(dev, SYS_RES_IRQ,
                            rid, txr->res);
+
+               rxr = &adapter->rx_rings[i];
+               /* an early abort? */
+               if (rxr == NULL)
+                       break;
                rid = rxr->msix +1;
                if (rxr->tag != NULL) {
                        bus_teardown_intr(dev, rxr->res, rxr->tag);
@@ -2700,14 +2767,19 @@ em_setup_msix(struct adapter *adapter)
        device_t dev = adapter->dev;
        int val;
 
+       /* Nearly always going to use one queue */
+       adapter->num_queues = 1;
+
        /*
-       ** Setup MSI/X for Hartwell: tests have shown
-       ** use of two queues to be unstable, and to
-       ** provide no great gain anyway, so we simply
-       ** seperate the interrupts and use a single queue.
+       ** Try using MSI-X for Hartwell adapters
        */
        if ((adapter->hw.mac.type == e1000_82574) &&
            (em_enable_msix == TRUE)) {
+#ifdef EM_MULTIQUEUE
+               adapter->num_queues = (em_num_queues == 1) ? 1 : 2;
+               if (adapter->num_queues > 1)
+                       em_enable_vectors_82574(adapter);
+#endif
                /* Map the MSIX BAR */
                int rid = PCIR_BAR(EM_MSIX_BAR);
                adapter->msix_mem = bus_alloc_resource_any(dev,
@@ -2719,16 +2791,34 @@ em_setup_msix(struct adapter *adapter)
                        goto msi;
                        }
                val = pci_msix_count(dev); 
-               /* We only need/want 3 vectors */
-               if (val >= 3)
-                       val = 3;
-               else {
-                               device_printf(adapter->dev,
-                           "MSIX: insufficient vectors, using MSI\n");
-                       goto msi;
+
+#ifdef EM_MULTIQUEUE
+               /* We need 5 vectors in the multiqueue case */
+               if (adapter->num_queues > 1 ) {
+                       if (val >= 5)
+                               val = 5;
+                       else {
+                               adapter->num_queues = 1;
+                               device_printf(adapter->dev,
+                                   "Insufficient MSIX vectors for >1 queue, "
+                                   "using single queue...\n");
+                               goto msix_one;
+                       }
+               } else {
+msix_one:
+#endif
+                       if (val >= 3)
+                               val = 3;
+                       else {
+                               device_printf(adapter->dev,
+                               "Insufficient MSIX vectors, using MSI\n");
+                               goto msi;
+                       }
+#ifdef EM_MULTIQUEUE
                }
+#endif
 
-               if ((pci_alloc_msix(dev, &val) == 0) && (val == 3)) {
+               if ((pci_alloc_msix(dev, &val) == 0)) {
                        device_printf(adapter->dev,
                            "Using MSIX interrupts "
                            "with %d vectors\n", val);
@@ -2749,7 +2839,7 @@ msi:
        }
                val = 1;
                if (pci_alloc_msi(dev, &val) == 0) {
-                       device_printf(adapter->dev,"Using an MSI interrupt\n");
+                       device_printf(adapter->dev, "Using an MSI interrupt\n");
                return (val);
        } 
        /* Should only happen due to manual configuration */
@@ -3394,7 +3484,7 @@ em_initialize_transmit_unit(struct adapt
 {
        struct tx_ring  *txr = adapter->tx_rings;
        struct e1000_hw *hw = &adapter->hw;
-       u32     tctl, tarc, tipg = 0;
+       u32     tctl, txdctl = 0, tarc, tipg = 0;
 
         INIT_DEBUGOUT("em_initialize_transmit_unit: begin");
 
@@ -3416,6 +3506,15 @@ em_initialize_transmit_unit(struct adapt
                    E1000_READ_REG(&adapter->hw, E1000_TDLEN(i)));
 
                txr->busy = EM_TX_IDLE;
+               txdctl = 0; /* clear txdctl */
+                txdctl |= 0x1f; /* PTHRESH */
+                txdctl |= 1 << 8; /* HTHRESH */
+                txdctl |= 1 << 16;/* WTHRESH */
+               txdctl |= 1 << 22; /* Reserved bit 22 must always be 1 */
+               txdctl |= E1000_TXDCTL_GRAN;
+                txdctl |= 1 << 25; /* LWTHRESH */
+
+                E1000_WRITE_REG(hw, E1000_TXDCTL(i), txdctl);
        }
 
        /* Set the default values for the Tx Inter Packet Gap timer */
@@ -3446,15 +3545,25 @@ em_initialize_transmit_unit(struct adapt
        if ((adapter->hw.mac.type == e1000_82571) ||
            (adapter->hw.mac.type == e1000_82572)) {
                tarc = E1000_READ_REG(&adapter->hw, E1000_TARC(0));
-               tarc |= SPEED_MODE_BIT;
+               tarc |= TARC_SPEED_MODE_BIT;
                E1000_WRITE_REG(&adapter->hw, E1000_TARC(0), tarc);
        } else if (adapter->hw.mac.type == e1000_80003es2lan) {
+               /* errata: program both queues to unweighted RR */
                tarc = E1000_READ_REG(&adapter->hw, E1000_TARC(0));
                tarc |= 1;
                E1000_WRITE_REG(&adapter->hw, E1000_TARC(0), tarc);
                tarc = E1000_READ_REG(&adapter->hw, E1000_TARC(1));
                tarc |= 1;
                E1000_WRITE_REG(&adapter->hw, E1000_TARC(1), tarc);
+       } else if (adapter->hw.mac.type == e1000_82574) {
+               tarc = E1000_READ_REG(&adapter->hw, E1000_TARC(0));
+               tarc |= TARC_ERRATA_BIT;
+               if ( adapter->num_queues > 1) {
+                       tarc |= (TARC_COMPENSATION_MODE | TARC_MQ_FIX);
+                       E1000_WRITE_REG(&adapter->hw, E1000_TARC(0), tarc);
+                       E1000_WRITE_REG(&adapter->hw, E1000_TARC(1), tarc);
+               } else
+                       E1000_WRITE_REG(&adapter->hw, E1000_TARC(0), tarc);
        }
 
        adapter->txd_cmd = E1000_TXD_CMD_IFCS;
@@ -3885,8 +3994,9 @@ em_txeof(struct tx_ring *txr)
         * TX lock which, with a single queue, guarantees 
         * sanity.
          */
-        if (txr->tx_avail >= EM_MAX_SCATTER)
+        if (txr->tx_avail >= EM_MAX_SCATTER) {
                if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
+       }
 
        /* Disable hang detection if all clean */
        if (txr->tx_avail == adapter->num_tx_desc)
@@ -4258,6 +4368,9 @@ em_initialize_receive_unit(struct adapte
 
        E1000_WRITE_REG(&adapter->hw, E1000_RADV,
            adapter->rx_abs_int_delay.value);
+
+       E1000_WRITE_REG(&adapter->hw, E1000_RDTR,
+           adapter->rx_int_delay.value);
        /*
         * Set the interrupt throttling rate. Value is calculated
         * as DEFAULT_ITR = 1/(MAX_INTS_PER_SEC * 256ns)
@@ -4269,20 +4382,65 @@ em_initialize_receive_unit(struct adapte
        ** using the EITR register (82574 only)
        */
        if (hw->mac.type == e1000_82574) {
+               u32 rfctl;
                for (int i = 0; i < 4; i++)
                        E1000_WRITE_REG(hw, E1000_EITR_82574(i),
                            DEFAULT_ITR);
                /* Disable accelerated acknowledge */
-               E1000_WRITE_REG(hw, E1000_RFCTL, E1000_RFCTL_ACK_DIS);
+               rfctl = E1000_READ_REG(hw, E1000_RFCTL);
+               rfctl |= E1000_RFCTL_ACK_DIS;
+               E1000_WRITE_REG(hw, E1000_RFCTL, rfctl);
        }
 
        rxcsum = E1000_READ_REG(hw, E1000_RXCSUM);
-       if (if_getcapenable(ifp) & IFCAP_RXCSUM)
+       if (if_getcapenable(ifp) & IFCAP_RXCSUM) {
+#ifdef EM_MULTIQUEUE
+               rxcsum |= E1000_RXCSUM_TUOFL |
+                         E1000_RXCSUM_IPOFL |
+                         E1000_RXCSUM_PCSD;
+#else
                rxcsum |= E1000_RXCSUM_TUOFL;
-       else
+#endif
+       } else
                rxcsum &= ~E1000_RXCSUM_TUOFL;
+
        E1000_WRITE_REG(hw, E1000_RXCSUM, rxcsum);
 
+#ifdef EM_MULTIQUEUE
+       if (adapter->num_queues > 1) {
+               uint32_t rss_key[10];
+               uint32_t reta;
+               int i;
+
+               /*
+               * Configure RSS key
+               */
+               arc4rand(rss_key, sizeof(rss_key), 0);
+               for (i = 0; i < 10; ++i)
+                       E1000_WRITE_REG_ARRAY(hw,E1000_RSSRK(0), i, rss_key[i]);
+
+               /*
+               * Configure RSS redirect table in following fashion:
+               * (hash & ring_cnt_mask) == rdr_table[(hash & rdr_table_mask)]
+               */
+               reta = 0;
+               for (i = 0; i < 4; ++i) {
+                       uint32_t q;
+                       q = (i % adapter->num_queues) << 7;
+                       reta |= q << (8 * i);
+               }
+               for (i = 0; i < 32; ++i)
+                       E1000_WRITE_REG(hw, E1000_RETA(i), reta);
+
+               E1000_WRITE_REG(hw, E1000_MRQC, E1000_MRQC_RSS_ENABLE_2Q | 
+                               E1000_MRQC_RSS_FIELD_IPV4_TCP |
+                               E1000_MRQC_RSS_FIELD_IPV4 |
+                               E1000_MRQC_RSS_FIELD_IPV6_TCP_EX |
+                               E1000_MRQC_RSS_FIELD_IPV6_EX |
+                               E1000_MRQC_RSS_FIELD_IPV6 |
+                               E1000_MRQC_RSS_FIELD_IPV6_TCP);
+       }
+#endif
        /*
        ** XXX TEMPORARY WORKAROUND: on some systems with 82573
        ** long latencies are observed, like Lenovo X60. This
@@ -4317,13 +4475,30 @@ em_initialize_receive_unit(struct adapte
                E1000_WRITE_REG(hw, E1000_RDT(i), rdt);
        }
 
-       /* Set PTHRESH for improved jumbo performance */
+       /*
+        * Set PTHRESH for improved jumbo performance
+        * According to 10.2.5.11 of Intel 82574 Datasheet,
+        * RXDCTL(1) is written whenever RXDCTL(0) is written.
+        * Only write to RXDCTL(1) if there is a need for different
+        * settings.
+        */
        if (((adapter->hw.mac.type == e1000_ich9lan) ||
            (adapter->hw.mac.type == e1000_pch2lan) ||
            (adapter->hw.mac.type == e1000_ich10lan)) &&
            (if_getmtu(ifp) > ETHERMTU)) {
                u32 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(0));
                E1000_WRITE_REG(hw, E1000_RXDCTL(0), rxdctl | 3);
+       } else if ((adapter->hw.mac.type == e1000_82574) &&
+                 (if_getmtu(ifp) > ETHERMTU)) {
+               for (int i = 0; i < adapter->num_queues; i++) {
+                       u32 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(i));
+
+                       rxdctl |= 0x20; /* PTHRESH */
+                       rxdctl |= 4 << 8; /* HTHRESH */
+                       rxdctl |= 4 << 16;/* WTHRESH */
+                       rxdctl |= 1 << 24; /* Switch to granularity */
+                       E1000_WRITE_REG(hw, E1000_RXDCTL(i), rxdctl);
+               }
        }
                
        if (adapter->hw.mac.type >= e1000_pch2lan) {
@@ -4390,6 +4565,11 @@ em_rxeof(struct rx_ring *rxr, int count,
 
        EM_RX_LOCK(rxr);
 
+       /* Sync the ring */
+       bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
+           BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
+
+
 #ifdef DEV_NETMAP
        if (netmap_rx_irq(ifp, rxr->me, &processed)) {
                EM_RX_UNLOCK(rxr);
@@ -4402,9 +4582,6 @@ em_rxeof(struct rx_ring *rxr, int count,
                if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0)
                        break;
 
-               bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-head@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to