Hi Valdimir and Jack,

I have ported Valdimir's 1.16 revision of their driver to -CURRENT code
as of today, but I don't have a box that is suitable for testing right
now as I just moved, and the server I used to do FreeBSD coding stuff is
located several thousand miles away :-)

I hope that this would be useful for adoption to the official em(4)
driver, and thanks Valdimir and Yandex for their work on this.

Cheers,
-- 
Xin LI <[EMAIL PROTECTED]>      http://www.delphij.net/
FreeBSD - The Power to Serve!
Index: e1000_defines.h
===================================================================
RCS file: /home/ncvs/src/sys/dev/em/e1000_defines.h,v
retrieving revision 1.3
diff -u -p -r1.3 e1000_defines.h
--- e1000_defines.h     16 May 2007 00:14:23 -0000      1.3
+++ e1000_defines.h     3 Oct 2007 21:36:07 -0000
@@ -746,7 +746,6 @@
  */
 #define IMS_ENABLE_MASK ( \
     E1000_IMS_RXT0   |    \
-    E1000_IMS_TXDW   |    \
     E1000_IMS_RXDMT0 |    \
     E1000_IMS_RXSEQ  |    \
     E1000_IMS_LSC)
Index: if_em.c
===================================================================
RCS file: /home/ncvs/src/sys/dev/em/if_em.c,v
retrieving revision 1.184
diff -u -p -r1.184 if_em.c
--- if_em.c     10 Sep 2007 21:50:40 -0000      1.184
+++ if_em.c     3 Oct 2007 21:41:12 -0000
@@ -240,14 +240,16 @@ static void       em_initialize_transmit_unit(
 static int     em_setup_receive_structures(struct adapter *);
 static void    em_initialize_receive_unit(struct adapter *);
 static void    em_enable_intr(struct adapter *);
+static void    em_enable_intr_rx(struct adapter *);
 static void    em_disable_intr(struct adapter *);
+static void    em_disable_intr_rx(struct adapter *);
 static void    em_free_transmit_structures(struct adapter *);
 static void    em_free_receive_structures(struct adapter *);
 static void    em_update_stats_counters(struct adapter *);
 static void    em_txeof(struct adapter *);
 static int     em_allocate_receive_structures(struct adapter *);
 static int     em_allocate_transmit_structures(struct adapter *);
-static int     em_rxeof(struct adapter *, int);
+static int     em_rxeof(struct adapter *, int, int);
 #ifndef __NO_STRICT_ALIGNMENT
 static int     em_fixup_rx(struct adapter *);
 #endif
@@ -292,14 +294,19 @@ static void     em_get_hw_control(struct
 static void     em_release_hw_control(struct adapter *);
 static void     em_enable_wakeup(device_t);
 
+
+/*
+ * Fast interrupt handler and legacy ithread/polling modes are
+ * mutually exclusive.
+ */
 #ifdef DEVICE_POLLING
 static poll_handler_t em_poll;
 static void    em_intr(void *);
 #else
+static void    em_add_int_rx_kthread_priority(struct adapter *, const char *,
+               const char *, int *, int);
 static int     em_intr_fast(void *);
-static void    em_add_rx_process_limit(struct adapter *, const char *,
-                   const char *, int *, int);
-static void    em_handle_rxtx(void *context, int pending);
+static void    em_kthread_rx(void *arg);
 static void    em_handle_link(void *context, int pending);
 #endif
 
@@ -351,9 +358,8 @@ TUNABLE_INT("hw.em.rxd", &em_rxd);
 TUNABLE_INT("hw.em.txd", &em_txd);
 TUNABLE_INT("hw.em.smart_pwr_down", &em_smart_pwr_down);
 #ifndef DEVICE_POLLING
-/* How many packets rxeof tries to clean at a time */
-static int em_rx_process_limit = 100;
-TUNABLE_INT("hw.em.rx_process_limit", &em_rx_process_limit);
+static int em_rx_kthread_priority = PRI_MAX_KERN;
+TUNABLE_INT("hw.em.rx_kthread_priority", &em_rx_kthread_priority);
 #endif
 /* Global used in WOL setup with multiport cards */
 static int global_quad_port_a = 0;
@@ -370,7 +376,7 @@ static int global_quad_port_a = 0;
 static int
 em_probe(device_t dev)
 {
-       char            adapter_name[60];
+       char            adapter_name[1024];     /* XXX why? */
        uint16_t        pci_vendor_id = 0;
        uint16_t        pci_device_id = 0;
        uint16_t        pci_subvendor_id = 0;
@@ -431,7 +437,8 @@ em_attach(device_t dev)
 
        adapter = device_get_softc(dev);
        adapter->dev = adapter->osdep.dev = dev;
-       EM_LOCK_INIT(adapter, device_get_nameunit(dev));
+       EM_RXLOCK_INIT(adapter, device_get_nameunit(dev));
+       EM_TXLOCK_INIT(adapter, device_get_nameunit(dev));
 
        /* SYSCTL stuff */
        SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
@@ -444,8 +451,8 @@ em_attach(device_t dev)
            OID_AUTO, "stats", CTLTYPE_INT|CTLFLAG_RW, adapter, 0,
            em_sysctl_stats, "I", "Statistics");
 
-       callout_init_mtx(&adapter->timer, &adapter->mtx, 0);
-       callout_init_mtx(&adapter->tx_fifo_timer, &adapter->mtx, 0);
+       callout_init_mtx(&adapter->timer, &adapter->txmtx, 0);
+       callout_init_mtx(&adapter->tx_fifo_timer, &adapter->txmtx, 0);
 
        /* Determine hardware and mac info */
        em_identify_hardware(adapter);
@@ -506,10 +513,10 @@ em_attach(device_t dev)
        }
 
 #ifndef DEVICE_POLLING
-       /* Sysctls for limiting the amount of work done in the taskqueue */
-       em_add_rx_process_limit(adapter, "rx_processing_limit",
-           "max number of rx packets to process", &adapter->rx_process_limit,
-           em_rx_process_limit);
+       /* Sysctls for set the RX kthreads' priority */
+       em_add_int_rx_kthread_priority(adapter, "rx_kthread_priority",
+           "priority of RX handler kthread", &adapter->rx_kthread_priority,
+           em_rx_kthread_priority);
 #endif
 
        /*
@@ -517,25 +524,14 @@ em_attach(device_t dev)
         * must not exceed hardware maximum, and must be multiple
         * of E1000_DBA_ALIGN.
         */
-       if (((em_txd * sizeof(struct e1000_tx_desc)) % EM_DBA_ALIGN) != 0 ||
-           (adapter->hw.mac.type >= e1000_82544 && em_txd > EM_MAX_TXD) ||
-           (adapter->hw.mac.type < e1000_82544 && em_txd > EM_MAX_TXD_82543) ||
-           (em_txd < EM_MIN_TXD)) {
-               device_printf(dev, "Using %d TX descriptors instead of %d!\n",
-                   EM_DEFAULT_TXD, em_txd);
-               adapter->num_tx_desc = EM_DEFAULT_TXD;
-       } else
-               adapter->num_tx_desc = em_txd;
-       if (((em_rxd * sizeof(struct e1000_rx_desc)) % EM_DBA_ALIGN) != 0 ||
-           (adapter->hw.mac.type >= e1000_82544 && em_rxd > EM_MAX_RXD) ||
-           (adapter->hw.mac.type < e1000_82544 && em_rxd > EM_MAX_RXD_82543) ||
-           (em_rxd < EM_MIN_RXD)) {
-               device_printf(dev, "Using %d RX descriptors instead of %d!\n",
-                   EM_DEFAULT_RXD, em_rxd);
-               adapter->num_rx_desc = EM_DEFAULT_RXD;
-       } else
-               adapter->num_rx_desc = em_rxd;
-
+       if (adapter->hw.mac.type >= e1000_82544) {
+               adapter->num_tx_desc = EM_MAX_TXD;
+               adapter->num_rx_desc = EM_MAX_RXD;
+       } else {
+               adapter->num_tx_desc = EM_MAX_TXD_82543;
+               adapter->num_rx_desc = EM_MAX_RXD_82543;
+       }
+       
        adapter->hw.mac.autoneg = DO_AUTO_NEG;
        adapter->hw.phy.wait_for_link = FALSE;
        adapter->hw.phy.autoneg_advertised = AUTONEG_ADV_DEFAULT;
@@ -736,7 +732,9 @@ err_tx_desc:
 err_pci:
        em_free_intr(adapter);
        em_free_pci_resources(adapter);
-       EM_LOCK_DESTROY(adapter);
+       /* XXX */
+       EM_TXLOCK_DESTROY(adapter);
+       EM_RXLOCK_DESTROY(adapter);
 
        return (error);
 }
@@ -766,7 +764,8 @@ em_detach(device_t dev)
 
        em_disable_intr(adapter);
        em_free_intr(adapter);
-       EM_LOCK(adapter);
+       EM_RXLOCK(adapter);
+       EM_TXLOCK(adapter);
        adapter->in_detach = 1;
        em_stop(adapter);
        e1000_phy_hw_reset(&adapter->hw);
@@ -785,7 +784,8 @@ em_detach(device_t dev)
                em_enable_wakeup(dev);
        }
 
-       EM_UNLOCK(adapter);
+       EM_TXUNLOCK(adapter);
+       EM_RXUNLOCK(adapter);
        ether_ifdetach(adapter->ifp);
 
        callout_drain(&adapter->timer);
@@ -811,7 +811,8 @@ em_detach(device_t dev)
                adapter->rx_desc_base = NULL;
        }
 
-       EM_LOCK_DESTROY(adapter);
+       EM_TXLOCK_DESTROY(adapter);
+       EM_RXLOCK_DESTROY(adapter);
 
        return (0);
 }
@@ -836,7 +837,8 @@ em_suspend(device_t dev)
 {
        struct adapter *adapter = device_get_softc(dev);
 
-       EM_LOCK(adapter);
+       EM_RXLOCK(adapter);
+       EM_TXLOCK(adapter);
        em_stop(adapter);
 
         em_release_manageability(adapter);
@@ -853,7 +855,8 @@ em_suspend(device_t dev)
                 em_enable_wakeup(dev);
         }
 
-       EM_UNLOCK(adapter);
+       EM_TXUNLOCK(adapter);
+       EM_RXUNLOCK(adapter);
 
        return bus_generic_suspend(dev);
 }
@@ -864,7 +867,8 @@ em_resume(device_t dev)
        struct adapter *adapter = device_get_softc(dev);
        struct ifnet *ifp = adapter->ifp;
 
-       EM_LOCK(adapter);
+       EM_RXLOCK(adapter);
+       EM_TXLOCK(adapter);
        em_init_locked(adapter);
        em_init_manageability(adapter);
 
@@ -872,7 +876,8 @@ em_resume(device_t dev)
            (ifp->if_drv_flags & IFF_DRV_RUNNING))
                em_start_locked(ifp);
 
-       EM_UNLOCK(adapter);
+       EM_TXUNLOCK(adapter);
+       EM_RXUNLOCK(adapter);
 
        return bus_generic_resume(dev);
 }
@@ -894,7 +899,7 @@ em_start_locked(struct ifnet *ifp)
        struct adapter  *adapter = ifp->if_softc;
        struct mbuf     *m_head;
 
-       EM_LOCK_ASSERT(adapter);
+       EM_TXLOCK_ASSERT(adapter);
 
        if ((ifp->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
            IFF_DRV_RUNNING)
@@ -906,7 +911,7 @@ em_start_locked(struct ifnet *ifp)
 
                IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
                if (m_head == NULL)
-                       break;
+                       continue;
                /*
                 *  Encapsulation can modify our pointer, and or make it
                 *  NULL on failure.  In that event, we can't requeue.
@@ -926,7 +931,12 @@ em_start_locked(struct ifnet *ifp)
                ETHER_BPF_MTAP(ifp, m_head);
 
                /* Set timeout in case hardware has problems transmitting. */
-               adapter->watchdog_timer = EM_TX_TIMEOUT;
+               adapter->tx_counter ++;
+       }
+
+       if (adapter->num_tx_desc - adapter->num_tx_desc_avail > 32) {
+               /* it's time to clean a little bit */
+               em_txeof (adapter);
        }
 }
 
@@ -935,10 +945,10 @@ em_start(struct ifnet *ifp)
 {
        struct adapter *adapter = ifp->if_softc;
 
-       EM_LOCK(adapter);
+       EM_TXLOCK(adapter);
        if (ifp->if_drv_flags & IFF_DRV_RUNNING)
                em_start_locked(ifp);
-       EM_UNLOCK(adapter);
+       EM_TXUNLOCK(adapter);
 }
 
 /*********************************************************************
@@ -973,9 +983,11 @@ em_ioctl(struct ifnet *ifp, u_long comma
                         */
                        ifp->if_flags |= IFF_UP;
                        if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
-                               EM_LOCK(adapter);
+                               EM_RXLOCK(adapter);
+                               EM_TXLOCK(adapter);
                                em_init_locked(adapter);
-                               EM_UNLOCK(adapter);
+                               EM_TXUNLOCK(adapter);
+                               EM_RXUNLOCK(adapter);
                        }
                        arp_ifinit(ifp, ifa);
                } else
@@ -988,7 +1000,8 @@ em_ioctl(struct ifnet *ifp, u_long comma
 
                IOCTL_DEBUGOUT("ioctl rcv'd: SIOCSIFMTU (Set Interface MTU)");
 
-               EM_LOCK(adapter);
+               EM_RXLOCK(adapter);
+               EM_TXLOCK(adapter);
                switch (adapter->hw.mac.type) {
                case e1000_82573:
                        /*
@@ -1019,7 +1032,8 @@ em_ioctl(struct ifnet *ifp, u_long comma
                }
                if (ifr->ifr_mtu > max_frame_size - ETHER_HDR_LEN -
                    ETHER_CRC_LEN) {
-                       EM_UNLOCK(adapter);
+                       EM_TXUNLOCK(adapter);
+                       EM_RXUNLOCK(adapter);
                        error = EINVAL;
                        break;
                }
@@ -1028,13 +1042,15 @@ em_ioctl(struct ifnet *ifp, u_long comma
                adapter->hw.mac.max_frame_size =
                ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN;
                em_init_locked(adapter);
-               EM_UNLOCK(adapter);
+               EM_TXUNLOCK(adapter);
+               EM_RXUNLOCK(adapter);
                break;
            }
        case SIOCSIFFLAGS:
                IOCTL_DEBUGOUT("ioctl rcv'd:\
                    SIOCSIFFLAGS (Set Interface Flags)");
-               EM_LOCK(adapter);
+               EM_RXLOCK(adapter);
+               EM_TXLOCK(adapter);
                if (ifp->if_flags & IFF_UP) {
                        if ((ifp->if_drv_flags & IFF_DRV_RUNNING)) {
                                if ((ifp->if_flags ^ adapter->if_flags) &
@@ -1048,13 +1064,15 @@ em_ioctl(struct ifnet *ifp, u_long comma
                        if (ifp->if_drv_flags & IFF_DRV_RUNNING)
                                em_stop(adapter);
                adapter->if_flags = ifp->if_flags;
-               EM_UNLOCK(adapter);
+               EM_TXUNLOCK(adapter);
+               EM_RXUNLOCK(adapter);
                break;
        case SIOCADDMULTI:
        case SIOCDELMULTI:
                IOCTL_DEBUGOUT("ioctl rcv'd: SIOC(ADD|DEL)MULTI");
                if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
-                       EM_LOCK(adapter);
+                       EM_RXLOCK(adapter);
+                       EM_TXLOCK(adapter);
                        em_disable_intr(adapter);
                        em_set_multi(adapter);
                        if (adapter->hw.mac.type == e1000_82542 && 
@@ -1065,19 +1083,23 @@ em_ioctl(struct ifnet *ifp, u_long comma
                        if (!(ifp->if_capenable & IFCAP_POLLING))
 #endif
                                em_enable_intr(adapter);
-                       EM_UNLOCK(adapter);
+                       EM_TXUNLOCK(adapter);
+                       EM_RXUNLOCK(adapter);
                }
                break;
        case SIOCSIFMEDIA:
                /* Check SOL/IDER usage */
-               EM_LOCK(adapter);
+               EM_RXLOCK(adapter);
+               EM_TXLOCK(adapter);
                if (e1000_check_reset_block(&adapter->hw)) {
-                       EM_UNLOCK(adapter);
+                       EM_TXUNLOCK(adapter);
+                       EM_RXUNLOCK(adapter);
                        device_printf(adapter->dev, "Media change is"
                            " blocked due to SOL/IDER session.\n");
                        break;
                }
-               EM_UNLOCK(adapter);
+               EM_TXUNLOCK(adapter);
+               EM_RXUNLOCK(adapter);
        case SIOCGIFMEDIA:
                IOCTL_DEBUGOUT("ioctl rcv'd: \
                    SIOCxIFMEDIA (Get/Set Interface Media)");
@@ -1096,17 +1118,21 @@ em_ioctl(struct ifnet *ifp, u_long comma
                                error = ether_poll_register(em_poll, ifp);
                                if (error)
                                        return (error);
-                               EM_LOCK(adapter);
+                               EM_RXLOCK(adapter);
+                               EM_TXLOCK(adapter);
                                em_disable_intr(adapter);
                                ifp->if_capenable |= IFCAP_POLLING;
-                               EM_UNLOCK(adapter);
+                               EM_TXUNLOCK(adapter);
+                               EM_RXUNLOCK(adapter);
                        } else {
                                error = ether_poll_deregister(ifp);
                                /* Enable interrupt even in error case */
-                               EM_LOCK(adapter);
+                               EM_RXLOCK(adapter);
+                               EM_TXLOCK(adapter);
                                em_enable_intr(adapter);
                                ifp->if_capenable &= ~IFCAP_POLLING;
-                               EM_UNLOCK(adapter);
+                               EM_TXUNLOCK(adapter);
+                               EM_RXUNLOCK(adapter);
                        }
                }
 #endif
@@ -1149,29 +1175,49 @@ static void
 em_watchdog(struct adapter *adapter)
 {
 
-       EM_LOCK_ASSERT(adapter);
+       EM_TXLOCK_ASSERT(adapter);
 
-       /*
-       ** The timer is set to 5 every time start queues a packet.
-       ** Then txeof keeps resetting to 5 as long as it cleans at
-       ** least one descriptor.
-       ** Finally, anytime all descriptors are clean the timer is
-       ** set to 0.
-       */
-       if (adapter->watchdog_timer == 0 || --adapter->watchdog_timer)
-               return;
+       if (E1000_READ_REG(&adapter->hw, E1000_TDH) ==
+           E1000_READ_REG(&adapter->hw, E1000_TDT)) {
+               /* TX queue is clean. Nothing to wait */
+               adapter->tx_counter_watchdog_mark = 0;
+       }
 
        /* If we are in this routine because of pause frames, then
         * don't reset the hardware.
         */
        if (E1000_READ_REG(&adapter->hw, E1000_STATUS) &
            E1000_STATUS_TXOFF) {
-               adapter->watchdog_timer = EM_TX_TIMEOUT;
+               /* XOFF received */
+               adapter->tx_counter_watchdog_mark = 0;
+               return;
+       }
+
+       if (!adapter->tx_counter_watchdog_mark) {
+               /* watchdog isn't started yet, let's do it */
+               adapter->tx_counter_watchdog_mark = adapter->tx_counter;
+               adapter->tx_tdh_watchdog_mark = E1000_READ_REG(&adapter->hw, 
E1000_TDH);
+               return;
+       }
+
+       if (adapter->tx_counter - adapter->tx_counter_watchdog_mark >= 
adapter->num_tx_desc) {
+               /* TX ring has been wrapped, clean watchdog condition */
+               adapter->tx_counter_watchdog_mark = 0;
                return;
        }
 
-       if (e1000_check_for_link(&adapter->hw) == 0)
+       if (adapter->tx_tdh_watchdog_mark != E1000_READ_REG(&adapter->hw, 
E1000_TDH)) {
+               /* Something were sent */
+               adapter->tx_counter_watchdog_mark = 0;
+               return;
+       }
+
+       if (e1000_check_for_link(&adapter->hw) == 0) {
                device_printf(adapter->dev, "watchdog timeout -- resetting\n");
+               em_print_hw_stats(adapter);
+               em_print_debug_info(adapter);
+       }
+
        adapter->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
        adapter->watchdog_events++;
 
@@ -1198,7 +1244,8 @@ em_init_locked(struct adapter *adapter)
 
        INIT_DEBUGOUT("em_init: begin");
 
-       EM_LOCK_ASSERT(adapter);
+       EM_RXLOCK_ASSERT(adapter);
+       EM_TXLOCK_ASSERT(adapter);
 
        em_stop(adapter);
 
@@ -1337,9 +1384,11 @@ em_init(void *arg)
 {
        struct adapter *adapter = arg;
 
-       EM_LOCK(adapter);
+       EM_RXLOCK(adapter);
+       EM_TXLOCK(adapter);
        em_init_locked(adapter);
-       EM_UNLOCK(adapter);
+       EM_TXUNLOCK(adapter);
+       EM_RXUNLOCK(adapter);
 }
 
 
@@ -1355,9 +1404,11 @@ em_poll(struct ifnet *ifp, enum poll_cmd
        struct adapter *adapter = ifp->if_softc;
        uint32_t reg_icr;
 
-       EM_LOCK(adapter);
+       EM_RXLOCK(adapter);
+       EM_TXLOCK(adapter);
        if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
-               EM_UNLOCK(adapter);
+               EM_TXUNLOCK(adapter);
+               EM_RXUNLOCK(adapter);
                return;
        }
 
@@ -1372,12 +1423,13 @@ em_poll(struct ifnet *ifp, enum poll_cmd
                            em_local_timer, adapter);
                }
        }
-       em_rxeof(adapter, count);
+       em_rxeof(adapter, count, 0);
        em_txeof(adapter);
 
        if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd))
                em_start_locked(ifp);
-       EM_UNLOCK(adapter);
+       EM_TXUNLOCK(adapter);
+       EM_RXUNLOCK(adapter);
 }
 
 /*********************************************************************
@@ -1393,11 +1445,11 @@ em_intr(void *arg)
        struct ifnet    *ifp;
        uint32_t        reg_icr;
 
-       EM_LOCK(adapter);
+       /* XXX EM_LOCK(adapter); */
        ifp = adapter->ifp;
 
        if (ifp->if_capenable & IFCAP_POLLING) {
-               EM_UNLOCK(adapter);
+               /* EM_UNLOCK(adapter); */
                return;
        }
 
@@ -1419,29 +1471,35 @@ em_intr(void *arg)
                if (reg_icr == 0xffffffff)
                        break;
 
-               if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
-                       em_rxeof(adapter, -1);
-                       em_txeof(adapter);
-               }
-
                /* Link status change */
                if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) {
+                       EM_RXLOCK(adapter);
+                       EM_TXLOCK(adapter);
                        callout_stop(&adapter->timer);
                        adapter->hw.mac.get_link_status = 1;
                        e1000_check_for_link(&adapter->hw);
                        em_update_link_status(adapter);
                        callout_reset(&adapter->timer, hz,
                            em_local_timer, adapter);
+                       EM_TXUNLOCK(adapter);
+                       EM_RXUNLOCK(adapter);
+               }
+               if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
+                       if (reg_icr & 
(E1000_ICR_RXDMT0|E1000_ICR_RXO|E1000_ICR_RXT0)) {
+                               EM_RXLOCK(adapter);
+                               em_rxeof(adapter, -1,0);
+                               EM_RXUNLOCK(adapter);
+                       }
+                       if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
+                               EM_TXLOCK(adapter);
+                               em_start_locked(ifp);
+                               EM_TXUNLOCK(adapter);
+                       }
                }
 
                if (reg_icr & E1000_ICR_RXO)
                        adapter->rx_overruns++;
        }
-
-       if (ifp->if_drv_flags & IFF_DRV_RUNNING &&
-           !IFQ_DRV_IS_EMPTY(&ifp->if_snd))
-               em_start_locked(ifp);
-       EM_UNLOCK(adapter);
 }
 
 #else /* if not DEVICE_POLLING, then fast interrupt routines only */
@@ -1454,9 +1512,11 @@ em_handle_link(void *context, int pendin
 
        ifp = adapter->ifp;
 
-       EM_LOCK(adapter);
+       EM_RXLOCK(adapter);
+       EM_TXLOCK(adapter);
        if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
-               EM_UNLOCK(adapter);
+               EM_TXUNLOCK(adapter);
+               EM_RXUNLOCK(adapter);
                return;
        }
 
@@ -1465,33 +1525,37 @@ em_handle_link(void *context, int pendin
        e1000_check_for_link(&adapter->hw);
        em_update_link_status(adapter);
        callout_reset(&adapter->timer, hz, em_local_timer, adapter);
-       EM_UNLOCK(adapter);
+
+       wakeup (&adapter->rxmtx);
+       wakeup (&adapter->txmtx);
+       EM_TXUNLOCK(adapter);
+       EM_RXUNLOCK(adapter);
 }
 
 static void
-em_handle_rxtx(void *context, int pending)
+em_kthread_rx(void *arg)
 {
-       struct adapter  *adapter = context;
-       struct ifnet    *ifp;
+       struct adapter  *adapter = arg;
+       struct ifnet    *ifp = adapter->ifp;
+       int myKthreadNo = 0;
 
-       ifp = adapter->ifp;
+       EM_RXLOCK(adapter);
+       myKthreadNo = adapter -> rxKthreadNo ++;
+       adapter -> rxIpBeingProcessed[myKthreadNo] = 0;
+       adapter -> waitedBy[myKthreadNo] = 0;
+       EM_RXUNLOCK(adapter);
 
-       /*
-        * TODO:
-        * It should be possible to run the tx clean loop without the lock.
-        */
-       if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
-               if (em_rxeof(adapter, adapter->rx_process_limit) != 0)
-                       taskqueue_enqueue(adapter->tq, &adapter->rxtx_task);
-               EM_LOCK(adapter);
-               em_txeof(adapter);
-
-               if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd))
-                       em_start_locked(ifp);
-               EM_UNLOCK(adapter);
+       while (!adapter->rx_shutdown_flag) {
+               tsleep(&adapter->rxmtx, adapter->rx_kthread_priority, "em_rx", 
hz);
+               if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
+                       EM_RXLOCK(adapter);
+                       em_rxeof(adapter,-1, myKthreadNo);
+                       EM_RXUNLOCK(adapter);
+               }
+               em_enable_intr_rx(adapter);
        }
 
-       em_enable_intr(adapter);
+       kthread_exit(0);
 }
 
 /*********************************************************************
@@ -1526,13 +1590,17 @@ em_intr_fast(void *arg)
            (reg_icr & E1000_ICR_INT_ASSERTED) == 0)
                return (FILTER_STRAY);
 
-       /*
-        * Mask interrupts until the taskqueue is finished running.  This is
-        * cheap, just assume that it is needed.  This also works around the
-        * MSI message reordering errata on certain systems.
-        */
-       em_disable_intr(adapter);
-       taskqueue_enqueue(adapter->tq, &adapter->rxtx_task);
+       if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
+               if (reg_icr & (E1000_ICR_RXDMT0|E1000_ICR_RXO|E1000_ICR_RXT0)) {
+                       /*
+                        * Mask interrupts until the taskqueue is finished 
running.  This is
+                        * cheap, just assume that it is needed.  This also 
works around the
+                        * MSI message reordering errata on certain systems.
+                        */
+                       em_disable_intr_rx (adapter);
+                       wakeup (&adapter->rxmtx);
+               }
+       }
 
        /* Link status change */
        if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC))
@@ -1560,7 +1628,8 @@ em_media_status(struct ifnet *ifp, struc
 
        INIT_DEBUGOUT("em_media_status: begin");
 
-       EM_LOCK(adapter);
+       EM_RXLOCK(adapter);
+       EM_TXLOCK(adapter);
        e1000_check_for_link(&adapter->hw);
        em_update_link_status(adapter);
 
@@ -1568,7 +1637,8 @@ em_media_status(struct ifnet *ifp, struc
        ifmr->ifm_active = IFM_ETHER;
 
        if (!adapter->link_active) {
-               EM_UNLOCK(adapter);
+               EM_TXUNLOCK(adapter);
+               EM_RXUNLOCK(adapter);
                return;
        }
 
@@ -1596,7 +1666,8 @@ em_media_status(struct ifnet *ifp, struc
                else
                        ifmr->ifm_active |= IFM_HDX;
        }
-       EM_UNLOCK(adapter);
+       EM_TXUNLOCK(adapter);
+       EM_RXUNLOCK(adapter);
 }
 
 /*********************************************************************
@@ -1618,7 +1689,8 @@ em_media_change(struct ifnet *ifp)
        if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER)
                return (EINVAL);
 
-       EM_LOCK(adapter);
+       EM_RXLOCK(adapter);
+       EM_TXLOCK(adapter);
        switch (IFM_SUBTYPE(ifm->ifm_media)) {
        case IFM_AUTO:
                adapter->hw.mac.autoneg = DO_AUTO_NEG;
@@ -1656,7 +1728,8 @@ em_media_change(struct ifnet *ifp)
        adapter->hw.phy.reset_disable = FALSE;
 
        em_init_locked(adapter);
-       EM_UNLOCK(adapter);
+       EM_TXUNLOCK(adapter);
+       EM_RXUNLOCK(adapter);
 
        return (0);
 }
@@ -2130,7 +2203,8 @@ em_82547_move_tail(void *arg)
        uint16_t length = 0;
        boolean_t eop = 0;
 
-       EM_LOCK_ASSERT(adapter);
+       EM_RXLOCK_ASSERT(adapter);
+       EM_TXLOCK_ASSERT(adapter);
 
        hw_tdt = E1000_READ_REG(&adapter->hw, E1000_TDT);
        sw_tdt = adapter->next_avail_tx_desc;
@@ -2337,7 +2411,8 @@ em_local_timer(void *arg)
        struct adapter  *adapter = arg;
        struct ifnet    *ifp = adapter->ifp;
 
-       EM_LOCK_ASSERT(adapter);
+       EM_RXLOCK(adapter);
+       EM_TXLOCK(adapter);
 
        e1000_check_for_link(&adapter->hw);
        em_update_link_status(adapter);
@@ -2359,6 +2434,9 @@ em_local_timer(void *arg)
        em_watchdog(adapter);
 
        callout_reset(&adapter->timer, hz, em_local_timer, adapter);
+
+       EM_TXUNLOCK(adapter);
+       EM_RXUNLOCK(adapter);
 }
 
 static void
@@ -2419,7 +2497,8 @@ em_stop(void *arg)
        struct adapter  *adapter = arg;
        struct ifnet    *ifp = adapter->ifp;
 
-       EM_LOCK_ASSERT(adapter);
+       EM_RXLOCK_ASSERT(adapter);
+       EM_TXLOCK_ASSERT(adapter);
 
        INIT_DEBUGOUT("em_stop: begin");
 
@@ -2606,19 +2685,22 @@ em_allocate_intr(struct adapter *adapter
         * Try allocating a fast interrupt and the associated deferred
         * processing contexts.
         */
-       TASK_INIT(&adapter->rxtx_task, 0, em_handle_rxtx, adapter);
-       TASK_INIT(&adapter->link_task, 0, em_handle_link, adapter);
-       adapter->tq = taskqueue_create_fast("em_taskq", M_NOWAIT,
-           taskqueue_thread_enqueue, &adapter->tq);
-       taskqueue_start_threads(&adapter->tq, 1, PI_NET, "%s taskq",
-           device_get_nameunit(adapter->dev));
+       TASK_INIT(&adapter->link_task, INTR_TYPE_NET | INTR_MPSAFE, 
em_handle_link, adapter);
+
+       adapter->rx_shutdown_flag=FALSE;
+       adapter->rxKthreadNo=0;
+       adapter->reorder_cnt=0;
+       for (int i = 0; i < RX_KTHREADS_NUM; i++) {
+               adapter->rx_kthreads_handles[i] = NULL;
+               kthread_create (em_kthread_rx, adapter, 
adapter->rx_kthreads_handles + i, 
+                       INTR_TYPE_NET | INTR_FAST | INTR_MPSAFE, 0, 
"%s_rx_kthread_%d",device_get_nameunit(dev),i);
+       }
+
        if ((error = bus_setup_intr(dev, adapter->res_interrupt,
-           INTR_TYPE_NET, em_intr_fast, NULL, adapter,
+           INTR_TYPE_NET | INTR_FAST | INTR_MPSAFE, em_intr_fast, NULL, 
adapter,
            &adapter->int_handler_tag)) != 0) {
                device_printf(dev, "Failed to register fast interrupt "
                            "handler: %d\n", error);
-               taskqueue_free(adapter->tq);
-               adapter->tq = NULL;
                return (error);
        }
 #endif 
@@ -2637,11 +2719,12 @@ em_free_intr(struct adapter *adapter)
                        adapter->int_handler_tag);
                adapter->int_handler_tag = NULL;
        }
-       if (adapter->tq != NULL) {
-               taskqueue_drain(adapter->tq, &adapter->rxtx_task);
-               taskqueue_drain(taskqueue_fast, &adapter->link_task);
-               taskqueue_free(adapter->tq);
-               adapter->tq = NULL;
+       taskqueue_drain(taskqueue_fast, &adapter->link_task);
+
+       adapter->rx_shutdown_flag=TRUE;
+       for (int i = 0; i < RX_KTHREADS_NUM; i++) {
+               if (adapter->rx_kthreads_handles[i])
+                       tsleep(adapter->rx_kthreads_handles[i], 0, "RXSTOP", 
3*hz);
        }
 }
 
@@ -3138,7 +3221,7 @@ em_initialize_transmit_unit(struct adapt
        E1000_WRITE_REG(&adapter->hw, E1000_TIDV, adapter->tx_int_delay.value);
        if(adapter->hw.mac.type >= e1000_82540)
                E1000_WRITE_REG(&adapter->hw, E1000_TADV,
-                   adapter->tx_abs_int_delay.value);
+                   EM_USECS_TO_TICKS(adapter->tx_abs_int_delay.value));
 
        if ((adapter->hw.mac.type == e1000_82571) ||
            (adapter->hw.mac.type == e1000_82572)) {
@@ -3364,6 +3447,10 @@ em_transmit_checksum_setup(struct adapte
 
        adapter->num_tx_desc_avail--;
        adapter->next_avail_tx_desc = curr_txd;
+
+       adapter->tx_counter=0;
+       adapter->tx_counter_watchdog_mark=0;
+       adapter->tx_tdh_watchdog_mark=0;
 }
 
 /**********************************************************************
@@ -3736,7 +3823,7 @@ em_txeof(struct adapter *adapter)
         struct e1000_tx_desc   *tx_desc, *eop_desc;
        struct ifnet   *ifp = adapter->ifp;
 
-       EM_LOCK_ASSERT(adapter);
+       EM_TXLOCK_ASSERT(adapter);
 
         if (adapter->num_tx_desc_avail == adapter->num_tx_desc)
                 return;
@@ -3809,15 +3896,8 @@ em_txeof(struct adapter *adapter)
          * If there are no pending descriptors, clear the timeout. Otherwise,
          * if some descriptors have been freed, restart the timeout.
          */
-        if (num_avail > EM_TX_CLEANUP_THRESHOLD) {                
+        if (num_avail > EM_TX_CLEANUP_THRESHOLD)
                 ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
-               /* All clean, turn off the timer */
-                if (num_avail == adapter->num_tx_desc)
-                       adapter->watchdog_timer = 0;
-               /* Some cleaned, reset the timer */
-                else if (num_avail != adapter->num_tx_desc_avail)
-                       adapter->watchdog_timer = EM_TX_TIMEOUT;
-        }
         adapter->num_tx_desc_avail = num_avail;
         return;
 }
@@ -4144,7 +4224,7 @@ em_free_receive_structures(struct adapte
  *
  *********************************************************************/
 static int
-em_rxeof(struct adapter *adapter, int count)
+em_rxeof(struct adapter *adapter, int count, int myKthreadNo)
 {
        struct ifnet    *ifp;
        struct mbuf     *mp;
@@ -4298,15 +4378,57 @@ discard:
                if (++i == adapter->num_rx_desc)
                        i = 0;
                if (m != NULL) {
+                       struct ip *ip = mtod(m, struct ip *);
+
                        adapter->next_rx_desc_to_check = i;
-#ifdef DEVICE_POLLING
-                       EM_UNLOCK(adapter);
-                       (*ifp->if_input)(ifp, m);
-                       EM_LOCK(adapter);
-#else
-                       /* Already running unlocked */
+
+                       /*
+                        * Trick to avoid reorder:
+                        *
+                        * Don't allow change order of tcp packets
+                        * in same session.  In order to make this
+                        * easier, we will not allow to process packets
+                        * from one same source with more than one CPU.
+                        */
+                       int hlen = ip->ip_hl << 2;
+                       if (hlen >= sizeof(struct ip)) { /* minimum header 
length */
+                               adapter -> 
rxIpBeingProcessed[myKthreadNo]=ip->ip_src.s_addr;
+
+                               if (ip->ip_src.s_addr)
+                                       for (int k=0; k < RX_KTHREADS_NUM; k++) 
{
+                                               if 
((adapter->rxIpBeingProcessed[k] == ip->ip_src.s_addr) 
+                                                    && !adapter->waitedBy[k]) {
+                                                       /*
+                                                        * Packet from the same 
IP is being processed
+                                                        * by another thread, 
wait until that was done.
+                                                        */
+                                                       adapter->reorder_cnt++; 
+                                                       adapter->waitedBy[k] = 
myKthreadNo;
+                                                       
msleep(adapter->rxIpBeingProcessed+k,
+                                                               &adapter->rxmtx,
+                                                               
adapter->rx_kthread_priority,
+                                                               "RORDER", -1);
+                                               }
+                               }
+                       } else 
+                               ip = NULL;
+
+                       EM_RXUNLOCK(adapter);
+
                        (*ifp->if_input)(ifp, m);
-#endif
+
+                       EM_RXLOCK(adapter);
+
+                       adapter->rxIpBeingProcessed[myKthreadNo]=0;
+
+                       if (adapter->waitedBy[myKthreadNo]) {
+                               /*
+                                * Wakeup threads blocking on our packet process
+                                * procedure due to the reorder prevention check
+                                */
+                               wakeup(adapter->rxIpBeingProcessed+myKthreadNo);
+                               adapter->waitedBy[myKthreadNo] = 0;
+                       }
                        i = adapter->next_rx_desc_to_check;
                }
                current_desc = &adapter->rx_desc_base[i];
@@ -4438,6 +4560,18 @@ em_disable_intr(struct adapter *adapter)
        E1000_WRITE_REG(&adapter->hw, E1000_IMC, 0xffffffff);
 }
 
+static void
+em_enable_intr_rx(struct adapter *adapter)
+{
+       E1000_WRITE_REG(&adapter->hw, E1000_IMS, E1000_IMS_RXT0 | 
E1000_IMS_RXDMT0 | E1000_IMS_RXO);
+}
+
+static void
+em_disable_intr_rx(struct adapter *adapter)
+{
+       E1000_WRITE_REG(&adapter->hw, E1000_IMC, E1000_IMS_RXT0 | 
E1000_IMS_RXDMT0 | E1000_IMS_RXO);
+}
+
 /*
  * Bit of a misnomer, what this really means is
  * to enable OS management of the system... aka
@@ -4878,6 +5012,8 @@ em_print_debug_info(struct adapter *adap
            adapter->dropped_pkts);
        device_printf(dev, "Driver tx dma failure in encap = %ld\n",
                adapter->no_tx_dma_setup);
+       device_printf(dev, "Packets pended due to reorder = %ld\n",
+               adapter->reorder_cnt);
 }
 
 static void
@@ -4996,7 +5132,8 @@ em_sysctl_int_delay(SYSCTL_HANDLER_ARGS)
 
        adapter = info->adapter;
        
-       EM_LOCK(adapter);
+       EM_RXLOCK(adapter);
+       EM_TXLOCK(adapter);
        regval = E1000_READ_OFFSET(&adapter->hw, info->offset);
        regval = (regval & ~0xffff) | (ticks & 0xffff);
        /* Handle a few special cases. */
@@ -5014,7 +5151,8 @@ em_sysctl_int_delay(SYSCTL_HANDLER_ARGS)
                break;
        }
        E1000_WRITE_OFFSET(&adapter->hw, info->offset, regval);
-       EM_UNLOCK(adapter);
+       EM_TXUNLOCK(adapter);
+       EM_RXUNLOCK(adapter);
        return (0);
 }
 
@@ -5034,7 +5172,7 @@ em_add_int_delay_sysctl(struct adapter *
 
 #ifndef DEVICE_POLLING
 static void
-em_add_rx_process_limit(struct adapter *adapter, const char *name,
+em_add_int_rx_kthread_priority(struct adapter *adapter, const char *name,
        const char *description, int *limit, int value)
 {
        *limit = value;
Index: if_em.h
===================================================================
RCS file: /home/ncvs/src/sys/dev/em/if_em.h,v
retrieving revision 1.62
diff -u -p -r1.62 if_em.h
--- if_em.h     10 Sep 2007 21:50:40 -0000      1.62
+++ if_em.h     3 Oct 2007 21:35:44 -0000
@@ -82,7 +82,7 @@ POSSIBILITY OF SUCH DAMAGE.
  *   system is reporting dropped transmits, this value may be set too high
  *   causing the driver to run out of available transmit descriptors.
  */
-#define EM_TIDV                         64
+#define EM_TIDV                         65535
 
 /*
  * EM_TADV - Transmit Absolute Interrupt Delay Value
@@ -96,7 +96,7 @@ POSSIBILITY OF SUCH DAMAGE.
  *   along with EM_TIDV, may improve traffic throughput in specific
  *   network conditions.
  */
-#define EM_TADV                         64
+#define EM_TADV                         65535
 
 /*
  * EM_RDTR - Receive Interrupt Delay Timer (Packet Timer)
@@ -130,12 +130,12 @@ POSSIBILITY OF SUCH DAMAGE.
  *   along with EM_RDTR, may improve traffic throughput in specific network
  *   conditions.
  */
-#define EM_RADV                         64
+#define EM_RADV                         977
 
 /*
  * This parameter controls the duration of transmit watchdog timer.
  */
-#define EM_TX_TIMEOUT                   5    /* set to 5 seconds */
+#define EM_TX_TIMEOUT                   2    /* set to 2 seconds */
 
 /*
  * This parameter controls when the driver calls the routine to reclaim
@@ -270,15 +270,31 @@ struct adapter {
        struct ifmedia  media;
        struct callout  timer;
        struct callout  tx_fifo_timer;
-       int             watchdog_timer;
+
+       unsigned        tx_counter;
+       unsigned        tx_counter_watchdog_mark;
+       unsigned        tx_tdh_watchdog_mark;
+
        int             io_rid;
        int             msi;
        int             if_flags;
-       struct mtx      mtx;
        int             em_insert_vlan_header;
+       
+       /* RX/TX locks */
+       struct mtx      rxmtx;
+       struct mtx      txmtx;
+
        struct task     link_task;
-       struct task     rxtx_task;
-       struct taskqueue *tq;           /* private task queue */
+
+#define        RX_KTHREADS_NUM 2
+       struct proc     *rx_kthreads_handles[RX_KTHREADS_NUM];
+       int             rx_shutdown_flag;
+
+       in_addr_t       rxIpBeingProcessed[RX_KTHREADS_NUM];
+       int             waitedBy[RX_KTHREADS_NUM];
+       int             rxKthreadNo;
+       unsigned long   reorder_cnt;
+
        /* Management and WOL features */
        int             wol;
        int             has_manage;
@@ -333,7 +349,7 @@ struct adapter {
        uint32_t                next_rx_desc_to_check;
        uint32_t                rx_buffer_len;
        uint16_t                num_rx_desc;
-       int                     rx_process_limit;
+       int                     rx_kthread_priority;
        struct em_buffer        *rx_buffer_area;
        bus_dma_tag_t           rxtag;
        bus_dmamap_t            rx_sparemap;
@@ -413,11 +429,20 @@ typedef struct _DESCRIPTOR_PAIR
        uint32_t   elements;
 } DESC_ARRAY, *PDESC_ARRAY;
 
-#define        EM_LOCK_INIT(_sc, _name) \
-       mtx_init(&(_sc)->mtx, _name, MTX_NETWORK_LOCK, MTX_DEF)
-#define        EM_LOCK_DESTROY(_sc)    mtx_destroy(&(_sc)->mtx)
-#define        EM_LOCK(_sc)            mtx_lock(&(_sc)->mtx)
-#define        EM_UNLOCK(_sc)          mtx_unlock(&(_sc)->mtx)
-#define        EM_LOCK_ASSERT(_sc)     mtx_assert(&(_sc)->mtx, MA_OWNED)
+#define        EM_RXLOCK_INIT(_sc, _name) \
+       mtx_init(&(_sc)->rxmtx, _name, MTX_NETWORK_LOCK, MTX_DEF)
+#define        EM_RXLOCK_DESTROY(_sc)  mtx_destroy(&(_sc)->rxmtx)
+#define        EM_RXLOCK(_sc)          mtx_lock(&(_sc)->rxmtx)
+#define        EM_RXTRYLOCK(_sc)       mtx_trylock(&(_sc)->rxmtx)
+#define        EM_RXUNLOCK(_sc)        mtx_unlock(&(_sc)->rxmtx)
+#define        EM_RXLOCK_ASSERT(_sc)   mtx_assert(&(_sc)->rxmtx, MA_OWNED)
+
+#define        EM_TXLOCK_INIT(_sc, _name) \
+       mtx_init(&(_sc)->txmtx, _name, MTX_NETWORK_LOCK, MTX_DEF)
+#define        EM_TXLOCK_DESTROY(_sc)  mtx_destroy(&(_sc)->txmtx)
+#define        EM_TXLOCK(_sc)          mtx_lock(&(_sc)->txmtx)
+#define        EM_TXTRYLOCK(_sc)       mtx_trylock(&(_sc)->txmtx)
+#define        EM_TXUNLOCK(_sc)        mtx_unlock(&(_sc)->txmtx)
+#define        EM_TXLOCK_ASSERT(_sc)   mtx_assert(&(_sc)->txmtx, MA_OWNED)
 
 #endif /* _EM_H_DEFINED_ */

Attachment: signature.asc
Description: OpenPGP digital signature

Reply via email to