David S. Miller a écrit :
From: Eric Dumazet <[EMAIL PROTECTED]>
Date: Wed, 24 Aug 2005 01:10:44 +0200


Looking at tg3_tx() more closely, I am not convinced it really needs
to lock tp->tx_lock during the loop.  tp->tx_cons (swidx) is changed
in this function only, and could be changed to an atomic_t

The tx_lock would be needed for the final

if (netif_queue_stopped(tp->dev) &&
        (TX_BUFFS_AVAIL(tp) > TG3_TX_WAKEUP_THRESH))
        netif_wake_queue(tp->dev);


If you're going to take the tx_lock anyways, no need to make
the tx_cons atomic in any way.  Just update it inside of
that lock holding at the end.

dev->poll() executes atomically, and that helps a lot here.

Please toy around with some of these ideas and let us know
what works well in your testing.

Thanks.



Hi David

I played and I have very good results with the following patches.

# tc -s -d qdisc show dev eth0 ; sleep 10 ; tc -s -d qdisc show dev eth0
qdisc pfifo_fast 0: bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
Sent 440173135511 bytes 3211378293 pkt (dropped 240817, overlimits 0 requeues 27028035)
 backlog 0b 0p requeues 27028035
qdisc pfifo_fast 0: bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
Sent 440216655667 bytes 3211730812 pkt (dropped 240904, overlimits 0 requeues 27031668)
 backlog 0b 0p requeues 27031668

(So about 360 requeues per second, much better than before (12000 / second))

oprofile results give
0.6257  %    qdisc_restart  (instead of 2.6452 %)

This patch has 3 parts

- [TG3] : tx_lock spinlock is taken in tg3_tx() only when really needed.

The TX completion (tg3_tx) is done after RX completion : Better to first take care of incoming frames than take care of re-enabling transmit.

- [NET] : add a spin_lock_prefetch(&dev->queue_lock) in dev_queue_xmit() as the lock is going to be taken.

- [NET] : Try to reorder some hot fields of struct net_device and place them on separate cache lines in SMP to lower memory bouncing netween multiple CPU accessing the device.

        - One part is mostly used on receive path (including eth_type_trans())
         (poll_list, poll, quota, weight, last_rx, dev_addr, broadcast) 

        - One part is mostly used on queue transmit path (qdisc)
         (queue_lock, qdisc, qdisc_sleeping, qdisc_list, tx_queue_len)

        - One part is mostly used on xmit path (device)
         (xmit_lock, xmit_lock_owner, priv, hard_start_xmit, trans_start)


Thank you

Signed-off-by: Eric Dumazet <[EMAIL PROTECTED]>
--- net-2.6.14/drivers/net/tg3.c        2005-08-26 02:13:58.000000000 +0200
+++ net-2.6.14-ed/drivers/net/tg3.c     2005-08-26 15:22:01.000000000 +0200
@@ -2847,8 +2847,7 @@
                struct sk_buff *skb = ri->skb;
                int i;
 
-               if (unlikely(skb == NULL))
-                       BUG();
+               BUG_ON(skb == NULL);
 
                pci_unmap_single(tp->pdev,
                                 pci_unmap_addr(ri, mapping),
@@ -2860,12 +2859,10 @@
                sw_idx = NEXT_TX(sw_idx);
 
                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
-                       if (unlikely(sw_idx == hw_idx))
-                               BUG();
+                       BUG_ON(sw_idx == hw_idx);
 
                        ri = &tp->tx_buffers[sw_idx];
-                       if (unlikely(ri->skb != NULL))
-                               BUG();
+                       BUG_ON(ri->skb != NULL);
 
                        pci_unmap_page(tp->pdev,
                                       pci_unmap_addr(ri, mapping),
@@ -2878,11 +2875,13 @@
                dev_kfree_skb(skb);
        }
 
+       spin_lock(&tp->tx_lock);
        tp->tx_cons = sw_idx;
 
        if (netif_queue_stopped(tp->dev) &&
            (TX_BUFFS_AVAIL(tp) > TG3_TX_WAKEUP_THRESH))
                netif_wake_queue(tp->dev);
+       spin_unlock(&tp->tx_lock);
 }
 
 /* Returns size of skb allocated or < 0 on error.
@@ -3196,13 +3195,6 @@
                }
        }
 
-       /* run TX completion thread */
-       if (sblk->idx[0].tx_consumer != tp->tx_cons) {
-               spin_lock(&tp->tx_lock);
-               tg3_tx(tp);
-               spin_unlock(&tp->tx_lock);
-       }
-
        /* run RX thread, within the bounds set by NAPI.
         * All RX "locking" is done by ensuring outside
         * code synchronizes with dev->poll()
@@ -3220,6 +3212,10 @@
                netdev->quota -= work_done;
        }
 
+       /* run TX completion thread */
+       if (sblk->idx[0].tx_consumer != tp->tx_cons)
+               tg3_tx(tp);
+
        if (tp->tg3_flags & TG3_FLAG_TAGGED_STATUS)
                tp->last_tag = sblk->status_tag;
        rmb();
--- net-2.6.14/net/core/dev.c   2005-08-26 02:14:00.000000000 +0200
+++ net-2.6.14-ed/net/core/dev.c        2005-08-26 16:32:27.000000000 +0200
@@ -1257,6 +1257,8 @@
                if (skb_checksum_help(skb, 0))
                        goto out_kfree_skb;
 
+       spin_lock_prefetch(&dev->queue_lock);
+
        /* Disable soft irqs for various locks below. Also 
         * stops preemption for RCU. 
         */
--- net-2.6.14/include/linux/netdevice.h        2005-08-26 02:13:59.000000000 
+0200
+++ net-2.6.14-ed/include/linux/netdevice.h     2005-08-26 16:13:27.000000000 
+0200
@@ -317,8 +317,6 @@
         */
 
        /* These may be needed for future network-power-down code. */
-       unsigned long           trans_start;    /* Time (in jiffies) of last Tx 
*/
-       unsigned long           last_rx;        /* Time of last Rx      */
 
        unsigned short          flags;  /* interface flags (a la BSD)   */
        unsigned short          gflags;
@@ -328,15 +326,12 @@
        unsigned                mtu;    /* interface MTU value          */
        unsigned short          type;   /* interface hardware type      */
        unsigned short          hard_header_len;        /* hardware hdr length  
*/
-       void                    *priv;  /* pointer to private data      */
 
        struct net_device       *master; /* Pointer to master device of a group,
                                          * which this device is member of.
                                          */
 
        /* Interface address info. */
-       unsigned char           broadcast[MAX_ADDR_LEN];        /* hw bcast add 
*/
-       unsigned char           dev_addr[MAX_ADDR_LEN]; /* hw address   */
        unsigned char           perm_addr[MAX_ADDR_LEN]; /* permanent hw 
address */
        unsigned char           addr_len;       /* hardware address length      
*/
        unsigned short          dev_id;         /* for shared network cards */
@@ -358,26 +353,49 @@
        void                    *ec_ptr;        /* Econet specific data */
        void                    *ax25_ptr;      /* AX.25 specific data */
 
-       struct list_head        poll_list;      /* Link to poll list    */
+/*
+ * cache line used on receive path
+ */
+       struct list_head        poll_list ____cacheline_aligned_in_smp; /* Link 
to poll list    */
+       int                     (*poll) (struct net_device *dev, int *quota);
        int                     quota;
        int                     weight;
+       unsigned long           last_rx;        /* Time of last Rx      */
+       /* Interface address info. */
+       unsigned char           dev_addr[MAX_ADDR_LEN]; /* hw address   */
+       unsigned char           broadcast[MAX_ADDR_LEN];        /* hw bcast add 
*/
 
+/*
+ * cache line used on queue transmit path
+ */
+       /* device queue lock */
+       spinlock_t              queue_lock ____cacheline_aligned_in_smp;
        struct Qdisc            *qdisc;
        struct Qdisc            *qdisc_sleeping;
-       struct Qdisc            *qdisc_ingress;
        struct list_head        qdisc_list;
        unsigned long           tx_queue_len;   /* Max frames per queue allowed 
*/
 
        /* ingress path synchronizer */
        spinlock_t              ingress_lock;
+       struct Qdisc            *qdisc_ingress;
+
+
+/*
+ * cache line used on xmit path
+ */
        /* hard_start_xmit synchronizer */
        spinlock_t              xmit_lock;
        /* cpu id of processor entered to hard_start_xmit or -1,
           if nobody entered there.
         */
        int                     xmit_lock_owner;
-       /* device queue lock */
-       spinlock_t              queue_lock;
+       void                    *priv;  /* pointer to private data      */
+       int                     (*hard_start_xmit) (struct sk_buff *skb,
+                                                   struct net_device *dev);
+       unsigned long           trans_start;    /* Time (in jiffies) of last Tx 
*/
+
+
+
        /* Number of references to this device */
        atomic_t                refcnt;
        /* delayed register/unregister */
@@ -419,10 +437,7 @@
        /* Pointers to interface service routines.      */
        int                     (*open)(struct net_device *dev);
        int                     (*stop)(struct net_device *dev);
-       int                     (*hard_start_xmit) (struct sk_buff *skb,
-                                                   struct net_device *dev);
 #define HAVE_NETDEV_POLL
-       int                     (*poll) (struct net_device *dev, int *quota);
        int                     (*hard_header) (struct sk_buff *skb,
                                                struct net_device *dev,
                                                unsigned short type,

Reply via email to