David S. Miller a écrit :
From: Eric Dumazet <[EMAIL PROTECTED]>
Date: Wed, 24 Aug 2005 01:10:44 +0200
Looking at tg3_tx() more closely, I am not convinced it really needs
to lock tp->tx_lock during the loop. tp->tx_cons (swidx) is changed
in this function only, and could be changed to an atomic_t
The tx_lock would be needed for the final
if (netif_queue_stopped(tp->dev) &&
(TX_BUFFS_AVAIL(tp) > TG3_TX_WAKEUP_THRESH))
netif_wake_queue(tp->dev);
If you're going to take the tx_lock anyways, no need to make
the tx_cons atomic in any way. Just update it inside of
that lock holding at the end.
dev->poll() executes atomically, and that helps a lot here.
Please toy around with some of these ideas and let us know
what works well in your testing.
Thanks.
Hi David
I played and I have very good results with the following patches.
# tc -s -d qdisc show dev eth0 ; sleep 10 ; tc -s -d qdisc show dev eth0
qdisc pfifo_fast 0: bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
Sent 440173135511 bytes 3211378293 pkt (dropped 240817, overlimits 0
requeues 27028035)
backlog 0b 0p requeues 27028035
qdisc pfifo_fast 0: bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
Sent 440216655667 bytes 3211730812 pkt (dropped 240904, overlimits 0
requeues 27031668)
backlog 0b 0p requeues 27031668
(So about 360 requeues per second, much better than before (12000 / second))
oprofile results give
0.6257 % qdisc_restart (instead of 2.6452 %)
This patch has 3 parts
- [TG3] : tx_lock spinlock is taken in tg3_tx() only when really needed.
The TX completion (tg3_tx) is done after RX completion : Better to first
take care of incoming frames than take care of re-enabling transmit.
- [NET] : add a spin_lock_prefetch(&dev->queue_lock) in dev_queue_xmit() as
the lock is going to be taken.
- [NET] : Try to reorder some hot fields of struct net_device and place them
on separate cache lines in SMP to lower memory bouncing netween multiple CPU
accessing the device.
- One part is mostly used on receive path (including eth_type_trans())
(poll_list, poll, quota, weight, last_rx, dev_addr, broadcast)
- One part is mostly used on queue transmit path (qdisc)
(queue_lock, qdisc, qdisc_sleeping, qdisc_list, tx_queue_len)
- One part is mostly used on xmit path (device)
(xmit_lock, xmit_lock_owner, priv, hard_start_xmit, trans_start)
Thank you
Signed-off-by: Eric Dumazet <[EMAIL PROTECTED]>
--- net-2.6.14/drivers/net/tg3.c 2005-08-26 02:13:58.000000000 +0200
+++ net-2.6.14-ed/drivers/net/tg3.c 2005-08-26 15:22:01.000000000 +0200
@@ -2847,8 +2847,7 @@
struct sk_buff *skb = ri->skb;
int i;
- if (unlikely(skb == NULL))
- BUG();
+ BUG_ON(skb == NULL);
pci_unmap_single(tp->pdev,
pci_unmap_addr(ri, mapping),
@@ -2860,12 +2859,10 @@
sw_idx = NEXT_TX(sw_idx);
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
- if (unlikely(sw_idx == hw_idx))
- BUG();
+ BUG_ON(sw_idx == hw_idx);
ri = &tp->tx_buffers[sw_idx];
- if (unlikely(ri->skb != NULL))
- BUG();
+ BUG_ON(ri->skb != NULL);
pci_unmap_page(tp->pdev,
pci_unmap_addr(ri, mapping),
@@ -2878,11 +2875,13 @@
dev_kfree_skb(skb);
}
+ spin_lock(&tp->tx_lock);
tp->tx_cons = sw_idx;
if (netif_queue_stopped(tp->dev) &&
(TX_BUFFS_AVAIL(tp) > TG3_TX_WAKEUP_THRESH))
netif_wake_queue(tp->dev);
+ spin_unlock(&tp->tx_lock);
}
/* Returns size of skb allocated or < 0 on error.
@@ -3196,13 +3195,6 @@
}
}
- /* run TX completion thread */
- if (sblk->idx[0].tx_consumer != tp->tx_cons) {
- spin_lock(&tp->tx_lock);
- tg3_tx(tp);
- spin_unlock(&tp->tx_lock);
- }
-
/* run RX thread, within the bounds set by NAPI.
* All RX "locking" is done by ensuring outside
* code synchronizes with dev->poll()
@@ -3220,6 +3212,10 @@
netdev->quota -= work_done;
}
+ /* run TX completion thread */
+ if (sblk->idx[0].tx_consumer != tp->tx_cons)
+ tg3_tx(tp);
+
if (tp->tg3_flags & TG3_FLAG_TAGGED_STATUS)
tp->last_tag = sblk->status_tag;
rmb();
--- net-2.6.14/net/core/dev.c 2005-08-26 02:14:00.000000000 +0200
+++ net-2.6.14-ed/net/core/dev.c 2005-08-26 16:32:27.000000000 +0200
@@ -1257,6 +1257,8 @@
if (skb_checksum_help(skb, 0))
goto out_kfree_skb;
+ spin_lock_prefetch(&dev->queue_lock);
+
/* Disable soft irqs for various locks below. Also
* stops preemption for RCU.
*/
--- net-2.6.14/include/linux/netdevice.h 2005-08-26 02:13:59.000000000
+0200
+++ net-2.6.14-ed/include/linux/netdevice.h 2005-08-26 16:13:27.000000000
+0200
@@ -317,8 +317,6 @@
*/
/* These may be needed for future network-power-down code. */
- unsigned long trans_start; /* Time (in jiffies) of last Tx
*/
- unsigned long last_rx; /* Time of last Rx */
unsigned short flags; /* interface flags (a la BSD) */
unsigned short gflags;
@@ -328,15 +326,12 @@
unsigned mtu; /* interface MTU value */
unsigned short type; /* interface hardware type */
unsigned short hard_header_len; /* hardware hdr length
*/
- void *priv; /* pointer to private data */
struct net_device *master; /* Pointer to master device of a group,
* which this device is member of.
*/
/* Interface address info. */
- unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add
*/
- unsigned char dev_addr[MAX_ADDR_LEN]; /* hw address */
unsigned char perm_addr[MAX_ADDR_LEN]; /* permanent hw
address */
unsigned char addr_len; /* hardware address length
*/
unsigned short dev_id; /* for shared network cards */
@@ -358,26 +353,49 @@
void *ec_ptr; /* Econet specific data */
void *ax25_ptr; /* AX.25 specific data */
- struct list_head poll_list; /* Link to poll list */
+/*
+ * cache line used on receive path
+ */
+ struct list_head poll_list ____cacheline_aligned_in_smp; /* Link
to poll list */
+ int (*poll) (struct net_device *dev, int *quota);
int quota;
int weight;
+ unsigned long last_rx; /* Time of last Rx */
+ /* Interface address info. */
+ unsigned char dev_addr[MAX_ADDR_LEN]; /* hw address */
+ unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add
*/
+/*
+ * cache line used on queue transmit path
+ */
+ /* device queue lock */
+ spinlock_t queue_lock ____cacheline_aligned_in_smp;
struct Qdisc *qdisc;
struct Qdisc *qdisc_sleeping;
- struct Qdisc *qdisc_ingress;
struct list_head qdisc_list;
unsigned long tx_queue_len; /* Max frames per queue allowed
*/
/* ingress path synchronizer */
spinlock_t ingress_lock;
+ struct Qdisc *qdisc_ingress;
+
+
+/*
+ * cache line used on xmit path
+ */
/* hard_start_xmit synchronizer */
spinlock_t xmit_lock;
/* cpu id of processor entered to hard_start_xmit or -1,
if nobody entered there.
*/
int xmit_lock_owner;
- /* device queue lock */
- spinlock_t queue_lock;
+ void *priv; /* pointer to private data */
+ int (*hard_start_xmit) (struct sk_buff *skb,
+ struct net_device *dev);
+ unsigned long trans_start; /* Time (in jiffies) of last Tx
*/
+
+
+
/* Number of references to this device */
atomic_t refcnt;
/* delayed register/unregister */
@@ -419,10 +437,7 @@
/* Pointers to interface service routines. */
int (*open)(struct net_device *dev);
int (*stop)(struct net_device *dev);
- int (*hard_start_xmit) (struct sk_buff *skb,
- struct net_device *dev);
#define HAVE_NETDEV_POLL
- int (*poll) (struct net_device *dev, int *quota);
int (*hard_header) (struct sk_buff *skb,
struct net_device *dev,
unsigned short type,