[EXPERIMENTAL] HT aware loopback device (hack, x86-64 only atm)

Benjamin LaHaise Tue, 07 Mar 2006 13:32:22 -0800

Hi folks,

I'd like to start some discussions on SMP optimizations for the networking 
stack.  The patch below is one such example which changes the loopback 
device in a way that helps out on workloads like netperf by trying to share 
more work with the other CPU on an HT system.  Basically, if the other CPU 
is idle, we punt the netif_rx onto the other CPU.  Using a kernel thread 
for this is fairly inefficient, so I am wondering if it makes sense to do 
it on the softirq level.  This particular patch improves netperf over 
localhost by ~600Mbit/s (from ~9874Mbit/s to ~10475Mbit/s while raising 
%CPU usage from ~92% to ~95%, although it varies quite a bit) on a 3GHz P4 
with HT (this is with a pile of other patches to optimize task switching 
on x86-64).


The bigger part of the discussion is probably a question of how we can make 
the network stack scale with multicore CPUs.  For workloads like routing 
lots of small packets, a single CPU can be easily overwhelmed.  The question 
becomes where does partitioning the work make sense?  At the very least we 
probably need to do some preprocessing of incoming packets so that a series 
of packets destined for a particular flow end up on the same CPU.  This 
sort of preprocessing probably makes sense for other reasons: by processing 
a group of packets for a particular socket in one go, we can avoid the 
overhead of locking and unlocking the socket repeatedly (which is pretty 
expensive due to the memory barrier nature of locks).

At this point I'd just like to stir up some discussion, so please comment 
away with any ideas and concerns.

                -ben
-- 
"Time is of no importance, Mr. President, only life is important."
Don't Email: <[EMAIL PROTECTED]>.


diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 690a1aa..ef283a3 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -58,6 +58,69 @@
 #include <linux/tcp.h>
 #include <linux/percpu.h>
 
+#define LOOP_NR_SKBS   256
+static struct sk_buff *loop_skb_ring[LOOP_NR_SKBS];
+static unsigned loop_skb_head, loop_skb_tail;
+static struct task_struct *loop_task;
+
+static void smp_loop_netif_rx(struct sk_buff *skb)
+{
+       unsigned int next = (loop_skb_head + 1) % LOOP_NR_SKBS;
+
+       if (next == loop_skb_tail) {
+               dev_kfree_skb(skb);
+               return;
+       }
+
+       loop_skb_ring[loop_skb_head] = skb;
+       wmb();
+       loop_skb_head = next;
+}
+
+static void smp_loop_wake(void)
+{
+       if (loop_task && loop_task->state != TASK_RUNNING)
+               wake_up_process(loop_task);
+}
+
+static int loop_netif_rx_thread(void *data)
+{
+       loop_task = current;
+
+       for (;;) {
+               int nr = 0;
+               while (loop_skb_tail != loop_skb_head) {
+                       unsigned next;
+                       struct sk_buff *skb = loop_skb_ring[loop_skb_tail];
+                       loop_skb_ring[loop_skb_tail] = NULL;
+                       next = (loop_skb_tail + 1) % LOOP_NR_SKBS;
+                       barrier();
+                       loop_skb_tail = next;
+                       netif_rx(skb);
+                       if (nr++ >= 96) {
+                               do_softirq();
+                               nr = 0;
+                       }
+               }
+
+               do_softirq();
+
+               set_current_state(TASK_INTERRUPTIBLE);
+               if (loop_skb_tail == loop_skb_head)
+                       schedule();
+               set_current_state(TASK_RUNNING);
+       }
+}
+
+static inline int sibling_is_idle(void)
+{
+       int cpu = smp_processor_id() ^ 1;
+       struct x8664_pda *pda = cpu_pda(cpu);
+       if (pda->pcurrent == idle_task(cpu) || pda->pcurrent == loop_task)
+               return 1;
+       return 0;
+}
+
 static DEFINE_PER_CPU(struct net_device_stats, loopback_stats);
 
 #define LOOPBACK_OVERHEAD (128 + MAX_HEADER + 16 + 16)
@@ -69,6 +132,7 @@ static DEFINE_PER_CPU(struct net_device_
  */
 
 #ifdef LOOPBACK_TSO
+static void smp_loop_netif_rx(struct sk_buff *skb);
 static void emulate_large_send_offload(struct sk_buff *skb)
 {
        struct iphdr *iph = skb->nh.iph;
@@ -76,6 +140,7 @@ static void emulate_large_send_offload(s
        unsigned int doffset = (iph->ihl + th->doff) * 4;
        unsigned int mtu = skb_shinfo(skb)->tso_size + doffset;
        unsigned int offset = 0;
+       int use_sibling = sibling_is_idle();
        u32 seq = ntohl(th->seq);
        u16 id  = ntohs(iph->id);
 
@@ -112,12 +177,21 @@ static void emulate_large_send_offload(s
                th->seq = htonl(seq);
                if (offset + doffset + frag_size < skb->len)
                        th->fin = th->psh = 0;
+#ifdef CONFIG_SMP
+               if (use_sibling)
+                       smp_loop_netif_rx(nskb);
+               else
+                       netif_rx(nskb);
+#else
                netif_rx(nskb);
+#endif
                offset += frag_size;
                seq += frag_size;
                id++;
        }
 
+       if (use_sibling)
+               smp_loop_wake();
        dev_kfree_skb(skb);
 }
 #endif /* LOOPBACK_TSO */
@@ -156,8 +230,15 @@ static int loopback_xmit(struct sk_buff 
        lb_stats->tx_packets = lb_stats->rx_packets;
        put_cpu();
 
+#ifdef CONFIG_SMP
+       if (sibling_is_idle()) {
+               smp_loop_netif_rx(skb);
+               smp_loop_wake();
+       } else
+               netif_rx(skb);
+#else
        netif_rx(skb);
-
+#endif
        return(0);
 }
 
@@ -225,6 +306,8 @@ int __init loopback_init(void)
 {
        struct net_device_stats *stats;
 
+       kernel_thread(loop_netif_rx_thread, NULL, 0);
+
        /* Can survive without statistics */
        stats = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL);
        if (stats) {

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[EXPERIMENTAL] HT aware loopback device (hack, x86-64 only atm)

Reply via email to