Hi folks, I'd like to start some discussions on SMP optimizations for the networking stack. The patch below is one such example which changes the loopback device in a way that helps out on workloads like netperf by trying to share more work with the other CPU on an HT system. Basically, if the other CPU is idle, we punt the netif_rx onto the other CPU. Using a kernel thread for this is fairly inefficient, so I am wondering if it makes sense to do it on the softirq level. This particular patch improves netperf over localhost by ~600Mbit/s (from ~9874Mbit/s to ~10475Mbit/s while raising %CPU usage from ~92% to ~95%, although it varies quite a bit) on a 3GHz P4 with HT (this is with a pile of other patches to optimize task switching on x86-64).
The bigger part of the discussion is probably a question of how we can make the network stack scale with multicore CPUs. For workloads like routing lots of small packets, a single CPU can be easily overwhelmed. The question becomes where does partitioning the work make sense? At the very least we probably need to do some preprocessing of incoming packets so that a series of packets destined for a particular flow end up on the same CPU. This sort of preprocessing probably makes sense for other reasons: by processing a group of packets for a particular socket in one go, we can avoid the overhead of locking and unlocking the socket repeatedly (which is pretty expensive due to the memory barrier nature of locks). At this point I'd just like to stir up some discussion, so please comment away with any ideas and concerns. -ben -- "Time is of no importance, Mr. President, only life is important." Don't Email: <[EMAIL PROTECTED]>. diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index 690a1aa..ef283a3 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -58,6 +58,69 @@ #include <linux/tcp.h> #include <linux/percpu.h> +#define LOOP_NR_SKBS 256 +static struct sk_buff *loop_skb_ring[LOOP_NR_SKBS]; +static unsigned loop_skb_head, loop_skb_tail; +static struct task_struct *loop_task; + +static void smp_loop_netif_rx(struct sk_buff *skb) +{ + unsigned int next = (loop_skb_head + 1) % LOOP_NR_SKBS; + + if (next == loop_skb_tail) { + dev_kfree_skb(skb); + return; + } + + loop_skb_ring[loop_skb_head] = skb; + wmb(); + loop_skb_head = next; +} + +static void smp_loop_wake(void) +{ + if (loop_task && loop_task->state != TASK_RUNNING) + wake_up_process(loop_task); +} + +static int loop_netif_rx_thread(void *data) +{ + loop_task = current; + + for (;;) { + int nr = 0; + while (loop_skb_tail != loop_skb_head) { + unsigned next; + struct sk_buff *skb = loop_skb_ring[loop_skb_tail]; + loop_skb_ring[loop_skb_tail] = NULL; + next = (loop_skb_tail + 1) % LOOP_NR_SKBS; + barrier(); + loop_skb_tail = next; + netif_rx(skb); + if (nr++ >= 96) { + do_softirq(); + nr = 0; + } + } + + do_softirq(); + + set_current_state(TASK_INTERRUPTIBLE); + if (loop_skb_tail == loop_skb_head) + schedule(); + set_current_state(TASK_RUNNING); + } +} + +static inline int sibling_is_idle(void) +{ + int cpu = smp_processor_id() ^ 1; + struct x8664_pda *pda = cpu_pda(cpu); + if (pda->pcurrent == idle_task(cpu) || pda->pcurrent == loop_task) + return 1; + return 0; +} + static DEFINE_PER_CPU(struct net_device_stats, loopback_stats); #define LOOPBACK_OVERHEAD (128 + MAX_HEADER + 16 + 16) @@ -69,6 +132,7 @@ static DEFINE_PER_CPU(struct net_device_ */ #ifdef LOOPBACK_TSO +static void smp_loop_netif_rx(struct sk_buff *skb); static void emulate_large_send_offload(struct sk_buff *skb) { struct iphdr *iph = skb->nh.iph; @@ -76,6 +140,7 @@ static void emulate_large_send_offload(s unsigned int doffset = (iph->ihl + th->doff) * 4; unsigned int mtu = skb_shinfo(skb)->tso_size + doffset; unsigned int offset = 0; + int use_sibling = sibling_is_idle(); u32 seq = ntohl(th->seq); u16 id = ntohs(iph->id); @@ -112,12 +177,21 @@ static void emulate_large_send_offload(s th->seq = htonl(seq); if (offset + doffset + frag_size < skb->len) th->fin = th->psh = 0; +#ifdef CONFIG_SMP + if (use_sibling) + smp_loop_netif_rx(nskb); + else + netif_rx(nskb); +#else netif_rx(nskb); +#endif offset += frag_size; seq += frag_size; id++; } + if (use_sibling) + smp_loop_wake(); dev_kfree_skb(skb); } #endif /* LOOPBACK_TSO */ @@ -156,8 +230,15 @@ static int loopback_xmit(struct sk_buff lb_stats->tx_packets = lb_stats->rx_packets; put_cpu(); +#ifdef CONFIG_SMP + if (sibling_is_idle()) { + smp_loop_netif_rx(skb); + smp_loop_wake(); + } else + netif_rx(skb); +#else netif_rx(skb); - +#endif return(0); } @@ -225,6 +306,8 @@ int __init loopback_init(void) { struct net_device_stats *stats; + kernel_thread(loop_netif_rx_thread, NULL, 0); + /* Can survive without statistics */ stats = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL); if (stats) { - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html