This could maybe be made more efficient if we first split the list based on
 skb->protocol, and then did ptype lookup for each sublist.  Unfortunately,
 there are things liks sch_handle_ingress and the rx_handlers that can
 produce different results per packet.

Signed-off-by: Edward Cree <ec...@solarflare.com>
---
 include/trace/events/net.h |   7 +++
 net/core/dev.c             | 146 ++++++++++++++++++++++++++++++++-------------
 2 files changed, 113 insertions(+), 40 deletions(-)

diff --git a/include/trace/events/net.h b/include/trace/events/net.h
index 30f359c..7a17a31 100644
--- a/include/trace/events/net.h
+++ b/include/trace/events/net.h
@@ -130,6 +130,13 @@ DEFINE_EVENT(net_dev_template, netif_receive_skb,
        TP_ARGS(skb)
 );
 
+DEFINE_EVENT(net_dev_template, netif_receive_skb_list,
+
+       TP_PROTO(struct sk_buff *skb),
+
+       TP_ARGS(skb)
+);
+
 DEFINE_EVENT(net_dev_template, netif_rx,
 
        TP_PROTO(struct sk_buff *skb),
diff --git a/net/core/dev.c b/net/core/dev.c
index 0f914bf..db1d16a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4061,12 +4061,13 @@ static inline int nf_ingress(struct sk_buff *skb, 
struct packet_type **pt_prev,
        return 0;
 }
 
-static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
+static int __netif_receive_skb_taps(struct sk_buff *skb, bool pfmemalloc,
+                                   struct packet_type **pt_prev)
 {
-       struct packet_type *ptype, *pt_prev;
        rx_handler_func_t *rx_handler;
        struct net_device *orig_dev;
        bool deliver_exact = false;
+       struct packet_type *ptype;
        int ret = NET_RX_DROP;
        __be16 type;
 
@@ -4081,7 +4082,7 @@ static int __netif_receive_skb_core(struct sk_buff *skb, 
bool pfmemalloc)
                skb_reset_transport_header(skb);
        skb_reset_mac_len(skb);
 
-       pt_prev = NULL;
+       *pt_prev = NULL;
 
 another_round:
        skb->skb_iif = skb->dev->ifindex;
@@ -4106,25 +4107,25 @@ another_round:
                goto skip_taps;
 
        list_for_each_entry_rcu(ptype, &ptype_all, list) {
-               if (pt_prev)
-                       ret = deliver_skb(skb, pt_prev, orig_dev);
-               pt_prev = ptype;
+               if (*pt_prev)
+                       ret = deliver_skb(skb, *pt_prev, orig_dev);
+               *pt_prev = ptype;
        }
 
        list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
-               if (pt_prev)
-                       ret = deliver_skb(skb, pt_prev, orig_dev);
-               pt_prev = ptype;
+               if (*pt_prev)
+                       ret = deliver_skb(skb, *pt_prev, orig_dev);
+               *pt_prev = ptype;
        }
 
 skip_taps:
 #ifdef CONFIG_NET_INGRESS
        if (static_key_false(&ingress_needed)) {
-               skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
+               skb = sch_handle_ingress(skb, pt_prev, &ret, orig_dev);
                if (!skb)
                        goto out;
 
-               if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
+               if (nf_ingress(skb, pt_prev, &ret, orig_dev) < 0)
                        goto out;
        }
 #endif
@@ -4136,9 +4137,9 @@ ncls:
                goto drop;
 
        if (skb_vlan_tag_present(skb)) {
-               if (pt_prev) {
-                       ret = deliver_skb(skb, pt_prev, orig_dev);
-                       pt_prev = NULL;
+               if (*pt_prev) {
+                       ret = deliver_skb(skb, *pt_prev, orig_dev);
+                       *pt_prev = NULL;
                }
                if (vlan_do_receive(&skb))
                        goto another_round;
@@ -4148,9 +4149,9 @@ ncls:
 
        rx_handler = rcu_dereference(skb->dev->rx_handler);
        if (rx_handler) {
-               if (pt_prev) {
-                       ret = deliver_skb(skb, pt_prev, orig_dev);
-                       pt_prev = NULL;
+               if (*pt_prev) {
+                       ret = deliver_skb(skb, *pt_prev, orig_dev);
+                       *pt_prev = NULL;
                }
                switch (rx_handler(&skb)) {
                case RX_HANDLER_CONSUMED:
@@ -4181,47 +4182,112 @@ ncls:
 
        /* deliver only exact match when indicated */
        if (likely(!deliver_exact)) {
-               deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+               deliver_ptype_list_skb(skb, pt_prev, orig_dev, type,
                                       &ptype_base[ntohs(type) &
                                                   PTYPE_HASH_MASK]);
        }
 
-       deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+       deliver_ptype_list_skb(skb, pt_prev, orig_dev, type,
                               &orig_dev->ptype_specific);
 
        if (unlikely(skb->dev != orig_dev)) {
-               deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+               deliver_ptype_list_skb(skb, pt_prev, orig_dev, type,
                                       &skb->dev->ptype_specific);
        }
-
-       if (pt_prev) {
-               if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
-                       goto drop;
-               else
-                       ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
-       } else {
+       if (*pt_prev && unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
+               goto drop;
+       return ret;
 drop:
-               if (!deliver_exact)
-                       atomic_long_inc(&skb->dev->rx_dropped);
-               else
-                       atomic_long_inc(&skb->dev->rx_nohandler);
-               kfree_skb(skb);
-               /* Jamal, now you will not able to escape explaining
-                * me how you were going to use this. :-)
-                */
-               ret = NET_RX_DROP;
-       }
-
+       if (!deliver_exact)
+               atomic_long_inc(&skb->dev->rx_dropped);
+       else
+               atomic_long_inc(&skb->dev->rx_nohandler);
+       kfree_skb(skb);
+       /* Jamal, now you will not able to escape explaining
+        * me how you were going to use this. :-)
+        */
+       ret = NET_RX_DROP;
 out:
+       *pt_prev = NULL;
        return ret;
 }
 
-static void __netif_receive_skb_list_core(struct sk_buff_head *list, bool 
pfmemalloc)
+static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
+{
+       struct net_device *orig_dev = skb->dev;
+       struct packet_type *pt_prev;
+       int ret;
+
+       ret = __netif_receive_skb_taps(skb, pfmemalloc, &pt_prev);
+       if (pt_prev)
+               ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+       return ret;
+}
+
+static inline void __netif_receive_skb_list_ptype(struct sk_buff_head *list,
+                                                 struct packet_type *pt_prev,
+                                                 struct net_device *orig_dev)
 {
        struct sk_buff *skb;
 
        while ((skb = __skb_dequeue(list)) != NULL)
-               __netif_receive_skb_core(skb, pfmemalloc);
+               pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+}
+
+static void __netif_receive_skb_list_core(struct sk_buff_head *list, bool 
pfmemalloc)
+{
+       /* Fast-path assumptions:
+        * - There is no RX handler.
+        * - Only one packet_type matches.
+        * If either of these fails, we will end up doing some per-packet
+        * processing in-line, then handling the 'last ptype' for the whole
+        * sublist.  This can't cause out-of-order delivery to any single ptype,
+        * because the 'last ptype' must be constant across the sublist, and all
+        * other ptypes are handled per-packet.  Unless, that is, a ptype can
+        * be delivered to more than once for a single packet - but that seems
+        * like it would be a bad idea anyway.
+        * So it should be fine (at least, I think so), but you'll lose the
+        * (putative) performance benefits of batching.
+        */
+       /* Current (common) ptype of sublist */
+       struct packet_type *pt_curr = NULL;
+       /* In the normal (device RX) case, orig_dev should be the same for
+        * every skb in the list.  But as I'm not certain of this, I check
+        * it's constant and split the list if not.
+        * So, od_curr is the current (common) orig_dev of sublist.
+        */
+       struct net_device *od_curr = NULL;
+       struct sk_buff_head sublist;
+       struct sk_buff *skb;
+
+       __skb_queue_head_init(&sublist);
+
+       while ((skb = __skb_dequeue(list)) != NULL) {
+               struct packet_type *pt_prev;
+               struct net_device *orig_dev = skb->dev;
+
+               __netif_receive_skb_taps(skb, pfmemalloc, &pt_prev);
+               if (pt_prev) {
+                       if (skb_queue_empty(&sublist)) {
+                               pt_curr = pt_prev;
+                               od_curr = orig_dev;
+                       } else if (!(pt_curr == pt_prev &&
+                                    od_curr == orig_dev)) {
+                               /* dispatch old sublist */
+                               __netif_receive_skb_list_ptype(&sublist,
+                                                              pt_curr,
+                                                              od_curr);
+                               /* start new sublist */
+                               __skb_queue_head_init(&sublist);
+                               pt_curr = pt_prev;
+                               od_curr = orig_dev;
+                       }
+                       __skb_queue_tail(&sublist, skb);
+               }
+       }
+
+       /* dispatch final sublist */
+       __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 }
 
 static int __netif_receive_skb(struct sk_buff *skb)

Reply via email to