Implement the single/dual/quad loop design pattern from FD.IO VPP to
improve cache efficiency in the af_packet PMD receive path.

The original implementation processes packets one at a time in a simple
loop, which can result in cache misses when accessing frame headers and
packet data. The new implementation:

- Processes packets in batches of 4 (quad), 2 (dual), and 1 (single)
- Prefetches next batch of frame headers while processing current batch
- Prefetches packet data before memcpy to hide memory latency
- Reduces loop overhead through partial unrolling

Two helper functions are introduced:
- af_packet_get_frame(): Returns frame pointer at index with wraparound
- af_packet_rx_one(): Common per-packet processing (mbuf alloc, memcpy,
  VLAN handling, timestamp offload)

The quad loop checks availability of all 4 frames before processing,
falling through to dual/single loops when fewer frames are ready. Early
exit paths (out_advance1/2/3) ensure correct frame index tracking when
mbuf allocation fails mid-batch.

Prefetch strategy:
- Frame headers: prefetch N+4..N+7 while processing N..N+3
- Packet data: prefetch at tp_mac offset before memcpy

This pattern is well-established in high-performance packet processing
and should improve throughput by better utilizing CPU cache hierarchy,
particularly beneficial when processing bursts of packets.

Signed-off-by: Stephen Hemminger <[email protected]>
---
 drivers/net/af_packet/rte_eth_af_packet.c | 208 +++++++++++++++++-----
 1 file changed, 164 insertions(+), 44 deletions(-)

diff --git a/drivers/net/af_packet/rte_eth_af_packet.c 
b/drivers/net/af_packet/rte_eth_af_packet.c
index 5847e14d80..946c21d878 100644
--- a/drivers/net/af_packet/rte_eth_af_packet.c
+++ b/drivers/net/af_packet/rte_eth_af_packet.c
@@ -14,6 +14,7 @@
 #include <rte_malloc.h>
 #include <rte_kvargs.h>
 #include <bus_vdev_driver.h>
+#include <rte_prefetch.h>
 
 #include <errno.h>
 #include <linux/if_ether.h>
@@ -120,75 +121,194 @@ RTE_LOG_REGISTER_DEFAULT(af_packet_logtype, NOTICE);
        RTE_LOG_LINE(level, AFPACKET, "%s(): " fmt ":%s", __func__, \
                ## __VA_ARGS__, strerror(errno))
 
-static uint16_t
-eth_af_packet_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+/*
+ * Helper to get the frame pointer at a given index with wraparound
+ */
+static inline struct tpacket2_hdr *
+af_packet_get_frame(struct pkt_rx_queue *pkt_q, unsigned int idx)
+{
+       if (idx >= pkt_q->framecount)
+               idx -= pkt_q->framecount;
+       return (struct tpacket2_hdr *)pkt_q->rd[idx].iov_base;
+}
+
+/*
+ * Process a single received packet - common code for all loop variants
+ */
+static inline int
+af_packet_rx_one(struct pkt_rx_queue *pkt_q,
+                struct tpacket2_hdr *ppd,
+                struct rte_mbuf **mbuf_out,
+                unsigned long *rx_bytes)
 {
-       unsigned i;
-       struct tpacket2_hdr *ppd;
        struct rte_mbuf *mbuf;
        uint8_t *pbuf;
+
+       mbuf = rte_pktmbuf_alloc(pkt_q->mb_pool);
+       if (unlikely(mbuf == NULL)) {
+               pkt_q->rx_nombuf++;
+               return -1;
+       }
+
+       rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf) = 
ppd->tp_snaplen;
+       pbuf = (uint8_t *)ppd + ppd->tp_mac;
+       memcpy(rte_pktmbuf_mtod(mbuf, void *), pbuf, 
rte_pktmbuf_data_len(mbuf));
+
+       if (ppd->tp_status & TP_STATUS_VLAN_VALID) {
+               mbuf->vlan_tci = ppd->tp_vlan_tci;
+               mbuf->ol_flags |= (RTE_MBUF_F_RX_VLAN | 
RTE_MBUF_F_RX_VLAN_STRIPPED);
+               if (!pkt_q->vlan_strip && rte_vlan_insert(&mbuf))
+                       PMD_LOG(ERR, "Failed to reinsert VLAN tag");
+       }
+
+       if (pkt_q->timestamp_offloading) {
+               *RTE_MBUF_DYNFIELD(mbuf, timestamp_dynfield_offset,
+                       rte_mbuf_timestamp_t *) =
+                               (uint64_t)ppd->tp_sec * 1000000000 + 
ppd->tp_nsec;
+               mbuf->ol_flags |= timestamp_dynflag;
+       }
+
+       mbuf->port = pkt_q->in_port;
+       *mbuf_out = mbuf;
+       *rx_bytes += mbuf->pkt_len;
+       ppd->tp_status = TP_STATUS_KERNEL;
+
+       return 0;
+}
+
+/*
+ * Receive packets using VPP-style single/dual/quad loop pattern with 
prefetching.
+ */
+static uint16_t
+eth_af_packet_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
        struct pkt_rx_queue *pkt_q = queue;
+       struct tpacket2_hdr *ppd0, *ppd1, *ppd2, *ppd3;
        uint16_t num_rx = 0;
        unsigned long num_rx_bytes = 0;
        unsigned int framecount, framenum;
+       uint16_t n_left;
 
        if (unlikely(nb_pkts == 0))
                return 0;
 
-       /*
-        * Reads the given number of packets from the AF_PACKET socket one by
-        * one and copies the packet data into a newly allocated mbuf.
-        */
        framecount = pkt_q->framecount;
        framenum = pkt_q->framenum;
-       for (i = 0; i < nb_pkts; i++) {
-               /* point at the next incoming frame */
-               ppd = (struct tpacket2_hdr *) pkt_q->rd[framenum].iov_base;
-               if ((ppd->tp_status & TP_STATUS_USER) == 0)
+       n_left = nb_pkts;
+
+       /* Quad loop: Process 4 packets at a time with prefetching */
+       while (n_left >= 4) {
+               ppd0 = af_packet_get_frame(pkt_q, framenum);
+               ppd1 = af_packet_get_frame(pkt_q, framenum + 1);
+               ppd2 = af_packet_get_frame(pkt_q, framenum + 2);
+               ppd3 = af_packet_get_frame(pkt_q, framenum + 3);
+
+               if ((ppd0->tp_status & TP_STATUS_USER) == 0)
                        break;
+               if ((ppd1->tp_status & TP_STATUS_USER) == 0)
+                       goto dual_loop;
+               if ((ppd2->tp_status & TP_STATUS_USER) == 0)
+                       goto dual_loop;
+               if ((ppd3->tp_status & TP_STATUS_USER) == 0)
+                       goto dual_loop;
+
+               /* Prefetch next 4 frame headers */
+               rte_prefetch0(af_packet_get_frame(pkt_q, framenum + 4));
+               rte_prefetch0(af_packet_get_frame(pkt_q, framenum + 5));
+               rte_prefetch0(af_packet_get_frame(pkt_q, framenum + 6));
+               rte_prefetch0(af_packet_get_frame(pkt_q, framenum + 7));
+
+               /* Prefetch packet data */
+               rte_prefetch0((uint8_t *)ppd0 + ppd0->tp_mac);
+               rte_prefetch0((uint8_t *)ppd1 + ppd1->tp_mac);
+               rte_prefetch0((uint8_t *)ppd2 + ppd2->tp_mac);
+               rte_prefetch0((uint8_t *)ppd3 + ppd3->tp_mac);
+
+               if (unlikely(af_packet_rx_one(pkt_q, ppd0, &bufs[num_rx], 
&num_rx_bytes) < 0))
+                       goto out;
+               num_rx++;
+               if (unlikely(af_packet_rx_one(pkt_q, ppd1, &bufs[num_rx], 
&num_rx_bytes) < 0))
+                       goto out_advance1;
+               num_rx++;
+               if (unlikely(af_packet_rx_one(pkt_q, ppd2, &bufs[num_rx], 
&num_rx_bytes) < 0))
+                       goto out_advance2;
+               num_rx++;
+               if (unlikely(af_packet_rx_one(pkt_q, ppd3, &bufs[num_rx], 
&num_rx_bytes) < 0))
+                       goto out_advance3;
+               num_rx++;
 
-               /* allocate the next mbuf */
-               mbuf = rte_pktmbuf_alloc(pkt_q->mb_pool);
-               if (unlikely(mbuf == NULL)) {
-                       pkt_q->rx_nombuf++;
+               framenum += 4;
+               if (framenum >= framecount)
+                       framenum -= framecount;
+               n_left -= 4;
+       }
+
+dual_loop:
+       /* Dual loop: Process 2 packets at a time */
+       while (n_left >= 2) {
+               ppd0 = af_packet_get_frame(pkt_q, framenum);
+               ppd1 = af_packet_get_frame(pkt_q, framenum + 1);
+
+               if ((ppd0->tp_status & TP_STATUS_USER) == 0)
                        break;
-               }
+               if ((ppd1->tp_status & TP_STATUS_USER) == 0)
+                       goto single_loop;
 
-               /* packet will fit in the mbuf, go ahead and receive it */
-               rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf) = 
ppd->tp_snaplen;
-               pbuf = (uint8_t *) ppd + ppd->tp_mac;
-               memcpy(rte_pktmbuf_mtod(mbuf, void *), pbuf, 
rte_pktmbuf_data_len(mbuf));
+               rte_prefetch0(af_packet_get_frame(pkt_q, framenum + 2));
+               rte_prefetch0(af_packet_get_frame(pkt_q, framenum + 3));
+               rte_prefetch0((uint8_t *)ppd0 + ppd0->tp_mac);
+               rte_prefetch0((uint8_t *)ppd1 + ppd1->tp_mac);
 
-               /* check for vlan info */
-               if (ppd->tp_status & TP_STATUS_VLAN_VALID) {
-                       mbuf->vlan_tci = ppd->tp_vlan_tci;
-                       mbuf->ol_flags |= (RTE_MBUF_F_RX_VLAN | 
RTE_MBUF_F_RX_VLAN_STRIPPED);
+               if (unlikely(af_packet_rx_one(pkt_q, ppd0, &bufs[num_rx], 
&num_rx_bytes) < 0))
+                       goto out;
+               num_rx++;
+               if (unlikely(af_packet_rx_one(pkt_q, ppd1, &bufs[num_rx], 
&num_rx_bytes) < 0))
+                       goto out_advance1;
+               num_rx++;
 
-                       if (!pkt_q->vlan_strip && rte_vlan_insert(&mbuf))
-                               PMD_LOG(ERR, "Failed to reinsert VLAN tag");
-               }
+               framenum += 2;
+               if (framenum >= framecount)
+                       framenum -= framecount;
+               n_left -= 2;
+       }
 
-               /* add kernel provided timestamp when offloading is enabled */
-               if (pkt_q->timestamp_offloading) {
-                       /* since TPACKET_V2 timestamps are provided in 
nanoseconds resolution */
-                       *RTE_MBUF_DYNFIELD(mbuf, timestamp_dynfield_offset,
-                               rte_mbuf_timestamp_t *) =
-                                       (uint64_t)ppd->tp_sec * 1000000000 + 
ppd->tp_nsec;
+single_loop:
+       /* Single loop: Process remaining packets */
+       while (n_left >= 1) {
+               ppd0 = af_packet_get_frame(pkt_q, framenum);
 
-                       mbuf->ol_flags |= timestamp_dynflag;
-               }
+               if ((ppd0->tp_status & TP_STATUS_USER) == 0)
+                       break;
 
-               /* release incoming frame and advance ring buffer */
-               ppd->tp_status = TP_STATUS_KERNEL;
-               if (++framenum >= framecount)
-                       framenum = 0;
-               mbuf->port = pkt_q->in_port;
+               rte_prefetch0(af_packet_get_frame(pkt_q, framenum + 1));
+               rte_prefetch0((uint8_t *)ppd0 + ppd0->tp_mac);
 
-               /* account for the receive frame */
-               bufs[i] = mbuf;
+               if (unlikely(af_packet_rx_one(pkt_q, ppd0, &bufs[num_rx], 
&num_rx_bytes) < 0))
+                       goto out;
                num_rx++;
-               num_rx_bytes += mbuf->pkt_len;
+
+               if (++framenum >= framecount)
+                       framenum = 0;
+               n_left--;
        }
+
+       goto out;
+
+out_advance3:
+       framenum += 3;
+       if (framenum >= framecount)
+               framenum -= framecount;
+       goto out;
+out_advance2:
+       framenum += 2;
+       if (framenum >= framecount)
+               framenum -= framecount;
+       goto out;
+out_advance1:
+       framenum += 1;
+       if (framenum >= framecount)
+               framenum -= framecount;
+out:
        pkt_q->framenum = framenum;
        pkt_q->rx_pkts += num_rx;
        pkt_q->rx_bytes += num_rx_bytes;
-- 
2.51.0

Reply via email to