From: Vignesh PS <vignesh.purushotham.srini...@ericsson.com>

af_packet PMD uses system calls to transmit packets. Separate the
transmit function into two different calls so its possible to avoid
syscalls during transmit.

Signed-off-by: Vignesh PS <vignesh.purushotham.srini...@ericsson.com>
---
 .mailmap                                  |  1 +
 doc/guides/nics/af_packet.rst             | 26 ++++++-
 drivers/net/af_packet/rte_eth_af_packet.c | 90 ++++++++++++++++++++++-
 3 files changed, 110 insertions(+), 7 deletions(-)

diff --git a/.mailmap b/.mailmap
index 4a508bafad..5e9462b7cd 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1548,6 +1548,7 @@ Viacheslav Ovsiienko <viachesl...@nvidia.com> 
<viachesl...@mellanox.com>
 Victor Kaplansky <vict...@redhat.com>
 Victor Raj <victor....@intel.com>
 Vidya Sagar Velumuri <vvelum...@marvell.com>
+Vignesh PS <vignesh.purushotham.srini...@ericsson.com>
 Vignesh Sridhar <vignesh.srid...@intel.com>
 Vijayakumar Muthuvel Manickam <mmvi...@gmail.com>
 Vijaya Mohan Guvva <vijay1...@gmail.com>
diff --git a/doc/guides/nics/af_packet.rst b/doc/guides/nics/af_packet.rst
index 66b977e1a2..fe92ef231f 100644
--- a/doc/guides/nics/af_packet.rst
+++ b/doc/guides/nics/af_packet.rst
@@ -29,6 +29,7 @@ Some of these, in turn, will be used to configure the 
PACKET_MMAP settings.
 *   ``framesz`` - PACKET_MMAP frame size (optional, default 2048B; Note: 
multiple
     of 16B);
 *   ``framecnt`` - PACKET_MMAP frame count (optional, default 512).
+*   ``explicit_flush`` - enable two stage packet transmit.
 
 Because this implementation is based on PACKET_MMAP, and PACKET_MMAP has its
 own pre-requisites, it should be noted that the inner workings of PACKET_MMAP
@@ -39,6 +40,9 @@ As an example, if one changes ``framesz`` to be 1024B, it is 
expected that
 ``blocksz`` is set to at least 1024B as well (although 2048B in this case would
 allow two "frames" per "block").
 
+When ``explicit_flush`` is enabled, then the PMD will temporary buffer mbuf in 
a
+ring buffer in the PMD until ``rte_eth_tx_done_cleanup`` is called on the TX 
queue.
+
 This restriction happens because PACKET_MMAP expects each single "frame" to fit
 inside of a "block". And although multiple "frames" can fit inside of a single
 "block", a "frame" may not span across two "blocks".
@@ -64,11 +68,25 @@ framecnt=512):
 
 .. code-block:: console
 
-    
--vdev=eth_af_packet0,iface=tap0,blocksz=4096,framesz=2048,framecnt=512,qpairs=1,qdisc_bypass=0
+    
--vdev=eth_af_packet0,iface=tap0,blocksz=4096,framesz=2048,framecnt=512,qpairs=1,qdisc_bypass=0,explicit_flush=1
 
 Features and Limitations
 ------------------------
 
-The PMD will re-insert the VLAN tag transparently to the packet if the kernel
-strips it, as long as the ``RTE_ETH_RX_OFFLOAD_VLAN_STRIP`` is not enabled by 
the
-application.
+* The PMD will re-insert the VLAN tag transparently to the packet if the kernel
+  strips it, as long as the ``RTE_ETH_RX_OFFLOAD_VLAN_STRIP`` is not enabled 
by the
+  application.
+* The PMD relies on send_to() system call to transmit packets from the 
PACKET_MMAP socket.
+  This system call can cause head-in-line blocking. Hence, it's advantageous 
to buffer the
+  packets in the drivers instead of immediately triggering packet transmits on 
calling
+  ``rte_eth_tx_burst()``. Therefore, the PMD splits the functionality of 
``rte_eth_tx_burst()``
+  into two functional stages, where ``rte_eth_tx_burst()`` causes packets to 
be  be buffered
+  in the driver, and subsequent call to ``rte_eth_tx_done_cleanup()`` triggers 
the actual
+  packet transmits. With such disaggregated PMD design, it is possible to call
+  ``rte_eth_tx_burst()`` on workers and trigger tramists (by calling
+  ``rte_eth_tx_done_cleanup()``) from a control plane worker and eliminate
+  head-in-line blocking.
+* To enable the two stage packet transmit, the PMD should be started with 
explicit_flush=1
+  (Default explicit_flush=0).
+* When calling ``rte_eth_tx_done_cleanup()`` the free_cnt parameter has no 
effect on how
+  many packets are flushed. The PMD will flush all the packets present in the 
buffer.
diff --git a/drivers/net/af_packet/rte_eth_af_packet.c 
b/drivers/net/af_packet/rte_eth_af_packet.c
index 6b7b16f348..cdbe43313a 100644
--- a/drivers/net/af_packet/rte_eth_af_packet.c
+++ b/drivers/net/af_packet/rte_eth_af_packet.c
@@ -36,9 +36,11 @@
 #define ETH_AF_PACKET_FRAMESIZE_ARG    "framesz"
 #define ETH_AF_PACKET_FRAMECOUNT_ARG   "framecnt"
 #define ETH_AF_PACKET_QDISC_BYPASS_ARG "qdisc_bypass"
+#define ETH_AF_PACKET_EXPLICIT_FLUSH_ARG       "explicit_flush"
 
 #define DFLT_FRAME_SIZE                (1 << 11)
 #define DFLT_FRAME_COUNT       (1 << 9)
+#define DFLT_FRAME_BURST       (32)
 
 struct __rte_cache_aligned pkt_rx_queue {
        int sockfd;
@@ -62,8 +64,10 @@ struct __rte_cache_aligned pkt_tx_queue {
 
        struct iovec *rd;
        uint8_t *map;
+       struct rte_ring *buf;
        unsigned int framecount;
        unsigned int framenum;
+       unsigned int explicit_flush;
 
        volatile unsigned long tx_pkts;
        volatile unsigned long err_pkts;
@@ -91,6 +95,7 @@ static const char *valid_arguments[] = {
        ETH_AF_PACKET_FRAMESIZE_ARG,
        ETH_AF_PACKET_FRAMECOUNT_ARG,
        ETH_AF_PACKET_QDISC_BYPASS_ARG,
+       ETH_AF_PACKET_EXPLICIT_FLUSH_ARG,
        NULL
 };
 
@@ -198,7 +203,7 @@ tx_ring_status_available(uint32_t tp_status)
  * Callback to handle sending packets through a real NIC.
  */
 static uint16_t
-eth_af_packet_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+eth_af_packet_tx_internal(void *queue, struct rte_mbuf **bufs, uint16_t 
nb_pkts)
 {
        struct tpacket2_hdr *ppd;
        struct rte_mbuf *mbuf;
@@ -311,6 +316,59 @@ eth_af_packet_tx(void *queue, struct rte_mbuf **bufs, 
uint16_t nb_pkts)
        return i;
 }
 
+/*
+ * Callback to handle sending packets.
+ */
+static uint16_t
+eth_af_packet_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+       struct pkt_tx_queue *pkt_q = queue;
+
+       if (unlikely(nb_pkts == 0))
+               return 0;
+
+       if (pkt_q->explicit_flush)
+               return rte_ring_enqueue_burst(pkt_q->buf,
+                               (void **)bufs, nb_pkts, NULL);
+
+       return eth_af_packet_tx_internal(queue, bufs, nb_pkts);
+}
+
+/*
+ * Callback to flush previously buffer tx packets.
+ */
+static int
+eth_af_packet_tx_flush(void *queue, uint32_t free_cnt __rte_unused)
+{
+       uint16_t sent, nb_pkts;
+       uint16_t num_flushed = 0;
+
+       struct pkt_tx_queue *pkt_q = queue;
+
+       while (true) {
+               /* flush DFLT_FRAME_BURST of buffered pkts every iteration */
+               struct rte_mbuf *bufs[DFLT_FRAME_BURST];
+               nb_pkts = rte_ring_dequeue_burst_start(pkt_q->buf,
+                                  (void **)bufs, DFLT_FRAME_BURST, NULL);
+
+               if (unlikely(nb_pkts == 0))
+                       break;
+
+               /* If packet are dropped internally by the below
+                * function, it okay to not include that stats in the
+                * return of this function because err_pkts is updated
+                * internally.
+                */
+               sent = eth_af_packet_tx_internal(queue, bufs, nb_pkts);
+               num_flushed +=  sent;
+
+               /* commit the dequeue operation */
+               rte_ring_dequeue_finish(pkt_q->buf, sent);
+       }
+
+       return num_flushed;
+}
+
 static int
 eth_dev_start(struct rte_eth_dev *dev)
 {
@@ -637,6 +695,7 @@ static const struct eth_dev_ops ops = {
        .link_update = eth_link_update,
        .stats_get = eth_stats_get,
        .stats_reset = eth_stats_reset,
+       .tx_done_cleanup = eth_af_packet_tx_flush,
 };
 
 /*
@@ -668,6 +727,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
                        unsigned int framesize,
                        unsigned int framecnt,
                       unsigned int qdisc_bypass,
+                      unsigned int explicit_flush,
                        struct pmd_internals **internals,
                        struct rte_eth_dev **eth_dev,
                        struct rte_kvargs *kvlist)
@@ -885,6 +945,18 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
                        goto error;
                }
 
+               char buf_name[RTE_RING_NAMESIZE];
+               snprintf(buf_name, RTE_RING_NAMESIZE, "%s:txq%u", name, q);
+               tx_queue->buf = rte_ring_create(buf_name, tx_queue->framecount,
+                                 numa_node, RING_F_SP_ENQ | RING_F_SC_DEQ);
+               if (tx_queue->buf == NULL) {
+                       PMD_LOG(ERR,
+                               "%s: could not create ring buffer. err=%s",
+                               buf_name, rte_strerror(rte_errno));
+                       goto error;
+               }
+               tx_queue->explicit_flush = explicit_flush;
+
 #if defined(PACKET_FANOUT)
                rc = setsockopt(qsockfd, SOL_PACKET, PACKET_FANOUT,
                                &fanout_arg, sizeof(fanout_arg));
@@ -962,6 +1034,7 @@ rte_eth_from_packet(struct rte_vdev_device *dev,
        unsigned int framecount = DFLT_FRAME_COUNT;
        unsigned int qpairs = 1;
        unsigned int qdisc_bypass = 1;
+       unsigned int explicit_flush = 0;
 
        /* do some parameter checking */
        if (*sockfd < 0)
@@ -1024,6 +1097,16 @@ rte_eth_from_packet(struct rte_vdev_device *dev,
                        }
                        continue;
                }
+               if (strstr(pair->key, ETH_AF_PACKET_EXPLICIT_FLUSH_ARG) != 
NULL) {
+                       explicit_flush = atoi(pair->value);
+                       if (explicit_flush > 1) {
+                               PMD_LOG(ERR,
+                                       "%s: invalid explicit_flush value",
+                                       name);
+                               return -1;
+                       }
+                       continue;
+               }
        }
 
        if (framesize > blocksize) {
@@ -1049,7 +1132,7 @@ rte_eth_from_packet(struct rte_vdev_device *dev,
        if (rte_pmd_init_internals(dev, *sockfd, qpairs,
                                   blocksize, blockcount,
                                   framesize, framecount,
-                                  qdisc_bypass,
+                                  qdisc_bypass, explicit_flush,
                                   &internals, &eth_dev,
                                   kvlist) < 0)
                return -1;
@@ -1146,4 +1229,5 @@ RTE_PMD_REGISTER_PARAM_STRING(net_af_packet,
        "blocksz=<int> "
        "framesz=<int> "
        "framecnt=<int> "
-       "qdisc_bypass=<0|1>");
+       "qdisc_bypass=<0|1> "
+       "explicit_flush=<0|1>");
-- 
2.34.1

Reply via email to