This commit implements TCP segmentation offload in TAP.
Dpdk rte_gso library is used to segment large TCP payloads (e.g. 64K bytes)
into smaller MTU size buffers.
By supporting TSO offload capability in software a TAP device can be used
as a failsafe sub device and be paired with another PCI device which
supports TSO capability in HW.

For more details on dpdk librte_gso implementation please refer to dpdk
documentation.
The number of newly generated TSO segments is limited to 64.

Signed-off-by: Ophir Munk <ophi...@mellanox.com>
---
 drivers/net/tap/Makefile      |   2 +-
 drivers/net/tap/rte_eth_tap.c | 157 ++++++++++++++++++++++++++++++++----------
 drivers/net/tap/rte_eth_tap.h |   4 ++
 3 files changed, 126 insertions(+), 37 deletions(-)

diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index ccc5c5f..3243365 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -24,7 +24,7 @@ CFLAGS += -I.
 CFLAGS += $(WERROR_FLAGS)
 LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
 LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs -lrte_hash
-LDLIBS += -lrte_bus_vdev
+LDLIBS += -lrte_bus_vdev -lrte_gso
 
 CFLAGS += -DTAP_MAX_QUEUES=$(TAP_MAX_QUEUES)
 
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index f312084..4dda100 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -473,40 +473,37 @@ tap_tx_offload(char *packet, uint64_t ol_flags, unsigned 
int l2_len,
        }
 }
 
-/* Callback to handle sending packets from the tap interface
- */
-static uint16_t
-pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+static void
+tap_mbuf_pool_create(struct rte_mempool **mp)
 {
-       struct tx_queue *txq = queue;
-       uint16_t num_tx = 0;
-       unsigned long num_tx_bytes = 0;
-       uint32_t max_size;
-       int i;
+       *mp = NULL; /* TODO - create mp */
+}
 
-       if (unlikely(nb_pkts == 0))
-               return 0;
+static inline void
+tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs,
+                       struct rte_mbuf **pmbufs,
+                       uint16_t *num_packets, unsigned long *num_tx_bytes)
+{
+       int i;
 
-       max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
-       for (i = 0; i < nb_pkts; i++) {
-               struct rte_mbuf *mbuf = bufs[num_tx];
-               struct iovec iovecs[mbuf->nb_segs + 1];
+       for (i = 0; i < num_mbufs; i++) {
+               struct rte_mbuf *mbuf = pmbufs[i];
+               struct iovec iovecs[mbuf->nb_segs + 2];
                struct tun_pi pi = { .flags = 0 };
                struct rte_mbuf *seg = mbuf;
                char m_copy[mbuf->data_len];
                int n;
                int j;
-               int k; /* first index in iovecs for copying segments */
+               int k; /* current index in iovecs for copying segments */
                uint16_t l234_len; /* length of layers 2,3,4 headers */
                uint16_t seg_len; /* length of first segment */
+               uint16_t nb_segs;
 
-               /* stats.errs will be incremented */
-               if (rte_pktmbuf_pkt_len(mbuf) > max_size)
-                       break;
-
-               iovecs[0].iov_base = &pi;
-               iovecs[0].iov_len = sizeof(pi);
-               k = 1;
+               k = 0;
+               iovecs[k].iov_base = &pi;
+               iovecs[k].iov_len = sizeof(pi);
+               k++;
+               nb_segs = mbuf->nb_segs;
                if (txq->csum &&
                    ((mbuf->ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_IPV4) ||
                     (mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM ||
@@ -523,39 +520,99 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, 
uint16_t nb_pkts)
                        /* To change checksums, work on a
                         * copy of l2, l3 l4 headers.
                         */
-                       rte_memcpy(m_copy, rte_pktmbuf_mtod(mbuf, void *),
-                                       l234_len);
+                       rte_memcpy(m_copy,
+                               rte_pktmbuf_mtod(mbuf, void *), l234_len);
                        tap_tx_offload(m_copy, mbuf->ol_flags,
                                       mbuf->l2_len, mbuf->l3_len);
-                       iovecs[1].iov_base = m_copy;
-                       iovecs[1].iov_len = l234_len;
+                       iovecs[k].iov_base = m_copy;
+                       iovecs[k].iov_len = l234_len;
                        k++;
+
                        /* Adjust data pointer beyond l2, l3, l4 headers.
                         * If this segment becomes empty - skip it
                         */
                        if (seg_len > l234_len) {
-                               rte_pktmbuf_adj(mbuf, l234_len);
-                       } else {
-                               seg = seg->next;
-                               mbuf->nb_segs--;
+                               iovecs[k].iov_len = seg_len - l234_len;
+                               iovecs[k].iov_base =
+                                       rte_pktmbuf_mtod(seg, char *) +
+                                               l234_len;
+                               k++;
+                       } else { /* seg_len == l234_len */
+                               nb_segs--;
                        }
+
+                       seg = seg->next;
                }
-               for (j = k; j <= mbuf->nb_segs; j++) {
+               for (j = k; j <= nb_segs; j++) {
                        iovecs[j].iov_len = rte_pktmbuf_data_len(seg);
                        iovecs[j].iov_base = rte_pktmbuf_mtod(seg, void *);
                        seg = seg->next;
                }
                /* copy the tx frame data */
-               n = writev(txq->fd, iovecs, mbuf->nb_segs + 1);
+               n = writev(txq->fd, iovecs, j);
                if (n <= 0)
                        break;
+               (*num_packets)++;
+               (*num_tx_bytes) += rte_pktmbuf_pkt_len(mbuf);
+       }
+}
 
+/* Callback to handle sending packets from the tap interface
+ */
+static uint16_t
+pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+       struct tx_queue *txq = queue;
+       uint16_t num_tx = 0;
+       uint16_t num_packets = 0;
+       unsigned long num_tx_bytes = 0;
+       uint32_t max_size;
+       int i;
+       uint64_t tso;
+       int ret;
+
+       if (unlikely(nb_pkts == 0))
+               return 0;
+
+       struct rte_mbuf *gso_mbufs[MAX_GSO_MBUFS];
+       max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
+       for (i = 0; i < nb_pkts; i++) {
+               struct rte_mbuf *mbuf_in = bufs[num_tx];
+               struct rte_mbuf **mbuf;
+               uint16_t num_mbufs;
+
+               tso = mbuf_in->ol_flags & PKT_TX_TCP_SEG;
+               if (tso) {
+                       struct rte_gso_ctx *gso_ctx = &txq->gso_ctx;
+                       /* gso size is calculated without ETHER_CRC_LEN */
+                       gso_ctx->gso_size = *txq->mtu + ETHER_HDR_LEN;
+                       ret = rte_gso_segment(mbuf_in, /* packet to segment */
+                               gso_ctx, /* gso control block */
+                               (struct rte_mbuf **)&gso_mbufs, /* out mbufs */
+                               RTE_DIM(gso_mbufs)); /* max tso mbufs */
+
+                       /* ret contains the number of new created mbufs */
+                       if (ret < 0)
+                               break;
+
+                       mbuf = gso_mbufs;
+                       num_mbufs = ret;
+               } else {
+                       /* stats.errs will be incremented */
+                       if (rte_pktmbuf_pkt_len(mbuf_in) > max_size)
+                               break;
+
+                       mbuf = &mbuf_in;
+                       num_mbufs = 1;
+               }
+
+               tap_write_mbufs(txq, num_mbufs, mbuf,
+                               &num_packets, &num_tx_bytes);
                num_tx++;
-               num_tx_bytes += mbuf->pkt_len;
-               rte_pktmbuf_free(mbuf);
+               rte_pktmbuf_free(mbuf_in);
        }
 
-       txq->stats.opackets += num_tx;
+       txq->stats.opackets += num_packets;
        txq->stats.errs += nb_pkts - num_tx;
        txq->stats.obytes += num_tx_bytes;
 
@@ -996,11 +1053,35 @@ tap_mac_set(struct rte_eth_dev *dev, struct ether_addr 
*mac_addr)
 }
 
 static int
+tap_init_gso_ctx(struct tx_queue *tx)
+{
+       uint32_t gso_types;
+
+       /* Create private mbuf pool with 128 bytes size
+        * use this pool for both direct and indirect mbufs
+        */
+       struct rte_mempool *mp;      /* Mempool for TX/GSO packets */
+       tap_mbuf_pool_create(&mp); /* tx->mp or maybe embedded in gso_ctx */
+
+       /* initialize GSO context */
+       gso_types = DEV_TX_OFFLOAD_TCP_TSO | DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+               DEV_TX_OFFLOAD_GRE_TNL_TSO;
+       tx->gso_ctx.direct_pool = mp;
+       tx->gso_ctx.indirect_pool = mp;
+       tx->gso_ctx.gso_types = gso_types;
+       tx->gso_ctx.gso_size = ETHER_MAX_LEN - ETHER_CRC_LEN;
+       tx->gso_ctx.flag = 0;
+
+       return 0;
+}
+
+static int
 tap_setup_queue(struct rte_eth_dev *dev,
                struct pmd_internals *internals,
                uint16_t qid,
                int is_rx)
 {
+       int ret;
        int *fd;
        int *other_fd;
        const char *dir;
@@ -1048,6 +1129,10 @@ tap_setup_queue(struct rte_eth_dev *dev,
        tx->mtu = &dev->data->mtu;
        rx->rxmode = &dev->data->dev_conf.rxmode;
 
+       ret = tap_init_gso_ctx(tx);
+       if (ret)
+               return -1;
+
        return *fd;
 }
 
diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h
index 53a506a..65da5f8 100644
--- a/drivers/net/tap/rte_eth_tap.h
+++ b/drivers/net/tap/rte_eth_tap.h
@@ -15,6 +15,7 @@
 
 #include <rte_ethdev_driver.h>
 #include <rte_ether.h>
+#include <rte_gso.h>
 
 #ifdef IFF_MULTI_QUEUE
 #define RTE_PMD_TAP_MAX_QUEUES TAP_MAX_QUEUES
@@ -22,6 +23,8 @@
 #define RTE_PMD_TAP_MAX_QUEUES 1
 #endif
 
+#define MAX_GSO_MBUFS 64
+
 struct pkt_stats {
        uint64_t opackets;              /* Number of output packets */
        uint64_t ipackets;              /* Number of input packets */
@@ -50,6 +53,7 @@ struct tx_queue {
        uint16_t *mtu;                  /* Pointer to MTU from dev_data */
        uint16_t csum:1;                /* Enable checksum offloading */
        struct pkt_stats stats;         /* Stats for this TX queue */
+       struct rte_gso_ctx gso_ctx;     /* GSO context */
 };
 
 struct pmd_internals {
-- 
2.7.4

Reply via email to