This commit implements TCP segmentation offload in TAP. Dpdk rte_gso library is used to segment large TCP payloads (e.g. 64K bytes) into smaller MTU size buffers. By supporting TSO offload capability in software a TAP device can be used as a failsafe sub device and be paired with another PCI device which supports TSO capability in HW.
For more details on dpdk librte_gso implementation please refer to dpdk documentation. The number of newly generated TSO segments is limited to 64. Signed-off-by: Ophir Munk <ophi...@mellanox.com> --- drivers/net/tap/Makefile | 2 +- drivers/net/tap/rte_eth_tap.c | 157 ++++++++++++++++++++++++++++++++---------- drivers/net/tap/rte_eth_tap.h | 4 ++ 3 files changed, 126 insertions(+), 37 deletions(-) diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile index ccc5c5f..3243365 100644 --- a/drivers/net/tap/Makefile +++ b/drivers/net/tap/Makefile @@ -24,7 +24,7 @@ CFLAGS += -I. CFLAGS += $(WERROR_FLAGS) LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs -lrte_hash -LDLIBS += -lrte_bus_vdev +LDLIBS += -lrte_bus_vdev -lrte_gso CFLAGS += -DTAP_MAX_QUEUES=$(TAP_MAX_QUEUES) diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c index f312084..4dda100 100644 --- a/drivers/net/tap/rte_eth_tap.c +++ b/drivers/net/tap/rte_eth_tap.c @@ -473,40 +473,37 @@ tap_tx_offload(char *packet, uint64_t ol_flags, unsigned int l2_len, } } -/* Callback to handle sending packets from the tap interface - */ -static uint16_t -pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) +static void +tap_mbuf_pool_create(struct rte_mempool **mp) { - struct tx_queue *txq = queue; - uint16_t num_tx = 0; - unsigned long num_tx_bytes = 0; - uint32_t max_size; - int i; + *mp = NULL; /* TODO - create mp */ +} - if (unlikely(nb_pkts == 0)) - return 0; +static inline void +tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs, + struct rte_mbuf **pmbufs, + uint16_t *num_packets, unsigned long *num_tx_bytes) +{ + int i; - max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4); - for (i = 0; i < nb_pkts; i++) { - struct rte_mbuf *mbuf = bufs[num_tx]; - struct iovec iovecs[mbuf->nb_segs + 1]; + for (i = 0; i < num_mbufs; i++) { + struct rte_mbuf *mbuf = pmbufs[i]; + struct iovec iovecs[mbuf->nb_segs + 2]; struct tun_pi pi = { .flags = 0 }; struct rte_mbuf *seg = mbuf; char m_copy[mbuf->data_len]; int n; int j; - int k; /* first index in iovecs for copying segments */ + int k; /* current index in iovecs for copying segments */ uint16_t l234_len; /* length of layers 2,3,4 headers */ uint16_t seg_len; /* length of first segment */ + uint16_t nb_segs; - /* stats.errs will be incremented */ - if (rte_pktmbuf_pkt_len(mbuf) > max_size) - break; - - iovecs[0].iov_base = π - iovecs[0].iov_len = sizeof(pi); - k = 1; + k = 0; + iovecs[k].iov_base = π + iovecs[k].iov_len = sizeof(pi); + k++; + nb_segs = mbuf->nb_segs; if (txq->csum && ((mbuf->ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_IPV4) || (mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM || @@ -523,39 +520,99 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) /* To change checksums, work on a * copy of l2, l3 l4 headers. */ - rte_memcpy(m_copy, rte_pktmbuf_mtod(mbuf, void *), - l234_len); + rte_memcpy(m_copy, + rte_pktmbuf_mtod(mbuf, void *), l234_len); tap_tx_offload(m_copy, mbuf->ol_flags, mbuf->l2_len, mbuf->l3_len); - iovecs[1].iov_base = m_copy; - iovecs[1].iov_len = l234_len; + iovecs[k].iov_base = m_copy; + iovecs[k].iov_len = l234_len; k++; + /* Adjust data pointer beyond l2, l3, l4 headers. * If this segment becomes empty - skip it */ if (seg_len > l234_len) { - rte_pktmbuf_adj(mbuf, l234_len); - } else { - seg = seg->next; - mbuf->nb_segs--; + iovecs[k].iov_len = seg_len - l234_len; + iovecs[k].iov_base = + rte_pktmbuf_mtod(seg, char *) + + l234_len; + k++; + } else { /* seg_len == l234_len */ + nb_segs--; } + + seg = seg->next; } - for (j = k; j <= mbuf->nb_segs; j++) { + for (j = k; j <= nb_segs; j++) { iovecs[j].iov_len = rte_pktmbuf_data_len(seg); iovecs[j].iov_base = rte_pktmbuf_mtod(seg, void *); seg = seg->next; } /* copy the tx frame data */ - n = writev(txq->fd, iovecs, mbuf->nb_segs + 1); + n = writev(txq->fd, iovecs, j); if (n <= 0) break; + (*num_packets)++; + (*num_tx_bytes) += rte_pktmbuf_pkt_len(mbuf); + } +} +/* Callback to handle sending packets from the tap interface + */ +static uint16_t +pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) +{ + struct tx_queue *txq = queue; + uint16_t num_tx = 0; + uint16_t num_packets = 0; + unsigned long num_tx_bytes = 0; + uint32_t max_size; + int i; + uint64_t tso; + int ret; + + if (unlikely(nb_pkts == 0)) + return 0; + + struct rte_mbuf *gso_mbufs[MAX_GSO_MBUFS]; + max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4); + for (i = 0; i < nb_pkts; i++) { + struct rte_mbuf *mbuf_in = bufs[num_tx]; + struct rte_mbuf **mbuf; + uint16_t num_mbufs; + + tso = mbuf_in->ol_flags & PKT_TX_TCP_SEG; + if (tso) { + struct rte_gso_ctx *gso_ctx = &txq->gso_ctx; + /* gso size is calculated without ETHER_CRC_LEN */ + gso_ctx->gso_size = *txq->mtu + ETHER_HDR_LEN; + ret = rte_gso_segment(mbuf_in, /* packet to segment */ + gso_ctx, /* gso control block */ + (struct rte_mbuf **)&gso_mbufs, /* out mbufs */ + RTE_DIM(gso_mbufs)); /* max tso mbufs */ + + /* ret contains the number of new created mbufs */ + if (ret < 0) + break; + + mbuf = gso_mbufs; + num_mbufs = ret; + } else { + /* stats.errs will be incremented */ + if (rte_pktmbuf_pkt_len(mbuf_in) > max_size) + break; + + mbuf = &mbuf_in; + num_mbufs = 1; + } + + tap_write_mbufs(txq, num_mbufs, mbuf, + &num_packets, &num_tx_bytes); num_tx++; - num_tx_bytes += mbuf->pkt_len; - rte_pktmbuf_free(mbuf); + rte_pktmbuf_free(mbuf_in); } - txq->stats.opackets += num_tx; + txq->stats.opackets += num_packets; txq->stats.errs += nb_pkts - num_tx; txq->stats.obytes += num_tx_bytes; @@ -996,11 +1053,35 @@ tap_mac_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr) } static int +tap_init_gso_ctx(struct tx_queue *tx) +{ + uint32_t gso_types; + + /* Create private mbuf pool with 128 bytes size + * use this pool for both direct and indirect mbufs + */ + struct rte_mempool *mp; /* Mempool for TX/GSO packets */ + tap_mbuf_pool_create(&mp); /* tx->mp or maybe embedded in gso_ctx */ + + /* initialize GSO context */ + gso_types = DEV_TX_OFFLOAD_TCP_TSO | DEV_TX_OFFLOAD_VXLAN_TNL_TSO | + DEV_TX_OFFLOAD_GRE_TNL_TSO; + tx->gso_ctx.direct_pool = mp; + tx->gso_ctx.indirect_pool = mp; + tx->gso_ctx.gso_types = gso_types; + tx->gso_ctx.gso_size = ETHER_MAX_LEN - ETHER_CRC_LEN; + tx->gso_ctx.flag = 0; + + return 0; +} + +static int tap_setup_queue(struct rte_eth_dev *dev, struct pmd_internals *internals, uint16_t qid, int is_rx) { + int ret; int *fd; int *other_fd; const char *dir; @@ -1048,6 +1129,10 @@ tap_setup_queue(struct rte_eth_dev *dev, tx->mtu = &dev->data->mtu; rx->rxmode = &dev->data->dev_conf.rxmode; + ret = tap_init_gso_ctx(tx); + if (ret) + return -1; + return *fd; } diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h index 53a506a..65da5f8 100644 --- a/drivers/net/tap/rte_eth_tap.h +++ b/drivers/net/tap/rte_eth_tap.h @@ -15,6 +15,7 @@ #include <rte_ethdev_driver.h> #include <rte_ether.h> +#include <rte_gso.h> #ifdef IFF_MULTI_QUEUE #define RTE_PMD_TAP_MAX_QUEUES TAP_MAX_QUEUES @@ -22,6 +23,8 @@ #define RTE_PMD_TAP_MAX_QUEUES 1 #endif +#define MAX_GSO_MBUFS 64 + struct pkt_stats { uint64_t opackets; /* Number of output packets */ uint64_t ipackets; /* Number of input packets */ @@ -50,6 +53,7 @@ struct tx_queue { uint16_t *mtu; /* Pointer to MTU from dev_data */ uint16_t csum:1; /* Enable checksum offloading */ struct pkt_stats stats; /* Stats for this TX queue */ + struct rte_gso_ctx gso_ctx; /* GSO context */ }; struct pmd_internals { -- 2.7.4