Some of the NICs supported by DPDK have a possibility to accelerate TCP
traffic by using segmentation offload. The application prepares a packet
with valid TCP header with size up to 64K and deleguates the
segmentation to the NIC.

Implement the generic part of TCP segmentation offload in rte_mbuf. It
introduces 2 new fields in rte_mbuf: l4_len (length of L4 header in bytes)
and tso_segsz (MSS of packets).

To delegate the TCP segmentation to the hardware, the user has to:

- set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag implies
  PKT_TX_TCP_CKSUM)
- set the flag PKT_TX_IPV4 or PKT_TX_IPV6
- set PKT_TX_IP_CKSUM if it's IPv4, and set the IP checksum to 0 in
  the packet
- fill the mbuf offload information: l2_len, l3_len, l4_len, tso_segsz
- calculate the pseudo header checksum without taking ip_len in account,
  and set it in the TCP header, for instance by using
  rte_ipv4_phdr_cksum(ip_hdr, ol_flags)

The API is inspired from ixgbe hardware (the next commit adds the
support for ixgbe), but it seems generic enough to be used for other
hw/drivers in the future.

This commit also reworks the way l2_len and l3_len are used in igb
and ixgbe drivers as the l2_l3_len is not available anymore in mbuf.

Signed-off-by: Mirek Walukiewicz <miroslaw.walukiewicz at intel.com>
Signed-off-by: Olivier Matz <olivier.matz at 6wind.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev at intel.com>
---
 app/test-pmd/testpmd.c            |  2 +-
 examples/ipv4_multicast/main.c    |  2 +-
 lib/librte_mbuf/rte_mbuf.c        |  1 +
 lib/librte_mbuf/rte_mbuf.h        | 45 +++++++++++++++++++++++----------------
 lib/librte_net/rte_ip.h           | 39 +++++++++++++++++++++++++++------
 lib/librte_pmd_e1000/igb_rxtx.c   | 11 +++++++++-
 lib/librte_pmd_ixgbe/ixgbe_rxtx.c | 11 +++++++++-
 7 files changed, 82 insertions(+), 29 deletions(-)

diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index 8a4190b..d2d127d 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -408,7 +408,7 @@ testpmd_mbuf_ctor(struct rte_mempool *mp,
        mb->ol_flags     = 0;
        mb->data_off     = RTE_PKTMBUF_HEADROOM;
        mb->nb_segs      = 1;
-       mb->l2_l3_len       = 0;
+       mb->tx_offload   = 0;
        mb->vlan_tci     = 0;
        mb->hash.rss     = 0;
 }
diff --git a/examples/ipv4_multicast/main.c b/examples/ipv4_multicast/main.c
index 590d11a..80c5140 100644
--- a/examples/ipv4_multicast/main.c
+++ b/examples/ipv4_multicast/main.c
@@ -302,7 +302,7 @@ mcast_out_pkt(struct rte_mbuf *pkt, int use_clone)
        /* copy metadata from source packet*/
        hdr->port = pkt->port;
        hdr->vlan_tci = pkt->vlan_tci;
-       hdr->l2_l3_len = pkt->l2_l3_len;
+       hdr->tx_offload = pkt->tx_offload;
        hdr->hash = pkt->hash;

        hdr->ol_flags = pkt->ol_flags;
diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
index 9b57b3a..87c2963 100644
--- a/lib/librte_mbuf/rte_mbuf.c
+++ b/lib/librte_mbuf/rte_mbuf.c
@@ -241,6 +241,7 @@ const char *rte_get_tx_ol_flag_name(uint64_t mask)
        case PKT_TX_UDP_CKSUM: return "PKT_TX_UDP_CKSUM";
        case PKT_TX_IEEE1588_TMST: return "PKT_TX_IEEE1588_TMST";
        case PKT_TX_VXLAN_CKSUM: return "PKT_TX_VXLAN_CKSUM";
+       case PKT_TX_TCP_SEG: return "PKT_TX_TCP_SEG";
        default: return NULL;
        }
 }
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index 832fe0a..04cbf41 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -2,6 +2,7 @@
  *   BSD LICENSE
  *
  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   Copyright 2014 6WIND S.A.
  *   All rights reserved.
  *
  *   Redistribution and use in source and binary forms, with or without
@@ -132,6 +133,20 @@ extern "C" {

 #define PKT_TX_VLAN_PKT      (1ULL << 55) /**< TX packet is a 802.1q VLAN 
packet. */

+/**
+ * TCP segmentation offload. To enable this offload feature for a
+ * packet to be transmitted on hardware supporting TSO:
+ *  - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag implies
+ *    PKT_TX_TCP_CKSUM)
+ *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag and write the IP checksum
+ *    to 0 in the packet
+ *  - fill the mbuf offload information: l2_len, l3_len, l4_len, tso_segsz
+ *  - calculate the pseudo header checksum without taking ip_len in accound,
+ *    and set it in the TCP header. Refer to rte_ipv4_phdr_cksum() and
+ *    rte_ipv6_phdr_cksum() that can be used as helpers.
+ */
+#define PKT_TX_TCP_SEG       (1ULL << 49)
+
 /* Use final bit of flags to indicate a control mbuf */
 #define CTRL_MBUF_FLAG       (1ULL << 63) /**< Mbuf contains control data */

@@ -242,22 +257,18 @@ struct rte_mbuf {

        /* fields to support TX offloads */
        union {
-               uint16_t l2_l3_len; /**< combined l2/l3 lengths as single var */
+               uint64_t tx_offload;       /**< combined for easy fetch */
                struct {
-                       uint16_t l3_len:9;      /**< L3 (IP) Header Length. */
-                       uint16_t l2_len:7;      /**< L2 (MAC) Header Length. */
-               };
-       };
+                       uint64_t l2_len:7; /**< L2 (MAC) Header Length. */
+                       uint64_t l3_len:9; /**< L3 (IP) Header Length. */
+                       uint64_t l4_len:8; /**< L4 (TCP/UDP) Header Length. */
+                       uint64_t tso_segsz:16; /**< TCP TSO segment size */

-       /* fields for TX offloading of tunnels */
-       union {
-               uint16_t inner_l2_l3_len;
-               /**< combined inner l2/l3 lengths as single var */
-               struct {
-                       uint16_t inner_l3_len:9;
-                       /**< inner L3 (IP) Header Length. */
-                       uint16_t inner_l2_len:7;
-                       /**< inner L2 (MAC) Header Length. */
+                       /* fields for TX offloading of tunnels */
+                       uint64_t inner_l3_len:9; /**< inner L3 (IP) Hdr Length. 
*/
+                       uint64_t inner_l2_len:7; /**< inner L2 (MAC) Hdr 
Length. */
+
+                       /* uint64_t unused:8; */
                };
        };
 } __rte_cache_aligned;
@@ -609,8 +620,7 @@ static inline void rte_pktmbuf_reset(struct rte_mbuf *m)
 {
        m->next = NULL;
        m->pkt_len = 0;
-       m->l2_l3_len = 0;
-       m->inner_l2_l3_len = 0;
+       m->tx_offload = 0;
        m->vlan_tci = 0;
        m->nb_segs = 1;
        m->port = 0xff;
@@ -679,8 +689,7 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, 
struct rte_mbuf *md)
        mi->data_len = md->data_len;
        mi->port = md->port;
        mi->vlan_tci = md->vlan_tci;
-       mi->l2_l3_len = md->l2_l3_len;
-       mi->inner_l2_l3_len = md->inner_l2_l3_len;
+       mi->tx_offload = md->tx_offload;
        mi->hash = md->hash;

        mi->next = NULL;
diff --git a/lib/librte_net/rte_ip.h b/lib/librte_net/rte_ip.h
index 387b06c..20c3ae1 100644
--- a/lib/librte_net/rte_ip.h
+++ b/lib/librte_net/rte_ip.h
@@ -81,6 +81,7 @@

 #include <rte_memcpy.h>
 #include <rte_byteorder.h>
+#include <rte_mbuf.h>

 #ifdef __cplusplus
 extern "C" {
@@ -312,13 +313,21 @@ rte_ipv4_cksum(const struct ipv4_hdr *ipv4_hdr)
  *
  * The checksum field must be set to 0 by the caller.
  *
+ * Depending on the ol_flags, the pseudo-header checksum expected by the
+ * drivers is not the same. For instance, when TSO is enabled, the IP
+ * payload length must not be included in the packet.
+ *
+ * When ol_flags is 0, it computes the standard pseudo-header checksum.
+ *
  * @param ipv4_hdr
  *   The pointer to the contiguous IPv4 header.
+ * @param ol_flags
+ *   The ol_flags of the associated mbuf.
  * @return
  *   The non-complemented checksum to set in the L4 header.
  */
 static inline uint16_t
-rte_ipv4_phdr_cksum(const struct ipv4_hdr *ipv4_hdr)
+rte_ipv4_phdr_cksum(const struct ipv4_hdr *ipv4_hdr, uint64_t ol_flags)
 {
        struct ipv4_psd_header {
                uint32_t src_addr; /* IP address of source host. */
@@ -332,9 +341,13 @@ rte_ipv4_phdr_cksum(const struct ipv4_hdr *ipv4_hdr)
        psd_hdr.dst_addr = ipv4_hdr->dst_addr;
        psd_hdr.zero = 0;
        psd_hdr.proto = ipv4_hdr->next_proto_id;
-       psd_hdr.len = rte_cpu_to_be_16(
-               (uint16_t)(rte_be_to_cpu_16(ipv4_hdr->total_length)
-                       - sizeof(struct ipv4_hdr)));
+       if (ol_flags & PKT_TX_TCP_SEG) {
+               psd_hdr.len = 0;
+       } else {
+               psd_hdr.len = rte_cpu_to_be_16(
+                       (uint16_t)(rte_be_to_cpu_16(ipv4_hdr->total_length)
+                               - sizeof(struct ipv4_hdr)));
+       }
        return rte_raw_cksum((const char *)&psd_hdr, sizeof(psd_hdr));
 }

@@ -361,7 +374,7 @@ rte_ipv4_udptcp_cksum(const struct ipv4_hdr *ipv4_hdr, 
const void *l4_hdr)
                sizeof(struct ipv4_hdr);

        cksum = rte_raw_cksum(l4_hdr, l4_len);
-       cksum += rte_ipv4_phdr_cksum(ipv4_hdr);
+       cksum += rte_ipv4_phdr_cksum(ipv4_hdr, 0);

        cksum = ((cksum & 0xffff0000) >> 16) + (cksum & 0xffff);
        cksum = (~cksum) & 0xffff;
@@ -386,13 +399,21 @@ struct ipv6_hdr {
 /**
  * Process the pseudo-header checksum of an IPv6 header.
  *
+ * Depending on the ol_flags, the pseudo-header checksum expected by the
+ * drivers is not the same. For instance, when TSO is enabled, the IPv6
+ * payload length must not be included in the packet.
+ *
+ * When ol_flags is 0, it computes the standard pseudo-header checksum.
+ *
  * @param ipv6_hdr
  *   The pointer to the contiguous IPv6 header.
+ * @param ol_flags
+ *   The ol_flags of the associated mbuf.
  * @return
  *   The non-complemented checksum to set in the L4 header.
  */
 static inline uint16_t
-rte_ipv6_phdr_cksum(const struct ipv6_hdr *ipv6_hdr)
+rte_ipv6_phdr_cksum(const struct ipv6_hdr *ipv6_hdr, uint64_t ol_flags)
 {
        struct ipv6_psd_header {
                uint8_t src_addr[16]; /* IP address of source host. */
@@ -404,7 +425,11 @@ rte_ipv6_phdr_cksum(const struct ipv6_hdr *ipv6_hdr)
        rte_memcpy(&psd_hdr.src_addr, ipv6_hdr->src_addr,
                sizeof(ipv6_hdr->src_addr) + sizeof(ipv6_hdr->dst_addr));
        psd_hdr.proto = (ipv6_hdr->proto << 24);
-       psd_hdr.len = ipv6_hdr->payload_len;
+       if (ol_flags & PKT_TX_TCP_SEG) {
+               psd_hdr.len = 0;
+       } else {
+               psd_hdr.len = ipv6_hdr->payload_len;
+       }

        return rte_raw_cksum((const char *)&psd_hdr, sizeof(psd_hdr));
 }
diff --git a/lib/librte_pmd_e1000/igb_rxtx.c b/lib/librte_pmd_e1000/igb_rxtx.c
index 433c616..848d5d1 100644
--- a/lib/librte_pmd_e1000/igb_rxtx.c
+++ b/lib/librte_pmd_e1000/igb_rxtx.c
@@ -367,6 +367,13 @@ eth_igb_xmit_pkts(void *tx_queue, struct rte_mbuf 
**tx_pkts,
        struct rte_mbuf     *tx_pkt;
        struct rte_mbuf     *m_seg;
        union igb_vlan_macip vlan_macip_lens;
+       union {
+               uint16_t u16;
+               struct {
+                       uint16_t l3_len:9;
+                       uint16_t l2_len:7;
+               };
+       } l2_l3_len;
        uint64_t buf_dma_addr;
        uint32_t olinfo_status;
        uint32_t cmd_type_len;
@@ -404,8 +411,10 @@ eth_igb_xmit_pkts(void *tx_queue, struct rte_mbuf 
**tx_pkts,
                tx_last = (uint16_t) (tx_id + tx_pkt->nb_segs - 1);

                ol_flags = tx_pkt->ol_flags;
+               l2_l3_len.l2_len = tx_pkt->l2_len;
+               l2_l3_len.l3_len = tx_pkt->l3_len;
                vlan_macip_lens.f.vlan_tci = tx_pkt->vlan_tci;
-               vlan_macip_lens.f.l2_l3_len = tx_pkt->l2_l3_len;
+               vlan_macip_lens.f.l2_l3_len = l2_l3_len.u16;
                tx_ol_req = ol_flags & IGB_TX_OFFLOAD_MASK;

                /* If a Context Descriptor need be built . */
diff --git a/lib/librte_pmd_ixgbe/ixgbe_rxtx.c 
b/lib/librte_pmd_ixgbe/ixgbe_rxtx.c
index ca35db2..2df3385 100644
--- a/lib/librte_pmd_ixgbe/ixgbe_rxtx.c
+++ b/lib/librte_pmd_ixgbe/ixgbe_rxtx.c
@@ -546,6 +546,13 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
        struct rte_mbuf     *tx_pkt;
        struct rte_mbuf     *m_seg;
        union ixgbe_vlan_macip vlan_macip_lens;
+       union {
+               uint16_t u16;
+               struct {
+                       uint16_t l3_len:9;
+                       uint16_t l2_len:7;
+               };
+       } l2_l3_len;
        uint64_t buf_dma_addr;
        uint32_t olinfo_status;
        uint32_t cmd_type_len;
@@ -588,8 +595,10 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
                /* If hardware offload required */
                tx_ol_req = ol_flags & IXGBE_TX_OFFLOAD_MASK;
                if (tx_ol_req) {
+                       l2_l3_len.l2_len = tx_pkt->l2_len;
+                       l2_l3_len.l3_len = tx_pkt->l3_len;
                        vlan_macip_lens.f.vlan_tci = tx_pkt->vlan_tci;
-                       vlan_macip_lens.f.l2_l3_len = tx_pkt->l2_l3_len;
+                       vlan_macip_lens.f.l2_l3_len = l2_l3_len.u16;

                        /* If new context need be built or reuse the exist ctx. 
*/
                        ctx = what_advctx_update(txq, tx_ol_req,
-- 
2.1.0

Reply via email to