inline > -----Original Message----- > From: Matan Azrad > Sent: Monday, July 9, 2018 4:07 PM > To: Mordechay Haimovsky <mo...@mellanox.com>; Adrien Mazarguil > <adrien.mazarg...@6wind.com> > Cc: dev@dpdk.org > Subject: RE: [PATCH v5] net/mlx4: support hardware TSO > > > > Hi Moti > > Please see some comments below. > > From: Mordechay Haimovsky > > Implement support for hardware TSO. > > > > Signed-off-by: Moti Haimovsky <mo...@mellanox.com> > > --- > > v5: > > * Modification to the code according to review inputs from Matan > > Azrad. > > * Code optimization to the TSO header copy routine. > > * Rearranged the TSO data-segments creation routine. > > in reply to > > 1530715998-15703-1-git-send-email-mo...@mellanox.com > > > > v4: > > * Bug fixes in filling TSO data segments. > > * Modifications according to review inputs from Adrien Mazarguil > > and Matan Azrad. > > in reply to > > 1530190137-17848-1-git-send-email-mo...@mellanox.com > > > > v3: > > * Fixed compilation errors in compilers without GNU C extensions > > caused by a declaration of zero-length array in the code. > > in reply to > > 1530187032-6489-1-git-send-email-mo...@mellanox.com > > > > v2: > > * Fixed coding style warning. > > in reply to > > 1530184583-30166-1-git-send-email-mo...@mellanox.com > > > > v1: > > * Fixed coding style warnings. > > in reply to > > 1530181779-19716-1-git-send-email-mo...@mellanox.com > > --- > > doc/guides/nics/features/mlx4.ini | 1 + > > doc/guides/nics/mlx4.rst | 3 + > > drivers/net/mlx4/Makefile | 5 + > > drivers/net/mlx4/mlx4.c | 9 + > > drivers/net/mlx4/mlx4.h | 5 + > > drivers/net/mlx4/mlx4_prm.h | 15 ++ > > drivers/net/mlx4/mlx4_rxtx.c | 372 > > +++++++++++++++++++++++++++++++++++++- > > drivers/net/mlx4/mlx4_rxtx.h | 2 +- > > drivers/net/mlx4/mlx4_txq.c | 8 +- > > 9 files changed, 416 insertions(+), 4 deletions(-) > > > > diff --git a/doc/guides/nics/features/mlx4.ini > > b/doc/guides/nics/features/mlx4.ini > > index f6efd21..98a3f61 100644 > > --- a/doc/guides/nics/features/mlx4.ini > > +++ b/doc/guides/nics/features/mlx4.ini > > @@ -13,6 +13,7 @@ Queue start/stop = Y > > MTU update = Y > > Jumbo frame = Y > > Scattered Rx = Y > > +TSO = Y > > Promiscuous mode = Y > > Allmulticast mode = Y > > Unicast MAC filter = Y > > diff --git a/doc/guides/nics/mlx4.rst b/doc/guides/nics/mlx4.rst index > > 491106a..12adaeb 100644 > > --- a/doc/guides/nics/mlx4.rst > > +++ b/doc/guides/nics/mlx4.rst > > @@ -142,6 +142,9 @@ Limitations > > The ability to enable/disable CRC stripping requires OFED version > > 4.3-1.5.0.0 and above or rdma-core version v18 and above. > > > > +- TSO (Transmit Segmentation Offload) is supported in OFED version > > + 4.4 and above or in rdma-core version v18 and above. > > + > > Prerequisites > > ------------- > > > > diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile > > index > > 73f9d40..63bc003 100644 > > --- a/drivers/net/mlx4/Makefile > > +++ b/drivers/net/mlx4/Makefile > > @@ -85,6 +85,11 @@ mlx4_autoconf.h.new: FORCE > > mlx4_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh > > $Q $(RM) -f -- '$@' > > $Q : > '$@' > > + $Q sh -- '$<' '$@' \ > > + HAVE_IBV_MLX4_WQE_LSO_SEG \ > > + infiniband/mlx4dv.h \ > > + type 'struct mlx4_wqe_lso_seg' \ > > + $(AUTOCONF_OUTPUT) > > > > # Create mlx4_autoconf.h or update it in case it differs from the new one. > > > > diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c index > > d151a90..5d8c76d 100644 > > --- a/drivers/net/mlx4/mlx4.c > > +++ b/drivers/net/mlx4/mlx4.c > > @@ -677,6 +677,15 @@ struct mlx4_conf { > > > > IBV_RAW_PACKET_CAP_SCATTER_FCS); > > DEBUG("FCS stripping toggling is %ssupported", > > priv->hw_fcs_strip ? "" : "not "); > > + priv->tso = > > + ((device_attr_ex.tso_caps.max_tso > 0) && > > + (device_attr_ex.tso_caps.supported_qpts & > > + (1 << IBV_QPT_RAW_PACKET))); > > + if (priv->tso) > > + priv->tso_max_payload_sz = > > + device_attr_ex.tso_caps.max_tso; > > + DEBUG("TSO is %ssupported", > > + priv->tso ? "" : "not "); > > /* Configure the first MAC address by default. */ > > err = mlx4_get_mac(priv, &mac.addr_bytes); > > if (err) { > > diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index > > 300cb4d..89d8c38 100644 > > --- a/drivers/net/mlx4/mlx4.h > > +++ b/drivers/net/mlx4/mlx4.h > > @@ -47,6 +47,9 @@ > > /** Interrupt alarm timeout value in microseconds. */ #define > > MLX4_INTR_ALARM_TIMEOUT 100000 > > > > +/* Maximum packet headers size (L2+L3+L4) for TSO. */ #define > > +MLX4_MAX_TSO_HEADER 192 > > + > > /** Port parameter. */ > > #define MLX4_PMD_PORT_KVARG "port" > > > > @@ -90,6 +93,8 @@ struct priv { > > uint32_t hw_csum:1; /**< Checksum offload is supported. */ > > uint32_t hw_csum_l2tun:1; /**< Checksum support for L2 tunnels. > > */ > > uint32_t hw_fcs_strip:1; /**< FCS stripping toggling is supported. > > */ > > + uint32_t tso:1; /**< Transmit segmentation offload is supported. */ > > + uint32_t tso_max_payload_sz; /**< Max supported TSO payload > > size. */ > > uint64_t hw_rss_sup; /**< Supported RSS hash fields (Verbs > format). > > */ > > struct rte_intr_handle intr_handle; /**< Port interrupt handle. */ > > struct mlx4_drop *drop; /**< Shared resources for drop flow rules. > > */ diff --git a/drivers/net/mlx4/mlx4_prm.h > > b/drivers/net/mlx4/mlx4_prm.h index b771d8c..aef77ba 100644 > > --- a/drivers/net/mlx4/mlx4_prm.h > > +++ b/drivers/net/mlx4/mlx4_prm.h > > @@ -19,6 +19,7 @@ > > #ifdef PEDANTIC > > #pragma GCC diagnostic error "-Wpedantic" > > #endif > > +#include "mlx4_autoconf.h" > > > > /* ConnectX-3 Tx queue basic block. */ #define MLX4_TXBB_SHIFT 6 @@ > > -40,6 +41,7 @@ > > /* Work queue element (WQE) flags. */ #define > > MLX4_WQE_CTRL_IIP_HDR_CSUM (1 << 28) #define > > MLX4_WQE_CTRL_IL4_HDR_CSUM (1 << 27) > > +#define MLX4_WQE_CTRL_RR (1 << 6) > > > > /* CQE checksum flags. */ > > enum { > > @@ -98,6 +100,19 @@ struct mlx4_cq { > > int arm_sn; /**< Rx event counter. */ }; > > > > +#ifndef HAVE_IBV_MLX4_WQE_LSO_SEG > > +/* > > + * WQE LSO segment structure. > > + * Defined here as backward compatibility for rdma-core v17 and below. > > + * Similar definition is found in infiniband/mlx4dv.h in rdma-core > > +v18 > > + * and above. > > + */ > > +struct mlx4_wqe_lso_seg { > > + rte_be32_t mss_hdr_size; > > + rte_be32_t header[]; > > +}; > > +#endif > > + > > /** > > * Retrieve a CQE entry from a CQ. > > * > > diff --git a/drivers/net/mlx4/mlx4_rxtx.c > > b/drivers/net/mlx4/mlx4_rxtx.c index 78b6dd5..b695539 100644 > > --- a/drivers/net/mlx4/mlx4_rxtx.c > > +++ b/drivers/net/mlx4/mlx4_rxtx.c > > @@ -38,10 +38,29 @@ > > * DWORD (32 byte) of a TXBB. > > */ > > struct pv { > > - volatile struct mlx4_wqe_data_seg *dseg; > > + union { > > + volatile struct mlx4_wqe_data_seg *dseg; > > + volatile uint32_t *dst; > > + }; > > uint32_t val; > > }; > > > > +/** A helper structure for TSO packet handling. */ struct tso_info { > > + /** Pointer to the array of saved first DWORD (32 byte) of a TXBB. */ > > + struct pv *pv; > > + /** Current entry in the pv array. */ > > + int pv_counter; > > + /** Total size of the WQE including padding. */ > > + uint32_t wqe_size; > > + /** Size of TSO header to prepend to each packet to send. */ > > + uint16_t tso_header_size; > > + /** Total size of the TSO segment in the WQE. */ > > + uint16_t wqe_tso_seg_size; > > + /** Raw WQE size in units of 16 Bytes and without padding. */ > > + uint8_t fence_size; > > +}; > > + > > /** A table to translate Rx completion flags to packet type. */ > > uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = { > > /* > > @@ -368,6 +387,345 @@ struct pv { > > } > > > > /** > > + * Obtain and calculate TSO information needed for assembling a TSO > WQE. > > + * > > + * @param buf > > + * Pointer to the first packet mbuf. > > + * @param txq > > + * Pointer to Tx queue structure. > > + * @param tinfo > > + * Pointer to a structure to fill the info with. > > + * > > + * @return > > + * 0 on success, negative value upon error. > > + */ > > +static inline int > > +mlx4_tx_burst_tso_get_params(struct rte_mbuf *buf, > > + struct txq *txq, > > + struct tso_info *tinfo) > > +{ > > + struct mlx4_sq *sq = &txq->msq; > > + const uint8_t tunneled = txq->priv->hw_csum_l2tun && > > + (buf->ol_flags & PKT_TX_TUNNEL_MASK); > > + > > + tinfo->tso_header_size = buf->l2_len + buf->l3_len + buf->l4_len; > > + if (tunneled) > > + tinfo->tso_header_size += > > + buf->outer_l2_len + buf->outer_l3_len; > > + if (unlikely(buf->tso_segsz == 0 || > > + tinfo->tso_header_size == 0 || > > + tinfo->tso_header_size > MLX4_MAX_TSO_HEADER || > > + tinfo->tso_header_size > buf->data_len)) > > + return -EINVAL; > > + /* > > + * Calculate the WQE TSO segment size > > + * Note: > > + * 1. An LSO segment must be padded such that the subsequent data > > + * segment is 16-byte aligned. > > + * 2. The start address of the TSO segment is always 16 Bytes aligned. > > + */ > > + tinfo->wqe_tso_seg_size = RTE_ALIGN(sizeof(struct > > mlx4_wqe_lso_seg) + > > + tinfo->tso_header_size, > > + sizeof(struct > > mlx4_wqe_data_seg)); > > + tinfo->fence_size = ((sizeof(struct mlx4_wqe_ctrl_seg) + > > + tinfo->wqe_tso_seg_size) >> MLX4_SEG_SHIFT) + > > + buf->nb_segs; > > + tinfo->wqe_size = > > + RTE_ALIGN((uint32_t)(tinfo->fence_size << > > MLX4_SEG_SHIFT), > > + MLX4_TXBB_SIZE); > > + /* Validate WQE size and WQE space in the send queue. */ > > + if (sq->remain_size < tinfo->wqe_size || > > + tinfo->wqe_size > MLX4_MAX_WQE_SIZE) > > + return -ENOMEM; > > + /* Init pv. */ > > + tinfo->pv = (struct pv *)txq->bounce_buf; > > + tinfo->pv_counter = 0; > > + return 0; > > +} > > + > > +/** > > + * Fill the TSO WQE data segments with info on buffers to transmit . > > + * > > + * @param buf > > + * Pointer to the first packet mbuf. > > + * @param txq > > + * Pointer to Tx queue structure. > > + * @param tinfo > > + * Pointer to TSO info to use. > > + * @param dseg > > + * Pointer to the first data segment in the TSO WQE. > > + * @param ctrl > > + * Pointer to the control segment in the TSO WQE. > > + * > > + * @return > > + * 0 on success, negative value upon error. > > + */ > > +static inline volatile struct mlx4_wqe_ctrl_seg * > > +mlx4_tx_burst_fill_tso_dsegs(struct rte_mbuf *buf, > > + struct txq *txq, > > + struct tso_info *tinfo, > > + volatile struct mlx4_wqe_data_seg *dseg, > > + volatile struct mlx4_wqe_ctrl_seg *ctrl) { > > + uint32_t lkey; > > + int nb_segs = buf->nb_segs; > > + int nb_segs_txbb; > > + struct mlx4_sq *sq = &txq->msq; > > + struct rte_mbuf *sbuf = buf; > > + struct pv *pv = tinfo->pv; > > + int *pv_counter = &tinfo->pv_counter; > > + volatile struct mlx4_wqe_ctrl_seg *ctrl_next = > > + (volatile struct mlx4_wqe_ctrl_seg *) > > + ((volatile uint8_t *)ctrl + tinfo->wqe_size); > > + uint16_t sb_of = tinfo->tso_header_size; > > + uint16_t data_len; > > + > > + do { > > + /* how many dseg entries do we have in the current TXBB ? > > */ > > + nb_segs_txbb = (MLX4_TXBB_SIZE - > > + ((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1))) >> > > + MLX4_SEG_SHIFT; > > + switch (nb_segs_txbb) { > > + default: > > + /* Should never happen. */ > > + rte_panic("%p: Invalid number of SGEs(%d) for a > > TXBB", > > + (void *)txq, nb_segs_txbb); > > + /* rte_panic never returns. */ > > Since this default case should not happen because of the above calculation I > think we don't need it. > Just "break" if the compiler complain of default case lack. > Although "default" is not mandatory in switch case statement it is a good practice to have it even just for code clarity. so I will keep it there.
> > + case 4: > > + /* Memory region key for this memory pool. */ > > + lkey = mlx4_tx_mb2mr(txq, sbuf); > > + if (unlikely(lkey == (uint32_t)-1)) > > + goto err; > > + dseg->addr = > > + > > rte_cpu_to_be_64(rte_pktmbuf_mtod_offset(sbuf, > > + uintptr_t, > > + sb_of)); > > + dseg->lkey = lkey; > > + /* > > + * This data segment starts at the beginning of a new > > + * TXBB, so we need to postpone its byte_count > > writing > > + * for later. > > + */ > > + pv[*pv_counter].dseg = dseg; > > + /* > > + * Zero length segment is treated as inline segment > > + * with zero data. > > + */ > > + data_len = sbuf->data_len - sb_of; > > Is there a chance that the data_len will be negative? Rolled in this case? Since we verify ahead the all l2,l3 and L4 headers reside in the same fragment there is no reason for data_len to become negative, this is why I use uint16_t which is the same data type used in struct rte_mbuf for representing data_len , and as we do it in mlx4_tx_burst_segs. > Maybe it is better to change it for int16_t and to replace the next check to > be: > data_len > 0 ? data_len : 0x80000000 > I will keep this the way it is for 2 reasons: 1. Seems to me more cumbersome then what I wrote. 2. Code consistency wise, this is how we also wrote it in mlx4_tx_burst_segs, What's good there is also good here. > > And I think I found a way to remove the sb_of calculations for each segment: > > Each segment will create the next segment parameters while only the pre > loop calculation for the first segment parameters will calculate the header > offset: > > The parameters: data_len and sb_of. > > So before the loop: > sb_of = tinfo->tso_header_size; > data_len = sbuf->data_len - sb_of; > > And inside the loop (after the check of nb_segs): > sb_of = 0; > data_len = sbuf->data_len(the next sbuf); > > so each segment calculates the next segment parameters and we don't need > the "- sb_of" calculation per segment. > NICE :) > > + pv[(*pv_counter)++].val = > > + rte_cpu_to_be_32(data_len ? > > + data_len : > > + 0x80000000); > > + sb_of = 0; > > + sbuf = sbuf->next; > > + dseg++; > > + if (--nb_segs == 0) > > + return ctrl_next; > > + /* fallthrough */ > > + case 3: > > + lkey = mlx4_tx_mb2mr(txq, sbuf); > > + if (unlikely(lkey == (uint32_t)-1)) > > + goto err; > > + data_len = sbuf->data_len - sb_of; > > + mlx4_fill_tx_data_seg(dseg, > > + lkey, > > + rte_pktmbuf_mtod_offset(sbuf, > > + uintptr_t, > > + sb_of), > > + rte_cpu_to_be_32(data_len ? > > + data_len : > > + 0x80000000)); > > + sb_of = 0; > > + sbuf = sbuf->next; > > + dseg++; > > + if (--nb_segs == 0) > > + return ctrl_next; > > + /* fallthrough */ > > + case 2: > > + lkey = mlx4_tx_mb2mr(txq, sbuf); > > + if (unlikely(lkey == (uint32_t)-1)) > > + goto err; > > + data_len = sbuf->data_len - sb_of; > > + mlx4_fill_tx_data_seg(dseg, > > + lkey, > > + rte_pktmbuf_mtod_offset(sbuf, > > + uintptr_t, > > + sb_of), > > + rte_cpu_to_be_32(data_len ? > > + data_len : > > + 0x80000000)); > > + sb_of = 0; > > + sbuf = sbuf->next; > > + dseg++; > > + if (--nb_segs == 0) > > + return ctrl_next; > > + /* fallthrough */ > > + case 1: > > + lkey = mlx4_tx_mb2mr(txq, sbuf); > > + if (unlikely(lkey == (uint32_t)-1)) > > + goto err; > > + data_len = sbuf->data_len - sb_of; > > + mlx4_fill_tx_data_seg(dseg, > > + lkey, > > + rte_pktmbuf_mtod_offset(sbuf, > > + uintptr_t, > > + sb_of), > > + rte_cpu_to_be_32(data_len ? > > + data_len : > > + 0x80000000)); > > + sb_of = 0; > > + sbuf = sbuf->next; > > + dseg++; > > + if (--nb_segs == 0) > > + return ctrl_next; > > + } > > + /* Wrap dseg if it points at the end of the queue. */ > > + if ((volatile uint8_t *)dseg >= sq->eob) > > + dseg = (volatile struct mlx4_wqe_data_seg *) > > + ((volatile uint8_t *)dseg - sq->size); > > + } while (true); > > +err: > > + return NULL; > > +} > > + > > +/** > > + * Fill the packet's l2, l3 and l4 headers to the WQE. > > + * > > + * This will be used as the header for each TSO segment that is > transmitted. > > + * > > + * @param buf > > + * Pointer to the first packet mbuf. > > + * @param txq > > + * Pointer to Tx queue structure. > > + * @param tinfo > > + * Pointer to TSO info to use. > > + * @param ctrl > > + * Pointer to the control segment in the TSO WQE. > > + * > > + * @return > > + * 0 on success, negative value upon error. > > + */ > > +static inline volatile struct mlx4_wqe_data_seg * > > +mlx4_tx_burst_fill_tso_hdr(struct rte_mbuf *buf, > > + struct txq *txq, > > + struct tso_info *tinfo, > > + volatile struct mlx4_wqe_ctrl_seg *ctrl) { > > + volatile struct mlx4_wqe_lso_seg *tseg = > > + (volatile struct mlx4_wqe_lso_seg *)(ctrl + 1); > > + struct mlx4_sq *sq = &txq->msq; > > + struct pv *pv = tinfo->pv; > > + int *pv_counter = &tinfo->pv_counter; > > + int remain_size = tinfo->tso_header_size; > > + char *from = rte_pktmbuf_mtod(buf, char *); > > + uint16_t txbb_avail_space; > > + /* Union to overcome volatile constraints when copying TSO header. > > */ > > + union { > > + volatile uint8_t *vto; > > + uint8_t *to; > > + } thdr = { .vto = (volatile uint8_t *)tseg->header, }; > > + > > + /* > > + * TSO data always starts at offset 20 from the beginning of the TXBB > > + * (16 byte ctrl + 4byte TSO desc). Since each TXBB is 64Byte aligned > > + * we can write the first 44 TSO header bytes without worry for TxQ > > + * wrapping or overwriting the first TXBB 32bit word. > > + */ > > + txbb_avail_space = MLX4_TXBB_SIZE - > > + (sizeof(struct mlx4_wqe_ctrl_seg) + > > + sizeof(struct mlx4_wqe_lso_seg)); > > I think that better name is txbb_tail_size. I think that txbb_avail_space is good enough, so no change here. > > > + while (remain_size >= (int)(txbb_avail_space + sizeof(uint32_t))) { > > + /* Copy to end of txbb. */ > > + rte_memcpy(thdr.to, from, txbb_avail_space); > > + from += txbb_avail_space; > > + thdr.to += txbb_avail_space; > > + /* New TXBB, Check for TxQ wrap. */ > > + if (thdr.to >= sq->eob) > > + thdr.vto = sq->buf; > > + /* New TXBB, stash the first 32bits for later use. */ > > + pv[*pv_counter].dst = (volatile uint32_t *)thdr.to; > > + pv[(*pv_counter)++].val = *(uint32_t *)from, > > + from += sizeof(uint32_t); > > + thdr.to += sizeof(uint32_t); > > + remain_size -= (txbb_avail_space + sizeof(uint32_t)); > > You don't need the () here. True > > > + /* Avail space in new TXBB is TXBB size - 4 */ > > + txbb_avail_space = MLX4_TXBB_SIZE - sizeof(uint32_t); > > + } > > + if (remain_size > txbb_avail_space) { > > + rte_memcpy(thdr.to, from, txbb_avail_space); > > + from += txbb_avail_space; > > + thdr.to += txbb_avail_space; > > + remain_size -= txbb_avail_space; > > + /* New TXBB, Check for TxQ wrap. */ > > + if (thdr.to >= sq->eob) > > + thdr.vto = sq->buf; > > + pv[*pv_counter].dst = (volatile uint32_t *)thdr.to; > > + rte_memcpy(&pv[*pv_counter].val, from, remain_size); > > + (*pv_counter)++; > > + } else { > > Here it should be else if (remain_size > 0). true > > > + rte_memcpy(thdr.to, from, remain_size); > > + } > > + > > + tseg->mss_hdr_size = rte_cpu_to_be_32((buf->tso_segsz << 16) | > > + tinfo->tso_header_size); > > + /* Calculate data segment location */ > > + return (volatile struct mlx4_wqe_data_seg *) > > + ((uintptr_t)tseg + tinfo->wqe_tso_seg_size); > > } > > + > > +/** > > + * Write data segments and header for TSO uni/multi segment packet. > > + * > > + * @param buf > > + * Pointer to the first packet mbuf. > > + * @param txq > > + * Pointer to Tx queue structure. > > + * @param ctrl > > + * Pointer to the WQE control segment. > > + * > > + * @return > > + * Pointer to the next WQE control segment on success, NULL otherwise. > > + */ > > +static volatile struct mlx4_wqe_ctrl_seg * mlx4_tx_burst_tso(struct > > +rte_mbuf *buf, struct txq *txq, > > + volatile struct mlx4_wqe_ctrl_seg *ctrl) { > > + volatile struct mlx4_wqe_data_seg *dseg; > > + volatile struct mlx4_wqe_ctrl_seg *ctrl_next; > > + struct mlx4_sq *sq = &txq->msq; > > + struct tso_info tinfo; > > + struct pv *pv; > > + int pv_counter; > > + int ret; > > + > > + ret = mlx4_tx_burst_tso_get_params(buf, txq, &tinfo); > > + if (unlikely(ret)) > > + goto error; > > + dseg = mlx4_tx_burst_fill_tso_hdr(buf, txq, &tinfo, ctrl); > > + if (unlikely(dseg == NULL)) > > + goto error; > > + if ((uintptr_t)dseg >= (uintptr_t)sq->eob) > > + dseg = (volatile struct mlx4_wqe_data_seg *) > > + ((uintptr_t)dseg - sq->size); > > + ctrl_next = mlx4_tx_burst_fill_tso_dsegs(buf, txq, &tinfo, dseg, ctrl); > > + if (unlikely(ctrl_next == NULL)) > > + goto error; > > + /* Write the first DWORD of each TXBB save earlier. */ > > + pv = tinfo.pv; > > + pv_counter = tinfo.pv_counter; > > + /* Need a barrier here before writing the first TXBB word. */ > > + rte_io_wmb(); > > > + for (--pv_counter; pv_counter >= 0; pv_counter--) > > Since we don't need the first check do while statement is better. > To be fully safe you can use likely check before the memory barrier. > Will return the if statement But will not change the loop as it is the same as in mlx4_tx_burst_segs and I do want to have a consistent code. > > + *pv[pv_counter].dst = pv[pv_counter].val; > > + ctrl->fence_size = tinfo.fence_size; > > + sq->remain_size -= tinfo.wqe_size; > > + return ctrl_next; > > +error: > > + txq->stats.odropped++; > > + return NULL; > > +} > > + > > +/** > > * Write data segments of multi-segment packet. > > * > > * @param buf > > @@ -560,6 +918,7 @@ struct pv { > > uint16_t flags16[2]; > > } srcrb; > > uint32_t lkey; > > + bool tso = txq->priv->tso && (buf->ol_flags & > > PKT_TX_TCP_SEG); > > > > /* Clean up old buffer. */ > > if (likely(elt->buf != NULL)) { > > @@ -578,7 +937,16 @@ struct pv { > > } while (tmp != NULL); > > } > > RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); > > - if (buf->nb_segs == 1) { > > + if (tso) { > > + /* Change opcode to TSO */ > > + owner_opcode &= ~MLX4_OPCODE_CONFIG_CMD; > > + owner_opcode |= MLX4_OPCODE_LSO | > > MLX4_WQE_CTRL_RR; > > + ctrl_next = mlx4_tx_burst_tso(buf, txq, ctrl); > > + if (!ctrl_next) { > > + elt->buf = NULL; > > + break; > > + } > > + } else if (buf->nb_segs == 1) { > > /* Validate WQE space in the send queue. */ > > if (sq->remain_size < MLX4_TXBB_SIZE) { > > elt->buf = NULL; > > diff --git a/drivers/net/mlx4/mlx4_rxtx.h > > b/drivers/net/mlx4/mlx4_rxtx.h index 4c025e3..ffa8abf 100644 > > --- a/drivers/net/mlx4/mlx4_rxtx.h > > +++ b/drivers/net/mlx4/mlx4_rxtx.h > > @@ -90,7 +90,7 @@ struct mlx4_txq_stats { > > unsigned int idx; /**< Mapping index. */ > > uint64_t opackets; /**< Total of successfully sent packets. */ > > uint64_t obytes; /**< Total of successfully sent bytes. */ > > - uint64_t odropped; /**< Total of packets not sent when Tx ring full. > > */ > > + uint64_t odropped; /**< Total number of packets failed to transmit. > > */ > > }; > > > > /** Tx queue descriptor. */ > > diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c > > index 6edaadb..9aa7440 100644 > > --- a/drivers/net/mlx4/mlx4_txq.c > > +++ b/drivers/net/mlx4/mlx4_txq.c > > @@ -116,8 +116,14 @@ > > DEV_TX_OFFLOAD_UDP_CKSUM | > > DEV_TX_OFFLOAD_TCP_CKSUM); > > } > > - if (priv->hw_csum_l2tun) > > + if (priv->tso) > > + offloads |= DEV_TX_OFFLOAD_TCP_TSO; > > + if (priv->hw_csum_l2tun) { > > offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM; > > + if (priv->tso) > > + offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO | > > + DEV_TX_OFFLOAD_GRE_TNL_TSO); > > + } > > return offloads; > > } > > > > -- > > 1.8.3.1