HI Moti Please see inline.
From: Mordechay Haimovsky > Implement support for hardware TSO. > > Signed-off-by: Moti Haimovsky <mo...@mellanox.com> > --- > v4: > * Bug fixes in filling TSO data segments. > * Modifications according to review inputs from Adrien Mazarguil > and Matan Azrad. > in reply to > 1530190137-17848-1-git-send-email-mo...@mellanox.com > > v3: > * Fixed compilation errors in compilers without GNU C extensions > caused by a declaration of zero-length array in the code. > in reply to > 1530187032-6489-1-git-send-email-mo...@mellanox.com > > v2: > * Fixed coding style warning. > in reply to > 1530184583-30166-1-git-send-email-mo...@mellanox.com > > v1: > * Fixed coding style warnings. > in reply to > 1530181779-19716-1-git-send-email-mo...@mellanox.com > --- > doc/guides/nics/features/mlx4.ini | 1 + > doc/guides/nics/mlx4.rst | 3 + > drivers/net/mlx4/Makefile | 5 + > drivers/net/mlx4/mlx4.c | 9 + > drivers/net/mlx4/mlx4.h | 5 + > drivers/net/mlx4/mlx4_prm.h | 15 ++ > drivers/net/mlx4/mlx4_rxtx.c | 362 > +++++++++++++++++++++++++++++++++++++- > drivers/net/mlx4/mlx4_rxtx.h | 2 +- > drivers/net/mlx4/mlx4_txq.c | 8 +- > 9 files changed, 406 insertions(+), 4 deletions(-) > > diff --git a/doc/guides/nics/features/mlx4.ini > b/doc/guides/nics/features/mlx4.ini > index f6efd21..98a3f61 100644 > --- a/doc/guides/nics/features/mlx4.ini > +++ b/doc/guides/nics/features/mlx4.ini > @@ -13,6 +13,7 @@ Queue start/stop = Y > MTU update = Y > Jumbo frame = Y > Scattered Rx = Y > +TSO = Y > Promiscuous mode = Y > Allmulticast mode = Y > Unicast MAC filter = Y > diff --git a/doc/guides/nics/mlx4.rst b/doc/guides/nics/mlx4.rst index > 491106a..12adaeb 100644 > --- a/doc/guides/nics/mlx4.rst > +++ b/doc/guides/nics/mlx4.rst > @@ -142,6 +142,9 @@ Limitations > The ability to enable/disable CRC stripping requires OFED version > 4.3-1.5.0.0 and above or rdma-core version v18 and above. > > +- TSO (Transmit Segmentation Offload) is supported in OFED version > + 4.4 and above or in rdma-core version v18 and above. > + > Prerequisites > ------------- > > diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile index > 73f9d40..63bc003 100644 > --- a/drivers/net/mlx4/Makefile > +++ b/drivers/net/mlx4/Makefile > @@ -85,6 +85,11 @@ mlx4_autoconf.h.new: FORCE > mlx4_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh > $Q $(RM) -f -- '$@' > $Q : > '$@' > + $Q sh -- '$<' '$@' \ > + HAVE_IBV_MLX4_WQE_LSO_SEG \ > + infiniband/mlx4dv.h \ > + type 'struct mlx4_wqe_lso_seg' \ > + $(AUTOCONF_OUTPUT) > > # Create mlx4_autoconf.h or update it in case it differs from the new one. > > diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c index > d151a90..5d8c76d 100644 > --- a/drivers/net/mlx4/mlx4.c > +++ b/drivers/net/mlx4/mlx4.c > @@ -677,6 +677,15 @@ struct mlx4_conf { > > IBV_RAW_PACKET_CAP_SCATTER_FCS); > DEBUG("FCS stripping toggling is %ssupported", > priv->hw_fcs_strip ? "" : "not "); > + priv->tso = > + ((device_attr_ex.tso_caps.max_tso > 0) && > + (device_attr_ex.tso_caps.supported_qpts & > + (1 << IBV_QPT_RAW_PACKET))); > + if (priv->tso) > + priv->tso_max_payload_sz = > + device_attr_ex.tso_caps.max_tso; > + DEBUG("TSO is %ssupported", > + priv->tso ? "" : "not "); > /* Configure the first MAC address by default. */ > err = mlx4_get_mac(priv, &mac.addr_bytes); > if (err) { > diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index > 300cb4d..89d8c38 100644 > --- a/drivers/net/mlx4/mlx4.h > +++ b/drivers/net/mlx4/mlx4.h > @@ -47,6 +47,9 @@ > /** Interrupt alarm timeout value in microseconds. */ #define > MLX4_INTR_ALARM_TIMEOUT 100000 > > +/* Maximum packet headers size (L2+L3+L4) for TSO. */ #define > +MLX4_MAX_TSO_HEADER 192 > + > /** Port parameter. */ > #define MLX4_PMD_PORT_KVARG "port" > > @@ -90,6 +93,8 @@ struct priv { > uint32_t hw_csum:1; /**< Checksum offload is supported. */ > uint32_t hw_csum_l2tun:1; /**< Checksum support for L2 tunnels. > */ > uint32_t hw_fcs_strip:1; /**< FCS stripping toggling is supported. */ > + uint32_t tso:1; /**< Transmit segmentation offload is supported. */ > + uint32_t tso_max_payload_sz; /**< Max supported TSO payload > size. */ > uint64_t hw_rss_sup; /**< Supported RSS hash fields (Verbs > format). */ > struct rte_intr_handle intr_handle; /**< Port interrupt handle. */ > struct mlx4_drop *drop; /**< Shared resources for drop flow rules. > */ diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h > index b771d8c..aef77ba 100644 > --- a/drivers/net/mlx4/mlx4_prm.h > +++ b/drivers/net/mlx4/mlx4_prm.h > @@ -19,6 +19,7 @@ > #ifdef PEDANTIC > #pragma GCC diagnostic error "-Wpedantic" > #endif > +#include "mlx4_autoconf.h" > > /* ConnectX-3 Tx queue basic block. */ > #define MLX4_TXBB_SHIFT 6 > @@ -40,6 +41,7 @@ > /* Work queue element (WQE) flags. */ > #define MLX4_WQE_CTRL_IIP_HDR_CSUM (1 << 28) #define > MLX4_WQE_CTRL_IL4_HDR_CSUM (1 << 27) > +#define MLX4_WQE_CTRL_RR (1 << 6) > > /* CQE checksum flags. */ > enum { > @@ -98,6 +100,19 @@ struct mlx4_cq { > int arm_sn; /**< Rx event counter. */ > }; > > +#ifndef HAVE_IBV_MLX4_WQE_LSO_SEG > +/* > + * WQE LSO segment structure. > + * Defined here as backward compatibility for rdma-core v17 and below. > + * Similar definition is found in infiniband/mlx4dv.h in rdma-core v18 > + * and above. > + */ > +struct mlx4_wqe_lso_seg { > + rte_be32_t mss_hdr_size; > + rte_be32_t header[]; > +}; > +#endif > + > /** > * Retrieve a CQE entry from a CQ. > * > diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c > index 78b6dd5..750ad6d 100644 > --- a/drivers/net/mlx4/mlx4_rxtx.c > +++ b/drivers/net/mlx4/mlx4_rxtx.c > @@ -38,10 +38,29 @@ > * DWORD (32 byte) of a TXBB. > */ > struct pv { > - volatile struct mlx4_wqe_data_seg *dseg; > + union { > + volatile struct mlx4_wqe_data_seg *dseg; > + volatile uint32_t *dst; > + }; > uint32_t val; > }; > > +/** A helper structure for TSO packet handling. */ struct tso_info { > + /** Pointer to the array of saved first DWORD (32 byte) of a TXBB. */ > + struct pv *pv; > + /** Current entry in the pv array. */ > + int pv_counter; > + /** Total size of the WQE including padding. */ > + uint32_t wqe_size; > + /** size of TSO header to prepend to each packet to send. */ size => Size > + uint16_t tso_header_sz; tso_header_sz = > t tso_header_size "size" like the next fields name. > + /** Total size of the TSO segment in the WQE. */ > + uint16_t wqe_tso_seg_size; > + /** Raw WQE size in units of 16 Bytes and without padding. */ > + uint8_t fence_size; > +}; > + > /** A table to translate Rx completion flags to packet type. */ uint32_t > mlx4_ptype_table[0x100] __rte_cache_aligned = { > /* > @@ -368,6 +387,335 @@ struct pv { > } > > /** > + * Obtain and calculate TSO information needed for assembling a TSO WQE. > + * > + * @param buf > + * Pointer to the first packet mbuf. > + * @param txq > + * Pointer to Tx queue structure. > + * @param tinfo > + * Pointer to a structure to fill the info with. > + * > + * @return > + * 0 on success, negative value upon error. > + */ > +static inline int > +mlx4_tx_burst_tso_get_params(struct rte_mbuf *buf, > + struct txq *txq, > + struct tso_info *tinfo) > +{ > + struct mlx4_sq *sq = &txq->msq; > + const uint8_t tunneled = txq->priv->hw_csum_l2tun && > + (buf->ol_flags & PKT_TX_TUNNEL_MASK); > + > + tinfo->tso_header_sz = buf->l2_len + buf->l3_len + buf->l4_len; > + if (tunneled) > + tinfo->tso_header_sz += buf->outer_l2_len + buf- > >outer_l3_len; > + if (unlikely(buf->tso_segsz == 0 || > + tinfo->tso_header_sz == 0 || > + tinfo->tso_header_sz > MLX4_MAX_TSO_HEADER || > + tinfo->tso_header_sz > buf->data_len)) > + return -EINVAL; > + /* > + * Calculate the WQE TSO segment size > + * Note: > + * 1. An LSO segment must be padded such that the subsequent data > + * segment is 16-byte aligned. > + * 2. The start address of the TSO segment is always 16 Bytes aligned. > + */ > + tinfo->wqe_tso_seg_size = RTE_ALIGN(sizeof(struct mlx4_wqe_lso_seg) + > tinfo->tso_header_sz, sizeof(struct mlx4_wqe_data_seg)); > + tinfo->fence_size = ((sizeof(struct mlx4_wqe_ctrl_seg) + > + tinfo->wqe_tso_seg_size) >> MLX4_SEG_SHIFT) + > + buf->nb_segs; > + tinfo->wqe_size = > + RTE_ALIGN((uint32_t)(tinfo->fence_size << > MLX4_SEG_SHIFT), > + MLX4_TXBB_SIZE); > + /* Validate WQE size and WQE space in the send queue. */ > + if (sq->remain_size < tinfo->wqe_size || > + tinfo->wqe_size > MLX4_MAX_WQE_SIZE) > + return -ENOMEM; > + /* Init pv. */ > + tinfo->pv = (struct pv *)txq->bounce_buf; > + tinfo->pv_counter = 0; > + return 0; > +} > + > +/** > + * Fill the TSO WQE data segments with info on buffers to transmit . > + * > + * @param buf > + * Pointer to the first packet mbuf. > + * @param txq > + * Pointer to Tx queue structure. > + * @param tinfo > + * Pointer to TSO info to use. > + * @param dseg > + * Pointer to the first data segment in the TSO WQE. > + * > + * @return > + * 0 on success, negative value upon error. > + */ > +static inline volatile struct mlx4_wqe_ctrl_seg * > +mlx4_tx_burst_fill_tso_dsegs(struct rte_mbuf *buf, > + struct txq *txq, > + struct tso_info *tinfo, > + volatile struct mlx4_wqe_data_seg *dseg, > + volatile struct mlx4_wqe_ctrl_seg *ctrl) { > + uint32_t lkey; > + int nb_segs = buf->nb_segs; > + int nb_segs_txbb; > + struct mlx4_sq *sq = &txq->msq; > + struct rte_mbuf *sbuf = buf; > + struct pv *pv = tinfo->pv; > + int *pv_counter = &tinfo->pv_counter; > + uint16_t sb_of = tinfo->tso_header_sz; > + uint16_t data_len; > + > + while (nb_segs > 0) { I think that here do while statement is better(no need the check in the first loop). > + /* how many dseg entries do we have in the current TXBB ? > */ > + nb_segs_txbb = > + (MLX4_TXBB_SIZE / sizeof(struct > mlx4_wqe_data_seg)) - > + ((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1)) / > + sizeof(struct mlx4_wqe_data_seg); Division may be expensive, you can avoid it by next: nb_segs_txbb = (MLX4_TXBB_SIZE - ((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1))) >> MLX4_SEG_SHIFT; > + switch (nb_segs_txbb) { > + case 4: > + /* Memory region key for this memory pool. */ > + lkey = mlx4_tx_mb2mr(txq, sbuf); > + if (unlikely(lkey == (uint32_t)-1)) > + goto lkey_err; > + dseg->addr = > + > rte_cpu_to_be_64(rte_pktmbuf_mtod_offset(sbuf, > + uintptr_t, > + sb_of)); > + dseg->lkey = lkey; > + /* > + * This data segment starts at the beginning of a new > + * TXBB, so we need to postpone its byte_count > writing > + * for later. > + */ > + pv[*pv_counter].dseg = dseg; > + /* > + * Zero length segment is treated as inline segment > + * with zero data. > + */ > + data_len = sbuf->data_len - sb_of; > + pv[(*pv_counter)++].val = > + rte_cpu_to_be_32(data_len ? > + data_len : > + 0x80000000); > + sb_of = 0; > + sbuf = sbuf->next; > + dseg++; > + if (--nb_segs == 0) > + break; I think that here and in all the other cases it is better to do " return X" instead of break. X is the same return value as now which can be calculated in the start. > + /* fallthrough */ > + case 3: > + lkey = mlx4_tx_mb2mr(txq, sbuf); > + if (unlikely(lkey == (uint32_t)-1)) > + goto lkey_err; > + data_len = sbuf->data_len - sb_of; > + mlx4_fill_tx_data_seg(dseg, > + lkey, > + rte_pktmbuf_mtod_offset(sbuf, > + uintptr_t, > + sb_of), > + rte_cpu_to_be_32(data_len ? > + data_len : > + 0x80000000)); > + sb_of = 0; > + sbuf = sbuf->next; > + dseg++; > + if (--nb_segs == 0) > + break; > + /* fallthrough */ > + case 2: > + lkey = mlx4_tx_mb2mr(txq, sbuf); > + if (unlikely(lkey == (uint32_t)-1)) > + goto lkey_err; > + data_len = sbuf->data_len - sb_of; > + mlx4_fill_tx_data_seg(dseg, > + lkey, > + rte_pktmbuf_mtod_offset(sbuf, > + uintptr_t, > + sb_of), > + rte_cpu_to_be_32(data_len ? > + data_len : > + 0x80000000)); > + sb_of = 0; > + sbuf = sbuf->next; > + dseg++; > + if (--nb_segs == 0) > + break; > + /* fallthrough */ > + case 1: > + lkey = mlx4_tx_mb2mr(txq, sbuf); > + if (unlikely(lkey == (uint32_t)-1)) > + goto lkey_err; > + data_len = sbuf->data_len - sb_of; > + mlx4_fill_tx_data_seg(dseg, > + lkey, > + rte_pktmbuf_mtod_offset(sbuf, > + uintptr_t, > + sb_of), > + rte_cpu_to_be_32(data_len ? > + data_len : > + 0x80000000)); > + sb_of = 0; > + sbuf = sbuf->next; > + dseg++; > + --nb_segs; > + break; > + default: > + /* Should never happen */ > + rte_panic("%p: Invalid number of SGEs(%d) for a > TXBB", > + (void *)txq, nb_segs_txbb); I think we don't need the default case here, Do you have any scenario it may really happen? > + } > + /* Wrap dseg if it points at the end of the queue. */ > + if ((volatile uint8_t *)dseg >= sq->eob) > + dseg = (volatile struct mlx4_wqe_data_seg *) > + ((volatile uint8_t *)dseg - sq->size); > + } > + /* Align next WQE address to the next TXBB. */ > + return (volatile struct mlx4_wqe_ctrl_seg *) > + ((volatile uint8_t *)ctrl + tinfo->wqe_size); > +lkey_err: > + return NULL; > +} > + > +/** > + * Fill the packet's l2, l3 and l4 headers to the WQE. > + * > + * This will be used as the header for each TSO segment that is transmitted. > + * > + * @param buf > + * Pointer to the first packet mbuf. > + * @param txq > + * Pointer to Tx queue structure. > + * @param tinfo > + * Pointer to TSO info to use. > + * @param ctrl > + * Pointer to the control segment in the TSO WQE. > + * > + * @return > + * 0 on success, negative value upon error. > + */ > +static inline volatile struct mlx4_wqe_data_seg * > +mlx4_tx_burst_fill_tso_hdr(struct rte_mbuf *buf, > + struct txq *txq, > + struct tso_info *tinfo, > + volatile struct mlx4_wqe_ctrl_seg *ctrl) { > + volatile struct mlx4_wqe_lso_seg *tseg = > + (volatile struct mlx4_wqe_lso_seg *)(ctrl + 1); > + struct mlx4_sq *sq = &txq->msq; > + struct pv *pv = tinfo->pv; > + int *pv_counter = &tinfo->pv_counter; > + int remain_sz = tinfo->tso_header_sz; > + char *from = rte_pktmbuf_mtod(buf, char *); > + uint16_t txbb_avail_space; > + int copy_sz; > + /* Union to overcome volatile constraints when copying TSO header. > */ > + union { > + volatile uint8_t *vto; > + uint8_t *to; > + } thdr = { .vto = (volatile uint8_t *)tseg->header, }; > + > + /* > + * TSO data always starts at offset 20 from the beginning of the TXBB > + * (16 byte ctrl + 4byte TSO desc). Since each TXBB is 64Byte aligned > + * we can write the first 44 TSO header bytes without worry for TxQ > + * wrapping or overwriting the first TXBB 32bit word. > + */ > + txbb_avail_space = MLX4_TXBB_SIZE - > + (sizeof(struct mlx4_wqe_ctrl_seg) + > + sizeof(struct mlx4_wqe_lso_seg)); > + do { > + copy_sz = RTE_MIN(txbb_avail_space, remain_sz); > + rte_memcpy(thdr.to, from, copy_sz); > + remain_sz -= copy_sz; > + if (remain_sz <= 0) > + break; > + from += copy_sz; > + thdr.to += copy_sz; > + /* New TXBB, Check for TxQ wrap. */ > + if (thdr.to >= sq->eob) > + thdr.vto = sq->buf; > + /* New TXBB, stash the first 32bits for later use. */ > + pv[*pv_counter].dst = (volatile uint32_t *)thdr.vto; > + rte_memcpy(&pv[*pv_counter].val, from, > + RTE_MIN((size_t)remain_sz, sizeof(uint32_t))); > + (*pv_counter)++; > + from += sizeof(uint32_t); > + thdr.to += sizeof(uint32_t); > + remain_sz -= sizeof(uint32_t); > + /* Space in current TXBB is TXBB size - 4 */ > + txbb_avail_space = MLX4_TXBB_SIZE - sizeof(uint32_t); > + } while (remain_sz > 0); I think the loop can be better - you have now 5 checks per txbb, we can reduce it to 2 by next: txbb_data_space = 44; (not include the first 4 bytes of the current txbb) while (remain_size >= txbb_data_space + 4) // loop to write the tail of the current txbb + the head of the next txbb write txbb_data_space to the WQE. Check wrap around Write 4 bytes for the next txbb to pv remain_size -= txbb_data_space + 4 txbb_data_space =60 if (remain_size > txbb_data_space) // write tail and partially head write txbb_data_space to the WQE. Check wrap around Write (remain_size - txbb_data_space) to pv Else // write only tail write remain_size from the header Am I missing something? > + tseg->mss_hdr_size = rte_cpu_to_be_32((buf->tso_segsz << 16) | > + tinfo->tso_header_sz); > + /* Calculate data segment location */ > + return (volatile struct mlx4_wqe_data_seg *) > + ((uintptr_t)tseg + tinfo->wqe_tso_seg_size); > } > + > +/** > + * Write data segments and header for TSO uni/multi segment packet. > + * > + * @param buf > + * Pointer to the first packet mbuf. > + * @param txq > + * Pointer to Tx queue structure. > + * @param ctrl > + * Pointer to the WQE control segment. > + * > + * @return > + * Pointer to the next WQE control segment on success, NULL otherwise. > + */ > +static volatile struct mlx4_wqe_ctrl_seg * mlx4_tx_burst_tso(struct > +rte_mbuf *buf, struct txq *txq, > + volatile struct mlx4_wqe_ctrl_seg *ctrl) { > + volatile struct mlx4_wqe_data_seg *dseg; > + volatile struct mlx4_wqe_ctrl_seg *ctrl_next; > + struct mlx4_sq *sq = &txq->msq; > + struct tso_info tinfo; > + struct pv *pv; > + int pv_counter; > + int ret; > + > + ret = mlx4_tx_burst_tso_get_params(buf, txq, &tinfo); > + if (unlikely(ret)) > + goto error; > + dseg = mlx4_tx_burst_fill_tso_hdr(buf, txq, &tinfo, ctrl); > + if (unlikely(dseg == NULL)) > + goto error; > + if ((uintptr_t)dseg >= (uintptr_t)sq->eob) > + dseg = (volatile struct mlx4_wqe_data_seg *) > + ((uintptr_t)dseg - sq->size); > + ctrl_next = mlx4_tx_burst_fill_tso_dsegs(buf, txq, &tinfo, dseg, ctrl); > + if (unlikely(ctrl_next == NULL)) > + goto error; > + /* Write the first DWORD of each TXBB save earlier. */ > + if (tinfo.pv_counter) { I think you can add here likely: The minimum segments: 1. cntrl 2. header eth 3. header IP 4. header IP\tcp 5. at least 1 data segment. Maybe even we don't need this check. > + pv = tinfo.pv; > + pv_counter = tinfo.pv_counter; > + /* Need a barrier here before writing the first TXBB word. */ > + rte_io_wmb(); > + for (--pv_counter; pv_counter >= 0; pv_counter--) > + *pv[pv_counter].dst = pv[pv_counter].val; > + } > + ctrl->fence_size = tinfo.fence_size; > + sq->remain_size -= tinfo.wqe_size; > + return ctrl_next; > +error: > + txq->stats.odropped++; > + return NULL; > +} > + > +/** > * Write data segments of multi-segment packet. > * > * @param buf > @@ -560,6 +908,7 @@ struct pv { > uint16_t flags16[2]; > } srcrb; > uint32_t lkey; > + bool tso = txq->priv->tso && (buf->ol_flags & > PKT_TX_TCP_SEG); > > /* Clean up old buffer. */ > if (likely(elt->buf != NULL)) { > @@ -578,7 +927,16 @@ struct pv { > } while (tmp != NULL); > } > RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); > - if (buf->nb_segs == 1) { > + if (tso) { > + /* Change opcode to TSO */ > + owner_opcode &= ~MLX4_OPCODE_CONFIG_CMD; > + owner_opcode |= MLX4_OPCODE_LSO | > MLX4_WQE_CTRL_RR; > + ctrl_next = mlx4_tx_burst_tso(buf, txq, ctrl); > + if (!ctrl_next) { > + elt->buf = NULL; > + break; > + } > + } else if (buf->nb_segs == 1) { > /* Validate WQE space in the send queue. */ > if (sq->remain_size < MLX4_TXBB_SIZE) { > elt->buf = NULL; > diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h > index 4c025e3..ffa8abf 100644 > --- a/drivers/net/mlx4/mlx4_rxtx.h > +++ b/drivers/net/mlx4/mlx4_rxtx.h > @@ -90,7 +90,7 @@ struct mlx4_txq_stats { > unsigned int idx; /**< Mapping index. */ > uint64_t opackets; /**< Total of successfully sent packets. */ > uint64_t obytes; /**< Total of successfully sent bytes. */ > - uint64_t odropped; /**< Total of packets not sent when Tx ring full. > */ > + uint64_t odropped; /**< Total number of packets failed to transmit. > */ > }; > > /** Tx queue descriptor. */ > diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c > index 6edaadb..9aa7440 100644 > --- a/drivers/net/mlx4/mlx4_txq.c > +++ b/drivers/net/mlx4/mlx4_txq.c > @@ -116,8 +116,14 @@ > DEV_TX_OFFLOAD_UDP_CKSUM | > DEV_TX_OFFLOAD_TCP_CKSUM); > } > - if (priv->hw_csum_l2tun) > + if (priv->tso) > + offloads |= DEV_TX_OFFLOAD_TCP_TSO; > + if (priv->hw_csum_l2tun) { > offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM; > + if (priv->tso) > + offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO | > + DEV_TX_OFFLOAD_GRE_TNL_TSO); > + } > return offloads; > } > > -- > 1.8.3.1