Add Tx multi-seg support in scalar for cn20k.

Signed-off-by: Nithin Dabilpuram <ndabilpu...@marvell.com>
Signed-off-by: Jerin Jacob <jer...@marvell.com>
Signed-off-by: Rahul Bhansali <rbhans...@marvell.com>
Signed-off-by: Pavan Nikhilesh <pbhagavat...@marvell.com>
---
 drivers/net/cnxk/cn20k_tx.h | 352 +++++++++++++++++++++++++++++++++++-
 1 file changed, 347 insertions(+), 5 deletions(-)

diff --git a/drivers/net/cnxk/cn20k_tx.h b/drivers/net/cnxk/cn20k_tx.h
index 610d64f21b..3f163285f0 100644
--- a/drivers/net/cnxk/cn20k_tx.h
+++ b/drivers/net/cnxk/cn20k_tx.h
@@ -863,6 +863,183 @@ cn20k_nix_xmit_prepare_tstamp(struct cn20k_eth_txq *txq, 
uintptr_t lmt_addr,
        }
 }
 
+static __rte_always_inline uint16_t
+cn20k_nix_prepare_mseg(struct cn20k_eth_txq *txq, struct rte_mbuf *m, struct 
rte_mbuf **extm,
+                      uint64_t *cmd, const uint16_t flags)
+{
+       uint64_t prefree = 0, aura0, aura, nb_segs, segdw;
+       struct nix_send_hdr_s *send_hdr;
+       union nix_send_sg_s *sg, l_sg;
+       union nix_send_sg2_s l_sg2;
+       struct rte_mbuf *cookie;
+       struct rte_mbuf *m_next;
+       uint8_t off, is_sg2;
+       uint64_t len, dlen;
+       uint64_t ol_flags;
+       uint64_t *slist;
+
+       send_hdr = (struct nix_send_hdr_s *)cmd;
+
+       if (flags & NIX_TX_NEED_EXT_HDR)
+               off = 2;
+       else
+               off = 0;
+
+       sg = (union nix_send_sg_s *)&cmd[2 + off];
+       len = send_hdr->w0.total;
+       if (flags & NIX_TX_OFFLOAD_SECURITY_F)
+               ol_flags = m->ol_flags;
+
+       /* Start from second segment, first segment is already there */
+       dlen = m->data_len;
+       is_sg2 = 0;
+       l_sg.u = sg->u;
+       /* Clear l_sg.u first seg length that might be stale from vector path */
+       l_sg.u &= ~0xFFFFUL;
+       l_sg.u |= dlen;
+       len -= dlen;
+       nb_segs = m->nb_segs - 1;
+       m_next = m->next;
+       m->next = NULL;
+       m->nb_segs = 1;
+       slist = &cmd[3 + off + 1];
+
+       cookie = RTE_MBUF_DIRECT(m) ? m : rte_mbuf_from_indirect(m);
+       /* Set invert df if buffer is not to be freed by H/W */
+       if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
+               aura = send_hdr->w0.aura;
+               prefree = cn20k_nix_prefree_seg(m, extm, txq, send_hdr, &aura);
+               send_hdr->w0.aura = aura;
+               l_sg.i1 = prefree;
+       }
+
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+       /* Mark mempool object as "put" since it is freed by NIX */
+       if (!prefree)
+               RTE_MEMPOOL_CHECK_COOKIES(cookie->pool, (void **)&cookie, 1, 0);
+       rte_io_wmb();
+#else
+       RTE_SET_USED(cookie);
+#endif
+
+       /* Quickly handle single segmented packets. With this if-condition
+        * compiler will completely optimize out the below do-while loop
+        * from the Tx handler when NIX_TX_MULTI_SEG_F offload is not set.
+        */
+       if (!(flags & NIX_TX_MULTI_SEG_F))
+               goto done;
+
+       aura0 = send_hdr->w0.aura;
+       m = m_next;
+       if (!m)
+               goto done;
+
+       /* Fill mbuf segments */
+       do {
+               uint64_t iova;
+
+               /* Save the current mbuf properties. These can get cleared in
+                * cnxk_nix_prefree_seg()
+                */
+               m_next = m->next;
+               iova = rte_mbuf_data_iova(m);
+               dlen = m->data_len;
+               len -= dlen;
+
+               nb_segs--;
+               aura = aura0;
+               prefree = 0;
+
+               m->next = NULL;
+
+               cookie = RTE_MBUF_DIRECT(m) ? m : rte_mbuf_from_indirect(m);
+               if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
+                       aura = roc_npa_aura_handle_to_aura(m->pool->pool_id);
+                       prefree = cn20k_nix_prefree_seg(m, extm, txq, send_hdr, 
&aura);
+                       is_sg2 = aura != aura0 && !prefree;
+               }
+
+               if (unlikely(is_sg2)) {
+                       /* This mbuf belongs to a different pool and
+                        * DF bit is not to be set, so use SG2 subdesc
+                        * so that it is freed to the appropriate pool.
+                        */
+
+                       /* Write the previous descriptor out */
+                       sg->u = l_sg.u;
+
+                       /* If the current SG subdc does not have any
+                        * iovas in it, then the SG2 subdc can overwrite
+                        * that SG subdc.
+                        *
+                        * If the current SG subdc has 2 iovas in it, then
+                        * the current iova word should be left empty.
+                        */
+                       slist += (-1 + (int)l_sg.segs);
+                       sg = (union nix_send_sg_s *)slist;
+
+                       l_sg2.u = l_sg.u & 0xC00000000000000; /* LD_TYPE */
+                       l_sg2.subdc = NIX_SUBDC_SG2;
+                       l_sg2.aura = aura;
+                       l_sg2.seg1_size = dlen;
+                       l_sg.u = l_sg2.u;
+
+                       slist++;
+                       *slist = iova;
+                       slist++;
+               } else {
+                       *slist = iova;
+                       /* Set invert df if buffer is not to be freed by H/W */
+                       l_sg.u |= (prefree << (l_sg.segs + 55));
+                       /* Set the segment length */
+                       l_sg.u |= ((uint64_t)dlen << (l_sg.segs << 4));
+                       l_sg.segs += 1;
+                       slist++;
+               }
+
+               if ((is_sg2 || l_sg.segs > 2) && nb_segs) {
+                       sg->u = l_sg.u;
+                       /* Next SG subdesc */
+                       sg = (union nix_send_sg_s *)slist;
+                       l_sg.u &= 0xC00000000000000; /* LD_TYPE */
+                       l_sg.subdc = NIX_SUBDC_SG;
+                       slist++;
+               }
+
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+               /* Mark mempool object as "put" since it is freed by NIX
+                */
+               if (!prefree)
+                       RTE_MEMPOOL_CHECK_COOKIES(cookie->pool, (void 
**)&cookie, 1, 0);
+#else
+               RTE_SET_USED(cookie);
+#endif
+               m = m_next;
+       } while (nb_segs);
+
+done:
+       /* Add remaining bytes of security data to last seg */
+       if (flags & NIX_TX_OFFLOAD_SECURITY_F && ol_flags & 
RTE_MBUF_F_TX_SEC_OFFLOAD && len) {
+               uint8_t shft = (l_sg.subdc == NIX_SUBDC_SG) ? ((l_sg.segs - 1) 
<< 4) : 0;
+
+               dlen = ((l_sg.u >> shft) & 0xFFFFULL) + len;
+               l_sg.u = l_sg.u & ~(0xFFFFULL << shft);
+               l_sg.u |= dlen << shft;
+       }
+
+       /* Write the last subdc out */
+       sg->u = l_sg.u;
+
+       segdw = (uint64_t *)slist - (uint64_t *)&cmd[2 + off];
+       /* Roundup extra dwords to multiple of 2 */
+       segdw = (segdw >> 1) + (segdw & 0x1);
+       /* Default dwords */
+       segdw += (off >> 1) + 1 + !!(flags & NIX_TX_OFFLOAD_TSTAMP_F);
+       send_hdr->w0.sizem1 = segdw - 1;
+
+       return segdw;
+}
+
 static __rte_always_inline uint16_t
 cn20k_nix_xmit_pkts(void *tx_queue, uint64_t *ws, struct rte_mbuf **tx_pkts, 
uint16_t pkts,
                    uint64_t *cmd, const uint16_t flags)
@@ -1010,6 +1187,170 @@ cn20k_nix_xmit_pkts(void *tx_queue, uint64_t *ws, 
struct rte_mbuf **tx_pkts, uin
        return pkts;
 }
 
+static __rte_always_inline uint16_t
+cn20k_nix_xmit_pkts_mseg(void *tx_queue, uint64_t *ws, struct rte_mbuf 
**tx_pkts, uint16_t pkts,
+                        uint64_t *cmd, const uint16_t flags)
+{
+       struct cn20k_eth_txq *txq = tx_queue;
+       uintptr_t pa0, pa1, lbase = txq->lmt_base;
+       const rte_iova_t io_addr = txq->io_addr;
+       uint16_t segdw, lmt_id, burst, left, i;
+       struct rte_mbuf *extm = NULL;
+       uint8_t lnum, c_lnum, c_loff;
+       uintptr_t c_lbase = lbase;
+       uint64_t lso_tun_fmt = 0;
+       uint64_t mark_fmt = 0;
+       uint8_t mark_flag = 0;
+       uint64_t data0, data1;
+       rte_iova_t c_io_addr;
+       uint8_t shft, c_shft;
+       __uint128_t data128;
+       uint16_t c_lmt_id;
+       uint64_t sa_base;
+       uintptr_t laddr;
+       bool sec;
+
+       if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && txq->tx_compl.ena)
+               handle_tx_completion_pkts(txq, flags & NIX_TX_VWQE_F);
+
+       if (!(flags & NIX_TX_VWQE_F))
+               NIX_XMIT_FC_CHECK_RETURN(txq, pkts);
+
+       /* Get cmd skeleton */
+       cn20k_nix_tx_skeleton(txq, cmd, flags, !(flags & NIX_TX_VWQE_F));
+
+       if (flags & NIX_TX_OFFLOAD_TSO_F)
+               lso_tun_fmt = txq->lso_tun_fmt;
+
+       if (flags & NIX_TX_OFFLOAD_VLAN_QINQ_F) {
+               mark_fmt = txq->mark_fmt;
+               mark_flag = txq->mark_flag;
+       }
+
+       /* Get LMT base address and LMT ID as lcore id */
+       ROC_LMT_BASE_ID_GET(lbase, lmt_id);
+       if (flags & NIX_TX_OFFLOAD_SECURITY_F) {
+               ROC_LMT_CPT_BASE_ID_GET(c_lbase, c_lmt_id);
+               c_io_addr = txq->cpt_io_addr;
+               sa_base = txq->sa_base;
+       }
+
+       left = pkts;
+again:
+       burst = left > 32 ? 32 : left;
+       shft = 16;
+       data128 = 0;
+
+       lnum = 0;
+       if (flags & NIX_TX_OFFLOAD_SECURITY_F) {
+               c_lnum = 0;
+               c_loff = 0;
+               c_shft = 16;
+       }
+
+       for (i = 0; i < burst; i++) {
+               cn20k_nix_tx_mbuf_validate(tx_pkts[i], flags);
+
+               /* Perform header writes for TSO, barrier at
+                * lmt steorl will suffice.
+                */
+               if (flags & NIX_TX_OFFLOAD_TSO_F)
+                       cn20k_nix_xmit_prepare_tso(tx_pkts[i], flags);
+
+               cn20k_nix_xmit_prepare(txq, tx_pkts[i], &extm, cmd, flags, 
lso_tun_fmt, &sec,
+                                      mark_flag, mark_fmt);
+
+               laddr = (uintptr_t)LMT_OFF(lbase, lnum, 0);
+
+               /* Prepare CPT instruction and get nixtx addr */
+               if (flags & NIX_TX_OFFLOAD_SECURITY_F && sec)
+                       cn20k_nix_prep_sec(tx_pkts[i], cmd, &laddr, c_lbase, 
&c_lnum, &c_loff,
+                                          &c_shft, sa_base, flags);
+
+               /* Move NIX desc to LMT/NIXTX area */
+               cn20k_nix_xmit_mv_lmt_base(laddr, cmd, flags);
+               /* Store sg list directly on lmt line */
+               segdw = cn20k_nix_prepare_mseg(txq, tx_pkts[i], &extm, 
(uint64_t *)laddr, flags);
+               cn20k_nix_xmit_prepare_tstamp(txq, laddr, tx_pkts[i]->ol_flags, 
segdw, flags);
+               if (!(flags & NIX_TX_OFFLOAD_SECURITY_F) || !sec) {
+                       lnum++;
+                       data128 |= (((__uint128_t)(segdw - 1)) << shft);
+                       shft += 3;
+               }
+       }
+
+       if ((flags & NIX_TX_VWQE_F) && !(ws[3] & BIT_ULL(35)))
+               ws[3] = roc_sso_hws_head_wait(ws[0]);
+
+       left -= burst;
+       tx_pkts += burst;
+
+       /* Submit CPT instructions if any */
+       if (flags & NIX_TX_OFFLOAD_SECURITY_F) {
+               uint16_t sec_pkts = ((c_lnum << 1) + c_loff);
+
+               /* Reduce pkts to be sent to CPT */
+               burst -= sec_pkts;
+               if (flags & NIX_TX_VWQE_F)
+                       cn20k_nix_vwqe_wait_fc(txq, sec_pkts);
+               cn20k_nix_sec_fc_wait(txq, sec_pkts);
+               cn20k_nix_sec_steorl(c_io_addr, c_lmt_id, c_lnum, c_loff, 
c_shft);
+       }
+
+       data0 = (uint64_t)data128;
+       data1 = (uint64_t)(data128 >> 64);
+       /* Make data0 similar to data1 */
+       data0 >>= 16;
+       /* Trigger LMTST */
+       if (burst > 16) {
+               pa0 = io_addr | (data0 & 0x7) << 4;
+               data0 &= ~0x7ULL;
+               /* Move lmtst1..15 sz to bits 63:19 */
+               data0 <<= 16;
+               data0 |= (15ULL << 12);
+               data0 |= (uint64_t)lmt_id;
+
+               if (flags & NIX_TX_VWQE_F)
+                       cn20k_nix_vwqe_wait_fc(txq, 16);
+               /* STEOR0 */
+               roc_lmt_submit_steorl(data0, pa0);
+
+               pa1 = io_addr | (data1 & 0x7) << 4;
+               data1 &= ~0x7ULL;
+               data1 <<= 16;
+               data1 |= ((uint64_t)(burst - 17)) << 12;
+               data1 |= (uint64_t)(lmt_id + 16);
+
+               if (flags & NIX_TX_VWQE_F)
+                       cn20k_nix_vwqe_wait_fc(txq, burst - 16);
+               /* STEOR1 */
+               roc_lmt_submit_steorl(data1, pa1);
+       } else if (burst) {
+               pa0 = io_addr | (data0 & 0x7) << 4;
+               data0 &= ~0x7ULL;
+               /* Move lmtst1..15 sz to bits 63:19 */
+               data0 <<= 16;
+               data0 |= ((burst - 1ULL) << 12);
+               data0 |= (uint64_t)lmt_id;
+
+               if (flags & NIX_TX_VWQE_F)
+                       cn20k_nix_vwqe_wait_fc(txq, burst);
+               /* STEOR0 */
+               roc_lmt_submit_steorl(data0, pa0);
+       }
+
+       rte_io_wmb();
+       if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && !txq->tx_compl.ena) {
+               cn20k_nix_free_extmbuf(extm);
+               extm = NULL;
+       }
+
+       if (left)
+               goto again;
+
+       return pkts;
+}
+
 #define L3L4CSUM_F   NIX_TX_OFFLOAD_L3_L4_CSUM_F
 #define OL3OL4CSUM_F NIX_TX_OFFLOAD_OL3_OL4_CSUM_F
 #define VLAN_F      NIX_TX_OFFLOAD_VLAN_QINQ_F
@@ -1214,10 +1555,12 @@ NIX_TX_FASTPATH_MODES
        uint16_t __rte_noinline __rte_hot fn(void *tx_queue, struct rte_mbuf 
**tx_pkts,            \
                                             uint16_t pkts)                     
                   \
        {                                                                       
                   \
-               RTE_SET_USED(tx_queue);                                         
                   \
-               RTE_SET_USED(tx_pkts);                                          
                   \
-               RTE_SET_USED(pkts);                                             
                   \
-               return 0;                                                       
                   \
+               uint64_t cmd[(sz) + CNXK_NIX_TX_MSEG_SG_DWORDS - 2];            
                   \
+               /* For TSO inner checksum is a must */                          
                   \
+               if (((flags) & NIX_TX_OFFLOAD_TSO_F) && !((flags) & 
NIX_TX_OFFLOAD_L3_L4_CSUM_F))  \
+                       return 0;                                               
                   \
+               return cn20k_nix_xmit_pkts_mseg(tx_queue, NULL, tx_pkts, pkts, 
cmd,                \
+                                               flags | NIX_TX_MULTI_SEG_F);    
                   \
        }
 
 #define NIX_TX_XMIT_VEC(fn, sz, flags)                                         
                    \
@@ -1247,5 +1590,4 @@ uint16_t __rte_noinline __rte_hot 
cn20k_nix_xmit_pkts_all_offload(void *tx_queue
 uint16_t __rte_noinline __rte_hot cn20k_nix_xmit_pkts_vec_all_offload(void 
*tx_queue,
                                                                      struct 
rte_mbuf **tx_pkts,
                                                                      uint16_t 
pkts);
-
 #endif /* __CN20K_TX_H__ */
-- 
2.34.1

Reply via email to