From: Pavan Nikhilesh <pbhagavat...@marvell.com>

Rewrite DMA fastpath to use NEON instructions and reduce number
of words read from config.

Signed-off-by: Pavan Nikhilesh <pbhagavat...@marvell.com>
---
 drivers/dma/cnxk/cnxk_dmadev.c    | 454 +++--------------------------
 drivers/dma/cnxk/cnxk_dmadev.h    |  89 +++++-
 drivers/dma/cnxk/cnxk_dmadev_fp.c | 455 ++++++++++++++++++++++++++++++
 drivers/dma/cnxk/meson.build      |   2 +-
 4 files changed, 570 insertions(+), 430 deletions(-)
 create mode 100644 drivers/dma/cnxk/cnxk_dmadev_fp.c

diff --git a/drivers/dma/cnxk/cnxk_dmadev.c b/drivers/dma/cnxk/cnxk_dmadev.c
index 35c2b79156..465290ce7a 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.c
+++ b/drivers/dma/cnxk/cnxk_dmadev.c
@@ -2,19 +2,6 @@
  * Copyright (C) 2021 Marvell International Ltd.
  */
 
-#include <string.h>
-#include <unistd.h>
-
-#include <bus_pci_driver.h>
-#include <rte_common.h>
-#include <rte_dmadev.h>
-#include <rte_dmadev_pmd.h>
-#include <rte_eal.h>
-#include <rte_lcore.h>
-#include <rte_mbuf_pool_ops.h>
-#include <rte_mempool.h>
-#include <rte_pci.h>
-
 #include <cnxk_dmadev.h>
 
 static int cnxk_stats_reset(struct rte_dma_dev *dev, uint16_t vchan);
@@ -166,22 +153,9 @@ cnxk_dmadev_configure(struct rte_dma_dev *dev, const 
struct rte_dma_conf *conf,
        return rc;
 }
 
-static int
-cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
-                       const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
+static void
+cn9k_dmadev_setup_hdr(union cnxk_dpi_instr_cmd *header, const struct 
rte_dma_vchan_conf *conf)
 {
-       struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
-       struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-       union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-       uint16_t max_desc;
-       uint32_t size;
-       int i;
-
-       RTE_SET_USED(conf_sz);
-
-       if (dpivf->flag & CNXK_DPI_DEV_START)
-               return 0;
-
        header->cn9k.pt = DPI_HDR_PT_ZBW_CA;
 
        switch (conf->direction) {
@@ -217,57 +191,11 @@ cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t 
vchan,
                header->cn9k.fport = conf->dst_port.pcie.coreid;
                header->cn9k.pvfe = 0;
        };
-
-       /* Free up descriptor memory before allocating. */
-       cnxk_dmadev_vchan_free(dpivf, vchan);
-
-       max_desc = conf->nb_desc;
-       if (!rte_is_power_of_2(max_desc))
-               max_desc = rte_align32pow2(max_desc);
-
-       if (max_desc > DPI_MAX_DESC)
-               max_desc = DPI_MAX_DESC;
-
-       size = (max_desc * sizeof(struct cnxk_dpi_compl_s *));
-       dpi_conf->c_desc.compl_ptr = rte_zmalloc(NULL, size, 0);
-
-       if (dpi_conf->c_desc.compl_ptr == NULL) {
-               plt_err("Failed to allocate for comp_data");
-               return -ENOMEM;
-       }
-
-       for (i = 0; i < max_desc; i++) {
-               dpi_conf->c_desc.compl_ptr[i] =
-                       rte_zmalloc(NULL, sizeof(struct cnxk_dpi_compl_s), 0);
-               if (!dpi_conf->c_desc.compl_ptr[i]) {
-                       plt_err("Failed to allocate for descriptor memory");
-                       return -ENOMEM;
-               }
-
-               dpi_conf->c_desc.compl_ptr[i]->cdata = DPI_REQ_CDATA;
-       }
-
-       dpi_conf->c_desc.max_cnt = (max_desc - 1);
-
-       return 0;
 }
 
-static int
-cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
-                        const struct rte_dma_vchan_conf *conf, uint32_t 
conf_sz)
+static void
+cn10k_dmadev_setup_hdr(union cnxk_dpi_instr_cmd *header, const struct 
rte_dma_vchan_conf *conf)
 {
-       struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
-       struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-       union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-       uint16_t max_desc;
-       uint32_t size;
-       int i;
-
-       RTE_SET_USED(conf_sz);
-
-       if (dpivf->flag & CNXK_DPI_DEV_START)
-               return 0;
-
        header->cn10k.pt = DPI_HDR_PT_ZBW_CA;
 
        switch (conf->direction) {
@@ -303,6 +231,29 @@ cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t 
vchan,
                header->cn10k.fport = conf->dst_port.pcie.coreid;
                header->cn10k.pvfe = 0;
        };
+}
+
+static int
+cnxk_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
+                       const struct rte_dma_vchan_conf *conf, uint32_t conf_sz)
+{
+       struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
+       struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+       union cnxk_dpi_instr_cmd *header;
+       uint16_t max_desc;
+       uint32_t size;
+       int i;
+
+       RTE_SET_USED(conf_sz);
+
+       header = (union cnxk_dpi_instr_cmd *)&dpi_conf->cmd.u;
+       if (dpivf->flag & CNXK_DPI_DEV_START)
+               return 0;
+
+       if (dpivf->is_cn10k)
+               cn10k_dmadev_setup_hdr(header, conf);
+       else
+               cn9k_dmadev_setup_hdr(header, conf);
 
        /* Free up descriptor memory before allocating. */
        cnxk_dmadev_vchan_free(dpivf, vchan);
@@ -329,6 +280,7 @@ cn10k_dmadev_vchan_setup(struct rte_dma_dev *dev, uint16_t 
vchan,
                        plt_err("Failed to allocate for descriptor memory");
                        return -ENOMEM;
                }
+
                dpi_conf->c_desc.compl_ptr[i]->cdata = DPI_REQ_CDATA;
        }
 
@@ -374,6 +326,11 @@ static int
 cnxk_dmadev_stop(struct rte_dma_dev *dev)
 {
        struct cnxk_dpi_vf_s *dpivf = dev->fp_obj->dev_private;
+       uint64_t reg;
+
+       reg = plt_read64(dpivf->rdpi.rbase + DPI_VDMA_SADDR);
+       while (!(reg & BIT_ULL(63)))
+               reg = plt_read64(dpivf->rdpi.rbase + DPI_VDMA_SADDR);
 
        roc_dpi_disable(&dpivf->rdpi);
        dpivf->flag &= ~CNXK_DPI_DEV_START;
@@ -396,332 +353,6 @@ cnxk_dmadev_close(struct rte_dma_dev *dev)
        return 0;
 }
 
-static inline int
-__dpi_queue_write(struct cnxk_dpi_vf_s *dpi, uint64_t *cmds, int cmd_count)
-{
-       uint64_t *ptr = dpi->chunk_base;
-
-       if ((cmd_count < DPI_MIN_CMD_SIZE) || (cmd_count > DPI_MAX_CMD_SIZE) || 
cmds == NULL)
-               return -EINVAL;
-
-       /*
-        * Normally there is plenty of room in the current buffer for the
-        * command
-        */
-       if (dpi->chunk_head + cmd_count < dpi->chunk_size_m1) {
-               ptr += dpi->chunk_head;
-               dpi->chunk_head += cmd_count;
-               while (cmd_count--)
-                       *ptr++ = *cmds++;
-       } else {
-               uint64_t *new_buff = NULL;
-               int count;
-
-               if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
-                       plt_dpi_dbg("Failed to alloc next buffer from NPA");
-                       return -ENOMEM;
-               }
-
-               /*
-                * Figure out how many cmd words will fit in this buffer.
-                * One location will be needed for the next buffer pointer.
-                */
-               count = dpi->chunk_size_m1 - dpi->chunk_head;
-               ptr += dpi->chunk_head;
-               cmd_count -= count;
-               while (count--)
-                       *ptr++ = *cmds++;
-
-               /*
-                * chunk next ptr is 2 DWORDS
-                * second DWORD is reserved.
-                */
-               *ptr++ = (uint64_t)new_buff;
-               *ptr = 0;
-
-               /*
-                * The current buffer is full and has a link to the next
-                * buffers. Time to write the rest of the commands into the new
-                * buffer.
-                */
-               dpi->chunk_base = new_buff;
-               dpi->chunk_head = cmd_count;
-               ptr = new_buff;
-               while (cmd_count--)
-                       *ptr++ = *cmds++;
-
-               /* queue index may be greater than pool size */
-               if (dpi->chunk_head == dpi->chunk_size_m1) {
-                       if (rte_mempool_get(dpi->chunk_pool, (void 
**)&new_buff) < 0) {
-                               plt_dpi_dbg("Failed to alloc next buffer from 
NPA");
-                               return -ENOMEM;
-                       }
-                       /* Write next buffer address */
-                       *ptr = (uint64_t)new_buff;
-                       dpi->chunk_base = new_buff;
-                       dpi->chunk_head = 0;
-               }
-       }
-
-       return 0;
-}
-
-static int
-cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t 
dst, uint32_t length,
-                uint64_t flags)
-{
-       struct cnxk_dpi_vf_s *dpivf = dev_private;
-       struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-       union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-       struct cnxk_dpi_compl_s *comp_ptr;
-       uint64_t cmd[DPI_MAX_CMD_SIZE];
-       rte_iova_t fptr, lptr;
-       int num_words = 0;
-       int rc;
-
-       comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-       header->cn9k.ptr = (uint64_t)comp_ptr;
-       STRM_INC(dpi_conf->c_desc, tail);
-
-       if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-               STRM_DEC(dpi_conf->c_desc, tail);
-               return -ENOSPC;
-       }
-
-       header->cn9k.nfst = 1;
-       header->cn9k.nlst = 1;
-
-       /*
-        * For inbound case, src pointers are last pointers.
-        * For all other cases, src pointers are first pointers.
-        */
-       if (header->cn9k.xtype == DPI_XTYPE_INBOUND) {
-               fptr = dst;
-               lptr = src;
-       } else {
-               fptr = src;
-               lptr = dst;
-       }
-
-       cmd[0] = header->u[0];
-       cmd[1] = header->u[1];
-       cmd[2] = header->u[2];
-       /* word3 is always 0 */
-       num_words += 4;
-       cmd[num_words++] = length;
-       cmd[num_words++] = fptr;
-       cmd[num_words++] = length;
-       cmd[num_words++] = lptr;
-
-       rc = __dpi_queue_write(dpivf, cmd, num_words);
-       if (unlikely(rc)) {
-               STRM_DEC(dpi_conf->c_desc, tail);
-               return rc;
-       }
-
-       if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-               rte_wmb();
-               plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-               dpi_conf->stats.submitted++;
-       } else {
-               dpi_conf->pnum_words += num_words;
-               dpi_conf->pending++;
-       }
-
-       return dpi_conf->desc_idx++;
-}
-
-static int
-cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct 
rte_dma_sge *src,
-                   const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t 
nb_dst, uint64_t flags)
-{
-       struct cnxk_dpi_vf_s *dpivf = dev_private;
-       struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-       union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-       const struct rte_dma_sge *fptr, *lptr;
-       struct cnxk_dpi_compl_s *comp_ptr;
-       uint64_t cmd[DPI_MAX_CMD_SIZE];
-       int num_words = 0;
-       int i, rc;
-
-       comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-       header->cn9k.ptr = (uint64_t)comp_ptr;
-       STRM_INC(dpi_conf->c_desc, tail);
-
-       if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-               STRM_DEC(dpi_conf->c_desc, tail);
-               return -ENOSPC;
-       }
-
-       /*
-        * For inbound case, src pointers are last pointers.
-        * For all other cases, src pointers are first pointers.
-        */
-       if (header->cn9k.xtype == DPI_XTYPE_INBOUND) {
-               header->cn9k.nfst = nb_dst & DPI_MAX_POINTER;
-               header->cn9k.nlst = nb_src & DPI_MAX_POINTER;
-               fptr = &dst[0];
-               lptr = &src[0];
-       } else {
-               header->cn9k.nfst = nb_src & DPI_MAX_POINTER;
-               header->cn9k.nlst = nb_dst & DPI_MAX_POINTER;
-               fptr = &src[0];
-               lptr = &dst[0];
-       }
-
-       cmd[0] = header->u[0];
-       cmd[1] = header->u[1];
-       cmd[2] = header->u[2];
-       num_words += 4;
-       for (i = 0; i < header->cn9k.nfst; i++) {
-               cmd[num_words++] = (uint64_t)fptr->length;
-               cmd[num_words++] = fptr->addr;
-               fptr++;
-       }
-
-       for (i = 0; i < header->cn9k.nlst; i++) {
-               cmd[num_words++] = (uint64_t)lptr->length;
-               cmd[num_words++] = lptr->addr;
-               lptr++;
-       }
-
-       rc = __dpi_queue_write(dpivf, cmd, num_words);
-       if (unlikely(rc)) {
-               STRM_DEC(dpi_conf->c_desc, tail);
-               return rc;
-       }
-
-       if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-               rte_wmb();
-               plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-               dpi_conf->stats.submitted++;
-       } else {
-               dpi_conf->pnum_words += num_words;
-               dpi_conf->pending++;
-       }
-
-       return dpi_conf->desc_idx++;
-}
-
-static int
-cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, 
rte_iova_t dst,
-                 uint32_t length, uint64_t flags)
-{
-       struct cnxk_dpi_vf_s *dpivf = dev_private;
-       struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-       union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-       struct cnxk_dpi_compl_s *comp_ptr;
-       uint64_t cmd[DPI_MAX_CMD_SIZE];
-       rte_iova_t fptr, lptr;
-       int num_words = 0;
-       int rc;
-
-       comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-       header->cn10k.ptr = (uint64_t)comp_ptr;
-       STRM_INC(dpi_conf->c_desc, tail);
-
-       if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-               STRM_DEC(dpi_conf->c_desc, tail);
-               return -ENOSPC;
-       }
-
-       header->cn10k.nfst = 1;
-       header->cn10k.nlst = 1;
-
-       fptr = src;
-       lptr = dst;
-
-       cmd[0] = header->u[0];
-       cmd[1] = header->u[1];
-       cmd[2] = header->u[2];
-       /* word3 is always 0 */
-       num_words += 4;
-       cmd[num_words++] = length;
-       cmd[num_words++] = fptr;
-       cmd[num_words++] = length;
-       cmd[num_words++] = lptr;
-
-       rc = __dpi_queue_write(dpivf, cmd, num_words);
-       if (unlikely(rc)) {
-               STRM_DEC(dpi_conf->c_desc, tail);
-               return rc;
-       }
-
-       if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-               rte_wmb();
-               plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-               dpi_conf->stats.submitted++;
-       } else {
-               dpi_conf->pnum_words += num_words;
-               dpi_conf->pending++;
-       }
-
-       return dpi_conf->desc_idx++;
-}
-
-static int
-cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct 
rte_dma_sge *src,
-                    const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t 
nb_dst,
-                    uint64_t flags)
-{
-       struct cnxk_dpi_vf_s *dpivf = dev_private;
-       struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
-       union dpi_instr_hdr_s *header = &dpi_conf->hdr;
-       const struct rte_dma_sge *fptr, *lptr;
-       struct cnxk_dpi_compl_s *comp_ptr;
-       uint64_t cmd[DPI_MAX_CMD_SIZE];
-       int num_words = 0;
-       int i, rc;
-
-       comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
-       header->cn10k.ptr = (uint64_t)comp_ptr;
-       STRM_INC(dpi_conf->c_desc, tail);
-
-       if (unlikely(dpi_conf->c_desc.tail == dpi_conf->c_desc.head)) {
-               STRM_DEC(dpi_conf->c_desc, tail);
-               return -ENOSPC;
-       }
-
-       header->cn10k.nfst = nb_src & DPI_MAX_POINTER;
-       header->cn10k.nlst = nb_dst & DPI_MAX_POINTER;
-       fptr = &src[0];
-       lptr = &dst[0];
-
-       cmd[0] = header->u[0];
-       cmd[1] = header->u[1];
-       cmd[2] = header->u[2];
-       num_words += 4;
-
-       for (i = 0; i < header->cn10k.nfst; i++) {
-               cmd[num_words++] = (uint64_t)fptr->length;
-               cmd[num_words++] = fptr->addr;
-               fptr++;
-       }
-
-       for (i = 0; i < header->cn10k.nlst; i++) {
-               cmd[num_words++] = (uint64_t)lptr->length;
-               cmd[num_words++] = lptr->addr;
-               lptr++;
-       }
-
-       rc = __dpi_queue_write(dpivf, cmd, num_words);
-       if (unlikely(rc)) {
-               STRM_DEC(dpi_conf->c_desc, tail);
-               return rc;
-       }
-
-       if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
-               rte_wmb();
-               plt_write64(num_words, dpivf->rdpi.rbase + DPI_VDMA_DBELL);
-               dpi_conf->stats.submitted++;
-       } else {
-               dpi_conf->pnum_words += num_words;
-               dpi_conf->pending++;
-       }
-
-       return dpi_conf->desc_idx++;
-}
-
 static uint16_t
 cnxk_dmadev_completed(void *dev_private, uint16_t vchan, const uint16_t 
nb_cpls, uint16_t *last_idx,
                      bool *has_error)
@@ -880,17 +511,6 @@ cnxk_stats_reset(struct rte_dma_dev *dev, uint16_t vchan)
        return 0;
 }
 
-static const struct rte_dma_dev_ops cn10k_dmadev_ops = {
-       .dev_close = cnxk_dmadev_close,
-       .dev_configure = cnxk_dmadev_configure,
-       .dev_info_get = cnxk_dmadev_info_get,
-       .dev_start = cnxk_dmadev_start,
-       .dev_stop = cnxk_dmadev_stop,
-       .stats_get = cnxk_stats_get,
-       .stats_reset = cnxk_stats_reset,
-       .vchan_setup = cn10k_dmadev_vchan_setup,
-};
-
 static const struct rte_dma_dev_ops cnxk_dmadev_ops = {
        .dev_close = cnxk_dmadev_close,
        .dev_configure = cnxk_dmadev_configure,
@@ -941,12 +561,8 @@ cnxk_dmadev_probe(struct rte_pci_driver *pci_drv 
__rte_unused, struct rte_pci_de
        dmadev->fp_obj->completed_status = cnxk_dmadev_completed_status;
        dmadev->fp_obj->burst_capacity = cnxk_damdev_burst_capacity;
 
-       if (pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KA ||
-           pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KAS ||
-           pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CNF10KA ||
-           pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CNF10KB ||
-           pci_dev->id.subsystem_device_id == PCI_SUBSYSTEM_DEVID_CN10KB) {
-               dmadev->dev_ops = &cn10k_dmadev_ops;
+       if (roc_model_is_cn10k()) {
+               dpivf->is_cn10k = true;
                dmadev->fp_obj->copy = cn10k_dmadev_copy;
                dmadev->fp_obj->copy_sg = cn10k_dmadev_copy_sg;
        }
diff --git a/drivers/dma/cnxk/cnxk_dmadev.h b/drivers/dma/cnxk/cnxk_dmadev.h
index 65f12d844d..c9032de779 100644
--- a/drivers/dma/cnxk/cnxk_dmadev.h
+++ b/drivers/dma/cnxk/cnxk_dmadev.h
@@ -4,14 +4,27 @@
 #ifndef CNXK_DMADEV_H
 #define CNXK_DMADEV_H
 
+#include <string.h>
+#include <unistd.h>
+
+#include <bus_pci_driver.h>
+#include <rte_common.h>
+#include <rte_dmadev.h>
+#include <rte_dmadev_pmd.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_mbuf_pool_ops.h>
+#include <rte_mempool.h>
+#include <rte_pci.h>
+
 #include <roc_api.h>
 
-#define DPI_MAX_POINTER             15
-#define STRM_INC(s, var)     ((s).var = ((s).var + 1) & (s).max_cnt)
-#define STRM_DEC(s, var)     ((s).var = ((s).var - 1) == -1 ? (s).max_cnt : 
((s).var - 1))
-#define DPI_MAX_DESC        2048
-#define DPI_MIN_DESC        2
-#define MAX_VCHANS_PER_QUEUE 4
+#define DPI_MAX_POINTER               15
+#define STRM_INC(s, var)       ((s).var = ((s).var + 1) & (s).max_cnt)
+#define STRM_DEC(s, var)       ((s).var = ((s).var - 1) == -1 ? (s).max_cnt : 
((s).var - 1))
+#define DPI_MAX_DESC          2048
+#define DPI_MIN_DESC          2
+#define MAX_VCHANS_PER_QUEUE   4
 #define DPI_CMD_QUEUE_BUF_SIZE 4096
 #define DPI_CMD_QUEUE_BUFS     1024
 
@@ -21,8 +34,51 @@
 #define DPI_REQ_CDATA 0xFF
 
 #define CNXK_DMA_POOL_MAX_CACHE_SZ (16)
-#define CNXK_DPI_DEV_CONFIG (1ULL << 0)
-#define CNXK_DPI_DEV_START  (1ULL << 1)
+#define CNXK_DPI_DEV_CONFIG       (1ULL << 0)
+#define CNXK_DPI_DEV_START        (1ULL << 1)
+
+union cnxk_dpi_instr_cmd {
+       uint64_t u;
+       struct cn9k_dpi_instr_cmd {
+               uint64_t aura : 20;
+               uint64_t func : 16;
+               uint64_t pt : 2;
+               uint64_t reserved_102 : 1;
+               uint64_t pvfe : 1;
+               uint64_t fl : 1;
+               uint64_t ii : 1;
+               uint64_t fi : 1;
+               uint64_t ca : 1;
+               uint64_t csel : 1;
+               uint64_t reserved_109_111 : 3;
+               uint64_t xtype : 2;
+               uint64_t reserved_114_119 : 6;
+               uint64_t fport : 2;
+               uint64_t reserved_122_123 : 2;
+               uint64_t lport : 2;
+               uint64_t reserved_126_127 : 2;
+               /* Word 1 - End */
+       } cn9k;
+
+       struct cn10k_dpi_instr_cmd {
+               uint64_t nfst : 4;
+               uint64_t reserved_4_5 : 2;
+               uint64_t nlst : 4;
+               uint64_t reserved_10_11 : 2;
+               uint64_t pvfe : 1;
+               uint64_t reserved_13 : 1;
+               uint64_t func : 16;
+               uint64_t aura : 20;
+               uint64_t xtype : 2;
+               uint64_t reserved_52_53 : 2;
+               uint64_t pt : 2;
+               uint64_t fport : 2;
+               uint64_t reserved_58_59 : 2;
+               uint64_t lport : 2;
+               uint64_t reserved_62_63 : 2;
+               /* Word 0 - End */
+       } cn10k;
+};
 
 struct cnxk_dpi_compl_s {
        uint64_t cdata;
@@ -37,26 +93,39 @@ struct cnxk_dpi_cdesc_data_s {
 };
 
 struct cnxk_dpi_conf {
-       union dpi_instr_hdr_s hdr;
+       union cnxk_dpi_instr_cmd cmd;
        struct cnxk_dpi_cdesc_data_s c_desc;
        uint16_t pnum_words;
        uint16_t pending;
        uint16_t desc_idx;
-       uint16_t pad0;
        struct rte_dma_stats stats;
        uint64_t completed_offset;
 };
 
 struct cnxk_dpi_vf_s {
+       /* Fast path*/
        uint64_t *chunk_base;
        uint16_t chunk_head;
        uint16_t chunk_size_m1;
        struct rte_mempool *chunk_pool;
        struct cnxk_dpi_conf conf[MAX_VCHANS_PER_QUEUE];
+       /* Slow path */
        struct roc_dpi rdpi;
        uint32_t aura;
        uint16_t num_vchans;
        uint16_t flag;
+       uint8_t is_cn10k;
 } __plt_cache_aligned;
 
+int cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, 
rte_iova_t dst,
+                    uint32_t length, uint64_t flags);
+int cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct 
rte_dma_sge *src,
+                       const struct rte_dma_sge *dst, uint16_t nb_src, 
uint16_t nb_dst,
+                       uint64_t flags);
+int cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, 
rte_iova_t dst,
+                     uint32_t length, uint64_t flags);
+int cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct 
rte_dma_sge *src,
+                        const struct rte_dma_sge *dst, uint16_t nb_src, 
uint16_t nb_dst,
+                        uint64_t flags);
+
 #endif
diff --git a/drivers/dma/cnxk/cnxk_dmadev_fp.c 
b/drivers/dma/cnxk/cnxk_dmadev_fp.c
new file mode 100644
index 0000000000..db1e57bf51
--- /dev/null
+++ b/drivers/dma/cnxk/cnxk_dmadev_fp.c
@@ -0,0 +1,455 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (C) 2021 Marvell International Ltd.
+ */
+
+#include <rte_vect.h>
+
+#include "cnxk_dmadev.h"
+
+#define DMA_DW_PER_SINGLE_CMD 8
+#define DMA_HDR_LEN          4
+#define DMA_CMD_LEN(src, dst) (DMA_HDR_LEN + (src << 1) + (dst << 1))
+
+static __plt_always_inline void
+__dpi_cpy_scalar(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+       uint8_t i;
+
+       for (i = 0; i < n; i++)
+               dst[i] = src[i];
+}
+
+static __plt_always_inline void
+__dpi_cpy_scalar_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+       uint8_t i;
+
+       for (i = 0; i < n; i++) {
+               *dst++ = src[i].length;
+               *dst++ = src[i].addr;
+       }
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_scalar_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t 
n, uint16_t lmt)
+{
+       uint8_t i;
+
+       for (i = 0; i < n && lmt; i++) {
+               *dst++ = src[i].length;
+               *dst++ = src[i].addr;
+               lmt -= 2;
+       }
+
+       return i;
+}
+
+#if defined(RTE_ARCH_ARM64)
+static __plt_always_inline void
+__dpi_cpy_vector(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+       uint64x2_t vec;
+       uint8_t i;
+
+       for (i = 0; i < n; i += 2) {
+               vec = vld1q_u64((const uint64_t *)&src[i]);
+               vst1q_u64(&dst[i], vec);
+       }
+}
+
+static __plt_always_inline void
+__dpi_cpy_vector_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+       uint64x2_t mask = {0xFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
+       uint64x2_t vec;
+       uint8_t i;
+
+       for (i = 0; i < n; i++) {
+               vec = vld1q_u64((const uint64_t *)&src[i]);
+               vec = vextq_u64(vec, vec, 1);
+               vec = vandq_u64(vec, mask);
+               vst1q_u64(dst, vec);
+               dst += 2;
+       }
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_vector_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t 
n, uint16_t lmt)
+{
+       uint64x2_t mask = {0xFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
+       uint64x2_t vec;
+       uint8_t i;
+
+       for (i = 0; i < n && lmt; i++) {
+               vec = vld1q_u64((const uint64_t *)&src[i]);
+               vec = vextq_u64(vec, vec, 1);
+               vec = vandq_u64(vec, mask);
+               vst1q_u64(dst, vec);
+               dst += 2;
+               lmt -= 2;
+       }
+
+       return i;
+}
+#endif
+
+static __plt_always_inline void
+__dpi_cpy(uint64_t *src, uint64_t *dst, uint8_t n)
+{
+#if defined(RTE_ARCH_ARM64)
+       __dpi_cpy_vector(src, dst, n);
+#else
+       __dpi_cpy_scalar(src, dst, n);
+#endif
+}
+
+static __plt_always_inline void
+__dpi_cpy_sg(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n)
+{
+#if defined(RTE_ARCH_ARM64)
+       __dpi_cpy_vector_sg(src, dst, n);
+#else
+       __dpi_cpy_scalar_sg(src, dst, n);
+#endif
+}
+
+static __plt_always_inline uint8_t
+__dpi_cpy_sg_lmt(const struct rte_dma_sge *src, uint64_t *dst, uint16_t n, 
uint16_t lmt)
+{
+#if defined(RTE_ARCH_ARM64)
+       return __dpi_cpy_vector_sg_lmt(src, dst, n, lmt);
+#else
+       return __dpi_cpy_scalar_sg_lmt(src, dst, n, lmt);
+#endif
+}
+
+static __plt_always_inline int
+__dpi_queue_write_single(struct cnxk_dpi_vf_s *dpi, uint64_t *cmd)
+{
+       uint64_t *ptr = dpi->chunk_base;
+
+       /*
+        * Normally there is plenty of room in the current buffer for the
+        * command
+        */
+       if (dpi->chunk_head + DMA_DW_PER_SINGLE_CMD < dpi->chunk_size_m1) {
+               ptr += dpi->chunk_head;
+
+               __dpi_cpy_scalar(cmd, ptr, DMA_DW_PER_SINGLE_CMD);
+               dpi->chunk_head += DMA_DW_PER_SINGLE_CMD;
+       } else {
+               uint64_t *new_buff = NULL;
+               int count;
+
+               if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+                       plt_dpi_dbg("Failed to alloc next buffer from NPA");
+                       return -ENOSPC;
+               }
+
+               /*
+                * Figure out how many cmd words will fit in this buffer.
+                * One location will be needed for the next buffer pointer.
+                */
+               count = dpi->chunk_size_m1 - dpi->chunk_head;
+               ptr += dpi->chunk_head;
+
+               __dpi_cpy_scalar(cmd, ptr, count);
+
+               ptr += count;
+               *ptr = (uint64_t)new_buff;
+               ptr = new_buff;
+
+               __dpi_cpy_scalar(cmd + count, ptr, DMA_DW_PER_SINGLE_CMD - 
count);
+
+               /*
+                * The current buffer is full and has a link to the next
+                * buffers. Time to write the rest of the commands into
+                * the new buffer.
+                */
+               dpi->chunk_base = new_buff;
+               dpi->chunk_head = DMA_DW_PER_SINGLE_CMD - count;
+       }
+
+       return 0;
+}
+
+static __plt_always_inline int
+__dpi_queue_write_sg(struct cnxk_dpi_vf_s *dpi, uint64_t *hdr, const struct 
rte_dma_sge *src,
+                    const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t 
nb_dst)
+{
+       uint8_t cmd_len = DMA_CMD_LEN(nb_src, nb_dst);
+       uint64_t *ptr = dpi->chunk_base;
+
+       /*
+        * Normally there is plenty of room in the current buffer for the
+        * command
+        */
+       if (dpi->chunk_head + cmd_len < dpi->chunk_size_m1) {
+               ptr += dpi->chunk_head;
+
+               __dpi_cpy(hdr, ptr, DMA_HDR_LEN);
+               ptr += DMA_HDR_LEN;
+               __dpi_cpy_sg(src, ptr, nb_src);
+               ptr += (nb_src << 1);
+               __dpi_cpy_sg(dst, ptr, nb_dst);
+
+               dpi->chunk_head += cmd_len;
+       } else {
+               uint64_t *new_buff = NULL, *buf;
+               uint16_t count;
+
+               if (rte_mempool_get(dpi->chunk_pool, (void **)&new_buff) < 0) {
+                       plt_dpi_dbg("Failed to alloc next buffer from NPA");
+                       return -ENOSPC;
+               }
+
+               /*
+                * Figure out how many cmd words will fit in this buffer.
+                * One location will be needed for the next buffer pointer.
+                */
+               count = dpi->chunk_size_m1 - dpi->chunk_head;
+               ptr += dpi->chunk_head;
+               buf = new_buff;
+               if (count <= 4) {
+                       __dpi_cpy(hdr, ptr, count);
+                       ptr += count;
+                       __dpi_cpy(&hdr[count], buf, 4);
+                       buf += (4 - count);
+               } else {
+                       uint8_t i;
+
+                       __dpi_cpy(hdr, ptr, 4);
+                       ptr += 4;
+                       count -= 4;
+
+                       i = __dpi_cpy_sg_lmt(src, ptr, nb_src, count);
+                       src += i;
+                       nb_src -= i;
+                       count -= (i << 1);
+                       ptr += (i << 1);
+
+                       i = __dpi_cpy_sg_lmt(dst, ptr, nb_dst, count);
+                       dst += i;
+                       nb_dst -= i;
+                       ptr += (i << 1);
+               }
+               *ptr = (uint64_t)new_buff;
+
+               __dpi_cpy_sg(src, buf, nb_src);
+               buf += (nb_src << 1);
+
+               __dpi_cpy_sg(dst, buf, nb_dst);
+               buf += (nb_dst << 1);
+
+               /*
+                * The current buffer is full and has a link to the next
+                * buffers. Time to write the rest of the commands into
+                * the new buffer.
+                */
+               dpi->chunk_base = new_buff;
+               dpi->chunk_head = buf - new_buff;
+       }
+
+       return 0;
+}
+
+int
+cnxk_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, rte_iova_t 
dst, uint32_t length,
+                uint64_t flags)
+{
+       struct cnxk_dpi_vf_s *dpivf = dev_private;
+       struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+       uint64_t cmd[DMA_DW_PER_SINGLE_CMD];
+       struct cnxk_dpi_compl_s *comp_ptr;
+       int rc;
+
+       if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+                    dpi_conf->c_desc.head))
+               return -ENOSPC;
+
+       comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+       STRM_INC(dpi_conf->c_desc, tail);
+
+       cmd[0] = (1UL << 54) | (1UL << 48);
+       cmd[1] = dpi_conf->cmd.u;
+       cmd[2] = (uint64_t)comp_ptr;
+       cmd[4] = length;
+       cmd[6] = length;
+
+       /*
+        * For inbound case, src pointers are last pointers.
+        * For all other cases, src pointers are first pointers.
+        */
+       if (((dpi_conf->cmd.u >> 48) & DPI_HDR_XTYPE_MASK) == 
DPI_XTYPE_INBOUND) {
+               cmd[5] = dst;
+               cmd[7] = src;
+       } else {
+               cmd[5] = src;
+               cmd[7] = dst;
+       }
+
+       rc = __dpi_queue_write_single(dpivf, cmd);
+       if (unlikely(rc)) {
+               STRM_DEC(dpi_conf->c_desc, tail);
+               return rc;
+       }
+
+       if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+               rte_wmb();
+               plt_write64(dpi_conf->pnum_words + DMA_DW_PER_SINGLE_CMD,
+                           dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+               dpi_conf->stats.submitted += dpi_conf->pending + 1;
+               dpi_conf->pnum_words = 0;
+               dpi_conf->pending = 0;
+       } else {
+               dpi_conf->pnum_words += DMA_DW_PER_SINGLE_CMD;
+               dpi_conf->pending++;
+       }
+
+       return dpi_conf->desc_idx++;
+}
+
+int
+cnxk_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct 
rte_dma_sge *src,
+                   const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t 
nb_dst, uint64_t flags)
+{
+       struct cnxk_dpi_vf_s *dpivf = dev_private;
+       struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+       const struct rte_dma_sge *fptr, *lptr;
+       struct cnxk_dpi_compl_s *comp_ptr;
+       uint64_t hdr[4];
+       int rc;
+
+       if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+                    dpi_conf->c_desc.head))
+               return -ENOSPC;
+
+       comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+       STRM_INC(dpi_conf->c_desc, tail);
+
+       hdr[1] = dpi_conf->cmd.u;
+       hdr[2] = (uint64_t)comp_ptr;
+
+       /*
+        * For inbound case, src pointers are last pointers.
+        * For all other cases, src pointers are first pointers.
+        */
+       if (((dpi_conf->cmd.u >> 48) & DPI_HDR_XTYPE_MASK) == 
DPI_XTYPE_INBOUND) {
+               fptr = dst;
+               lptr = src;
+               RTE_SWAP(nb_src, nb_dst);
+       } else {
+               fptr = src;
+               lptr = dst;
+       }
+       hdr[0] = ((uint64_t)nb_dst << 54) | (uint64_t)nb_src << 48;
+
+       rc = __dpi_queue_write_sg(dpivf, hdr, fptr, lptr, nb_src, nb_dst);
+       if (unlikely(rc)) {
+               STRM_DEC(dpi_conf->c_desc, tail);
+               return rc;
+       }
+
+       if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+               rte_wmb();
+               plt_write64(dpi_conf->pnum_words + DMA_CMD_LEN(nb_src, nb_dst),
+                           dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+               dpi_conf->stats.submitted += dpi_conf->pending + 1;
+               dpi_conf->pnum_words = 0;
+               dpi_conf->pending = 0;
+       } else {
+               dpi_conf->pnum_words += DMA_CMD_LEN(nb_src, nb_dst);
+               dpi_conf->pending++;
+       }
+
+       return dpi_conf->desc_idx++;
+}
+
+int
+cn10k_dmadev_copy(void *dev_private, uint16_t vchan, rte_iova_t src, 
rte_iova_t dst,
+                 uint32_t length, uint64_t flags)
+{
+       struct cnxk_dpi_vf_s *dpivf = dev_private;
+       struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+       uint64_t cmd[DMA_DW_PER_SINGLE_CMD];
+       struct cnxk_dpi_compl_s *comp_ptr;
+       int rc;
+
+       if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+                    dpi_conf->c_desc.head))
+               return -ENOSPC;
+
+       comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+       STRM_INC(dpi_conf->c_desc, tail);
+
+       cmd[0] = dpi_conf->cmd.u | (1U << 6) | 1U;
+       cmd[1] = (uint64_t)comp_ptr;
+       cmd[2] = 0;
+       cmd[4] = length;
+       cmd[5] = src;
+       cmd[6] = length;
+       cmd[7] = dst;
+
+       rc = __dpi_queue_write_single(dpivf, cmd);
+       if (unlikely(rc)) {
+               STRM_DEC(dpi_conf->c_desc, tail);
+               return rc;
+       }
+
+       if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+               rte_wmb();
+               plt_write64(dpi_conf->pnum_words + DMA_DW_PER_SINGLE_CMD,
+                           dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+               dpi_conf->stats.submitted += dpi_conf->pending + 1;
+               dpi_conf->pnum_words = 0;
+               dpi_conf->pending = 0;
+       } else {
+               dpi_conf->pnum_words += 8;
+               dpi_conf->pending++;
+       }
+
+       return dpi_conf->desc_idx++;
+}
+
+int
+cn10k_dmadev_copy_sg(void *dev_private, uint16_t vchan, const struct 
rte_dma_sge *src,
+                    const struct rte_dma_sge *dst, uint16_t nb_src, uint16_t 
nb_dst,
+                    uint64_t flags)
+{
+       struct cnxk_dpi_vf_s *dpivf = dev_private;
+       struct cnxk_dpi_conf *dpi_conf = &dpivf->conf[vchan];
+       struct cnxk_dpi_compl_s *comp_ptr;
+       uint64_t hdr[4];
+       int rc;
+
+       if (unlikely(((dpi_conf->c_desc.tail + 1) & dpi_conf->c_desc.max_cnt) ==
+                    dpi_conf->c_desc.head))
+               return -ENOSPC;
+
+       comp_ptr = dpi_conf->c_desc.compl_ptr[dpi_conf->c_desc.tail];
+       STRM_INC(dpi_conf->c_desc, tail);
+
+       hdr[0] = dpi_conf->cmd.u | (nb_dst << 6) | nb_src;
+       hdr[1] = (uint64_t)comp_ptr;
+       hdr[2] = 0;
+
+       rc = __dpi_queue_write_sg(dpivf, hdr, src, dst, nb_src, nb_dst);
+       if (unlikely(rc)) {
+               STRM_DEC(dpi_conf->c_desc, tail);
+               return rc;
+       }
+
+       if (flags & RTE_DMA_OP_FLAG_SUBMIT) {
+               rte_wmb();
+               plt_write64(dpi_conf->pnum_words + DMA_CMD_LEN(nb_src, nb_dst),
+                           dpivf->rdpi.rbase + DPI_VDMA_DBELL);
+               dpi_conf->stats.submitted += dpi_conf->pending + 1;
+               dpi_conf->pnum_words = 0;
+               dpi_conf->pending = 0;
+       } else {
+               dpi_conf->pnum_words += DMA_CMD_LEN(nb_src, nb_dst);
+               dpi_conf->pending++;
+       }
+
+       return dpi_conf->desc_idx++;
+}
diff --git a/drivers/dma/cnxk/meson.build b/drivers/dma/cnxk/meson.build
index b868fb14cb..a35b3a3b70 100644
--- a/drivers/dma/cnxk/meson.build
+++ b/drivers/dma/cnxk/meson.build
@@ -2,5 +2,5 @@
 # Copyright(C) 2021 Marvell International Ltd.
 
 deps += ['bus_pci', 'common_cnxk', 'dmadev']
-sources = files('cnxk_dmadev.c')
+sources = files('cnxk_dmadev.c', 'cnxk_dmadev_fp.c')
 require_iova_in_mbuf = false
-- 
2.25.1

Reply via email to