MPRQ (Multi-Packet Rx Queue) processes one packet at the time
using simple scalar instructions. MPRQ works by posting a single
large buffer (consisted of multiple fixed-size strides) in order to
receive multiple packets at once on this buffer. A Rx packet is then
copied to a user-provided mbuf or PMD attaches the Rx packet to
the mbuf by the pointer to an external buffer.

There is an opportunity to speed up the packet receiving by processing
4 packets simultaneously using SIMD (single instruction, multiple data)
extensions. Allocate mbufs in batches for every MPRQ buffer and process
the packets in the groups of 4 until all the strides are exhausted. Then
switch to another MPRQ buffer and repeat the process over again.

The vectorized MPRQ burst routine is engaged automatically in case
the mprq_en=1 devarg is specified and the vectorization is not disabled
explicitly by providing rx_vec_en=0 devarg.  There are two limitations:
- LRO is not supported and scalar MPRQ is selected if it is on.
- CQE compression is disabled in case vectorized MPRQ is engaged.

Signed-off-by: Alexander Kozyrev <akozy...@mellanox.com>
---
 drivers/net/mlx5/linux/mlx5_os.c         |   4 +
 drivers/net/mlx5/mlx5_ethdev.c           |  12 +-
 drivers/net/mlx5/mlx5_rxq.c              |  80 +--
 drivers/net/mlx5/mlx5_rxtx.c             |  30 +-
 drivers/net/mlx5/mlx5_rxtx.h             |   9 +-
 drivers/net/mlx5/mlx5_rxtx_vec.c         |  38 +-
 drivers/net/mlx5/mlx5_rxtx_vec.h         |  21 +
 drivers/net/mlx5/mlx5_rxtx_vec_altivec.h | 724 +++++++++++++++++++++++
 drivers/net/mlx5/mlx5_rxtx_vec_neon.h    | 577 ++++++++++++++++++
 drivers/net/mlx5/mlx5_rxtx_vec_sse.h     | 520 ++++++++++++++++
 10 files changed, 1968 insertions(+), 47 deletions(-)

diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index 742e2fba49..927fa07270 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -568,6 +568,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
                cqe_comp = 0;
        else
                cqe_comp = 1;
+       if (config.mprq.enabled)
+               cqe_comp = 0;
        config.cqe_comp = cqe_comp;
 #ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD
        /* Whether device supports 128B Rx CQE padding. */
@@ -973,6 +975,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
                                " setting default value (%u)",
                                1 << config.mprq.stride_num_n);
                }
+               if (config.mprq.stride_size_n)
+                       config.rx_vec_en = false;
                if (config.mprq.stride_size_n &&
                    (config.mprq.stride_size_n > mprq_max_stride_size_n ||
                     config.mprq.stride_size_n < mprq_min_stride_size_n)) {
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index cefb45064e..f48e8ea293 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -421,7 +421,8 @@ mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)
 
        if (dev->rx_pkt_burst == mlx5_rx_burst ||
            dev->rx_pkt_burst == mlx5_rx_burst_mprq ||
-           dev->rx_pkt_burst == mlx5_rx_burst_vec)
+           dev->rx_pkt_burst == mlx5_rx_burst_vec ||
+           dev->rx_pkt_burst == mlx5_rx_burst_mprq_vec)
                return ptypes;
        return NULL;
 }
@@ -479,12 +480,19 @@ mlx5_select_rx_function(struct rte_eth_dev *dev)
        eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst;
 
        MLX5_ASSERT(dev != NULL);
-       if (mlx5_check_vec_rx_support(dev) > 0) {
+       if (mlx5_check_vec_rx_support(dev) > 0 &&
+               mlx5_mprq_enabled(dev)) {
+               rx_pkt_burst = mlx5_rx_burst_mprq_vec;
+               DRV_LOG(DEBUG, "port %u selected Multi-Packet Rx vectorized 
function",
+                       dev->data->port_id);
+       } else if (mlx5_check_vec_rx_support(dev) > 0) {
                rx_pkt_burst = mlx5_rx_burst_vec;
                DRV_LOG(DEBUG, "port %u selected Rx vectorized function",
                        dev->data->port_id);
        } else if (mlx5_mprq_enabled(dev)) {
                rx_pkt_burst = mlx5_rx_burst_mprq;
+               DRV_LOG(DEBUG, "port %u selected Multi-Packet Rx function",
+                       dev->data->port_id);
        }
        return rx_pkt_burst;
 }
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 67d996cabf..06e7650be9 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -164,7 +164,7 @@ rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)
                        rxq->mprq_repl = buf;
        }
        DRV_LOG(DEBUG,
-               "port %u Rx queue %u allocated and configured %u segments",
+               "port %u Multi-Packet Rx queue %u allocated and configured %u 
segments",
                rxq->port_id, rxq->idx, wqe_n);
        return 0;
 error:
@@ -176,7 +176,7 @@ rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)
                                        (*rxq->mprq_bufs)[i]);
                (*rxq->mprq_bufs)[i] = NULL;
        }
-       DRV_LOG(DEBUG, "port %u Rx queue %u failed, freed everything",
+       DRV_LOG(DEBUG, "port %u Multi-Packet Rx queue %u failed, freed 
everything",
                rxq->port_id, rxq->idx);
        rte_errno = err; /* Restore rte_errno. */
        return -rte_errno;
@@ -194,11 +194,14 @@ rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)
 static int
 rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
 {
+       struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
        const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;
        unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n;
        unsigned int i;
        int err;
 
+       if (mlx5_rxq_mprq_enabled(rxq))
+               elts_n *= (1U << rxq_ctrl->rxq.strd_num_n);
        /* Iterate on segments. */
        for (i = 0; (i != elts_n); ++i) {
                struct rte_mbuf *buf;
@@ -284,8 +287,10 @@ rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
 int
 rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
 {
-       return mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ?
-              rxq_alloc_elts_mprq(rxq_ctrl) : rxq_alloc_elts_sprq(rxq_ctrl);
+       int ret = 0;
+       if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq))
+               ret = rxq_alloc_elts_mprq(rxq_ctrl);
+       return (ret || rxq_alloc_elts_sprq(rxq_ctrl));
 }
 
 /**
@@ -304,7 +309,6 @@ rxq_free_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)
                rxq->port_id, rxq->idx);
        if (rxq->mprq_bufs == NULL)
                return;
-       MLX5_ASSERT(mlx5_rxq_check_vec_support(rxq) < 0);
        for (i = 0; (i != (1u << rxq->elts_n)); ++i) {
                if ((*rxq->mprq_bufs)[i] != NULL)
                        mlx5_mprq_buf_free((*rxq->mprq_bufs)[i]);
@@ -326,15 +330,19 @@ static void
 rxq_free_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
 {
        struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
-       const uint16_t q_n = (1 << rxq->elts_n);
-       const uint16_t q_mask = q_n - 1;
-       uint16_t used = q_n - (rxq->rq_ci - rxq->rq_pi);
+       unsigned int q_n = (1 << rxq->elts_n);
+       uint16_t q_mask;
+       uint16_t used;
        uint16_t i;
 
        DRV_LOG(DEBUG, "port %u Rx queue %u freeing WRs",
                PORT_ID(rxq_ctrl->priv), rxq->idx);
        if (rxq->elts == NULL)
                return;
+       if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq))
+               q_n *= (1U << rxq_ctrl->rxq.strd_num_n);
+       q_mask = q_n - 1;
+       used = q_n - (rxq->rq_ci - rxq->rq_pi);
        /**
         * Some mbuf in the Ring belongs to the application.  They cannot be
         * freed.
@@ -344,7 +352,7 @@ rxq_free_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
                        (*rxq->elts)[(rxq->rq_ci + i) & q_mask] = NULL;
                rxq->rq_pi = rxq->rq_ci;
        }
-       for (i = 0; (i != (1u << rxq->elts_n)); ++i) {
+       for (i = 0; (i != q_n); ++i) {
                if ((*rxq->elts)[i] != NULL)
                        rte_pktmbuf_free_seg((*rxq->elts)[i]);
                (*rxq->elts)[i] = NULL;
@@ -362,8 +370,7 @@ rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
 {
        if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq))
                rxq_free_elts_mprq(rxq_ctrl);
-       else
-               rxq_free_elts_sprq(rxq_ctrl);
+       rxq_free_elts_sprq(rxq_ctrl);
 }
 
 /**
@@ -1793,20 +1800,10 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, 
uint16_t desc,
        struct mlx5_priv *priv = dev->data->dev_private;
        struct mlx5_rxq_ctrl *tmpl;
        unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
-       unsigned int mprq_stride_nums;
-       unsigned int mprq_stride_size;
-       unsigned int mprq_stride_cap;
        struct mlx5_dev_config *config = &priv->config;
-       /*
-        * Always allocate extra slots, even if eventually
-        * the vector Rx will not be used.
-        */
-       uint16_t desc_n =
-               desc + config->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP;
        uint64_t offloads = conf->offloads |
                           dev->data->dev_conf.rxmode.offloads;
        unsigned int lro_on_queue = !!(offloads & DEV_RX_OFFLOAD_TCP_LRO);
-       const int mprq_en = mlx5_check_mprq_support(dev) > 0;
        unsigned int max_rx_pkt_len = lro_on_queue ?
                        dev->data->dev_conf.rxmode.max_lro_pkt_size :
                        dev->data->dev_conf.rxmode.max_rx_pkt_len;
@@ -1814,6 +1811,23 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, 
uint16_t desc,
                                                        RTE_PKTMBUF_HEADROOM;
        unsigned int max_lro_size = 0;
        unsigned int first_mb_free_size = mb_len - RTE_PKTMBUF_HEADROOM;
+       const int mprq_en = mlx5_check_mprq_support(dev) > 0;
+       unsigned int mprq_stride_nums = config->mprq.stride_num_n ?
+               config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;
+       unsigned int mprq_stride_size = non_scatter_min_mbuf_size <=
+               (1U << config->mprq.max_stride_size_n) ?
+               log2above(non_scatter_min_mbuf_size) : MLX5_MPRQ_STRIDE_SIZE_N;
+       unsigned int mprq_stride_cap = (config->mprq.stride_num_n ?
+               (1U << config->mprq.stride_num_n) : (1U << mprq_stride_nums)) *
+                       (config->mprq.stride_size_n ?
+               (1U << config->mprq.stride_size_n) : (1U << mprq_stride_size));
+       /*
+        * Always allocate extra slots, even if eventually
+        * the vector Rx will not be used.
+        */
+       uint16_t desc_n = desc +
+               config->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP *
+               (desc >> mprq_stride_nums);
 
        if (non_scatter_min_mbuf_size > mb_len && !(offloads &
                                                    DEV_RX_OFFLOAD_SCATTER)) {
@@ -1825,8 +1839,12 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, 
uint16_t desc,
                rte_errno = ENOSPC;
                return NULL;
        }
-       tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, sizeof(*tmpl) +
-                          desc_n * sizeof(struct rte_mbuf *), 0, socket);
+       tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
+                                sizeof(*tmpl) +
+                                desc_n * sizeof(struct rte_mbuf *) +
+                                (desc >> mprq_stride_nums) *
+                                sizeof(struct mlx5_mprq_buf *),
+                                0, socket);
        if (!tmpl) {
                rte_errno = ENOMEM;
                return NULL;
@@ -1840,15 +1858,6 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, 
uint16_t desc,
        tmpl->socket = socket;
        if (dev->data->dev_conf.intr_conf.rxq)
                tmpl->irq = 1;
-       mprq_stride_nums = config->mprq.stride_num_n ?
-               config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;
-       mprq_stride_size = non_scatter_min_mbuf_size <=
-               (1U << config->mprq.max_stride_size_n) ?
-               log2above(non_scatter_min_mbuf_size) : MLX5_MPRQ_STRIDE_SIZE_N;
-       mprq_stride_cap = (config->mprq.stride_num_n ?
-               (1U << config->mprq.stride_num_n) : (1U << mprq_stride_nums)) *
-                       (config->mprq.stride_size_n ?
-               (1U << config->mprq.stride_size_n) : (1U << mprq_stride_size));
        /*
         * This Rx queue can be configured as a Multi-Packet RQ if all of the
         * following conditions are met:
@@ -1996,7 +2005,12 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, 
uint16_t desc,
        tmpl->rxq.rq_repl_thresh =
                MLX5_VPMD_RXQ_RPLNSH_THRESH(1 << tmpl->rxq.elts_n);
        tmpl->rxq.elts =
-               (struct rte_mbuf *(*)[1 << tmpl->rxq.elts_n])(tmpl + 1);
+               (struct rte_mbuf *(*)[desc_n])(tmpl + 1);
+       if (mlx5_rxq_mprq_enabled(&tmpl->rxq)) {
+               tmpl->rxq.rq_repl_thresh = 1;
+               tmpl->rxq.mprq_bufs =
+                       (struct mlx5_mprq_buf *(*)[desc])(tmpl + desc_n + 1);
+       }
 #ifndef RTE_ARCH_64
        tmpl->rxq.uar_lock_cq = &priv->sh->uar_lock_cq;
 #endif
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 65239f9ffe..768a242518 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -614,6 +614,16 @@ mlx5_rx_burst_mode_get(struct rte_eth_dev *dev,
                snprintf(mode->info, sizeof(mode->info), "%s", "Vector 
AltiVec");
 #else
                return -EINVAL;
+#endif
+       } else if (pkt_burst == mlx5_rx_burst_mprq_vec) {
+#if defined RTE_ARCH_X86_64
+               snprintf(mode->info, sizeof(mode->info), "%s", "Multi-Packet RQ 
Vector SSE");
+#elif defined RTE_ARCH_ARM64
+               snprintf(mode->info, sizeof(mode->info), "%s", "Multi-Packet RQ 
Vector Neon");
+#elif defined RTE_ARCH_PPC_64
+               snprintf(mode->info, sizeof(mode->info), "%s", "Multi-Packet RQ 
Vector AltiVec");
+#else
+               return -EINVAL;
 #endif
        } else {
                return -EINVAL;
@@ -1075,7 +1085,7 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec)
 {
        const uint16_t cqe_n = 1 << rxq->cqe_n;
        const uint16_t cqe_mask = cqe_n - 1;
-       const unsigned int wqe_n = 1 << rxq->elts_n;
+       unsigned int wqe_n = 1 << rxq->elts_n;
        struct mlx5_rxq_ctrl *rxq_ctrl =
                        container_of(rxq, struct mlx5_rxq_ctrl, rxq);
        union {
@@ -1139,11 +1149,17 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t 
vec)
                                                    &sm))
                                return -1;
                        if (vec) {
-                               const uint16_t q_mask = wqe_n - 1;
+                               uint16_t q_mask;
                                uint16_t elt_idx;
                                struct rte_mbuf **elt;
                                int i;
-                               unsigned int n = wqe_n - (rxq->rq_ci -
+                               unsigned int n;
+
+                               if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq))
+                                       wqe_n *= (1U <<
+                                                 rxq_ctrl->rxq.strd_num_n);
+                               q_mask = wqe_n - 1;
+                               n = wqe_n - (rxq->rq_ci -
                                                          rxq->rq_pi);
 
                                for (i = 0; i < (int)n; ++i) {
@@ -1982,6 +1998,14 @@ mlx5_rx_burst_vec(void *dpdk_txq __rte_unused,
        return 0;
 }
 
+__rte_weak uint16_t
+mlx5_rx_burst_mprq_vec(void *dpdk_txq __rte_unused,
+                 struct rte_mbuf **pkts __rte_unused,
+                 uint16_t pkts_n __rte_unused)
+{
+       return 0;
+}
+
 __rte_weak int
 mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)
 {
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 5116a15c33..3c44794d68 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -141,11 +141,8 @@ struct mlx5_rxq_data {
        uint16_t mprq_max_memcpy_len; /* Maximum size of packet to memcpy. */
        volatile void *wqes;
        volatile struct mlx5_cqe(*cqes)[];
-       RTE_STD_C11
-       union  {
-               struct rte_mbuf *(*elts)[];
-               struct mlx5_mprq_buf *(*mprq_bufs)[];
-       };
+       struct rte_mbuf *(*elts)[];
+       struct mlx5_mprq_buf *(*mprq_bufs)[];
        struct rte_mempool *mp;
        struct rte_mempool *mprq_mp; /* Mempool for Multi-Packet RQ. */
        struct mlx5_mprq_buf *mprq_repl; /* Stashed mbuf for replenish. */
@@ -518,6 +515,8 @@ int mlx5_rxq_check_vec_support(struct mlx5_rxq_data 
*rxq_data);
 int mlx5_check_vec_rx_support(struct rte_eth_dev *dev);
 uint16_t mlx5_rx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts,
                           uint16_t pkts_n);
+uint16_t mlx5_rx_burst_mprq_vec(void *dpdk_txq, struct rte_mbuf **pkts,
+                          uint16_t pkts_n);
 
 /* mlx5_mr.c */
 
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c b/drivers/net/mlx5/mlx5_rxtx_vec.c
index 7fae2010f9..53dd229271 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec.c
+++ b/drivers/net/mlx5/mlx5_rxtx_vec.c
@@ -119,6 +119,40 @@ mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, 
uint16_t pkts_n)
        return tn;
 }
 
+/**
+ * DPDK callback for MPRQ vectorized RX.
+ *
+ * @param dpdk_rxq
+ *   Generic pointer to RX queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+mlx5_rx_burst_mprq_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+       struct mlx5_rxq_data *rxq = dpdk_rxq;
+       uint16_t nb_rx = 0;
+       uint16_t tn = 0;
+       uint64_t err = 0;
+       bool no_cq = false;
+
+       do {
+               nb_rx = rxq_burst_mprq_v(rxq, pkts + tn, pkts_n - tn,
+                                        &err, &no_cq);
+               if (unlikely(err | rxq->err_state))
+                       nb_rx = rxq_handle_pending_error(rxq, pkts + tn, nb_rx);
+               tn += nb_rx;
+               if (unlikely(no_cq))
+                       break;
+       } while (tn != pkts_n);
+       return tn;
+}
+
 /**
  * Check a RX queue can support vectorized RX.
  *
@@ -134,8 +168,6 @@ mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq)
        struct mlx5_rxq_ctrl *ctrl =
                container_of(rxq, struct mlx5_rxq_ctrl, rxq);
 
-       if (mlx5_mprq_enabled(ETH_DEV(ctrl->priv)))
-               return -ENOTSUP;
        if (!ctrl->priv->config.rx_vec_en || rxq->sges_n != 0)
                return -ENOTSUP;
        if (rxq->lro)
@@ -160,8 +192,6 @@ mlx5_check_vec_rx_support(struct rte_eth_dev *dev)
 
        if (!priv->config.rx_vec_en)
                return -ENOTSUP;
-       if (mlx5_mprq_enabled(dev))
-               return -ENOTSUP;
        /* All the configured queues should support. */
        for (i = 0; i < priv->rxqs_n; ++i) {
                struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.h b/drivers/net/mlx5/mlx5_rxtx_vec.h
index 6ddcbfb0ad..305c5a596a 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec.h
@@ -122,4 +122,25 @@ mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq, 
uint16_t n)
        *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
 }
 
+static inline void
+mlx5_rx_replenish_bulk_mprq_mbuf(struct mlx5_rxq_data *rxq,
+                                uint16_t n, uint32_t rq_idx)
+{
+       const unsigned int strd_n = 1 << rxq->strd_num_n;
+       uint16_t elts_idx = rq_idx * strd_n +
+               rq_idx * MLX5_VPMD_DESCS_PER_LOOP;
+       struct rte_mbuf **elts = &(*rxq->elts)[elts_idx];
+       unsigned int i;
+
+       n = RTE_MIN(n, strd_n - rxq->consumed_strd);
+       if (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) {
+               rxq->stats.rx_nombuf += n;
+               return;
+       }
+       rxq->rq_repl_thresh = 0;
+       /* Prevent overflowing into the next MPRQ mbufs. */
+       for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
+               (*rxq->elts)[elts_idx + strd_n + i] = &rxq->fake_mbuf;
+}
+
 #endif /* RTE_PMD_MLX5_RXTX_VEC_H_ */
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h 
b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
index f5414eebad..8fc3e1fd66 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
@@ -59,6 +59,97 @@ rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf 
**pkts, uint16_t n)
                pkts[pos] = elts[pos];
 }
 
+/**
+ * Store free buffers to RX SW ring.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param pkts
+ *   Pointer to array of packets to be stored.
+ * @param pkts_n
+ *   Number of packets to be stored.
+ * @param buf
+ *   MPRQ buffer to get packets from.
+ * @param buf rq_ci
+ *   WQE index.
+ * @param strd_idx
+ *   Stride number.
+ * @param comp
+ *   Whether CQE is compressed or not.
+ */
+static inline void
+rxq_copy_mprq_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
+                    uint16_t n, struct mlx5_mprq_buf *buf,
+                    uint16_t rq_ci, uint16_t strd_idx, bool comp)
+{
+       const unsigned int strd_sz = 1 << rxq->strd_sz_n;
+       const unsigned int strd_n = 1 << rxq->strd_num_n;
+       const unsigned int strd_shift =
+               MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en;
+       uint32_t offset;
+       void *addr;
+       int i = 0;
+
+       if (comp) {
+               const uint16_t q_mask = (1 << rxq->cqe_n) - 1;
+               struct rte_mbuf **elts = &(*rxq->elts)[rq_ci * strd_n & q_mask];
+               unsigned int pos;
+               uint16_t p = n & -2;
+
+               for (pos = 0; pos < p; pos += 2) {
+                       vector unsigned char mbp;
+
+                       mbp = (vector unsigned char)vec_vsx_ld(0,
+                               (signed int const *)&elts[pos +
+                                                         rxq->consumed_strd]);
+                       *(vector unsigned char *)&pkts[pos] = mbp;
+               }
+               if (n & 1)
+                       pkts[pos] = elts[pos];
+       }
+
+       for (i = 0; i < n; ++i) {
+               offset = (strd_idx + i) * strd_sz + strd_shift;
+               addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset);
+               if (pkts[i]->pkt_len <= rxq->mprq_max_memcpy_len ||
+                   rxq->mprq_repl == NULL) {
+                       rte_memcpy(rte_pktmbuf_mtod(pkts[i], void *),
+                                  addr, pkts[i]->pkt_len);
+               } else {
+                       rte_iova_t buf_iova;
+                       struct rte_mbuf_ext_shared_info *shinfo;
+                       uint16_t buf_len = strd_sz;
+                       void *buf_addr;
+                       /* Increment the refcnt of the whole chunk. */
+                       rte_atomic16_add_return(&buf->refcnt, 1);
+                       MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf->refcnt) <=
+                                       strd_n + 1);
+                       buf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);
+                       /*
+                        * MLX5 device doesn't use iova but it is necessary in a
+                        * case where the Rx packet is transmitted via a
+                        * different PMD.
+                        */
+                       buf_iova = rte_mempool_virt2iova(buf) +
+                               RTE_PTR_DIFF(buf_addr, buf);
+                       shinfo = &buf->shinfos[strd_idx];
+                       rte_mbuf_ext_refcnt_set(shinfo, 1);
+                       /*
+                        * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when
+                        * attaching the stride to mbuf and more offload flags
+                        * will be added below by calling rxq_cq_to_mbuf().
+                        * Other fields will be overwritten.
+                        */
+                       rte_pktmbuf_attach_extbuf(pkts[i], buf_addr, buf_iova,
+                                               buf_len, shinfo);
+                       /* Set mbuf head-room. */
+                       SET_DATA_OFF(pkts[i], RTE_PKTMBUF_HEADROOM);
+                       DATA_LEN(pkts[i]) = pkts[i]->pkt_len;
+               }
+       }
+}
+
+
 /**
  * Decompress a compressed completion and fill in mbufs in RX SW ring with data
  * extracted from the title completion descriptor.
@@ -1136,4 +1227,637 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf 
**pkts, uint16_t pkts_n,
        return rcvd_pkt;
 }
 
+static inline void
+mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx,
+                const unsigned int strd_n)
+{
+       struct mlx5_mprq_buf *rep = rxq->mprq_repl;
+       volatile struct mlx5_wqe_data_seg *wqe =
+               &((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg;
+       void *addr;
+
+       MLX5_ASSERT(rep != NULL);
+       /* Replace MPRQ buf. */
+       (*rxq->mprq_bufs)[rq_idx] = rep;
+       /* Replace WQE. */
+       addr = mlx5_mprq_buf_addr(rep, strd_n);
+       wqe->addr = rte_cpu_to_be_64((uintptr_t)addr);
+       /* If there's only one MR, no need to replace LKey in WQE. */
+       if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
+               wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr);
+       /* Stash a mbuf for next replacement. */
+       if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep)))
+               rxq->mprq_repl = rep;
+       else
+               rxq->mprq_repl = NULL;
+}
+
+/**
+ * Receive burst of packets. An errored completion also consumes a mbuf, but 
the
+ * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed
+ * before returning to application.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ * @param[out] err
+ *   Pointer to a flag. Set non-zero value if pkts array has at least one error
+ *   packet to handle.
+ * @param[out] no_cq
+ *  Pointer to a boolean. Set true if no new CQE seen.
+ *
+ * @return
+ *   Number of packets received including errors (<= pkts_n).
+ */
+static inline uint16_t
+rxq_burst_mprq_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
+                uint16_t pkts_n, uint64_t *err, bool *no_cq)
+{
+       const unsigned int strd_n = 1 << rxq->strd_num_n;
+       const uint16_t q_n = 1 << rxq->cqe_n;
+       const uint16_t q_mask = q_n - 1;
+       const uint16_t e_n = 1 << rxq->elts_n;
+       const uint16_t e_mask = e_n - 1;
+       volatile struct mlx5_cqe *cq;
+       struct rte_mbuf **elts;
+       unsigned int pos;
+       uint64_t n;
+       uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
+       uint16_t nocmp_n = 0;
+       uint16_t rcvd_pkt = 0;
+       unsigned int cq_ci = rxq->cq_ci;
+       unsigned int cq_idx = cq_ci & q_mask;
+       unsigned int rq_ci = rxq->rq_ci;
+       unsigned int rq_idx = rq_ci & e_mask;
+       struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_idx];
+       unsigned int elts_idx;
+       unsigned int ownership = !!(rxq->cq_ci & (q_mask + 1));
+       const vector unsigned char zero = (vector unsigned char){0};
+       const vector unsigned char ones = vec_splat_u8(-1);
+       const vector unsigned char owner_check =
+               (vector unsigned char)(vector unsigned long){
+               0x0100000001000000LL, 0x0100000001000000LL};
+       const vector unsigned char opcode_check =
+               (vector unsigned char)(vector unsigned long){
+               0xf0000000f0000000LL, 0xf0000000f0000000LL};
+       const vector unsigned char format_check =
+               (vector unsigned char)(vector unsigned long){
+               0x0c0000000c000000LL, 0x0c0000000c000000LL};
+       const vector unsigned char resp_err_check =
+               (vector unsigned char)(vector unsigned long){
+               0xe0000000e0000000LL, 0xe0000000e0000000LL};
+#ifdef MLX5_PMD_SOFT_COUNTERS
+       uint32_t rcvd_byte = 0;
+       /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
+       const vector unsigned char len_shuf_mask = (vector unsigned char){
+                1,  0,  5,  4,
+                9,  8, 13, 12,
+               -1, -1, -1, -1,
+               -1, -1, -1, -1};
+#endif
+       /* Mask to shuffle from extracted CQE to mbuf. */
+       const vector unsigned char shuf_mask = (vector unsigned char){
+                5,  4,           /* bswap16, pkt_len */
+               -1, -1,           /* zero out 2nd half of pkt_len */
+                5,  4,           /* bswap16, data_len */
+               11, 10,           /* bswap16, vlan+tci */
+               15, 14, 13, 12,   /* bswap32, rss */
+                1,  2,  3, -1};  /* fdir.hi */
+       /* Mask to blend from the last Qword to the first DQword. */
+       /* Mask to blend from the last Qword to the first DQword. */
+       const vector unsigned char blend_mask = (vector unsigned char){
+               -1,  0,  0,  0,
+                0,  0,  0,  0,
+               -1, -1, -1, -1,
+               -1, -1, -1, -1};
+       const vector unsigned char crc_adj =
+               (vector unsigned char)(vector unsigned short){
+               rxq->crc_present * RTE_ETHER_CRC_LEN, 0,
+               rxq->crc_present * RTE_ETHER_CRC_LEN, 0, 0, 0, 0, 0};
+       const vector unsigned char flow_mark_adj =
+               (vector unsigned char)(vector unsigned int){
+               0, 0, 0, rxq->mark * (-1)};
+       const vector unsigned short cqe_sel_mask1 =
+               (vector unsigned short){0, 0, 0, 0, 0xffff, 0xffff, 0, 0};
+       const vector unsigned short cqe_sel_mask2 =
+               (vector unsigned short){0, 0, 0xffff, 0, 0, 0, 0, 0};
+
+       MLX5_ASSERT(rxq->sges_n == 0);
+       MLX5_ASSERT(rxq->cqe_n == rxq->elts_n);
+       if (rxq->consumed_strd == strd_n) {
+               /* Replace WQE only if the buffer is still in use. */
+               if (rte_atomic16_read(&buf->refcnt) > 1) {
+                       mprq_buf_replace(rxq, rq_ci & e_mask, strd_n);
+                       /* Release the old buffer. */
+                       mlx5_mprq_buf_free(buf);
+               } else if (unlikely(rxq->mprq_repl == NULL)) {
+                       struct mlx5_mprq_buf *rep;
+
+                       /*
+                        * Currently, the MPRQ mempool is out of buffer
+                        * and doing memcpy regardless of the size of Rx
+                        * packet. Retry allocation to get back to
+                        * normal.
+                        */
+                       if (!rte_mempool_get(rxq->mprq_mp,
+                                            (void **)&rep))
+                               rxq->mprq_repl = rep;
+               }
+               /* Advance to the next WQE. */
+               rxq->consumed_strd = 0;
+               ++rq_ci;
+               buf = (*rxq->mprq_bufs)[rq_ci & e_mask];
+               rxq->rq_repl_thresh = 1;
+       }
+       if (rxq->rq_repl_thresh)
+               mlx5_rx_replenish_bulk_mprq_mbuf(rxq, strd_n, rq_ci & e_mask);
+
+       cq = &(*rxq->cqes)[cq_idx];
+       rte_prefetch0(cq);
+       rte_prefetch0(cq + 1);
+       rte_prefetch0(cq + 2);
+       rte_prefetch0(cq + 3);
+       elts_idx = (rq_ci & e_mask) * strd_n +
+               (rq_ci & e_mask) * MLX5_VPMD_DESCS_PER_LOOP;
+       elts = &(*rxq->elts)[elts_idx];
+       pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST);
+       /* See if there're unreturned mbufs from compressed CQE. */
+       rcvd_pkt = rxq->decompressed;
+       if (rcvd_pkt > 0) {
+               rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n);
+               rxq_copy_mprq_mbuf_v(rxq, pkts, rcvd_pkt, buf,
+                                    rq_ci, rxq->consumed_strd, true);
+               rxq->consumed_strd += rcvd_pkt;
+               rxq->rq_pi += rcvd_pkt;
+               rxq->decompressed -= rcvd_pkt;
+               pkts += rcvd_pkt;
+       }
+       /* Not to cross queue end. */
+       pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
+       pkts_n = RTE_MIN(pkts_n, strd_n - rxq->consumed_strd);
+       if (!pkts_n) {
+               *no_cq = !rcvd_pkt;
+               return rcvd_pkt;
+       }
+       /* At this point, there shouldn't be any remaining packets. */
+       MLX5_ASSERT(rxq->decompressed == 0);
+
+       /*
+        * A. load first Qword (8bytes) in one loop.
+        * B. copy 4 mbuf pointers from elts ring to returing pkts.
+        * C. load remaining CQE data and extract necessary fields.
+        *    Final 16bytes cqes[] extracted from original 64bytes CQE has the
+        *    following structure:
+        *        struct {
+        *          uint8_t  pkt_info;
+        *          uint8_t  flow_tag[3];
+        *          uint16_t byte_cnt;
+        *          uint8_t  rsvd4;
+        *          uint8_t  op_own;
+        *          uint16_t hdr_type_etc;
+        *          uint16_t vlan_info;
+        *          uint32_t rx_has_res;
+        *        } c;
+        * D. fill in mbuf.
+        * E. get valid CQEs.
+        * F. find compressed CQE.
+        */
+       for (pos = 0;
+            pos < pkts_n;
+            pos += MLX5_VPMD_DESCS_PER_LOOP) {
+               vector unsigned char cqes[MLX5_VPMD_DESCS_PER_LOOP];
+               vector unsigned char cqe_tmp1, cqe_tmp2;
+               vector unsigned char pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3;
+               vector unsigned char op_own, op_own_tmp1, op_own_tmp2;
+               vector unsigned char opcode, owner_mask, invalid_mask;
+               vector unsigned char comp_mask;
+               vector unsigned char mask;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+               const vector unsigned char lower_half = {
+                       0, 1, 4, 5, 8, 9, 12, 13,
+                       16, 17, 20, 21, 24, 25, 28, 29};
+               const vector unsigned char upper_half = {
+                       2, 3, 6, 7, 10, 11, 14, 15,
+                       18, 19, 22, 23, 26, 27, 30, 31};
+               const vector unsigned long shmax = {64, 64};
+               vector unsigned char byte_cnt;
+               vector unsigned short left, right;
+               vector unsigned long lshift;
+               vector __attribute__((altivec(bool__)))
+                       unsigned long shmask;
+#endif
+               vector unsigned char mbp1, mbp2;
+               vector unsigned char p =
+                       (vector unsigned char)(vector unsigned short){
+                               0, 1, 2, 3, 0, 0, 0, 0};
+               unsigned int p1, p2, p3;
+
+               /* Prefetch next 4 CQEs. */
+               if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) {
+                       rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP]);
+                       rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 1]);
+                       rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 2]);
+                       rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 3]);
+               }
+
+               /* A.0 do not cross the end of CQ. */
+               mask = (vector unsigned char)(vector unsigned long){
+                       (pkts_n - pos) * sizeof(uint16_t) * 8, 0};
+
+               {
+                       vector unsigned long lshift;
+                       vector __attribute__((altivec(bool__)))
+                               unsigned long shmask;
+                       const vector unsigned long shmax = {64, 64};
+
+                       lshift = vec_splat((vector unsigned long)mask, 0);
+                       shmask = vec_cmpgt(shmax, lshift);
+                       mask = (vector unsigned char)
+                               vec_sl((vector unsigned long)ones, lshift);
+                       mask = (vector unsigned char)
+                               vec_sel((vector unsigned long)shmask,
+                               (vector unsigned long)mask, shmask);
+               }
+
+               p = (vector unsigned char)
+                       vec_andc((vector unsigned long)p,
+                       (vector unsigned long)mask);
+
+               /* A.1 load cqes. */
+               p3 = (unsigned int)((vector unsigned short)p)[3];
+               cqes[3] = (vector unsigned char)(vector unsigned long){
+                       *(__rte_aligned(8) unsigned long *)
+                       &cq[pos + p3].sop_drop_qpn, 0LL};
+               rte_compiler_barrier();
+
+               p2 = (unsigned int)((vector unsigned short)p)[2];
+               cqes[2] = (vector unsigned char)(vector unsigned long){
+                       *(__rte_aligned(8) unsigned long *)
+                       &cq[pos + p2].sop_drop_qpn, 0LL};
+               rte_compiler_barrier();
+
+               /* B.1 load mbuf pointers. */
+               mbp1 = (vector unsigned char)vec_vsx_ld(0,
+                       (signed int const *)&elts[pos + rxq->consumed_strd]);
+               mbp2 = (vector unsigned char)vec_vsx_ld(0,
+                       (signed int const *)&elts[pos +
+                                                 rxq->consumed_strd + 2]);
+
+               /* A.1 load a block having op_own. */
+               p1 = (unsigned int)((vector unsigned short)p)[1];
+               cqes[1] = (vector unsigned char)(vector unsigned long){
+                       *(__rte_aligned(8) unsigned long *)
+                       &cq[pos + p1].sop_drop_qpn, 0LL};
+               rte_compiler_barrier();
+
+               cqes[0] = (vector unsigned char)(vector unsigned long){
+                       *(__rte_aligned(8) unsigned long *)
+                       &cq[pos].sop_drop_qpn, 0LL};
+               rte_compiler_barrier();
+
+               /* B.2 copy mbuf pointers. */
+               *(vector unsigned char *)&pkts[pos] = mbp1;
+               *(vector unsigned char *)&pkts[pos + 2] = mbp2;
+               rte_cio_rmb();
+
+               /* C.1 load remaining CQE data and extract necessary fields. */
+               cqe_tmp2 = *(vector unsigned char *)
+                       &cq[pos + p3].pkt_info;
+               cqe_tmp1 = *(vector unsigned char *)
+                       &cq[pos + p2].pkt_info;
+               cqes[3] = vec_sel(cqes[3], cqe_tmp2, blend_mask);
+               cqes[2] = vec_sel(cqes[2], cqe_tmp1, blend_mask);
+               cqe_tmp2 = (vector unsigned char)vec_vsx_ld(0,
+                       (signed int const *)&cq[pos + p3].csum);
+               cqe_tmp1 = (vector unsigned char)vec_vsx_ld(0,
+                       (signed int const *)&cq[pos + p2].csum);
+               cqes[3] = (vector unsigned char)
+                       vec_sel((vector unsigned short)cqes[3],
+                       (vector unsigned short)cqe_tmp2, cqe_sel_mask1);
+               cqes[2] = (vector unsigned char)
+                       vec_sel((vector unsigned short)cqes[2],
+                       (vector unsigned short)cqe_tmp1, cqe_sel_mask1);
+               cqe_tmp2 = (vector unsigned char)(vector unsigned long){
+                       *(__rte_aligned(8) unsigned long *)
+                       &cq[pos + p3].rsvd3[9], 0LL};
+               cqe_tmp1 = (vector unsigned char)(vector unsigned long){
+                       *(__rte_aligned(8) unsigned long *)
+                       &cq[pos + p2].rsvd3[9], 0LL};
+               cqes[3] = (vector unsigned char)
+                       vec_sel((vector unsigned short)cqes[3],
+                       (vector unsigned short)cqe_tmp2,
+                       (vector unsigned short)cqe_sel_mask2);
+               cqes[2] = (vector unsigned char)
+                       vec_sel((vector unsigned short)cqes[2],
+                       (vector unsigned short)cqe_tmp1,
+                       (vector unsigned short)cqe_sel_mask2);
+
+               /* C.2 generate final structure for mbuf with swapping bytes. */
+               pkt_mb3 = vec_perm(cqes[3], zero, shuf_mask);
+               pkt_mb2 = vec_perm(cqes[2], zero, shuf_mask);
+
+               /* C.3 adjust CRC length. */
+               pkt_mb3 = (vector unsigned char)
+                       ((vector unsigned short)pkt_mb3 -
+                       (vector unsigned short)crc_adj);
+               pkt_mb2 = (vector unsigned char)
+                       ((vector unsigned short)pkt_mb2 -
+                       (vector unsigned short)crc_adj);
+
+               /* C.4 adjust flow mark. */
+               pkt_mb3 = (vector unsigned char)
+                       ((vector unsigned int)pkt_mb3 +
+                       (vector unsigned int)flow_mark_adj);
+               pkt_mb2 = (vector unsigned char)
+                       ((vector unsigned int)pkt_mb2 +
+                       (vector unsigned int)flow_mark_adj);
+
+               /* D.1 fill in mbuf - rx_descriptor_fields1. */
+               *(vector unsigned char *)
+                       &pkts[pos + 3]->pkt_len = pkt_mb3;
+               *(vector unsigned char *)
+                       &pkts[pos + 2]->pkt_len = pkt_mb2;
+
+               /* E.1 extract op_own field. */
+               op_own_tmp2 = (vector unsigned char)
+                       vec_mergeh((vector unsigned int)cqes[2],
+                       (vector unsigned int)cqes[3]);
+
+               /* C.1 load remaining CQE data and extract necessary fields. */
+               cqe_tmp2 = *(vector unsigned char *)
+                       &cq[pos + p1].pkt_info;
+               cqe_tmp1 = *(vector unsigned char *)
+                       &cq[pos].pkt_info;
+               cqes[1] = vec_sel(cqes[1], cqe_tmp2, blend_mask);
+               cqes[0] = vec_sel(cqes[0], cqe_tmp2, blend_mask);
+               cqe_tmp2 = (vector unsigned char)vec_vsx_ld(0,
+                       (signed int const *)&cq[pos + p1].csum);
+               cqe_tmp1 = (vector unsigned char)vec_vsx_ld(0,
+                       (signed int const *)&cq[pos].csum);
+               cqes[1] = (vector unsigned char)
+                       vec_sel((vector unsigned short)cqes[1],
+                       (vector unsigned short)cqe_tmp2, cqe_sel_mask1);
+               cqes[0] = (vector unsigned char)
+                       vec_sel((vector unsigned short)cqes[0],
+                       (vector unsigned short)cqe_tmp1, cqe_sel_mask1);
+               cqe_tmp2 = (vector unsigned char)(vector unsigned long){
+                       *(__rte_aligned(8) unsigned long *)
+                       &cq[pos + p1].rsvd3[9], 0LL};
+               cqe_tmp1 = (vector unsigned char)(vector unsigned long){
+                       *(__rte_aligned(8) unsigned long *)
+                       &cq[pos].rsvd3[9], 0LL};
+               cqes[1] = (vector unsigned char)
+                       vec_sel((vector unsigned short)cqes[1],
+                       (vector unsigned short)cqe_tmp2, cqe_sel_mask2);
+               cqes[0] = (vector unsigned char)
+                       vec_sel((vector unsigned short)cqes[0],
+                       (vector unsigned short)cqe_tmp1, cqe_sel_mask2);
+
+               /* C.2 generate final structure for mbuf with swapping bytes. */
+               pkt_mb1 = vec_perm(cqes[1], zero, shuf_mask);
+               pkt_mb0 = vec_perm(cqes[0], zero, shuf_mask);
+
+               /* C.3 adjust CRC length. */
+               pkt_mb1 = (vector unsigned char)
+                       ((vector unsigned short)pkt_mb1 -
+                       (vector unsigned short)crc_adj);
+               pkt_mb0 = (vector unsigned char)
+                       ((vector unsigned short)pkt_mb0 -
+                       (vector unsigned short)crc_adj);
+
+               /* C.4 adjust flow mark. */
+               pkt_mb1 = (vector unsigned char)
+                       ((vector unsigned int)pkt_mb1 +
+                       (vector unsigned int)flow_mark_adj);
+               pkt_mb0 = (vector unsigned char)
+                       ((vector unsigned int)pkt_mb0 +
+                       (vector unsigned int)flow_mark_adj);
+
+               /* E.1 extract op_own byte. */
+               op_own_tmp1 = (vector unsigned char)
+                       vec_mergeh((vector unsigned int)cqes[0],
+                       (vector unsigned int)cqes[1]);
+               op_own = (vector unsigned char)
+                       vec_mergel((vector unsigned long)op_own_tmp1,
+                       (vector unsigned long)op_own_tmp2);
+
+               /* D.1 fill in mbuf - rx_descriptor_fields1. */
+               *(vector unsigned char *)
+                       &pkts[pos + 1]->pkt_len = pkt_mb1;
+               *(vector unsigned char *)
+                       &pkts[pos]->pkt_len = pkt_mb0;
+
+               /* E.2 flip owner bit to mark CQEs from last round. */
+               owner_mask = (vector unsigned char)
+                       vec_and((vector unsigned long)op_own,
+                       (vector unsigned long)owner_check);
+               if (ownership)
+                       owner_mask = (vector unsigned char)
+                               vec_xor((vector unsigned long)owner_mask,
+                               (vector unsigned long)owner_check);
+               owner_mask = (vector unsigned char)
+                       vec_cmpeq((vector unsigned int)owner_mask,
+                       (vector unsigned int)owner_check);
+               owner_mask = (vector unsigned char)
+                       vec_packs((vector unsigned int)owner_mask,
+                       (vector unsigned int)zero);
+
+               /* E.3 get mask for invalidated CQEs. */
+               opcode = (vector unsigned char)
+                       vec_and((vector unsigned long)op_own,
+                       (vector unsigned long)opcode_check);
+               invalid_mask = (vector unsigned char)
+                       vec_cmpeq((vector unsigned int)opcode_check,
+                       (vector unsigned int)opcode);
+               invalid_mask = (vector unsigned char)
+                       vec_packs((vector unsigned int)invalid_mask,
+                       (vector unsigned int)zero);
+
+               /* E.4 mask out beyond boundary. */
+               invalid_mask = (vector unsigned char)
+                       vec_or((vector unsigned long)invalid_mask,
+                       (vector unsigned long)mask);
+
+               /* E.5 merge invalid_mask with invalid owner. */
+               invalid_mask = (vector unsigned char)
+                       vec_or((vector unsigned long)invalid_mask,
+                       (vector unsigned long)owner_mask);
+
+               /* F.1 find compressed CQE format. */
+               comp_mask = (vector unsigned char)
+                       vec_and((vector unsigned long)op_own,
+                       (vector unsigned long)format_check);
+               comp_mask = (vector unsigned char)
+                       vec_cmpeq((vector unsigned int)comp_mask,
+                       (vector unsigned int)format_check);
+               comp_mask = (vector unsigned char)
+                       vec_packs((vector unsigned int)comp_mask,
+                       (vector unsigned int)zero);
+
+               /* F.2 mask out invalid entries. */
+               comp_mask = (vector unsigned char)
+                       vec_andc((vector unsigned long)comp_mask,
+                       (vector unsigned long)invalid_mask);
+               comp_idx = ((vector unsigned long)comp_mask)[0];
+
+               /* F.3 get the first compressed CQE. */
+               comp_idx = comp_idx ? __builtin_ctzll(comp_idx) /
+                       (sizeof(uint16_t) * 8) : MLX5_VPMD_DESCS_PER_LOOP;
+
+               /* E.6 mask out entries after the compressed CQE. */
+               mask = (vector unsigned char)(vector unsigned long){
+                       (comp_idx * sizeof(uint16_t) * 8), 0};
+               lshift = vec_splat((vector unsigned long)mask, 0);
+               shmask = vec_cmpgt(shmax, lshift);
+               mask = (vector unsigned char)
+                       vec_sl((vector unsigned long)ones, lshift);
+               mask = (vector unsigned char)
+                       vec_sel((vector unsigned long)shmask,
+                       (vector unsigned long)mask, shmask);
+               invalid_mask = (vector unsigned char)
+                       vec_or((vector unsigned long)invalid_mask,
+                       (vector unsigned long)mask);
+
+               /* E.7 count non-compressed valid CQEs. */
+               n = ((vector unsigned long)invalid_mask)[0];
+               n = n ? __builtin_ctzll(n) / (sizeof(uint16_t) * 8) :
+                       MLX5_VPMD_DESCS_PER_LOOP;
+               nocmp_n += n;
+
+               /* D.2 get the final invalid mask. */
+               mask = (vector unsigned char)(vector unsigned long){
+                       (n * sizeof(uint16_t) * 8), 0};
+               lshift = vec_splat((vector unsigned long)mask, 0);
+               shmask = vec_cmpgt(shmax, lshift);
+               mask = (vector unsigned char)
+                       vec_sl((vector unsigned long)ones, lshift);
+               mask = (vector unsigned char)
+                       vec_sel((vector unsigned long)shmask,
+                       (vector unsigned long)mask, shmask);
+               invalid_mask = (vector unsigned char)
+                       vec_or((vector unsigned long)invalid_mask,
+                       (vector unsigned long)mask);
+
+               /* D.3 check error in opcode. */
+               opcode = (vector unsigned char)
+                       vec_cmpeq((vector unsigned int)resp_err_check,
+                       (vector unsigned int)opcode);
+               opcode = (vector unsigned char)
+                       vec_packs((vector unsigned int)opcode,
+                       (vector unsigned int)zero);
+               opcode = (vector unsigned char)
+                       vec_andc((vector unsigned long)opcode,
+                       (vector unsigned long)invalid_mask);
+
+               /* D.4 mark if any error is set */
+               *err |= ((vector unsigned long)opcode)[0];
+
+               /* D.5 fill in mbuf - rearm_data and packet_type. */
+               rxq_cq_to_ptype_oflags_v(rxq, cqes, opcode, &pkts[pos]);
+               if (rxq->hw_timestamp) {
+                       pkts[pos]->timestamp =
+                               rte_be_to_cpu_64(cq[pos].timestamp);
+                       pkts[pos + 1]->timestamp =
+                               rte_be_to_cpu_64(cq[pos + p1].timestamp);
+                       pkts[pos + 2]->timestamp =
+                               rte_be_to_cpu_64(cq[pos + p2].timestamp);
+                       pkts[pos + 3]->timestamp =
+                               rte_be_to_cpu_64(cq[pos + p3].timestamp);
+               }
+               if (rxq->dynf_meta) {
+                       uint64_t flag = rxq->flow_meta_mask;
+                       int32_t offs = rxq->flow_meta_offset;
+                       uint32_t metadata;
+
+                       /* This code is subject for futher optimization. */
+                       metadata = cq[pos].flow_table_metadata;
+                       *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+                                                               metadata;
+                       pkts[pos]->ol_flags |= metadata ? flag : 0ULL;
+                       metadata = cq[pos + 1].flow_table_metadata;
+                       *RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *) =
+                                                               metadata;
+                       pkts[pos + 1]->ol_flags |= metadata ? flag : 0ULL;
+                       metadata = cq[pos + 2].flow_table_metadata;
+                       *RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *) =
+                                                               metadata;
+                       pkts[pos + 2]->ol_flags |= metadata ? flag : 0ULL;
+                       metadata = cq[pos + 3].flow_table_metadata;
+                       *RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *) =
+                                                               metadata;
+                       pkts[pos + 3]->ol_flags |= metadata ? flag : 0ULL;
+               }
+#ifdef MLX5_PMD_SOFT_COUNTERS
+               /* Add up received bytes count. */
+               byte_cnt = vec_perm(op_own, zero, len_shuf_mask);
+               byte_cnt = (vector unsigned char)
+                       vec_andc((vector unsigned long)byte_cnt,
+                       (vector unsigned long)invalid_mask);
+               left = vec_perm((vector unsigned short)byte_cnt,
+                       (vector unsigned short)zero, lower_half);
+               right = vec_perm((vector unsigned short)byte_cnt,
+                       (vector unsigned short)zero, upper_half);
+               byte_cnt = (vector unsigned char)vec_add(left, right);
+               left = vec_perm((vector unsigned short)byte_cnt,
+                       (vector unsigned short)zero, lower_half);
+               right = vec_perm((vector unsigned short)byte_cnt,
+                       (vector unsigned short)zero, upper_half);
+               byte_cnt = (vector unsigned char)vec_add(left, right);
+               rcvd_byte += ((vector unsigned long)byte_cnt)[0];
+#endif
+
+               /*
+                * Break the loop unless more valid CQE is expected, or if
+                * there's a compressed CQE.
+                */
+               if (n != MLX5_VPMD_DESCS_PER_LOOP)
+                       break;
+       }
+       /* If no new CQE seen, return without updating cq_db. */
+       if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) {
+               *no_cq = true;
+               return rcvd_pkt;
+       }
+       /* Update the consumer indexes for non-compressed CQEs. */
+       MLX5_ASSERT(nocmp_n <= pkts_n);
+       rxq_copy_mprq_mbuf_v(rxq, pkts, nocmp_n, buf,
+                            rq_ci, rxq->consumed_strd, false);
+       rxq->cq_ci += nocmp_n;
+       rxq->consumed_strd += nocmp_n;
+       rcvd_pkt += nocmp_n;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+       rxq->stats.ipackets += nocmp_n;
+       rxq->stats.ibytes += rcvd_byte;
+#endif
+       /* Decompress the last CQE if compressed. */
+       if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) {
+               MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));
+               rxq->decompressed =
+                       rxq_cq_decompress_v(rxq, &cq[nocmp_n], &elts[nocmp_n]);
+               /* Return more packets if needed. */
+               if (nocmp_n < pkts_n) {
+                       uint16_t n = rxq->decompressed;
+
+                       n = RTE_MIN(n, pkts_n - nocmp_n);
+                       rxq_copy_mprq_mbuf_v(rxq, &pkts[nocmp_n], n, buf,
+                                            rq_ci, rxq->consumed_strd, true);
+                       rxq->consumed_strd += n;
+                       rcvd_pkt += n;
+                       rxq->decompressed -= n;
+               }
+       }
+       rte_compiler_barrier();
+       *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+       if (rq_ci != rxq->rq_ci) {
+               rxq->rq_ci = rq_ci;
+               rte_cio_wmb();
+               *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+       }
+       *no_cq = !rcvd_pkt;
+       return rcvd_pkt;
+}
+
 #endif /* RTE_PMD_MLX5_RXTX_VEC_ALTIVEC_H_ */
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h 
b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
index 555c342626..53c8ed8a9b 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
@@ -54,6 +54,95 @@ rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf 
**pkts, uint16_t n)
                pkts[pos] = elts[pos];
 }
 
+/**
+ * Store free buffers to RX SW ring.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param pkts
+ *   Pointer to array of packets to be stored.
+ * @param pkts_n
+ *   Number of packets to be stored.
+ * @param buf
+ *   MPRQ buffer to get packets from.
+ * @param buf rq_ci
+ *   WQE index.
+ * @param strd_idx
+ *   Stride number.
+ * @param comp
+ *   Whether CQE is compressed or not.
+ */
+static inline void
+rxq_copy_mprq_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
+                    uint16_t n, struct mlx5_mprq_buf *buf,
+                    uint16_t rq_ci, uint16_t strd_idx, bool comp)
+{
+       const unsigned int strd_sz = 1 << rxq->strd_sz_n;
+       const unsigned int strd_n = 1 << rxq->strd_num_n;
+       const unsigned int strd_shift =
+               MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en;
+       uint32_t offset;
+       void *addr;
+       int i = 0;
+
+       if (comp) {
+               const uint16_t q_mask = (1 << rxq->cqe_n) - 1;
+               struct rte_mbuf **elts = &(*rxq->elts)[rq_ci * strd_n & q_mask];
+               unsigned int pos;
+               uint16_t p = n & -2;
+
+               for (pos = 0; pos < p; pos += 2) {
+                       uint64x2_t mbp;
+
+                       mbp = vld1q_u64((void *)&elts[pos +
+                                                     rxq->consumed_strd]);
+                       vst1q_u64((void *)&pkts[pos], mbp);
+               }
+               if (n & 1)
+                       pkts[pos] = elts[pos];
+       }
+
+       for (i = 0; i < n; ++i) {
+               offset = (strd_idx + i) * strd_sz + strd_shift;
+               addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset);
+               if (pkts[i]->pkt_len <= rxq->mprq_max_memcpy_len ||
+                   rxq->mprq_repl == NULL) {
+                       rte_memcpy(rte_pktmbuf_mtod(pkts[i], void *),
+                                  addr, pkts[i]->pkt_len);
+               } else {
+                       rte_iova_t buf_iova;
+                       struct rte_mbuf_ext_shared_info *shinfo;
+                       uint16_t buf_len = strd_sz;
+                       void *buf_addr;
+                       /* Increment the refcnt of the whole chunk. */
+                       rte_atomic16_add_return(&buf->refcnt, 1);
+                       MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf->refcnt) <=
+                                   strd_n + 1);
+                       buf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);
+                       /*
+                        * MLX5 device doesn't use iova but it is necessary in a
+                        * case where the Rx packet is transmitted via a
+                        * different PMD.
+                        */
+                       buf_iova = rte_mempool_virt2iova(buf) +
+                               RTE_PTR_DIFF(buf_addr, buf);
+                       shinfo = &buf->shinfos[strd_idx];
+                       rte_mbuf_ext_refcnt_set(shinfo, 1);
+                       /*
+                        * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when
+                        * attaching the stride to mbuf and more offload flags
+                        * will be added below by calling rxq_cq_to_mbuf().
+                        * Other fields will be overwritten.
+                        */
+                       rte_pktmbuf_attach_extbuf(pkts[i], buf_addr, buf_iova,
+                                                 buf_len, shinfo);
+                       /* Set mbuf head-room. */
+                       SET_DATA_OFF(pkts[i], RTE_PKTMBUF_HEADROOM);
+                       DATA_LEN(pkts[i]) = pkts[i]->pkt_len;
+               }
+       }
+}
+
 /**
  * Decompress a compressed completion and fill in mbufs in RX SW ring with data
  * extracted from the title completion descriptor.
@@ -806,4 +895,492 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf 
**pkts, uint16_t pkts_n,
        return rcvd_pkt;
 }
 
+static inline void
+mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx,
+                const unsigned int strd_n)
+{
+       struct mlx5_mprq_buf *rep = rxq->mprq_repl;
+       volatile struct mlx5_wqe_data_seg *wqe =
+               &((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg;
+       void *addr;
+
+       MLX5_ASSERT(rep != NULL);
+       /* Replace MPRQ buf. */
+       (*rxq->mprq_bufs)[rq_idx] = rep;
+       /* Replace WQE. */
+       addr = mlx5_mprq_buf_addr(rep, strd_n);
+       wqe->addr = rte_cpu_to_be_64((uintptr_t)addr);
+       /* If there's only one MR, no need to replace LKey in WQE. */
+       if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
+               wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr);
+       /* Stash a mbuf for next replacement. */
+       if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep)))
+               rxq->mprq_repl = rep;
+       else
+               rxq->mprq_repl = NULL;
+}
+
+/**
+ * Receive burst of packets. An errored completion also consumes a mbuf, but 
the
+ * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed
+ * before returning to application.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ * @param[out] err
+ *   Pointer to a flag. Set non-zero value if pkts array has at least one error
+ *   packet to handle.
+ * @param[out] no_cq
+ *   Pointer to a boolean. Set true if no new CQE seen.
+ *
+ * @return
+ *   Number of packets received including errors (<= pkts_n).
+ */
+static inline uint16_t
+rxq_burst_mprq_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
+                uint16_t pkts_n, uint64_t *err, bool *no_cq)
+{
+       const unsigned int strd_n = 1 << rxq->strd_num_n;
+       const uint16_t q_n = 1 << rxq->cqe_n;
+       const uint16_t q_mask = q_n - 1;
+       const uint16_t e_n = 1 << rxq->elts_n;
+       const uint16_t e_mask = e_n - 1;
+       volatile struct mlx5_cqe *cq;
+       struct rte_mbuf **elts;
+       unsigned int pos;
+       uint64_t n;
+       uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
+       uint16_t nocmp_n = 0;
+       uint16_t rcvd_pkt = 0;
+       unsigned int cq_ci = rxq->cq_ci;
+       unsigned int cq_idx = cq_ci & q_mask;
+       unsigned int rq_ci = rxq->rq_ci;
+       unsigned int rq_idx = rq_ci & e_mask;
+       struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_idx];
+       unsigned int elts_idx;
+       const uint16x4_t ownership = vdup_n_u16(!(rxq->cq_ci & (q_mask + 1)));
+       const uint16x4_t owner_check = vcreate_u16(0x0001000100010001);
+       const uint16x4_t opcode_check = vcreate_u16(0x00f000f000f000f0);
+       const uint16x4_t format_check = vcreate_u16(0x000c000c000c000c);
+       const uint16x4_t resp_err_check = vcreate_u16(0x00e000e000e000e0);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+       uint32_t rcvd_byte = 0;
+#endif
+       /* Mask to generate 16B length vector. */
+       const uint8x8_t len_shuf_m = {
+               52, 53,         /* 4th CQE */
+               36, 37,         /* 3rd CQE */
+               20, 21,         /* 2nd CQE */
+                4,  5          /* 1st CQE */
+       };
+       /* Mask to extract 16B data from a 64B CQE. */
+       const uint8x16_t cqe_shuf_m = {
+               28, 29,         /* hdr_type_etc */
+                0,             /* pkt_info */
+               -1,             /* null */
+               47, 46,         /* byte_cnt, bswap16 */
+               31, 30,         /* vlan_info, bswap16 */
+               15, 14, 13, 12, /* rx_hash_res, bswap32 */
+               57, 58, 59,     /* flow_tag */
+               63              /* op_own */
+       };
+       /* Mask to generate 16B data for mbuf. */
+       const uint8x16_t mb_shuf_m = {
+                4,  5, -1, -1, /* pkt_len */
+                4,  5,         /* data_len */
+                6,  7,         /* vlan_tci */
+                8,  9, 10, 11, /* hash.rss */
+               12, 13, 14, -1  /* hash.fdir.hi */
+       };
+       /* Mask to generate 16B owner vector. */
+       const uint8x8_t owner_shuf_m = {
+               63, -1,         /* 4th CQE */
+               47, -1,         /* 3rd CQE */
+               31, -1,         /* 2nd CQE */
+               15, -1          /* 1st CQE */
+       };
+       /* Mask to generate a vector having packet_type/ol_flags. */
+       const uint8x16_t ptype_shuf_m = {
+               48, 49, 50, -1, /* 4th CQE */
+               32, 33, 34, -1, /* 3rd CQE */
+               16, 17, 18, -1, /* 2nd CQE */
+                0,  1,  2, -1  /* 1st CQE */
+       };
+       /* Mask to generate a vector having flow tags. */
+       const uint8x16_t ftag_shuf_m = {
+               60, 61, 62, -1, /* 4th CQE */
+               44, 45, 46, -1, /* 3rd CQE */
+               28, 29, 30, -1, /* 2nd CQE */
+               12, 13, 14, -1  /* 1st CQE */
+       };
+       const uint16x8_t crc_adj = {
+               0, 0, rxq->crc_present * RTE_ETHER_CRC_LEN, 0, 0, 0, 0, 0
+       };
+       const uint32x4_t flow_mark_adj = { 0, 0, 0, rxq->mark * (-1) };
+
+       MLX5_ASSERT(rxq->sges_n == 0);
+       MLX5_ASSERT(rxq->cqe_n == rxq->elts_n);
+       if (rxq->consumed_strd == strd_n) {
+               /* Replace WQE only if the buffer is still in use. */
+               if (rte_atomic16_read(&buf->refcnt) > 1) {
+                       mprq_buf_replace(rxq, rq_idx, strd_n);
+                       /* Release the old buffer. */
+                       mlx5_mprq_buf_free(buf);
+               } else if (unlikely(rxq->mprq_repl == NULL)) {
+                       struct mlx5_mprq_buf *rep;
+
+                       /*
+                        * Currently, the MPRQ mempool is out of buffer
+                        * and doing memcpy regardless of the size of Rx
+                        * packet. Retry allocation to get back to
+                        * normal.
+                        */
+                       if (!rte_mempool_get(rxq->mprq_mp,
+                                            (void **)&rep))
+                               rxq->mprq_repl = rep;
+                       }
+               /* Advance to the next WQE. */
+               rxq->consumed_strd = 0;
+               ++rq_ci;
+               rq_idx = rq_ci & e_mask;
+               buf = (*rxq->mprq_bufs)[rq_idx];
+               rxq->rq_repl_thresh = 1;
+       }
+       if (rxq->rq_repl_thresh)
+               mlx5_rx_replenish_bulk_mprq_mbuf(rxq, strd_n, rq_idx);
+
+       cq = &(*rxq->cqes)[cq_idx];
+       rte_prefetch_non_temporal(cq);
+       rte_prefetch_non_temporal(cq + 1);
+       rte_prefetch_non_temporal(cq + 2);
+       rte_prefetch_non_temporal(cq + 3);
+       elts_idx = (rq_ci & e_mask) * strd_n +
+               (rq_ci & e_mask) * MLX5_VPMD_DESCS_PER_LOOP;
+       elts = &(*rxq->elts)[elts_idx];
+       pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST);
+       /* See if there're unreturned mbufs from compressed CQE. */
+       rcvd_pkt = rxq->decompressed;
+       if (rcvd_pkt > 0) {
+               rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n);
+               rxq_copy_mprq_mbuf_v(rxq, pkts, rcvd_pkt, buf,
+                                    rq_ci, rxq->consumed_strd, true);
+               rxq->consumed_strd += rcvd_pkt;
+               pkts += rcvd_pkt;
+               rxq->decompressed -= rcvd_pkt;
+       }
+       /* Not to cross queue end. */
+       pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
+       pkts_n = RTE_MIN(pkts_n, strd_n - rxq->consumed_strd);
+       if (!pkts_n) {
+               *no_cq = !rcvd_pkt;
+               return rcvd_pkt;
+       }
+       /* At this point, there shouldn't be any remained packets. */
+       MLX5_ASSERT(rxq->decompressed == 0);
+       /*
+        * Note that vectors have reverse order - {v3, v2, v1, v0}, because
+        * there's no instruction to count trailing zeros. __builtin_clzl() is
+        * used instead.
+        *
+        * A. copy 4 mbuf pointers from elts ring to returing pkts.
+        * B. load 64B CQE and extract necessary fields
+        *    Final 16bytes cqes[] extracted from original 64bytes CQE has the
+        *    following structure:
+        *        struct {
+        *          uint16_t hdr_type_etc;
+        *          uint8_t  pkt_info;
+        *          uint8_t  rsvd;
+        *          uint16_t byte_cnt;
+        *          uint16_t vlan_info;
+        *          uint32_t rx_has_res;
+        *          uint8_t  flow_tag[3];
+        *          uint8_t  op_own;
+        *        } c;
+        * C. fill in mbuf.
+        * D. get valid CQEs.
+        * E. find compressed CQE.
+        */
+       for (pos = 0;
+            pos < pkts_n;
+            pos += MLX5_VPMD_DESCS_PER_LOOP) {
+               uint16x4_t op_own;
+               uint16x4_t opcode, owner_mask, invalid_mask;
+               uint16x4_t comp_mask;
+               uint16x4_t mask;
+               uint16x4_t byte_cnt;
+               uint32x4_t ptype_info, flow_tag;
+               register uint64x2_t c0, c1, c2, c3;
+               uint8_t *p0, *p1, *p2, *p3;
+               uint8_t *e0 = (void *)&elts[pos + rxq->consumed_strd]->pkt_len;
+               uint8_t *e1 = (void *)&elts[pos +
+                                           rxq->consumed_strd + 1]->pkt_len;
+               uint8_t *e2 = (void *)&elts[pos +
+                                           rxq->consumed_strd + 2]->pkt_len;
+               uint8_t *e3 = (void *)&elts[pos +
+                                           rxq->consumed_strd + 3]->pkt_len;
+               void *elts_p = (void *)&elts[pos + rxq->consumed_strd];
+               void *pkts_p = (void *)&pkts[pos];
+
+               /* A.0 do not cross the end of CQ. */
+               mask = vcreate_u16(pkts_n - pos < MLX5_VPMD_DESCS_PER_LOOP ?
+                                  -1UL >> ((pkts_n - pos) *
+                                           sizeof(uint16_t) * 8) : 0);
+               p0 = (void *)&cq[pos].pkt_info;
+               p1 = p0 + (pkts_n - pos > 1) * sizeof(struct mlx5_cqe);
+               p2 = p1 + (pkts_n - pos > 2) * sizeof(struct mlx5_cqe);
+               p3 = p2 + (pkts_n - pos > 3) * sizeof(struct mlx5_cqe);
+               /* B.0 (CQE 3) load a block having op_own. */
+               c3 = vld1q_u64((uint64_t *)(p3 + 48));
+               /* B.0 (CQE 2) load a block having op_own. */
+               c2 = vld1q_u64((uint64_t *)(p2 + 48));
+               /* B.0 (CQE 1) load a block having op_own. */
+               c1 = vld1q_u64((uint64_t *)(p1 + 48));
+               /* B.0 (CQE 0) load a block having op_own. */
+               c0 = vld1q_u64((uint64_t *)(p0 + 48));
+               /* Synchronize for loading the rest of blocks. */
+               rte_cio_rmb();
+               /* Prefetch next 4 CQEs. */
+               if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) {
+                       unsigned int next = pos + MLX5_VPMD_DESCS_PER_LOOP;
+                       rte_prefetch_non_temporal(&cq[next]);
+                       rte_prefetch_non_temporal(&cq[next + 1]);
+                       rte_prefetch_non_temporal(&cq[next + 2]);
+                       rte_prefetch_non_temporal(&cq[next + 3]);
+               }
+               __asm__ volatile (
+               /* B.1 (CQE 3) load the rest of blocks. */
+               "ld1 {v16.16b - v18.16b}, [%[p3]] \n\t"
+               /* B.2 (CQE 3) move the block having op_own. */
+               "mov v19.16b, %[c3].16b \n\t"
+               /* B.3 (CQE 3) extract 16B fields. */
+               "tbl v23.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+               /* B.1 (CQE 2) load the rest of blocks. */
+               "ld1 {v16.16b - v18.16b}, [%[p2]] \n\t"
+               /* B.4 (CQE 3) adjust CRC length. */
+               "sub v23.8h, v23.8h, %[crc_adj].8h \n\t"
+               /* C.1 (CQE 3) generate final structure for mbuf. */
+               "tbl v15.16b, {v23.16b}, %[mb_shuf_m].16b \n\t"
+               /* B.2 (CQE 2) move the block having op_own. */
+               "mov v19.16b, %[c2].16b \n\t"
+               /* B.3 (CQE 2) extract 16B fields. */
+               "tbl v22.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+               /* B.1 (CQE 1) load the rest of blocks. */
+               "ld1 {v16.16b - v18.16b}, [%[p1]] \n\t"
+               /* B.4 (CQE 2) adjust CRC length. */
+               "sub v22.8h, v22.8h, %[crc_adj].8h \n\t"
+               /* C.1 (CQE 2) generate final structure for mbuf. */
+               "tbl v14.16b, {v22.16b}, %[mb_shuf_m].16b \n\t"
+               /* B.2 (CQE 1) move the block having op_own. */
+               "mov v19.16b, %[c1].16b \n\t"
+               /* B.3 (CQE 1) extract 16B fields. */
+               "tbl v21.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+               /* B.1 (CQE 0) load the rest of blocks. */
+               "ld1 {v16.16b - v18.16b}, [%[p0]] \n\t"
+               /* B.4 (CQE 1) adjust CRC length. */
+               "sub v21.8h, v21.8h, %[crc_adj].8h \n\t"
+               /* C.1 (CQE 1) generate final structure for mbuf. */
+               "tbl v13.16b, {v21.16b}, %[mb_shuf_m].16b \n\t"
+               /* B.2 (CQE 0) move the block having op_own. */
+               "mov v19.16b, %[c0].16b \n\t"
+               /* A.1 load mbuf pointers. */
+               "ld1 {v24.2d - v25.2d}, [%[elts_p]] \n\t"
+               /* B.3 (CQE 0) extract 16B fields. */
+               "tbl v20.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+               /* B.4 (CQE 0) adjust CRC length. */
+               "sub v20.8h, v20.8h, %[crc_adj].8h \n\t"
+               /* D.1 extract op_own byte. */
+               "tbl %[op_own].8b, {v20.16b - v23.16b}, %[owner_shuf_m].8b \n\t"
+               /* C.2 (CQE 3) adjust flow mark. */
+               "add v15.4s, v15.4s, %[flow_mark_adj].4s \n\t"
+               /* C.3 (CQE 3) fill in mbuf - rx_descriptor_fields1. */
+               "st1 {v15.2d}, [%[e3]] \n\t"
+               /* C.2 (CQE 2) adjust flow mark. */
+               "add v14.4s, v14.4s, %[flow_mark_adj].4s \n\t"
+               /* C.3 (CQE 2) fill in mbuf - rx_descriptor_fields1. */
+               "st1 {v14.2d}, [%[e2]] \n\t"
+               /* C.1 (CQE 0) generate final structure for mbuf. */
+               "tbl v12.16b, {v20.16b}, %[mb_shuf_m].16b \n\t"
+               /* C.2 (CQE 1) adjust flow mark. */
+               "add v13.4s, v13.4s, %[flow_mark_adj].4s \n\t"
+               /* C.3 (CQE 1) fill in mbuf - rx_descriptor_fields1. */
+               "st1 {v13.2d}, [%[e1]] \n\t"
+#ifdef MLX5_PMD_SOFT_COUNTERS
+               /* Extract byte_cnt. */
+               "tbl %[byte_cnt].8b, {v20.16b - v23.16b}, %[len_shuf_m].8b \n\t"
+#endif
+               /* Extract ptype_info. */
+               "tbl %[ptype_info].16b, {v20.16b - v23.16b}, 
%[ptype_shuf_m].16b \n\t"
+               /* Extract flow_tag. */
+               "tbl %[flow_tag].16b, {v20.16b - v23.16b}, %[ftag_shuf_m].16b 
\n\t"
+               /* A.2 copy mbuf pointers. */
+               "st1 {v24.2d - v25.2d}, [%[pkts_p]] \n\t"
+               /* C.2 (CQE 0) adjust flow mark. */
+               "add v12.4s, v12.4s, %[flow_mark_adj].4s \n\t"
+               /* C.3 (CQE 1) fill in mbuf - rx_descriptor_fields1. */
+               "st1 {v12.2d}, [%[e0]] \n\t"
+               :[op_own]"=&w"(op_own),
+                [byte_cnt]"=&w"(byte_cnt),
+                [ptype_info]"=&w"(ptype_info),
+                [flow_tag]"=&w"(flow_tag)
+               :[p3]"r"(p3), [p2]"r"(p2), [p1]"r"(p1), [p0]"r"(p0),
+                [e3]"r"(e3), [e2]"r"(e2), [e1]"r"(e1), [e0]"r"(e0),
+                [c3]"w"(c3), [c2]"w"(c2), [c1]"w"(c1), [c0]"w"(c0),
+                [elts_p]"r"(elts_p),
+                [pkts_p]"r"(pkts_p),
+                [cqe_shuf_m]"w"(cqe_shuf_m),
+                [mb_shuf_m]"w"(mb_shuf_m),
+                [owner_shuf_m]"w"(owner_shuf_m),
+                [len_shuf_m]"w"(len_shuf_m),
+                [ptype_shuf_m]"w"(ptype_shuf_m),
+                [ftag_shuf_m]"w"(ftag_shuf_m),
+                [crc_adj]"w"(crc_adj),
+                [flow_mark_adj]"w"(flow_mark_adj)
+               :"memory",
+                "v12", "v13", "v14", "v15",
+                "v16", "v17", "v18", "v19",
+                "v20", "v21", "v22", "v23",
+                "v24", "v25");
+               /* D.2 flip owner bit to mark CQEs from last round. */
+               owner_mask = vand_u16(op_own, owner_check);
+               owner_mask = vceq_u16(owner_mask, ownership);
+               /* D.3 get mask for invalidated CQEs. */
+               opcode = vand_u16(op_own, opcode_check);
+               invalid_mask = vceq_u16(opcode_check, opcode);
+               /* E.1 find compressed CQE format. */
+               comp_mask = vand_u16(op_own, format_check);
+               comp_mask = vceq_u16(comp_mask, format_check);
+               /* D.4 mask out beyond boundary. */
+               invalid_mask = vorr_u16(invalid_mask, mask);
+               /* D.5 merge invalid_mask with invalid owner. */
+               invalid_mask = vorr_u16(invalid_mask, owner_mask);
+               /* E.2 mask out invalid entries. */
+               comp_mask = vbic_u16(comp_mask, invalid_mask);
+               /* E.3 get the first compressed CQE. */
+               comp_idx = __builtin_clzl(vget_lane_u64(vreinterpret_u64_u16(
+                                         comp_mask), 0)) /
+                                         (sizeof(uint16_t) * 8);
+               /* D.6 mask out entries after the compressed CQE. */
+               mask = vcreate_u16(comp_idx < MLX5_VPMD_DESCS_PER_LOOP ?
+                                  -1UL >> (comp_idx * sizeof(uint16_t) * 8) :
+                                  0);
+               invalid_mask = vorr_u16(invalid_mask, mask);
+               /* D.7 count non-compressed valid CQEs. */
+               n = __builtin_clzl(vget_lane_u64(vreinterpret_u64_u16(
+                                  invalid_mask), 0)) / (sizeof(uint16_t) * 8);
+               nocmp_n += n;
+               /* D.2 get the final invalid mask. */
+               mask = vcreate_u16(n < MLX5_VPMD_DESCS_PER_LOOP ?
+                                  -1UL >> (n * sizeof(uint16_t) * 8) : 0);
+               invalid_mask = vorr_u16(invalid_mask, mask);
+               /* D.3 check error in opcode. */
+               opcode = vceq_u16(resp_err_check, opcode);
+               opcode = vbic_u16(opcode, invalid_mask);
+               /* D.4 mark if any error is set */
+               *err |= vget_lane_u64(vreinterpret_u64_u16(opcode), 0);
+               /* C.4 fill in mbuf - rearm_data and packet_type. */
+               rxq_cq_to_ptype_oflags_v(rxq, ptype_info, flow_tag,
+                                        opcode, &elts[pos]);
+               if (rxq->hw_timestamp) {
+                       elts[pos]->timestamp =
+                               rte_be_to_cpu_64(
+                                       container_of(p0, struct mlx5_cqe,
+                                                    pkt_info)->timestamp);
+                       elts[pos + 1]->timestamp =
+                               rte_be_to_cpu_64(
+                                       container_of(p1, struct mlx5_cqe,
+                                                    pkt_info)->timestamp);
+                       elts[pos + 2]->timestamp =
+                               rte_be_to_cpu_64(
+                                       container_of(p2, struct mlx5_cqe,
+                                                    pkt_info)->timestamp);
+                       elts[pos + 3]->timestamp =
+                               rte_be_to_cpu_64(
+                                       container_of(p3, struct mlx5_cqe,
+                                                    pkt_info)->timestamp);
+               }
+               if (!!rxq->flow_meta_mask) {
+                       /* This code is subject for futher optimization. */
+                       int32_t offs = rxq->flow_meta_offset;
+
+                       *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+                               container_of(p0, struct mlx5_cqe,
+                                            pkt_info)->flow_table_metadata;
+                       *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+                               container_of(p1, struct mlx5_cqe,
+                                            pkt_info)->flow_table_metadata;
+                       *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+                               container_of(p2, struct mlx5_cqe,
+                                            pkt_info)->flow_table_metadata;
+                       *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+                               container_of(p3, struct mlx5_cqe,
+                                            pkt_info)->flow_table_metadata;
+                       if (*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *))
+                               elts[pos]->ol_flags |= rxq->flow_meta_mask;
+                       if (*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *))
+                               elts[pos + 1]->ol_flags |= rxq->flow_meta_mask;
+                       if (*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *))
+                               elts[pos + 2]->ol_flags |= rxq->flow_meta_mask;
+                       if (*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *))
+                               elts[pos + 3]->ol_flags |= rxq->flow_meta_mask;
+               }
+#ifdef MLX5_PMD_SOFT_COUNTERS
+               /* Add up received bytes count. */
+               byte_cnt = vbic_u16(byte_cnt, invalid_mask);
+               rcvd_byte += vget_lane_u64(vpaddl_u32(vpaddl_u16(byte_cnt)), 0);
+#endif
+               /*
+                * Break the loop unless more valid CQE is expected, or if
+                * there's a compressed CQE.
+                */
+               if (n != MLX5_VPMD_DESCS_PER_LOOP)
+                       break;
+       }
+       /* If no new CQE seen, return without updating cq_db. */
+       if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) {
+               *no_cq = true;
+               return rcvd_pkt;
+       }
+       /* Update the consumer indexes for non-compressed CQEs. */
+       MLX5_ASSERT(nocmp_n <= pkts_n);
+       rxq_copy_mprq_mbuf_v(rxq, pkts, nocmp_n, buf,
+                            rq_ci, rxq->consumed_strd, false);
+       rxq->cq_ci += nocmp_n;
+       rxq->consumed_strd += nocmp_n;
+       rcvd_pkt += nocmp_n;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+       rxq->stats.ipackets += nocmp_n;
+       rxq->stats.ibytes += rcvd_byte;
+#endif
+       /* Decompress the last CQE if compressed. */
+       if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) {
+               MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));
+               rxq->decompressed = rxq_cq_decompress_v(rxq, &cq[nocmp_n],
+                                                       &elts[nocmp_n]);
+               /* Return more packets if needed. */
+               if (nocmp_n < pkts_n) {
+                       uint16_t n = rxq->decompressed;
+
+                       n = RTE_MIN(n, pkts_n - nocmp_n);
+                       rxq_copy_mprq_mbuf_v(rxq, &pkts[nocmp_n], n, buf,
+                                            rq_ci, rxq->consumed_strd, true);
+                       rxq->consumed_strd += n;
+                       rcvd_pkt += n;
+                       rxq->decompressed -= n;
+               }
+       }
+       rte_cio_wmb();
+       *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+       if (rq_ci != rxq->rq_ci) {
+               rxq->rq_ci = rq_ci;
+               rte_cio_wmb();
+               *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+       }
+       *no_cq = !rcvd_pkt;
+       return rcvd_pkt;
+}
+
 #endif /* RTE_PMD_MLX5_RXTX_VEC_NEON_H_ */
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h 
b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
index 34e3397115..4054614674 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
@@ -56,6 +56,95 @@ rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf 
**pkts, uint16_t n)
                pkts[pos] = elts[pos];
 }
 
+/**
+ * Copy or attach MPRQ buffers to RX SW ring.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param pkts
+ *   Pointer to array of packets to be stored.
+ * @param pkts_n
+ *   Number of packets to be stored.
+ * @param buf
+ *   MPRQ buffer to get packets from.
+ * @param buf rq_ci
+ *   WQE index.
+ * @param strd_idx
+ *   Stride number.
+ * @param comp
+ *   Whether CQE is compressed or not.
+ */
+static inline void
+rxq_copy_mprq_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
+                    uint16_t n, struct mlx5_mprq_buf *buf,
+                    uint16_t rq_ci, uint16_t strd_idx, bool comp)
+{
+       const unsigned int strd_sz = 1 << rxq->strd_sz_n;
+       const unsigned int strd_n = 1 << rxq->strd_num_n;
+       const unsigned int strd_shift =
+               MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en;
+       uint32_t offset;
+       void *addr;
+       int i = 0;
+
+       if (comp) {
+               const uint16_t q_mask = (1 << rxq->cqe_n) - 1;
+               struct rte_mbuf **elts = &(*rxq->elts)[rq_ci * strd_n & q_mask];
+               unsigned int pos;
+               uint16_t p = n & -2;
+
+               for (pos = 0; pos < p; pos += 2) {
+                       __m128i mbp;
+
+                       mbp = _mm_loadu_si128((__m128i *)&elts[pos +
+                                                       rxq->consumed_strd]);
+                       _mm_storeu_si128((__m128i *)&pkts[pos], mbp);
+               }
+               if (n & 1)
+                       pkts[pos] = elts[pos];
+       }
+
+       for (i = 0; i < n; ++i) {
+               offset = (strd_idx + i) * strd_sz + strd_shift;
+               addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset);
+               if (pkts[i]->pkt_len <= rxq->mprq_max_memcpy_len ||
+                   rxq->mprq_repl == NULL) {
+                       rte_memcpy(rte_pktmbuf_mtod(pkts[i], void *),
+                                  addr, pkts[i]->pkt_len);
+               } else {
+                       rte_iova_t buf_iova;
+                       struct rte_mbuf_ext_shared_info *shinfo;
+                       uint16_t buf_len = strd_sz;
+                       void *buf_addr;
+                       /* Increment the refcnt of the whole chunk. */
+                       rte_atomic16_add_return(&buf->refcnt, 1);
+                       MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf->refcnt) <=
+                                   strd_n + 1);
+                       buf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);
+                       /*
+                        * MLX5 device doesn't use iova but it is necessary in a
+                        * case where the Rx packet is transmitted via a
+                        * different PMD.
+                        */
+                       buf_iova = rte_mempool_virt2iova(buf) +
+                               RTE_PTR_DIFF(buf_addr, buf);
+                       shinfo = &buf->shinfos[strd_idx];
+                       rte_mbuf_ext_refcnt_set(shinfo, 1);
+                       /*
+                        * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when
+                        * attaching the stride to mbuf and more offload flags
+                        * will be added below by calling rxq_cq_to_mbuf().
+                        * Other fields will be overwritten.
+                        */
+                       rte_pktmbuf_attach_extbuf(pkts[i], buf_addr, buf_iova,
+                                                 buf_len, shinfo);
+                       /* Set mbuf head-room. */
+                       SET_DATA_OFF(pkts[i], RTE_PKTMBUF_HEADROOM);
+                       DATA_LEN(pkts[i]) = pkts[i]->pkt_len;
+               }
+       }
+}
+
 /**
  * Decompress a compressed completion and fill in mbufs in RX SW ring with data
  * extracted from the title completion descriptor.
@@ -753,4 +842,435 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf 
**pkts, uint16_t pkts_n,
        return rcvd_pkt;
 }
 
+static inline void
+mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx,
+                const unsigned int strd_n)
+{
+       struct mlx5_mprq_buf *rep = rxq->mprq_repl;
+       volatile struct mlx5_wqe_data_seg *wqe =
+               &((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg;
+       void *addr;
+
+       MLX5_ASSERT(rep != NULL);
+       /* Replace MPRQ buf. */
+       (*rxq->mprq_bufs)[rq_idx] = rep;
+       /* Replace WQE. */
+       addr = mlx5_mprq_buf_addr(rep, strd_n);
+       wqe->addr = rte_cpu_to_be_64((uintptr_t)addr);
+       /* If there's only one MR, no need to replace LKey in WQE. */
+       if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
+               wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr);
+       /* Stash a mbuf for next replacement. */
+       if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep)))
+               rxq->mprq_repl = rep;
+       else
+               rxq->mprq_repl = NULL;
+}
+
+/**
+ * Receive burst of packets. An errored completion also consumes a mbuf, but 
the
+ * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed
+ * before returning to application.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ * @param[out] err
+ *   Pointer to a flag. Set non-zero value if pkts array has at least one error
+ *   packet to handle.
+ * @param[out] no_cq
+ *   Pointer to a boolean. Set true if no new CQE seen.
+ * @return
+ *   Number of packets received including errors (<= pkts_n).
+ */
+static inline uint16_t
+rxq_burst_mprq_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
+                uint16_t pkts_n, uint64_t *err, bool *no_cq)
+{
+       const unsigned int strd_n = 1 << rxq->strd_num_n;
+       const uint16_t q_n = 1 << rxq->cqe_n;
+       const uint16_t q_mask = q_n - 1;
+       const uint16_t e_n = 1 << rxq->elts_n;
+       const uint16_t e_mask = e_n - 1;
+       volatile struct mlx5_cqe *cq;
+       struct rte_mbuf **elts;
+       unsigned int pos;
+       uint64_t n;
+       uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
+       uint16_t nocmp_n = 0;
+       uint16_t rcvd_pkt = 0;
+       unsigned int cq_ci = rxq->cq_ci;
+       unsigned int cq_idx = cq_ci & q_mask;
+       unsigned int rq_ci = rxq->rq_ci;
+       unsigned int rq_idx = rq_ci & e_mask;
+       struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_idx];
+       unsigned int elts_idx;
+       unsigned int ownership = !!(rxq->cq_ci & (q_mask + 1));
+       const __m128i owner_check =
+               _mm_set_epi64x(0x0100000001000000LL, 0x0100000001000000LL);
+       const __m128i opcode_check =
+               _mm_set_epi64x(0xf0000000f0000000LL, 0xf0000000f0000000LL);
+       const __m128i format_check =
+               _mm_set_epi64x(0x0c0000000c000000LL, 0x0c0000000c000000LL);
+       const __m128i resp_err_check =
+               _mm_set_epi64x(0xe0000000e0000000LL, 0xe0000000e0000000LL);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+       uint32_t rcvd_byte = 0;
+       /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
+       const __m128i len_shuf_mask =
+               _mm_set_epi8(-1, -1, -1, -1,
+                            -1, -1, -1, -1,
+                            12, 13,  8,  9,
+                             4,  5,  0,  1);
+#endif
+       /* Mask to shuffle from extracted CQE to mbuf. */
+       const __m128i shuf_mask =
+               _mm_set_epi8(-1,  3,  2,  1, /* fdir.hi */
+                            12, 13, 14, 15, /* rss, bswap32 */
+                            10, 11,         /* vlan_tci, bswap16 */
+                             4,  5,         /* data_len, bswap16 */
+                            -1, -1,         /* zero out 2nd half of pkt_len */
+                             4,  5          /* pkt_len, bswap16 */);
+       /* Mask to blend from the last Qword to the first DQword. */
+       const __m128i blend_mask =
+               _mm_set_epi8(-1, -1, -1, -1,
+                            -1, -1, -1, -1,
+                             0,  0,  0,  0,
+                             0,  0,  0, -1);
+       const __m128i zero = _mm_setzero_si128();
+       const __m128i ones = _mm_cmpeq_epi32(zero, zero);
+       const __m128i crc_adj =
+               _mm_set_epi16(0, 0, 0, 0, 0,
+                             rxq->crc_present * RTE_ETHER_CRC_LEN,
+                             0,
+                             rxq->crc_present * RTE_ETHER_CRC_LEN);
+       const __m128i flow_mark_adj = _mm_set_epi32(rxq->mark * (-1), 0, 0, 0);
+
+       MLX5_ASSERT(rxq->sges_n == 0);
+       MLX5_ASSERT(rxq->cqe_n == rxq->elts_n);
+
+       if (rxq->consumed_strd == strd_n) {
+               /* Replace WQE only if the buffer is still in use. */
+               if (rte_atomic16_read(&buf->refcnt) > 1) {
+                       mprq_buf_replace(rxq, rq_ci & e_mask, strd_n);
+                       /* Release the old buffer. */
+                       mlx5_mprq_buf_free(buf);
+               } else if (unlikely(rxq->mprq_repl == NULL)) {
+                       struct mlx5_mprq_buf *rep;
+
+                       /*
+                        * Currently, the MPRQ mempool is out of buffer
+                        * and doing memcpy regardless of the size of Rx
+                        * packet. Retry allocation to get back to
+                        * normal.
+                        */
+                       if (!rte_mempool_get(rxq->mprq_mp,
+                                            (void **)&rep))
+                               rxq->mprq_repl = rep;
+               }
+               /* Advance to the next WQE. */
+               rxq->consumed_strd = 0;
+               ++rq_ci;
+               buf = (*rxq->mprq_bufs)[rq_ci & e_mask];
+               rxq->rq_repl_thresh = 1;
+       }
+       if (rxq->rq_repl_thresh)
+               mlx5_rx_replenish_bulk_mprq_mbuf(rxq, strd_n, rq_ci & e_mask);
+
+       cq = &(*rxq->cqes)[cq_idx];
+       rte_prefetch0(cq);
+       rte_prefetch0(cq + 1);
+       rte_prefetch0(cq + 2);
+       rte_prefetch0(cq + 3);
+       elts_idx = (rq_ci & e_mask) * strd_n +
+               (rq_ci & e_mask) * MLX5_VPMD_DESCS_PER_LOOP;
+       elts = &(*rxq->elts)[elts_idx];
+       pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST);
+       /* See if there're unreturned mbufs from compressed CQE. */
+       rcvd_pkt = rxq->decompressed;
+       if (rcvd_pkt > 0) {
+               rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n);
+               rxq_copy_mprq_mbuf_v(rxq, pkts, rcvd_pkt, buf,
+                                    rq_ci, rxq->consumed_strd, true);
+               rxq->consumed_strd += rcvd_pkt;
+               rxq->decompressed -= rcvd_pkt;
+               pkts += rcvd_pkt;
+       }
+       /* Not to cross queue end. */
+       pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
+       pkts_n = RTE_MIN(pkts_n, strd_n - rxq->consumed_strd);
+       if (!pkts_n) {
+               *no_cq = !rcvd_pkt;
+               return rcvd_pkt;
+       }
+       /* At this point, there shouldn't be any remained packets. */
+       MLX5_ASSERT(rxq->decompressed == 0);
+       /*
+        * A. load first Qword (8bytes) in one loop.
+        * B. copy 4 mbuf pointers from elts ring to returing pkts.
+        * C. load remained CQE data and extract necessary fields.
+        *    Final 16bytes cqes[] extracted from original 64bytes CQE has the
+        *    following structure:
+        *        struct {
+        *          uint8_t  pkt_info;
+        *          uint8_t  flow_tag[3];
+        *          uint16_t byte_cnt;
+        *          uint8_t  rsvd4;
+        *          uint8_t  op_own;
+        *          uint16_t hdr_type_etc;
+        *          uint16_t vlan_info;
+        *          uint32_t rx_has_res;
+        *        } c;
+        * D. fill in mbuf.
+        * E. get valid CQEs.
+        * F. find compressed CQE.
+        */
+       for (pos = 0;
+            pos < pkts_n;
+            pos += MLX5_VPMD_DESCS_PER_LOOP) {
+               __m128i cqes[MLX5_VPMD_DESCS_PER_LOOP];
+               __m128i cqe_tmp1, cqe_tmp2;
+               __m128i pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3;
+               __m128i op_own, op_own_tmp1, op_own_tmp2;
+               __m128i opcode, owner_mask, invalid_mask;
+               __m128i comp_mask;
+               __m128i mask;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+               __m128i byte_cnt;
+#endif
+               __m128i mbp1, mbp2;
+               __m128i p = _mm_set_epi16(0, 0, 0, 0, 3, 2, 1, 0);
+               unsigned int p1, p2, p3;
+
+               /* Prefetch next 4 CQEs. */
+               if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) {
+                       rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP]);
+                       rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 1]);
+                       rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 2]);
+                       rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 3]);
+               }
+               /* A.0 do not cross the end of CQ. */
+               mask = _mm_set_epi64x(0, (pkts_n - pos) * sizeof(uint16_t) * 8);
+               mask = _mm_sll_epi64(ones, mask);
+               p = _mm_andnot_si128(mask, p);
+               /* A.1 load cqes. */
+               p3 = _mm_extract_epi16(p, 3);
+               cqes[3] = _mm_loadl_epi64((__m128i *)
+                                          &cq[pos + p3].sop_drop_qpn);
+               rte_compiler_barrier();
+               p2 = _mm_extract_epi16(p, 2);
+               cqes[2] = _mm_loadl_epi64((__m128i *)
+                                          &cq[pos + p2].sop_drop_qpn);
+               rte_compiler_barrier();
+               /* B.1 load mbuf pointers. */
+               mbp1 = _mm_loadu_si128((__m128i *)&elts[pos +
+                                               rxq->consumed_strd]);
+               mbp2 = _mm_loadu_si128((__m128i *)&elts[pos +
+                                               rxq->consumed_strd + 2]);
+               /* A.1 load a block having op_own. */
+               p1 = _mm_extract_epi16(p, 1);
+               cqes[1] = _mm_loadl_epi64((__m128i *)
+                                          &cq[pos + p1].sop_drop_qpn);
+               rte_compiler_barrier();
+               cqes[0] = _mm_loadl_epi64((__m128i *)
+                                          &cq[pos].sop_drop_qpn);
+               /* B.2 copy mbuf pointers. */
+               _mm_storeu_si128((__m128i *)&pkts[pos], mbp1);
+               _mm_storeu_si128((__m128i *)&pkts[pos + 2], mbp2);
+               rte_cio_rmb();
+               /* C.1 load remained CQE data and extract necessary fields. */
+               cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p3]);
+               cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos + p2]);
+               cqes[3] = _mm_blendv_epi8(cqes[3], cqe_tmp2, blend_mask);
+               cqes[2] = _mm_blendv_epi8(cqes[2], cqe_tmp1, blend_mask);
+               cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p3].csum);
+               cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos + p2].csum);
+               cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x30);
+               cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x30);
+               cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p3].rsvd4[2]);
+               cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos + p2].rsvd4[2]);
+               cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x04);
+               cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x04);
+               /* C.2 generate final structure for mbuf with swapping bytes. */
+               pkt_mb3 = _mm_shuffle_epi8(cqes[3], shuf_mask);
+               pkt_mb2 = _mm_shuffle_epi8(cqes[2], shuf_mask);
+               /* C.3 adjust CRC length. */
+               pkt_mb3 = _mm_sub_epi16(pkt_mb3, crc_adj);
+               pkt_mb2 = _mm_sub_epi16(pkt_mb2, crc_adj);
+               /* C.4 adjust flow mark. */
+               pkt_mb3 = _mm_add_epi32(pkt_mb3, flow_mark_adj);
+               pkt_mb2 = _mm_add_epi32(pkt_mb2, flow_mark_adj);
+               /* D.1 fill in mbuf - rx_descriptor_fields1. */
+               _mm_storeu_si128((void *)&pkts[pos + 3]->pkt_len, pkt_mb3);
+               _mm_storeu_si128((void *)&pkts[pos + 2]->pkt_len, pkt_mb2);
+               /* E.1 extract op_own field. */
+               op_own_tmp2 = _mm_unpacklo_epi32(cqes[2], cqes[3]);
+               /* C.1 load remained CQE data and extract necessary fields. */
+               cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p1]);
+               cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos]);
+               cqes[1] = _mm_blendv_epi8(cqes[1], cqe_tmp2, blend_mask);
+               cqes[0] = _mm_blendv_epi8(cqes[0], cqe_tmp1, blend_mask);
+               cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p1].csum);
+               cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos].csum);
+               cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x30);
+               cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x30);
+               cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p1].rsvd4[2]);
+               cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos].rsvd4[2]);
+               cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x04);
+               cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x04);
+               /* C.2 generate final structure for mbuf with swapping bytes. */
+               pkt_mb1 = _mm_shuffle_epi8(cqes[1], shuf_mask);
+               pkt_mb0 = _mm_shuffle_epi8(cqes[0], shuf_mask);
+               /* C.3 adjust CRC length. */
+               pkt_mb1 = _mm_sub_epi16(pkt_mb1, crc_adj);
+               pkt_mb0 = _mm_sub_epi16(pkt_mb0, crc_adj);
+               /* C.4 adjust flow mark. */
+               pkt_mb1 = _mm_add_epi32(pkt_mb1, flow_mark_adj);
+               pkt_mb0 = _mm_add_epi32(pkt_mb0, flow_mark_adj);
+               /* E.1 extract op_own byte. */
+               op_own_tmp1 = _mm_unpacklo_epi32(cqes[0], cqes[1]);
+               op_own = _mm_unpackhi_epi64(op_own_tmp1, op_own_tmp2);
+               /* D.1 fill in mbuf - rx_descriptor_fields1. */
+               _mm_storeu_si128((void *)&pkts[pos + 1]->pkt_len, pkt_mb1);
+               _mm_storeu_si128((void *)&pkts[pos]->pkt_len, pkt_mb0);
+               /* E.2 flip owner bit to mark CQEs from last round. */
+               owner_mask = _mm_and_si128(op_own, owner_check);
+               if (ownership)
+                       owner_mask = _mm_xor_si128(owner_mask, owner_check);
+               owner_mask = _mm_cmpeq_epi32(owner_mask, owner_check);
+               owner_mask = _mm_packs_epi32(owner_mask, zero);
+               /* E.3 get mask for invalidated CQEs. */
+               opcode = _mm_and_si128(op_own, opcode_check);
+               invalid_mask = _mm_cmpeq_epi32(opcode_check, opcode);
+               invalid_mask = _mm_packs_epi32(invalid_mask, zero);
+               /* E.4 mask out beyond boundary. */
+               invalid_mask = _mm_or_si128(invalid_mask, mask);
+               /* E.5 merge invalid_mask with invalid owner. */
+               invalid_mask = _mm_or_si128(invalid_mask, owner_mask);
+               /* F.1 find compressed CQE format. */
+               comp_mask = _mm_and_si128(op_own, format_check);
+               comp_mask = _mm_cmpeq_epi32(comp_mask, format_check);
+               comp_mask = _mm_packs_epi32(comp_mask, zero);
+               /* F.2 mask out invalid entries. */
+               comp_mask = _mm_andnot_si128(invalid_mask, comp_mask);
+               comp_idx = _mm_cvtsi128_si64(comp_mask);
+               /* F.3 get the first compressed CQE. */
+               comp_idx = comp_idx ?
+                               __builtin_ctzll(comp_idx) /
+                                       (sizeof(uint16_t) * 8) :
+                               MLX5_VPMD_DESCS_PER_LOOP;
+               /* E.6 mask out entries after the compressed CQE. */
+               mask = _mm_set_epi64x(0, comp_idx * sizeof(uint16_t) * 8);
+               mask = _mm_sll_epi64(ones, mask);
+               invalid_mask = _mm_or_si128(invalid_mask, mask);
+               /* E.7 count non-compressed valid CQEs. */
+               n = _mm_cvtsi128_si64(invalid_mask);
+               n = n ? __builtin_ctzll(n) / (sizeof(uint16_t) * 8) :
+                       MLX5_VPMD_DESCS_PER_LOOP;
+               nocmp_n += n;
+               /* D.2 get the final invalid mask. */
+               mask = _mm_set_epi64x(0, n * sizeof(uint16_t) * 8);
+               mask = _mm_sll_epi64(ones, mask);
+               invalid_mask = _mm_or_si128(invalid_mask, mask);
+               /* D.3 check error in opcode. */
+               opcode = _mm_cmpeq_epi32(resp_err_check, opcode);
+               opcode = _mm_packs_epi32(opcode, zero);
+               opcode = _mm_andnot_si128(invalid_mask, opcode);
+               /* D.4 mark if any error is set */
+               *err |= _mm_cvtsi128_si64(opcode);
+               /* D.5 fill in mbuf - rearm_data and packet_type. */
+               rxq_cq_to_ptype_oflags_v(rxq, cqes, opcode, &pkts[pos]);
+               if (rxq->hw_timestamp) {
+                       pkts[pos]->timestamp =
+                               rte_be_to_cpu_64(cq[pos].timestamp);
+                       pkts[pos + 1]->timestamp =
+                               rte_be_to_cpu_64(cq[pos + p1].timestamp);
+                       pkts[pos + 2]->timestamp =
+                               rte_be_to_cpu_64(cq[pos + p2].timestamp);
+                       pkts[pos + 3]->timestamp =
+                               rte_be_to_cpu_64(cq[pos + p3].timestamp);
+               }
+               if (rxq->dynf_meta) {
+                       /* This code is subject for futher optimization. */
+                       int32_t offs = rxq->flow_meta_offset;
+
+                       *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+                               cq[pos].flow_table_metadata;
+                       *RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *) =
+                               cq[pos + p1].flow_table_metadata;
+                       *RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *) =
+                               cq[pos + p2].flow_table_metadata;
+                       *RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *) =
+                               cq[pos + p3].flow_table_metadata;
+                       if (*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *))
+                               pkts[pos]->ol_flags |= rxq->flow_meta_mask;
+                       if (*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *))
+                               pkts[pos + 1]->ol_flags |= rxq->flow_meta_mask;
+                       if (*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *))
+                               pkts[pos + 2]->ol_flags |= rxq->flow_meta_mask;
+                       if (*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *))
+                               pkts[pos + 3]->ol_flags |= rxq->flow_meta_mask;
+               }
+#ifdef MLX5_PMD_SOFT_COUNTERS
+               /* Add up received bytes count. */
+               byte_cnt = _mm_shuffle_epi8(op_own, len_shuf_mask);
+               byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt);
+               byte_cnt = _mm_hadd_epi16(byte_cnt, zero);
+               rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero));
+#endif
+               /*
+                * Break the loop unless more valid CQE is expected, or if
+                * there's a compressed CQE.
+                */
+               if (n != MLX5_VPMD_DESCS_PER_LOOP)
+                       break;
+       }
+       /* If no new CQE seen, return without updating cq_db. */
+       if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) {
+               *no_cq = true;
+               return rcvd_pkt;
+       }
+       /* Update the consumer indexes for non-compressed CQEs. */
+       MLX5_ASSERT(nocmp_n <= pkts_n);
+       rxq_copy_mprq_mbuf_v(rxq, pkts, nocmp_n, buf,
+                            rq_ci, rxq->consumed_strd, false);
+       rxq->cq_ci += nocmp_n;
+       rxq->consumed_strd += nocmp_n;
+       rcvd_pkt += nocmp_n;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+       rxq->stats.ipackets += nocmp_n;
+       rxq->stats.ibytes += rcvd_byte;
+#endif
+       /* Decompress the last CQE if compressed. */
+       if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) {
+               MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));
+               rxq->decompressed = rxq_cq_decompress_v(rxq, &cq[nocmp_n],
+                                       &elts[nocmp_n + rxq->consumed_strd]);
+               /* Return more packets if needed. */
+               if (nocmp_n < pkts_n) {
+                       uint16_t n = rxq->decompressed;
+
+                       n = RTE_MIN(n, pkts_n - nocmp_n);
+                       rxq_copy_mprq_mbuf_v(rxq, &pkts[nocmp_n], n, buf,
+                                            rq_ci, rxq->consumed_strd, true);
+                       rxq->consumed_strd += n;
+                       rcvd_pkt += n;
+                       rxq->decompressed -= n;
+               }
+       }
+
+       rte_compiler_barrier();
+       *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+       if (rq_ci != rxq->rq_ci) {
+               rxq->rq_ci = rq_ci;
+               rte_cio_wmb();
+               *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+       }
+       *no_cq = !rcvd_pkt;
+       return rcvd_pkt;
+}
+
 #endif /* RTE_PMD_MLX5_RXTX_VEC_SSE_H_ */
-- 
2.24.1

Reply via email to