There was an optimization work to prefetch all the CQEs before
their invalidation. It allowed us to speed up the mini-CQE
decompression process by preheating the cache in the vectorized
Rx routine.

Prefetching of the next mini-CQE, on the other hand, showed
no difference in the performance on x86 platform. So, that was
removed. Unfortunately this caused the performance drop on ARM.

Prefetch the mini-CQE as well as well as the all the soon to be
invalidated CQEs to get both CQE and mini-CQE on the hot path.

Fixes: 28a4b9632 ("net/mlx5: prefetch CQEs for a faster decompression")

Signed-off-by: Alexander Kozyrev <>
Acked-by: Viacheslav Ovsiienko <>
 drivers/net/mlx5/mlx5_rxtx_vec_altivec.h | 3 ++-
 drivers/net/mlx5/mlx5_rxtx_vec_neon.h    | 3 +++
 drivers/net/mlx5/mlx5_rxtx_vec_sse.h     | 3 ++-
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h 
index f5414eebad..cb4ce1a099 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
@@ -158,7 +158,6 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile 
struct mlx5_cqe *cq,
                for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
                        if (likely(pos + i < mcqe_n))
                                rte_prefetch0((void *)(cq + pos + i));
                /* A.1 load mCQEs into a 128bit register. */
                mcqe1 = (vector unsigned char)vec_vsx_ld(0,
                        (signed int const *)&mcq[pos % 8]);
@@ -287,6 +286,8 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile 
struct mlx5_cqe *cq,
                pos += MLX5_VPMD_DESCS_PER_LOOP;
                /* Move to next CQE and invalidate consumed CQEs. */
                if (!(pos & 0x7) && pos < mcqe_n) {
+                       if (pos + 8 < mcqe_n)
+                               rte_prefetch0((void *)(cq + pos + 8));
                        mcq = (void *)&(cq + pos)->pkt_info;
                        for (i = 0; i < 8; ++i)
                                cq[inv++].op_own = MLX5_CQE_INVALIDATE;
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h 
index 555c342626..6c3149523e 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
@@ -145,6 +145,7 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile 
struct mlx5_cqe *cq,
                                    -1UL << ((mcqe_n - pos) *
                                             sizeof(uint16_t) * 8) : 0);
                for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
                        if (likely(pos + i < mcqe_n))
                                rte_prefetch0((void *)(cq + pos + i));
@@ -227,6 +228,8 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile 
struct mlx5_cqe *cq,
                pos += MLX5_VPMD_DESCS_PER_LOOP;
                /* Move to next CQE and invalidate consumed CQEs. */
                if (!(pos & 0x7) && pos < mcqe_n) {
+                       if (pos + 8 < mcqe_n)
+                               rte_prefetch0((void *)(cq + pos + 8));
                        mcq = (void *)&(cq + pos)->pkt_info;
                        for (i = 0; i < 8; ++i)
                                cq[inv++].op_own = MLX5_CQE_INVALIDATE;
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h 
index 34e3397115..554924d7fc 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
@@ -135,7 +135,6 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile 
struct mlx5_cqe *cq,
                for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
                        if (likely(pos + i < mcqe_n))
                                rte_prefetch0((void *)(cq + pos + i));
                /* A.1 load mCQEs into a 128bit register. */
                mcqe1 = _mm_loadu_si128((__m128i *)&mcq[pos % 8]);
                mcqe2 = _mm_loadu_si128((__m128i *)&mcq[pos % 8 + 2]);
@@ -214,6 +213,8 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile 
struct mlx5_cqe *cq,
                pos += MLX5_VPMD_DESCS_PER_LOOP;
                /* Move to next CQE and invalidate consumed CQEs. */
                if (!(pos & 0x7) && pos < mcqe_n) {
+                       if (pos + 8 < mcqe_n)
+                               rte_prefetch0((void *)(cq + pos + 8));
                        mcq = (void *)(cq + pos);
                        for (i = 0; i < 8; ++i)
                                cq[inv++].op_own = MLX5_CQE_INVALIDATE;

Reply via email to