From: Alexander Kozyrev
> Maximum burst size of Vectorized Rx burst routine is set to
> MLX5_VPMD_RX_MAX_BURST(64). This limits the performance of any
> application that would like to gather more than 64 packets from the single Rx
> burst for batch processing (i.e. VPP).
>
> The situation gets worse with a mix of zipped and unzipped CQEs.
> They are processed separately and the Rx burst function returns small
> number of packets every call.
>
> Repeat the cycle of gathering packets from the vectorized Rx routine until a
> requested number of packets are collected or there are no more CQEs left to
> process.
>
> Fixes: 6cb559d67b83 ("net/mlx5: add vectorized Rx/Tx burst for x86")
> Cc: sta...@dpdk.org
>
> Signed-off-by: Alexander Kozyrev <akozy...@mellanox.com>
> Acked-by: Slava Ovsiienko <viachesl...@mellanox.com>
Acked-by: Matan Azrad <ma...@mellanox.com>
> ---
> drivers/net/mlx5/mlx5_rxtx_vec.c | 19 +++++++++++++------
> drivers/net/mlx5/mlx5_rxtx_vec_altivec.h | 13 ++++++++++---
> drivers/net/mlx5/mlx5_rxtx_vec_neon.h | 13 ++++++++++---
> drivers/net/mlx5/mlx5_rxtx_vec_sse.h | 13 ++++++++++---
> 4 files changed, 43 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c
> b/drivers/net/mlx5/mlx5_rxtx_vec.c
> index 1518bdd..b38bd20 100644
> --- a/drivers/net/mlx5/mlx5_rxtx_vec.c
> +++ b/drivers/net/mlx5/mlx5_rxtx_vec.c
> @@ -103,13 +103,20 @@
> mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t
> pkts_n) {
> struct mlx5_rxq_data *rxq = dpdk_rxq;
> - uint16_t nb_rx;
> + uint16_t nb_rx = 0;
> + uint16_t tn = 0;
> uint64_t err = 0;
> -
> - nb_rx = rxq_burst_v(rxq, pkts, pkts_n, &err);
> - if (unlikely(err | rxq->err_state))
> - nb_rx = rxq_handle_pending_error(rxq, pkts, nb_rx);
> - return nb_rx;
> + bool no_cq = false;
> +
> + do {
> + nb_rx = rxq_burst_v(rxq, pkts + tn, pkts_n - tn, &err,
> &no_cq);
> + if (unlikely(err | rxq->err_state))
> + nb_rx = rxq_handle_pending_error(rxq, pkts + tn,
> nb_rx);
> + tn += nb_rx;
> + if (unlikely(no_cq))
> + break;
> + } while (tn != pkts_n);
> + return tn;
> }
>
> /**
> diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
> b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
> index 26715ef..b55138a 100644
> --- a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
> +++ b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
> @@ -564,13 +564,15 @@
> * @param[out] err
> * Pointer to a flag. Set non-zero value if pkts array has at least one
> error
> * packet to handle.
> + * @param[out] no_cq
> + * Pointer to a boolean. Set true if no new CQE seen.
> *
> * @return
> * Number of packets received including errors (<= pkts_n).
> */
> static inline uint16_t
> rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t
> pkts_n,
> - uint64_t *err)
> + uint64_t *err, bool *no_cq)
> {
> const uint16_t q_n = 1 << rxq->cqe_n;
> const uint16_t q_mask = q_n - 1;
> @@ -663,8 +665,10 @@
> /* Not to cross queue end. */
> pkts_n = RTE_MIN(pkts_n, q_n - elts_idx);
> pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
> - if (!pkts_n)
> + if (!pkts_n) {
> + *no_cq = !rcvd_pkt;
> return rcvd_pkt;
> + }
> /* At this point, there shouldn't be any remaining packets. */
> MLX5_ASSERT(rxq->decompressed == 0);
>
> @@ -1079,8 +1083,10 @@
> break;
> }
> /* If no new CQE seen, return without updating cq_db. */
> - if (unlikely(!nocmp_n && comp_idx ==
> MLX5_VPMD_DESCS_PER_LOOP))
> + if (unlikely(!nocmp_n && comp_idx ==
> MLX5_VPMD_DESCS_PER_LOOP)) {
> + *no_cq = true;
> return rcvd_pkt;
> + }
> /* Update the consumer indexes for non-compressed CQEs. */
> MLX5_ASSERT(nocmp_n <= pkts_n);
> rxq->cq_ci += nocmp_n;
> @@ -1108,6 +1114,7 @@
> }
> rte_compiler_barrier();
> *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
> + *no_cq = !rcvd_pkt;
> return rcvd_pkt;
> }
>
> diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
> b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
> index ecafbf8..3007c03 100644
> --- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
> +++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
> @@ -378,13 +378,15 @@
> * @param[out] err
> * Pointer to a flag. Set non-zero value if pkts array has at least one
> error
> * packet to handle.
> + * @param[out] no_cq
> + * Pointer to a boolean. Set true if no new CQE seen.
> *
> * @return
> * Number of packets received including errors (<= pkts_n).
> */
> static inline uint16_t
> rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t
> pkts_n,
> - uint64_t *err)
> + uint64_t *err, bool *no_cq)
> {
> const uint16_t q_n = 1 << rxq->cqe_n;
> const uint16_t q_mask = q_n - 1;
> @@ -485,8 +487,10 @@
> /* Not to cross queue end. */
> pkts_n = RTE_MIN(pkts_n, q_n - elts_idx);
> pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
> - if (!pkts_n)
> + if (!pkts_n) {
> + *no_cq = !rcvd_pkt;
> return rcvd_pkt;
> + }
> /* At this point, there shouldn't be any remained packets. */
> MLX5_ASSERT(rxq->decompressed == 0);
> /*
> @@ -745,8 +749,10 @@
> break;
> }
> /* If no new CQE seen, return without updating cq_db. */
> - if (unlikely(!nocmp_n && comp_idx ==
> MLX5_VPMD_DESCS_PER_LOOP))
> + if (unlikely(!nocmp_n && comp_idx ==
> MLX5_VPMD_DESCS_PER_LOOP)) {
> + *no_cq = true;
> return rcvd_pkt;
> + }
> /* Update the consumer indexes for non-compressed CQEs. */
> MLX5_ASSERT(nocmp_n <= pkts_n);
> rxq->cq_ci += nocmp_n;
> @@ -774,6 +780,7 @@
> }
> rte_cio_wmb();
> *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
> + *no_cq = !rcvd_pkt;
> return rcvd_pkt;
> }
>
> diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
> b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
> index 6847ae7..da5960a 100644
> --- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
> +++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
> @@ -385,13 +385,15 @@
> * @param[out] err
> * Pointer to a flag. Set non-zero value if pkts array has at least one
> error
> * packet to handle.
> + * @param[out] no_cq
> + * Pointer to a boolean. Set true if no new CQE seen.
> *
> * @return
> * Number of packets received including errors (<= pkts_n).
> */
> static inline uint16_t
> rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t
> pkts_n,
> - uint64_t *err)
> + uint64_t *err, bool *no_cq)
> {
> const uint16_t q_n = 1 << rxq->cqe_n;
> const uint16_t q_mask = q_n - 1;
> @@ -473,8 +475,10 @@
> /* Not to cross queue end. */
> pkts_n = RTE_MIN(pkts_n, q_n - elts_idx);
> pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
> - if (!pkts_n)
> + if (!pkts_n) {
> + *no_cq = !rcvd_pkt;
> return rcvd_pkt;
> + }
> /* At this point, there shouldn't be any remained packets. */
> MLX5_ASSERT(rxq->decompressed == 0);
> /*
> @@ -696,8 +700,10 @@
> break;
> }
> /* If no new CQE seen, return without updating cq_db. */
> - if (unlikely(!nocmp_n && comp_idx ==
> MLX5_VPMD_DESCS_PER_LOOP))
> + if (unlikely(!nocmp_n && comp_idx ==
> MLX5_VPMD_DESCS_PER_LOOP)) {
> + *no_cq = true;
> return rcvd_pkt;
> + }
> /* Update the consumer indexes for non-compressed CQEs. */
> MLX5_ASSERT(nocmp_n <= pkts_n);
> rxq->cq_ci += nocmp_n;
> @@ -725,6 +731,7 @@
> }
> rte_compiler_barrier();
> *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
> + *no_cq = !rcvd_pkt;
> return rcvd_pkt;
> }
>
> --
> 1.8.3.1