Configuring UAR as IO-mapped makes maximum throughput decline by noticeable amount. If UAR is configured as write-combining register, a write memory barrier is needed on ringing a doorbell. rte_wmb() is mostly effective when the size of a burst is comparatively small.
Fixes: 9f9bebae5530 ("net/mlx5: don't map doorbell register to write combining") Cc: sta...@dpdk.org Cc: Sagi Grimberg <s...@grimberg.me> Cc: Alexander Solganik <solga...@gmail.com> Signed-off-by: Yongseok Koh <ys...@mellanox.com> Acked-by: Shahaf Shuler <shah...@mellanox.com> --- drivers/net/mlx5/mlx5.c | 2 -- drivers/net/mlx5/mlx5_rxtx.c | 8 ++++---- drivers/net/mlx5/mlx5_rxtx.h | 6 +++++- drivers/net/mlx5/mlx5_rxtx_vec_neon.h | 4 ++-- drivers/net/mlx5/mlx5_rxtx_vec_sse.h | 4 ++-- 5 files changed, 13 insertions(+), 11 deletions(-) diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 89fdc134f..fcdcbc367 100644 --- a/drivers/net/mlx5/mlx5.c +++ b/drivers/net/mlx5/mlx5.c @@ -1037,8 +1037,6 @@ rte_mlx5_pmd_init(void) * using this PMD, which is not supported in forked processes. */ setenv("RDMAV_HUGEPAGES_SAFE", "1", 1); - /* Don't map UAR to WC if BlueFlame is not used.*/ - setenv("MLX5_SHUT_UP_BF", "1", 1); /* Match the size of Rx completion entry to the size of a cacheline. */ if (RTE_CACHE_LINE_SIZE == 128) setenv("MLX5_CQE_SIZE", "128", 0); diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index 961967bf4..f54fee9fb 100644 --- a/drivers/net/mlx5/mlx5_rxtx.c +++ b/drivers/net/mlx5/mlx5_rxtx.c @@ -732,7 +732,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) txq->stats.opackets += i; #endif /* Ring QP doorbell. */ - mlx5_tx_dbrec(txq, (volatile struct mlx5_wqe *)last_wqe); + mlx5_tx_dbrec(txq, (volatile struct mlx5_wqe *)last_wqe, 1); return i; } @@ -948,7 +948,7 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) /* Ring QP doorbell. */ if (mpw.state == MLX5_MPW_STATE_OPENED) mlx5_mpw_close(txq, &mpw); - mlx5_tx_dbrec(txq, mpw.wqe); + mlx5_tx_dbrec(txq, mpw.wqe, 1); txq->elts_head = elts_head; return i; } @@ -1245,7 +1245,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, mlx5_mpw_inline_close(txq, &mpw); else if (mpw.state == MLX5_MPW_STATE_OPENED) mlx5_mpw_close(txq, &mpw); - mlx5_tx_dbrec(txq, mpw.wqe); + mlx5_tx_dbrec(txq, mpw.wqe, 1); txq->elts_head = elts_head; return i; } @@ -1596,7 +1596,7 @@ mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) else if (mpw.state == MLX5_MPW_STATE_OPENED) mlx5_mpw_close(txq, &mpw); /* Ring QP doorbell. */ - mlx5_tx_dbrec(txq, mpw.wqe); + mlx5_tx_dbrec(txq, mpw.wqe, 1); txq->elts_head = elts_head; return i; } diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h index ea037427b..58ae3c85b 100644 --- a/drivers/net/mlx5/mlx5_rxtx.h +++ b/drivers/net/mlx5/mlx5_rxtx.h @@ -584,9 +584,11 @@ mlx5_tx_mb2mr(struct mlx5_txq_data *txq, struct rte_mbuf *mb) * Pointer to TX queue structure. * @param wqe * Pointer to the last WQE posted in the NIC. + * @param mb + * Request for write memory barrier after BF update. */ static __rte_always_inline void -mlx5_tx_dbrec(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe) +mlx5_tx_dbrec(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe, int mb) { uint64_t *dst = (uint64_t *)((uintptr_t)txq->bf_reg); volatile uint64_t *src = ((volatile uint64_t *)wqe); @@ -596,6 +598,8 @@ mlx5_tx_dbrec(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe) /* Ensure ordering between DB record and BF copy. */ rte_wmb(); *dst = *src; + if (mb) + rte_wmb(); } #endif /* RTE_PMD_MLX5_RXTX_H_ */ diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h index 4cb7f2889..7a6e397de 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h +++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h @@ -225,7 +225,7 @@ txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, #ifdef MLX5_PMD_SOFT_COUNTERS txq->stats.opackets += n; #endif - mlx5_tx_dbrec(txq, wqe); + mlx5_tx_dbrec(txq, wqe, 1); return n; } @@ -345,7 +345,7 @@ txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n, txq->wqe_ci += (nb_dword_in_hdr + pkts_n + (nb_dword_per_wqebb - 1)) / nb_dword_per_wqebb; /* Ring QP doorbell. */ - mlx5_tx_dbrec(txq, wqe); + mlx5_tx_dbrec(txq, wqe, pkts_n < MLX5_VPMD_TX_MAX_BURST); return pkts_n; } diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h index e9819b762..12d5bed55 100644 --- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h +++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h @@ -226,7 +226,7 @@ txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, #ifdef MLX5_PMD_SOFT_COUNTERS txq->stats.opackets += n; #endif - mlx5_tx_dbrec(txq, wqe); + mlx5_tx_dbrec(txq, wqe, 1); return n; } @@ -344,7 +344,7 @@ txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n, txq->wqe_ci += (nb_dword_in_hdr + pkts_n + (nb_dword_per_wqebb - 1)) / nb_dword_per_wqebb; /* Ring QP doorbell. */ - mlx5_tx_dbrec(txq, wqe); + mlx5_tx_dbrec(txq, wqe, pkts_n < MLX5_VPMD_TX_MAX_BURST); return pkts_n; } -- 2.11.0