Acked-by: Viacheslav Ovsiienko <viachesl...@nvidia.com> > -----Original Message----- > From: Alexander Kozyrev <akozy...@nvidia.com> > Sent: Thursday, October 3, 2024 11:26 PM > To: dev@dpdk.org > Cc: sta...@dpdk.org; Raslan Darawsheh <rasl...@nvidia.com>; Slava > Ovsiienko <viachesl...@nvidia.com>; Matan Azrad <ma...@nvidia.com>; > step...@networkplumber.org > Subject: [PATCH v2] drivers: fix error CQE handling > > The completion queue element size can be independently configured to > report either 64 or 128 bytes CQEs by programming cqe_sz parameter at CQ > creation. This parameter depends on the cache line size and affects both > regular CQEs and error CQEs. But the error handling assumes that an error > CQE is 64 bytes and doesn't take the padding into consideration on platforms > with 128-byte cache lines. > Fix the error CQE size in all error handling routines in mlx5. > > Fixes: 957e45fb7b ("net/mlx5: handle Tx completion with error") > Cc: sta...@dpdk.org > > Signed-off-by: Alexander Kozyrev <akozy...@nvidia.com> > --- > drivers/common/mlx5/mlx5_prm.h | 29 ++++++++++++++++++++- > drivers/common/mlx5/windows/mlx5_win_defs.h | 12 --------- > drivers/compress/mlx5/mlx5_compress.c | 4 +-- > drivers/crypto/mlx5/mlx5_crypto_gcm.c | 2 +- > drivers/crypto/mlx5/mlx5_crypto_xts.c | 2 +- > drivers/net/mlx5/hws/mlx5dr_send.c | 2 +- > drivers/net/mlx5/mlx5_flow_aso.c | 6 ++--- > drivers/net/mlx5/mlx5_rx.c | 2 +- > drivers/net/mlx5/mlx5_tx.c | 8 +++--- > 9 files changed, 41 insertions(+), 26 deletions(-) > > diff --git a/drivers/common/mlx5/mlx5_prm.h > b/drivers/common/mlx5/mlx5_prm.h index 359f02f17c..210158350d 100644 > --- a/drivers/common/mlx5/mlx5_prm.h > +++ b/drivers/common/mlx5/mlx5_prm.h > @@ -268,8 +268,12 @@ > /* Maximum number of DS in WQE. Limited by 6-bit field. */ #define > MLX5_DSEG_MAX 63 > > -/* The 32 bit syndrome offset in struct mlx5_err_cqe. */ > +/* The 32 bit syndrome offset in struct mlx5_error_cqe. */ #if > +(RTE_CACHE_LINE_SIZE == 128) #define > MLX5_ERROR_CQE_SYNDROME_OFFSET 116 > +#else > #define MLX5_ERROR_CQE_SYNDROME_OFFSET 52 > +#endif > > /* The completion mode offset in the WQE control segment line 2. */ > #define MLX5_COMP_MODE_OFFSET 2 @@ -415,6 +419,29 @@ struct > mlx5_wqe_mprq { > > #define MLX5_MPRQ_STRIDE_SHIFT_BYTE 2 > > +struct mlx5_error_cqe { > +#if (RTE_CACHE_LINE_SIZE == 128) > + uint8_t padding[64]; > +#endif > + uint8_t rsvd0[2]; > + uint16_t eth_wqe_id; > + uint8_t rsvd1[16]; > + uint16_t ib_stride_index; > + uint8_t rsvd2[10]; > + uint32_t srqn; > + uint8_t rsvd3[8]; > + uint32_t byte_cnt; > + uint8_t rsvd4[4]; > + uint8_t hw_err_synd; > + uint8_t hw_synd_type; > + uint8_t vendor_err_synd; > + uint8_t syndrome; > + uint32_t s_wqe_opcode_qpn; > + uint16_t wqe_counter; > + uint8_t signature; > + uint8_t op_own; > +}; > + > /* CQ element structure - should be equal to the cache line size */ struct > mlx5_cqe { #if (RTE_CACHE_LINE_SIZE == 128) diff --git > a/drivers/common/mlx5/windows/mlx5_win_defs.h > b/drivers/common/mlx5/windows/mlx5_win_defs.h > index 79e7a7f386..d60df6fd37 100644 > --- a/drivers/common/mlx5/windows/mlx5_win_defs.h > +++ b/drivers/common/mlx5/windows/mlx5_win_defs.h > @@ -219,18 +219,6 @@ struct mlx5_action { > } dest_tir; > }; > > -struct mlx5_err_cqe { > - uint8_t rsvd0[32]; > - uint32_t srqn; > - uint8_t rsvd1[18]; > - uint8_t vendor_err_synd; > - uint8_t syndrome; > - uint32_t s_wqe_opcode_qpn; > - uint16_t wqe_counter; > - uint8_t signature; > - uint8_t op_own; > -}; > - > struct mlx5_wqe_srq_next_seg { > uint8_t rsvd0[2]; > rte_be16_t next_wqe_index; > diff --git a/drivers/compress/mlx5/mlx5_compress.c > b/drivers/compress/mlx5/mlx5_compress.c > index 5998d060e4..82105bfebd 100644 > --- a/drivers/compress/mlx5/mlx5_compress.c > +++ b/drivers/compress/mlx5/mlx5_compress.c > @@ -602,7 +602,7 @@ mlx5_compress_dump_err_objs(volatile uint32_t > *cqe, volatile uint32_t *wqe, > size_t i; > > DRV_LOG(ERR, "Error cqe:"); > - for (i = 0; i < sizeof(struct mlx5_err_cqe) >> 2; i += 4) > + for (i = 0; i < sizeof(struct mlx5_error_cqe) >> 2; i += 4) > DRV_LOG(ERR, "%08X %08X %08X %08X", cqe[i], cqe[i + 1], > cqe[i + 2], cqe[i + 3]); > DRV_LOG(ERR, "\nError wqe:"); > @@ -620,7 +620,7 @@ mlx5_compress_cqe_err_handle(struct > mlx5_compress_qp *qp, > struct rte_comp_op *op) > { > const uint32_t idx = qp->ci & (qp->entries_n - 1); > - volatile struct mlx5_err_cqe *cqe = (volatile struct mlx5_err_cqe *) > + volatile struct mlx5_error_cqe *cqe = (volatile struct mlx5_error_cqe > +*) > &qp- > >cq.cqes[idx]; > volatile struct mlx5_gga_wqe *wqes = (volatile struct mlx5_gga_wqe > *) > qp- > >qp.wqes; > diff --git a/drivers/crypto/mlx5/mlx5_crypto_gcm.c > b/drivers/crypto/mlx5/mlx5_crypto_gcm.c > index f598273873..cd21605bd2 100644 > --- a/drivers/crypto/mlx5/mlx5_crypto_gcm.c > +++ b/drivers/crypto/mlx5/mlx5_crypto_gcm.c > @@ -877,7 +877,7 @@ mlx5_crypto_gcm_cqe_err_handle(struct > mlx5_crypto_qp *qp, struct rte_crypto_op * { > uint8_t op_code; > const uint32_t idx = qp->cq_ci & (qp->entries_n - 1); > - volatile struct mlx5_err_cqe *cqe = (volatile struct mlx5_err_cqe *) > + volatile struct mlx5_error_cqe *cqe = (volatile struct mlx5_error_cqe > +*) > &qp- > >cq_obj.cqes[idx]; > > op_code = rte_be_to_cpu_32(cqe->s_wqe_opcode_qpn) >> > MLX5_CQ_INDEX_WIDTH; diff --git a/drivers/crypto/mlx5/mlx5_crypto_xts.c > b/drivers/crypto/mlx5/mlx5_crypto_xts.c > index d4e1dd718c..b9214711ac 100644 > --- a/drivers/crypto/mlx5/mlx5_crypto_xts.c > +++ b/drivers/crypto/mlx5/mlx5_crypto_xts.c > @@ -363,7 +363,7 @@ static __rte_noinline void > mlx5_crypto_xts_cqe_err_handle(struct mlx5_crypto_qp *qp, struct > rte_crypto_op *op) { > const uint32_t idx = qp->ci & (qp->entries_n - 1); > - volatile struct mlx5_err_cqe *cqe = (volatile struct mlx5_err_cqe *) > + volatile struct mlx5_error_cqe *cqe = (volatile struct mlx5_error_cqe > +*) > &qp- > >cq_obj.cqes[idx]; > > op->status = RTE_CRYPTO_OP_STATUS_ERROR; diff --git > a/drivers/net/mlx5/hws/mlx5dr_send.c > b/drivers/net/mlx5/hws/mlx5dr_send.c > index e9abf3dddb..e121c7f7ed 100644 > --- a/drivers/net/mlx5/hws/mlx5dr_send.c > +++ b/drivers/net/mlx5/hws/mlx5dr_send.c > @@ -599,7 +599,7 @@ static void mlx5dr_send_engine_poll_cq(struct > mlx5dr_send_engine *queue, > return; > > if (unlikely(cqe_opcode != MLX5_CQE_REQ)) { > - struct mlx5_err_cqe *err_cqe = (struct mlx5_err_cqe *)cqe; > + struct mlx5_error_cqe *err_cqe = (struct mlx5_error_cqe > *)cqe; > > DR_LOG(ERR, "CQE ERR:0x%x, Vendor_ERR:0x%x, OP:0x%x, > QPN:0x%x, WQE_CNT:0x%x", > err_cqe->syndrome, err_cqe->vendor_err_synd, > cqe_opcode, diff --git a/drivers/net/mlx5/mlx5_flow_aso.c > b/drivers/net/mlx5/mlx5_flow_aso.c > index a94b228396..feca8c3e89 100644 > --- a/drivers/net/mlx5/mlx5_flow_aso.c > +++ b/drivers/net/mlx5/mlx5_flow_aso.c > @@ -489,7 +489,7 @@ mlx5_aso_dump_err_objs(volatile uint32_t *cqe, > volatile uint32_t *wqe) > int i; > > DRV_LOG(ERR, "Error cqe:"); > - for (i = 0; i < 16; i += 4) > + for (i = 0; i < (int)sizeof(struct mlx5_error_cqe) / 4; i += 4) > DRV_LOG(ERR, "%08X %08X %08X %08X", cqe[i], cqe[i + 1], > cqe[i + 2], cqe[i + 3]); > DRV_LOG(ERR, "\nError wqe:"); > @@ -509,8 +509,8 @@ mlx5_aso_cqe_err_handle(struct mlx5_aso_sq *sq) { > struct mlx5_aso_cq *cq = &sq->cq; > uint32_t idx = cq->cq_ci & ((1 << cq->log_desc_n) - 1); > - volatile struct mlx5_err_cqe *cqe = > - (volatile struct mlx5_err_cqe *)&cq- > >cq_obj.cqes[idx]; > + volatile struct mlx5_error_cqe *cqe = > + (volatile struct mlx5_error_cqe *)&cq- > >cq_obj.cqes[idx]; > > cq->errors++; > idx = rte_be_to_cpu_16(cqe->wqe_counter) & (1u << sq- > >log_desc_n); diff --git a/drivers/net/mlx5/mlx5_rx.c > b/drivers/net/mlx5/mlx5_rx.c index f241809e08..5e58eb8bc9 100644 > --- a/drivers/net/mlx5/mlx5_rx.c > +++ b/drivers/net/mlx5/mlx5_rx.c > @@ -459,7 +459,7 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, > uint8_t vec, > container_of(rxq, struct mlx5_rxq_ctrl, rxq); > union { > volatile struct mlx5_cqe *cqe; > - volatile struct mlx5_err_cqe *err_cqe; > + volatile struct mlx5_error_cqe *err_cqe; > } u = { > .cqe = &(*rxq->cqes)[(rxq->cq_ci - vec) & cqe_mask], > }; > diff --git a/drivers/net/mlx5/mlx5_tx.c b/drivers/net/mlx5/mlx5_tx.c index > 04f80bb9bd..2f48bbc82e 100644 > --- a/drivers/net/mlx5/mlx5_tx.c > +++ b/drivers/net/mlx5/mlx5_tx.c > @@ -55,7 +55,7 @@ tx_recover_qp(struct mlx5_txq_ctrl *txq_ctrl) > > /* Return 1 if the error CQE is signed otherwise, sign it and return 0. */ > static > int -check_err_cqe_seen(volatile struct mlx5_err_cqe *err_cqe) > +check_err_cqe_seen(volatile struct mlx5_error_cqe *err_cqe) > { > static const uint8_t magic[] = "seen"; > int ret = 1; > @@ -83,7 +83,7 @@ check_err_cqe_seen(volatile struct mlx5_err_cqe > *err_cqe) > */ > static int > mlx5_tx_error_cqe_handle(struct mlx5_txq_data *__rte_restrict txq, > - volatile struct mlx5_err_cqe *err_cqe) > + volatile struct mlx5_error_cqe *err_cqe) > { > if (err_cqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR) > { > const uint16_t wqe_m = ((1 << txq->wqe_n) - 1); @@ -107,7 > +107,7 @@ mlx5_tx_error_cqe_handle(struct mlx5_txq_data > *__rte_restrict txq, > mlx5_dump_debug_information(name, "MLX5 Error > CQ:", > (const void *)((uintptr_t) > txq->cqes), > - sizeof(struct mlx5_cqe) * > + sizeof(struct > mlx5_error_cqe) * > (1 << txq->cqe_n)); > mlx5_dump_debug_information(name, "MLX5 Error > SQ:", > (const void *)((uintptr_t) > @@ -206,7 +206,7 @@ mlx5_tx_handle_completion(struct mlx5_txq_data > *__rte_restrict txq, > */ > rte_wmb(); > ret = mlx5_tx_error_cqe_handle > - (txq, (volatile struct mlx5_err_cqe *)cqe); > + (txq, (volatile struct mlx5_error_cqe *)cqe); > if (unlikely(ret < 0)) { > /* > * Some error occurred on queue error > -- > 2.18.2