PRM now supports transmitting packets spanning over arbitrary amount of buffers.
Signed-off-by: Moti Haimovsky <mo...@mellanox.com> --- drivers/net/mlx4/mlx4_prm.h | 16 +--- drivers/net/mlx4/mlx4_rxtx.c | 213 +++++++++++++++++++++++++++++++------------ drivers/net/mlx4/mlx4_rxtx.h | 3 +- drivers/net/mlx4/mlx4_txq.c | 12 ++- 4 files changed, 170 insertions(+), 74 deletions(-) diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h index c5ce33b..8b0248a 100644 --- a/drivers/net/mlx4/mlx4_prm.h +++ b/drivers/net/mlx4/mlx4_prm.h @@ -61,7 +61,7 @@ #define MLX4_OPCODE_SEND 0x0a #define MLX4_EN_BIT_WQE_OWN 0x80000000 -#define SIZE_TO_TXBBS(size) (RTE_ALIGN((size), (TXBB_SIZE)) / (TXBB_SIZE)) +#define SIZE_TO_TXBBS(size) (RTE_ALIGN((size), (TXBB_SIZE)) / (TXBB_SIZE)) /** * Update the HW with the new CQ consumer value. @@ -148,6 +148,7 @@ /** * Fills the ctrl segment of a WQE with info needed for transmitting the packet. + * Owner field is filled later. * * @param seg * Pointer to the control structure in the WQE. @@ -161,8 +162,8 @@ * Immediate data/Invalidation key.. */ static inline void -mlx4_set_ctrl_seg(struct mlx4_wqe_ctrl_seg *seg, uint32_t owner, - uint8_t fence_size, uint32_t srcrb_flags, uint32_t imm) +mlx4_set_ctrl_seg(struct mlx4_wqe_ctrl_seg *seg, uint8_t fence_size, + uint32_t srcrb_flags, uint32_t imm) { seg->fence_size = fence_size; seg->srcrb_flags = rte_cpu_to_be_32(srcrb_flags); @@ -173,13 +174,6 @@ * For the IBV_WR_SEND_WITH_INV, it should be htobe32(imm). */ seg->imm = imm; - /* - * Make sure descriptor is fully written before - * setting ownership bit (because HW can start - * executing as soon as we do). - */ - rte_wmb(); - seg->owner_opcode = rte_cpu_to_be_32(owner); } /** @@ -241,7 +235,7 @@ * The number of data-segments the WQE contains. * * @return - * WQE size in bytes. + * The calculated WQE size in bytes. */ static inline int mlx4_wqe_calc_real_size(unsigned int count) diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c index 0720e34..e41ea9e 100644 --- a/drivers/net/mlx4/mlx4_rxtx.c +++ b/drivers/net/mlx4/mlx4_rxtx.c @@ -309,6 +309,101 @@ } /** + * Copy a WQE written in the bounce buffer back to the SQ. + * Routine is used when a WQE wraps-around the SQ and therefore needs a + * special attention. note that the WQE is written backward to the SQ. + * + * @param txq + * Pointer to mlx4 Tx queue structure. + * @param index + * First SQ TXBB index for this WQE. + * @param desc_size + * TXBB-aligned sixe of the WQE. + * + * @return + * A pointer to the control segment of this WQE in the SQ. + */ +static struct mlx4_wqe_ctrl_seg +*mlx4_bounce_to_desc(struct txq *txq, + uint32_t index, + unsigned int desc_size) +{ + struct mlx4_sq *sq = &txq->msq; + uint32_t copy = (sq->txbb_cnt - index) * TXBB_SIZE; + int i; + + for (i = desc_size - copy - 4; i >= 0; i -= 4) { + if ((i & (TXBB_SIZE - 1)) == 0) + rte_wmb(); + *((uint32_t *)(sq->buf + i)) = + *((uint32_t *)(txq->bounce_buf + copy + i)); + } + for (i = copy - 4; i >= 4; i -= 4) { + if ((i & (TXBB_SIZE - 1)) == 0) + rte_wmb(); + *((uint32_t *)(sq->buf + index * TXBB_SIZE + i)) = + *((uint32_t *)(txq->bounce_buf + i)); + } + /* Return real descriptor location */ + return (struct mlx4_wqe_ctrl_seg *)(sq->buf + index * TXBB_SIZE); +} + +/** + * Handle address translation of scattered buffers for mlx4_tx_burst(). + * + * @param txq + * TX queue structure. + * @param segs + * Number of segments in buf. + * @param elt + * TX queue element to fill. + * @param[in] buf + * Buffer to process. + * @param elts_head + * Index of the linear buffer to use if necessary (normally txq->elts_head). + * @param[out] sges + * Array filled with SGEs on success. + * + * @return + * A structure containing the processed packet size in bytes and the + * number of SGEs. Both fields are set to (unsigned int)-1 in case of + * failure. + */ +static inline int +mlx4_tx_sg_virt_to_lkey(struct txq *txq, struct rte_mbuf *buf, + struct ibv_sge *sges, unsigned int segs) +{ + unsigned int j; + + /* Register segments as SGEs. */ + for (j = 0; (j != segs); ++j) { + struct ibv_sge *sge = &sges[j]; + uint32_t lkey; + + /* Retrieve Memory Region key for this memory pool. */ + lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf)); + if (unlikely(lkey == (uint32_t)-1)) { + /* MR does not exist. */ + DEBUG("%p: unable to get MP <-> MR association", + (void *)txq); + goto stop; + } + /* Update SGE. */ + sge->addr = rte_pktmbuf_mtod(buf, uintptr_t); + if (txq->priv->vf) + rte_prefetch0((volatile void *) + (uintptr_t)sge->addr); + sge->length = buf->data_len; + sge->lkey = lkey; + buf = buf->next; + } + return 0; +stop: + return -1; +} + + +/** * Posts a single work requests to a send queue. * * @param txq @@ -323,36 +418,53 @@ */ static int mlx4_post_send(struct txq *txq, + struct rte_mbuf *pkt, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) { struct mlx4_wqe_ctrl_seg *ctrl; struct mlx4_wqe_data_seg *dseg; struct mlx4_sq *sq = &txq->msq; + struct ibv_sge sge[wr->num_sge]; uint32_t srcrb_flags; uint8_t fence_size; uint32_t head_idx = sq->head & sq->txbb_cnt_mask; uint32_t owner_opcode; - int wqe_real_size, nr_txbbs; + int wqe_real_size, wqe_size, nr_txbbs, i; + bool bounce = FALSE; - /* for now we support pkts with one buf only */ - if (wr->num_sge != 1) + if (unlikely(mlx4_tx_sg_virt_to_lkey(txq, pkt, sge, wr->num_sge))) goto err; + wr->sg_list = sge; /* Calc the needed wqe size for this packet */ wqe_real_size = mlx4_wqe_calc_real_size(wr->num_sge); if (unlikely(!wqe_real_size)) goto err; + wqe_size = RTE_ALIGN(wqe_real_size, TXBB_SIZE); nr_txbbs = SIZE_TO_TXBBS(wqe_real_size); /* Are we too big to handle ? */ if (unlikely(mlx4_wq_overflow(sq, nr_txbbs))) goto err; - /* Get ctrl and single-data wqe entries */ - ctrl = mlx4_get_send_wqe(sq, head_idx); + /* Get ctrl entry */ + if (likely(head_idx + nr_txbbs <= sq->txbb_cnt)) { + ctrl = mlx4_get_send_wqe(sq, head_idx); + } else { + /* handle the case of wqe wraps around the SQ by working with + * a side-buf and when done copying it back to the SQ + */ + ctrl = (struct mlx4_wqe_ctrl_seg *)txq->bounce_buf; + bounce = TRUE; + } + /* Get data-seg entry */ dseg = (struct mlx4_wqe_data_seg *)(((char *)ctrl) + sizeof(struct mlx4_wqe_ctrl_seg)); - mlx4_set_data_seg(dseg, wr->sg_list); - /* For raw eth, the SOLICIT flag is used - * to indicate that no icrc should be calculated + /* Fill-in date from last to first */ + for (i = wr->num_sge - 1; i >= 0; --i) + mlx4_set_data_seg(dseg + i, wr->sg_list + i); + /* Handle control info + * + * For raw eth, the SOLICIT flag is used to indicate that + * no icrc should be calculated */ srcrb_flags = MLX4_WQE_CTRL_SOLICIT | ((wr->send_flags & IBV_SEND_SIGNALED) ? @@ -361,7 +473,19 @@ MLX4_WQE_CTRL_FENCE : 0) | ((wqe_real_size / 16) & 0x3f); owner_opcode = MLX4_OPCODE_SEND | ((sq->head & sq->txbb_cnt) ? MLX4_EN_BIT_WQE_OWN : 0); - mlx4_set_ctrl_seg(ctrl, owner_opcode, fence_size, srcrb_flags, 0); + /* fill in ctrl info but ownership */ + mlx4_set_ctrl_seg(ctrl, fence_size, srcrb_flags, 0); + /* If we used a bounce buffer then copy wqe back into sq */ + if (unlikely(bounce)) + ctrl = mlx4_bounce_to_desc(txq, head_idx, wqe_size); + /* + * Make sure descriptor is fully written before + * setting ownership bit (because HW can start + * executing as soon as we do). + */ + rte_wmb(); + ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode); + sq->head += nr_txbbs; rte_wmb(); return 0; @@ -439,62 +563,31 @@ /* Request Tx completion. */ if (unlikely(--elts_comp_cd == 0)) { elts_comp_cd = txq->elts_comp_cd_init; - ++elts_comp; send_flags |= IBV_SEND_SIGNALED; } - if (likely(segs == 1)) { - struct ibv_sge *sge = &elt->sge; - uintptr_t addr; - uint32_t length; - uint32_t lkey; - - /* Retrieve buffer information. */ - addr = rte_pktmbuf_mtod(buf, uintptr_t); - length = buf->data_len; - /* Retrieve memory region key for this memory pool. */ - lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf)); - if (unlikely(lkey == (uint32_t)-1)) { - /* MR does not exist. */ - DEBUG("%p: unable to get MP <-> MR" - " association", (void *)txq); - /* Clean up Tx element. */ - elt->buf = NULL; - goto stop; - } - if (buf->pkt_len <= txq->max_inline) - send_flags |= IBV_SEND_INLINE; - /* Update element. */ - elt->buf = buf; - if (txq->priv->vf) - rte_prefetch0((volatile void *) - (uintptr_t)addr); + if (buf->pkt_len <= txq->max_inline) + send_flags |= IBV_SEND_INLINE; + /* Update element. */ + elt->buf = buf; + if (txq->priv->vf) RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); - sge->addr = addr; - sge->length = length; - sge->lkey = lkey; - sent_size += length; - /* Set up WR. */ - wr->sg_list = sge; - wr->num_sge = segs; - wr->opcode = IBV_WR_SEND; - wr->send_flags = send_flags; - wr->next = NULL; - /* post the pkt for sending */ - err = mlx4_post_send(txq, wr, &wr_bad); - if (unlikely(err)) { - if (unlikely(wr_bad->send_flags & - IBV_SEND_SIGNALED)) { - elts_comp_cd = 1; - --elts_comp; - } - elt->buf = NULL; - goto stop; - } - sent_size += length; - } else { - err = -1; + /* Set up WR. */ + wr->sg_list = NULL; /* handled in post_send */ + wr->num_sge = segs; + wr->opcode = IBV_WR_SEND; + wr->send_flags = send_flags; + wr->next = NULL; + /* post the pkt for sending */ + err = mlx4_post_send(txq, buf, wr, &wr_bad); + if (unlikely(err)) { + if (unlikely(wr_bad->send_flags & + IBV_SEND_SIGNALED)) + elts_comp_cd = 1; + elt->buf = NULL; goto stop; } + ++elts_comp; + sent_size += buf->pkt_len; elts_head = elts_head_next; /* Increment sent bytes counter. */ txq->stats.obytes += sent_size; diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h index e442730..7cae7e2 100644 --- a/drivers/net/mlx4/mlx4_rxtx.h +++ b/drivers/net/mlx4/mlx4_rxtx.h @@ -139,13 +139,14 @@ struct txq { struct txq_elt (*elts)[]; /**< Tx elements. */ unsigned int elts_head; /**< Current index in (*elts)[]. */ unsigned int elts_tail; /**< First element awaiting completion. */ - unsigned int elts_comp; /**< Number of completion requests. */ + unsigned int elts_comp; /**< Number of pkts waiting for completion. */ unsigned int elts_comp_cd; /**< Countdown for next completion. */ unsigned int elts_comp_cd_init; /**< Initial value for countdown. */ struct mlx4_txq_stats stats; /**< Tx queue counters. */ unsigned int socket; /**< CPU socket ID for allocations. */ struct mlx4_sq msq; /**< Info for directly manipulating the SQ. */ struct mlx4_cq mcq; /**< Info for directly manipulating the CQ. */ + char *bounce_buf; /**< Side memory to be used when wqe wraps around */ }; /* mlx4_rxq.c */ diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c index 1273738..6f6ea9c 100644 --- a/drivers/net/mlx4/mlx4_txq.c +++ b/drivers/net/mlx4/mlx4_txq.c @@ -83,8 +83,14 @@ rte_calloc_socket("TXQ", 1, sizeof(*elts), 0, txq->socket); int ret = 0; - if (elts == NULL) { - ERROR("%p: can't allocate packets array", (void *)txq); + /* Allocate Bounce-buf memory */ + txq->bounce_buf = (char *)rte_zmalloc_socket("TXQ", + MAX_WQE_SIZE, + RTE_CACHE_LINE_MIN_SIZE, + txq->socket); + + if ((elts == NULL) || (txq->bounce_buf == NULL)) { + ERROR("%p: can't allocate TXQ memory", (void *)txq); ret = ENOMEM; goto error; } @@ -110,6 +116,8 @@ assert(ret == 0); return 0; error: + if (txq->bounce_buf != NULL) + rte_free(txq->bounce_buf); if (elts != NULL) rte_free(elts); DEBUG("%p: failed, freed everything", (void *)txq); -- 1.8.3.1