PRM now supports transmitting packets spanning over arbitrary
amount of buffers.

Signed-off-by: Moti Haimovsky <mo...@mellanox.com>
---
 drivers/net/mlx4/mlx4_prm.h  |  16 +---
 drivers/net/mlx4/mlx4_rxtx.c | 213 +++++++++++++++++++++++++++++++------------
 drivers/net/mlx4/mlx4_rxtx.h |   3 +-
 drivers/net/mlx4/mlx4_txq.c  |  12 ++-
 4 files changed, 170 insertions(+), 74 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
index c5ce33b..8b0248a 100644
--- a/drivers/net/mlx4/mlx4_prm.h
+++ b/drivers/net/mlx4/mlx4_prm.h
@@ -61,7 +61,7 @@
 #define MLX4_OPCODE_SEND       0x0a
 #define MLX4_EN_BIT_WQE_OWN    0x80000000
 
-#define SIZE_TO_TXBBS(size)     (RTE_ALIGN((size), (TXBB_SIZE)) / (TXBB_SIZE))
+#define SIZE_TO_TXBBS(size)    (RTE_ALIGN((size), (TXBB_SIZE)) / (TXBB_SIZE))
 
 /**
  * Update the HW with the new  CQ consumer value.
@@ -148,6 +148,7 @@
 
 /**
  * Fills the ctrl segment of a WQE with info needed for transmitting the 
packet.
+ * Owner field is filled later.
  *
  * @param seg
  *   Pointer to the control structure in the WQE.
@@ -161,8 +162,8 @@
  *   Immediate data/Invalidation key..
  */
 static inline void
-mlx4_set_ctrl_seg(struct mlx4_wqe_ctrl_seg *seg, uint32_t owner,
-            uint8_t fence_size, uint32_t srcrb_flags, uint32_t imm)
+mlx4_set_ctrl_seg(struct mlx4_wqe_ctrl_seg *seg, uint8_t fence_size,
+                 uint32_t srcrb_flags, uint32_t imm)
 {
        seg->fence_size = fence_size;
        seg->srcrb_flags = rte_cpu_to_be_32(srcrb_flags);
@@ -173,13 +174,6 @@
         * For the IBV_WR_SEND_WITH_INV, it should be htobe32(imm).
         */
        seg->imm = imm;
-       /*
-        * Make sure descriptor is fully written before
-        * setting ownership bit (because HW can start
-        * executing as soon as we do).
-        */
-       rte_wmb();
-       seg->owner_opcode = rte_cpu_to_be_32(owner);
 }
 
 /**
@@ -241,7 +235,7 @@
  *   The number of data-segments the WQE contains.
  *
  * @return
- *   WQE size in bytes.
+ *   The calculated WQE size in bytes.
  */
 static inline int
 mlx4_wqe_calc_real_size(unsigned int count)
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 0720e34..e41ea9e 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -309,6 +309,101 @@
 }
 
 /**
+ * Copy a WQE written in the bounce buffer back to the SQ.
+ * Routine is used when a WQE wraps-around the SQ and therefore needs a
+ * special attention. note that the WQE is written backward to the SQ.
+ *
+ * @param txq
+ *   Pointer to mlx4 Tx queue structure.
+ * @param index
+ *   First SQ TXBB index for this WQE.
+ * @param desc_size
+ *   TXBB-aligned sixe of the WQE.
+ *
+ * @return
+ *   A pointer to the control segment of this WQE in the SQ.
+ */
+static struct mlx4_wqe_ctrl_seg
+*mlx4_bounce_to_desc(struct txq *txq,
+                    uint32_t index,
+                    unsigned int desc_size)
+{
+       struct mlx4_sq *sq = &txq->msq;
+       uint32_t copy = (sq->txbb_cnt - index) * TXBB_SIZE;
+       int i;
+
+       for (i = desc_size - copy - 4; i >= 0; i -= 4) {
+               if ((i & (TXBB_SIZE - 1)) == 0)
+                       rte_wmb();
+               *((uint32_t *)(sq->buf + i)) =
+                       *((uint32_t *)(txq->bounce_buf + copy + i));
+       }
+       for (i = copy - 4; i >= 4; i -= 4) {
+               if ((i & (TXBB_SIZE - 1)) == 0)
+                       rte_wmb();
+               *((uint32_t *)(sq->buf + index * TXBB_SIZE + i)) =
+               *((uint32_t *)(txq->bounce_buf + i));
+       }
+       /* Return real descriptor location */
+       return (struct mlx4_wqe_ctrl_seg *)(sq->buf + index * TXBB_SIZE);
+}
+
+/**
+ * Handle address translation of scattered buffers for mlx4_tx_burst().
+ *
+ * @param txq
+ *   TX queue structure.
+ * @param segs
+ *   Number of segments in buf.
+ * @param elt
+ *   TX queue element to fill.
+ * @param[in] buf
+ *   Buffer to process.
+ * @param elts_head
+ *   Index of the linear buffer to use if necessary (normally txq->elts_head).
+ * @param[out] sges
+ *   Array filled with SGEs on success.
+ *
+ * @return
+ *   A structure containing the processed packet size in bytes and the
+ *   number of SGEs. Both fields are set to (unsigned int)-1 in case of
+ *   failure.
+ */
+static inline int
+mlx4_tx_sg_virt_to_lkey(struct txq *txq, struct rte_mbuf *buf,
+                       struct ibv_sge *sges, unsigned int segs)
+{
+       unsigned int j;
+
+       /* Register segments as SGEs. */
+       for (j = 0; (j != segs); ++j) {
+               struct ibv_sge *sge = &sges[j];
+               uint32_t lkey;
+
+               /* Retrieve Memory Region key for this memory pool. */
+               lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
+               if (unlikely(lkey == (uint32_t)-1)) {
+                       /* MR does not exist. */
+                       DEBUG("%p: unable to get MP <-> MR association",
+                             (void *)txq);
+                       goto stop;
+               }
+               /* Update SGE. */
+               sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
+               if (txq->priv->vf)
+                       rte_prefetch0((volatile void *)
+                                     (uintptr_t)sge->addr);
+               sge->length = buf->data_len;
+               sge->lkey = lkey;
+               buf = buf->next;
+       }
+       return 0;
+stop:
+       return -1;
+}
+
+
+/**
  * Posts a single work requests to a send queue.
  *
  * @param txq
@@ -323,36 +418,53 @@
  */
 static int
 mlx4_post_send(struct txq *txq,
+              struct rte_mbuf *pkt,
               struct ibv_send_wr *wr,
               struct ibv_send_wr **bad_wr)
 {
        struct mlx4_wqe_ctrl_seg *ctrl;
        struct mlx4_wqe_data_seg *dseg;
        struct mlx4_sq *sq = &txq->msq;
+       struct ibv_sge sge[wr->num_sge];
        uint32_t srcrb_flags;
        uint8_t fence_size;
        uint32_t head_idx = sq->head & sq->txbb_cnt_mask;
        uint32_t owner_opcode;
-       int wqe_real_size, nr_txbbs;
+       int wqe_real_size, wqe_size, nr_txbbs, i;
+       bool bounce = FALSE;
 
-       /* for now we support pkts with one buf only */
-       if (wr->num_sge != 1)
+       if (unlikely(mlx4_tx_sg_virt_to_lkey(txq, pkt, sge, wr->num_sge)))
                goto err;
+       wr->sg_list = sge;
        /* Calc the needed wqe size for this packet */
        wqe_real_size = mlx4_wqe_calc_real_size(wr->num_sge);
        if (unlikely(!wqe_real_size))
                goto err;
+       wqe_size = RTE_ALIGN(wqe_real_size, TXBB_SIZE);
        nr_txbbs = SIZE_TO_TXBBS(wqe_real_size);
        /* Are we too big to handle ? */
        if (unlikely(mlx4_wq_overflow(sq, nr_txbbs)))
                goto err;
-       /* Get ctrl and single-data wqe entries */
-       ctrl = mlx4_get_send_wqe(sq, head_idx);
+       /* Get ctrl entry */
+       if (likely(head_idx + nr_txbbs <= sq->txbb_cnt)) {
+               ctrl = mlx4_get_send_wqe(sq, head_idx);
+       } else {
+               /* handle the case of wqe wraps around the SQ by working with
+                * a side-buf and when done copying it back to the SQ
+                */
+               ctrl = (struct mlx4_wqe_ctrl_seg *)txq->bounce_buf;
+               bounce = TRUE;
+       }
+       /* Get data-seg entry */
        dseg = (struct mlx4_wqe_data_seg *)(((char *)ctrl) +
                sizeof(struct mlx4_wqe_ctrl_seg));
-       mlx4_set_data_seg(dseg, wr->sg_list);
-       /* For raw eth, the SOLICIT flag is used
-        * to indicate that no icrc should be calculated
+       /* Fill-in date from last to first */
+       for (i = wr->num_sge  - 1; i >= 0; --i)
+               mlx4_set_data_seg(dseg + i,  wr->sg_list + i);
+       /* Handle control info
+        *
+        * For raw eth, the SOLICIT flag is used to indicate that
+        * no icrc should be calculated
         */
        srcrb_flags = MLX4_WQE_CTRL_SOLICIT |
                      ((wr->send_flags & IBV_SEND_SIGNALED) ?
@@ -361,7 +473,19 @@
                MLX4_WQE_CTRL_FENCE : 0) | ((wqe_real_size / 16) & 0x3f);
        owner_opcode = MLX4_OPCODE_SEND |
                       ((sq->head & sq->txbb_cnt) ? MLX4_EN_BIT_WQE_OWN : 0);
-       mlx4_set_ctrl_seg(ctrl, owner_opcode, fence_size, srcrb_flags, 0);
+       /* fill in ctrl info but ownership */
+       mlx4_set_ctrl_seg(ctrl, fence_size, srcrb_flags, 0);
+       /* If we used a bounce buffer then copy wqe back into sq */
+       if (unlikely(bounce))
+               ctrl = mlx4_bounce_to_desc(txq, head_idx, wqe_size);
+       /*
+        * Make sure descriptor is fully written before
+        * setting ownership bit (because HW can start
+        * executing as soon as we do).
+        */
+        rte_wmb();
+        ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode);
+
        sq->head += nr_txbbs;
        rte_wmb();
        return 0;
@@ -439,62 +563,31 @@
                /* Request Tx completion. */
                if (unlikely(--elts_comp_cd == 0)) {
                        elts_comp_cd = txq->elts_comp_cd_init;
-                       ++elts_comp;
                        send_flags |= IBV_SEND_SIGNALED;
                }
-               if (likely(segs == 1)) {
-                       struct ibv_sge *sge = &elt->sge;
-                       uintptr_t addr;
-                       uint32_t length;
-                       uint32_t lkey;
-
-                       /* Retrieve buffer information. */
-                       addr = rte_pktmbuf_mtod(buf, uintptr_t);
-                       length = buf->data_len;
-                       /* Retrieve memory region key for this memory pool. */
-                       lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
-                       if (unlikely(lkey == (uint32_t)-1)) {
-                               /* MR does not exist. */
-                               DEBUG("%p: unable to get MP <-> MR"
-                                     " association", (void *)txq);
-                               /* Clean up Tx element. */
-                               elt->buf = NULL;
-                               goto stop;
-                       }
-                       if (buf->pkt_len <= txq->max_inline)
-                               send_flags |= IBV_SEND_INLINE;
-                       /* Update element. */
-                       elt->buf = buf;
-                       if (txq->priv->vf)
-                               rte_prefetch0((volatile void *)
-                                             (uintptr_t)addr);
+               if (buf->pkt_len <= txq->max_inline)
+                       send_flags |= IBV_SEND_INLINE;
+               /* Update element. */
+               elt->buf = buf;
+               if (txq->priv->vf)
                        RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
-                       sge->addr = addr;
-                       sge->length = length;
-                       sge->lkey = lkey;
-                       sent_size += length;
-                       /* Set up WR. */
-                       wr->sg_list  = sge;
-                       wr->num_sge  = segs;
-                       wr->opcode   = IBV_WR_SEND;
-                       wr->send_flags = send_flags;
-                       wr->next     = NULL;
-                       /* post the pkt for sending */
-                       err = mlx4_post_send(txq, wr, &wr_bad);
-                       if (unlikely(err)) {
-                               if (unlikely(wr_bad->send_flags &
-                                            IBV_SEND_SIGNALED)) {
-                                       elts_comp_cd = 1;
-                                       --elts_comp;
-                               }
-                               elt->buf = NULL;
-                               goto stop;
-                       }
-                       sent_size += length;
-               } else {
-                       err = -1;
+               /* Set up WR. */
+               wr->sg_list  = NULL; /* handled in post_send */
+               wr->num_sge  = segs;
+               wr->opcode   = IBV_WR_SEND;
+               wr->send_flags = send_flags;
+               wr->next     = NULL;
+               /* post the pkt for sending */
+               err = mlx4_post_send(txq, buf, wr, &wr_bad);
+               if (unlikely(err)) {
+                       if (unlikely(wr_bad->send_flags &
+                                    IBV_SEND_SIGNALED))
+                               elts_comp_cd = 1;
+                       elt->buf = NULL;
                        goto stop;
                }
+               ++elts_comp;
+               sent_size += buf->pkt_len;
                elts_head = elts_head_next;
                /* Increment sent bytes counter. */
                txq->stats.obytes += sent_size;
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index e442730..7cae7e2 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -139,13 +139,14 @@ struct txq {
        struct txq_elt (*elts)[]; /**< Tx elements. */
        unsigned int elts_head; /**< Current index in (*elts)[]. */
        unsigned int elts_tail; /**< First element awaiting completion. */
-       unsigned int elts_comp; /**< Number of completion requests. */
+       unsigned int elts_comp; /**< Number of pkts waiting for completion. */
        unsigned int elts_comp_cd; /**< Countdown for next completion. */
        unsigned int elts_comp_cd_init; /**< Initial value for countdown. */
        struct mlx4_txq_stats stats; /**< Tx queue counters. */
        unsigned int socket; /**< CPU socket ID for allocations. */
        struct mlx4_sq msq; /**< Info for directly manipulating the SQ. */
        struct mlx4_cq mcq; /**< Info for directly manipulating the CQ. */
+       char *bounce_buf; /**< Side memory to be used when wqe wraps around */
 };
 
 /* mlx4_rxq.c */
diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c
index 1273738..6f6ea9c 100644
--- a/drivers/net/mlx4/mlx4_txq.c
+++ b/drivers/net/mlx4/mlx4_txq.c
@@ -83,8 +83,14 @@
                rte_calloc_socket("TXQ", 1, sizeof(*elts), 0, txq->socket);
        int ret = 0;
 
-       if (elts == NULL) {
-               ERROR("%p: can't allocate packets array", (void *)txq);
+       /* Allocate Bounce-buf memory */
+       txq->bounce_buf = (char *)rte_zmalloc_socket("TXQ",
+                                                    MAX_WQE_SIZE,
+                                                    RTE_CACHE_LINE_MIN_SIZE,
+                                                    txq->socket);
+
+       if ((elts == NULL) || (txq->bounce_buf == NULL)) {
+               ERROR("%p: can't allocate TXQ memory", (void *)txq);
                ret = ENOMEM;
                goto error;
        }
@@ -110,6 +116,8 @@
        assert(ret == 0);
        return 0;
 error:
+       if (txq->bounce_buf != NULL)
+               rte_free(txq->bounce_buf);
        if (elts != NULL)
                rte_free(elts);
        DEBUG("%p: failed, freed everything", (void *)txq);
-- 
1.8.3.1

Reply via email to