Signed-off-by: Nelio Laranjeiro <nelio.laranje...@6wind.com>
---
 drivers/net/mlx5/mlx5.c      |   3 +
 drivers/net/mlx5/mlx5.h      |   2 +-
 drivers/net/mlx5/mlx5_flow.c |  97 +++-----
 drivers/net/mlx5/mlx5_rxq.c  | 564 ++++++++++++++++++++++++++-----------------
 drivers/net/mlx5/mlx5_rxtx.h |  26 +-
 drivers/net/mlx5/mlx5_vlan.c |   2 +-
 6 files changed, 401 insertions(+), 293 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 0d8ca52..c158d8e 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -200,6 +200,9 @@ mlx5_dev_close(struct rte_eth_dev *dev)
        }
        if (priv->reta_idx != NULL)
                rte_free(priv->reta_idx);
+       i = mlx5_priv_rxq_ibv_verify(priv);
+       if (i)
+               WARN("%p: some Verbs Rx queue still remain", (void*)priv);
        i = priv_flow_verify(priv);
        if (i)
                WARN("%p: some flows still remain", (void*)priv);
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 1ae5f59..228fd34 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -146,6 +146,7 @@ struct priv {
        struct rte_flow_drop *flow_drop_queue; /* Flow drop queue. */
        TAILQ_HEAD(mlx5_flows, rte_flow) flows; /* RTE Flow rules. */
        LIST_HEAD(mr, mlx5_mr) mr; /* Memory region. */
+       LIST_HEAD(rxqibv, mlx5_rxq_ibv) rxqsibv; /* Verbs Rx queues. */
        uint32_t link_speed_capa; /* Link speed capabilities. */
        struct mlx5_xstats_ctrl xstats_ctrl; /* Extended stats control. */
        rte_spinlock_t lock; /* Lock for control functions. */
@@ -287,7 +288,6 @@ int mlx5_flow_flush(struct rte_eth_dev *, struct 
rte_flow_error *);
 int mlx5_flow_isolate(struct rte_eth_dev *, int, struct rte_flow_error *);
 int priv_flow_start(struct priv *);
 void priv_flow_stop(struct priv *);
-int priv_flow_rxq_in_use(struct priv *, struct mlx5_rxq_data *);
 int priv_flow_verify(struct priv *);
 
 /* mlx5_mr.c */
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index bcbb984..9ed8d05 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -95,11 +95,11 @@ struct rte_flow {
        struct ibv_exp_flow *ibv_flow; /**< Verbs flow. */
        struct ibv_exp_wq *wq; /**< Verbs work queue. */
        struct ibv_cq *cq; /**< Verbs completion queue. */
-       uint16_t rxqs_n; /**< Number of queues in this flow, 0 if drop queue. */
        uint32_t mark:1; /**< Set if the flow is marked. */
        uint32_t drop:1; /**< Drop queue. */
        uint64_t hash_fields; /**< Fields that participate in the hash. */
-       struct mlx5_rxq_data *rxqs[]; /**< Pointer to the queues array. */
+       uint16_t queues[RTE_MAX_QUEUES_PER_PORT]; /**< List of queues. */
+       uint16_t queues_n; /**< Number of queues in the list. */
 };
 
 /** Static initializer for items. */
@@ -1097,23 +1097,21 @@ priv_flow_create_action_queue(struct priv *priv,
        assert(priv->pd);
        assert(priv->ctx);
        assert(!flow->actions.drop);
-       rte_flow = rte_calloc(__func__, 1, sizeof(*rte_flow) +
-                             sizeof(*rte_flow->rxqs) * flow->actions.queues_n,
-                             0);
+       rte_flow = rte_calloc(__func__, 1, sizeof(*rte_flow), 0);
        if (!rte_flow) {
                rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
                                   NULL, "cannot allocate flow memory");
                return NULL;
        }
        for (i = 0; i < flow->actions.queues_n; ++i) {
-               struct mlx5_rxq_ctrl *rxq;
+               struct mlx5_rxq_ibv *rxq =
+                       mlx5_priv_rxq_ibv_get(priv, flow->actions.queues[i]);
 
-               rxq = container_of((*priv->rxqs)[flow->actions.queues[i]],
-                                  struct mlx5_rxq_ctrl, rxq);
                wqs[i] = rxq->wq;
-               rte_flow->rxqs[i] = &rxq->rxq;
-               ++rte_flow->rxqs_n;
-               rxq->rxq.mark |= flow->actions.mark;
+               rte_flow->queues[i] = flow->actions.queues[i];
+               ++rte_flow->queues_n;
+               (*priv->rxqs)[flow->actions.queues[i]]->mark |=
+                       flow->actions.mark;
        }
        /* finalise indirection table. */
        for (j = 0; i < wqs_n; ++i, ++j) {
@@ -1294,6 +1292,8 @@ static void
 priv_flow_destroy(struct priv *priv,
                  struct rte_flow *flow)
 {
+       unsigned int i;
+
        TAILQ_REMOVE(&priv->flows, flow, next);
        if (flow->ibv_flow)
                claim_zero(ibv_exp_destroy_flow(flow->ibv_flow));
@@ -1303,37 +1303,33 @@ priv_flow_destroy(struct priv *priv,
                claim_zero(ibv_destroy_qp(flow->qp));
        if (flow->ind_table)
                claim_zero(ibv_exp_destroy_rwq_ind_table(flow->ind_table));
-       if (flow->mark) {
+       for (i = 0; i != flow->queues_n; ++i) {
                struct rte_flow *tmp;
-               struct mlx5_rxq_data *rxq;
-               uint32_t mark_n = 0;
-               uint32_t queue_n;
+               struct mlx5_rxq_data *rxq = (*priv->rxqs)[flow->queues[i]];
+               struct mlx5_rxq_ctrl *rxq_ctrl =
+                       container_of(rxq, struct mlx5_rxq_ctrl, rxq);
 
                /*
                 * To remove the mark from the queue, the queue must not be
                 * present in any other marked flow (RSS or not).
                 */
-               for (queue_n = 0; queue_n < flow->rxqs_n; ++queue_n) {
-                       rxq = flow->rxqs[queue_n];
-                       for (tmp = TAILQ_FIRST(&priv->flows);
-                            tmp;
-                            tmp = TAILQ_NEXT(tmp, next)) {
-                               uint32_t tqueue_n;
+               if (flow->mark) {
+                       int mark = 0;
+
+                       TAILQ_FOREACH(tmp, &priv->flows, next) {
+                               unsigned int j;
 
                                if (tmp->drop)
                                        continue;
-                               for (tqueue_n = 0;
-                                    tqueue_n < tmp->rxqs_n;
-                                    ++tqueue_n) {
-                                       struct mlx5_rxq_data *trxq;
-
-                                       trxq = tmp->rxqs[tqueue_n];
-                                       if (rxq == trxq)
-                                               ++mark_n;
-                               }
+                               if (!tmp->mark)
+                                       continue;
+                               for (j = 0; (j != tmp->queues_n) && !mark; j++)
+                                       if (tmp->queues[j] == flow->queues[i])
+                                               mark = 1;
                        }
-                       rxq->mark = !!mark_n;
+                       rxq->mark = mark;
                }
+               mlx5_priv_rxq_ibv_release(priv, rxq_ctrl->ibv);
        }
 free:
        rte_free(flow->ibv_attr);
@@ -1532,8 +1528,8 @@ priv_flow_stop(struct priv *priv)
                if (flow->mark) {
                        unsigned int n;
 
-                       for (n = 0; n < flow->rxqs_n; ++n)
-                               flow->rxqs[n]->mark = 0;
+                       for (n = 0; n < flow->queues_n; ++n)
+                               (*priv->rxqs)[flow->queues[n]]->mark = 0;
                }
                DEBUG("Flow %p removed", (void *)flow);
        }
@@ -1575,39 +1571,8 @@ priv_flow_start(struct priv *priv)
                if (flow->mark) {
                        unsigned int n;
 
-                       for (n = 0; n < flow->rxqs_n; ++n)
-                               flow->rxqs[n]->mark = 1;
-               }
-       }
-       return 0;
-}
-
-/**
- * Verify if the Rx queue is used in a flow.
- *
- * @param priv
- *   Pointer to private structure.
- * @param rxq
- *   Pointer to the queue to search.
- *
- * @return
- *   Nonzero if the queue is used by a flow.
- */
-int
-priv_flow_rxq_in_use(struct priv *priv, struct mlx5_rxq_data *rxq)
-{
-       struct rte_flow *flow;
-
-       for (flow = TAILQ_FIRST(&priv->flows);
-            flow;
-            flow = TAILQ_NEXT(flow, next)) {
-               unsigned int n;
-
-               if (flow->drop)
-                       continue;
-               for (n = 0; n < flow->rxqs_n; ++n) {
-                       if (flow->rxqs[n] == rxq)
-                               return 1;
+                       for (n = 0; n < flow->queues_n; ++n)
+                               (*priv->rxqs)[flow->queues[n]]->mark = 1;
                }
        }
        return 0;
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 80cfd96..1663734 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -378,7 +378,7 @@ priv_create_hash_rxqs(struct priv *priv)
 
                rxq_ctrl = container_of((*priv->rxqs)[(*priv->reta_idx)[i]],
                                        struct mlx5_rxq_ctrl, rxq);
-               wqs[i] = rxq_ctrl->wq;
+               wqs[i] = rxq_ctrl->ibv->wq;
        }
        /* Get number of hash RX queues to configure. */
        for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
@@ -647,8 +647,6 @@ rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl, unsigned int 
elts_n)
        /* Iterate on segments. */
        for (i = 0; (i != elts_n); ++i) {
                struct rte_mbuf *buf;
-               volatile struct mlx5_wqe_data_seg *scat =
-                       &(*rxq_ctrl->rxq.wqes)[i];
 
                buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp);
                if (buf == NULL) {
@@ -669,13 +667,6 @@ rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl, unsigned 
int elts_n)
                DATA_LEN(buf) = rte_pktmbuf_tailroom(buf);
                PKT_LEN(buf) = DATA_LEN(buf);
                NB_SEGS(buf) = 1;
-               /* scat->addr must be able to store a pointer. */
-               assert(sizeof(scat->addr) >= sizeof(uintptr_t));
-               *scat = (struct mlx5_wqe_data_seg){
-                       .addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
-                       .byte_count = htonl(DATA_LEN(buf)),
-                       .lkey = rxq_ctrl->mr->lkey,
-               };
                (*rxq_ctrl->rxq.elts)[i] = buf;
        }
        if (rxq_check_vec_support(&rxq_ctrl->rxq) > 0) {
@@ -761,65 +752,12 @@ mlx5_rxq_cleanup(struct mlx5_rxq_ctrl *rxq_ctrl)
 {
        DEBUG("cleaning up %p", (void *)rxq_ctrl);
        rxq_free_elts(rxq_ctrl);
-       if (rxq_ctrl->wq != NULL)
-               claim_zero(ibv_exp_destroy_wq(rxq_ctrl->wq));
-       if (rxq_ctrl->cq != NULL)
-               claim_zero(ibv_destroy_cq(rxq_ctrl->cq));
-       if (rxq_ctrl->channel != NULL)
-               claim_zero(ibv_destroy_comp_channel(rxq_ctrl->channel));
-       if (rxq_ctrl->mr != NULL)
-               priv_mr_release(rxq_ctrl->priv, rxq_ctrl->mr);
+       if (rxq_ctrl->ibv)
+               mlx5_priv_rxq_ibv_release(rxq_ctrl->priv, rxq_ctrl->ibv);
        memset(rxq_ctrl, 0, sizeof(*rxq_ctrl));
 }
 
 /**
- * Initialize RX queue.
- *
- * @param tmpl
- *   Pointer to RX queue control template.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static inline int
-rxq_setup(struct mlx5_rxq_ctrl *tmpl)
-{
-       struct ibv_cq *ibcq = tmpl->cq;
-       struct ibv_mlx5_cq_info cq_info;
-       struct mlx5_rwq *rwq = container_of(tmpl->wq, struct mlx5_rwq, wq);
-       const uint16_t desc_n =
-               (1 << tmpl->rxq.elts_n) + tmpl->priv->rx_vec_en *
-               MLX5_VPMD_DESCS_PER_LOOP;
-       struct rte_mbuf *(*elts)[desc_n] =
-               rte_calloc_socket("RXQ", 1, sizeof(*elts), 0, tmpl->socket);
-       if (ibv_mlx5_exp_get_cq_info(ibcq, &cq_info)) {
-               ERROR("Unable to query CQ info. check your OFED.");
-               return ENOTSUP;
-       }
-       if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) {
-               ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
-                     "it should be set to %u", RTE_CACHE_LINE_SIZE);
-               return EINVAL;
-       }
-       if (elts == NULL)
-               return ENOMEM;
-       tmpl->rxq.rq_db = rwq->rq.db;
-       tmpl->rxq.cqe_n = log2above(cq_info.cqe_cnt);
-       tmpl->rxq.cq_ci = 0;
-       tmpl->rxq.rq_ci = 0;
-       tmpl->rxq.rq_pi = 0;
-       tmpl->rxq.cq_db = cq_info.dbrec;
-       tmpl->rxq.wqes =
-               (volatile struct mlx5_wqe_data_seg (*)[])
-               (uintptr_t)rwq->rq.buff;
-       tmpl->rxq.cqes =
-               (volatile struct mlx5_cqe (*)[])
-               (uintptr_t)cq_info.buf;
-       tmpl->rxq.elts = elts;
-       return 0;
-}
-
-/**
  * Configure a RX queue.
  *
  * @param dev
@@ -848,25 +786,24 @@ mlx5_rxq_ctrl_setup(struct rte_eth_dev *dev, struct 
mlx5_rxq_ctrl *rxq_ctrl,
                .priv = priv,
                .socket = socket,
                .rxq = {
+                       .elts = rte_calloc_socket("RXQ", 1,
+                                                 desc *
+                                                 sizeof(struct rte_mbuf *), 0,
+                                                 socket),
                        .elts_n = log2above(desc),
                        .mp = mp,
                        .rss_hash = priv->rxqs_n > 1,
                },
        };
-       struct ibv_exp_wq_attr mod;
-       union {
-               struct ibv_exp_cq_init_attr cq;
-               struct ibv_exp_wq_init_attr wq;
-               struct ibv_exp_cq_attr cq_attr;
-       } attr;
        unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
-       unsigned int cqe_n = desc - 1;
        const uint16_t desc_n =
                desc + priv->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP;
        struct rte_mbuf *(*elts)[desc_n] = NULL;
        int ret = 0;
 
        (void)conf; /* Thresholds configuration (ignored). */
+       if (dev->data->dev_conf.intr_conf.rxq)
+               tmpl.memory_channel = 1;
        /* Enable scattered packets support for this queue if necessary. */
        assert(mb_len >= RTE_PKTMBUF_HEADROOM);
        if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
@@ -919,78 +856,13 @@ mlx5_rxq_ctrl_setup(struct rte_eth_dev *dev, struct 
mlx5_rxq_ctrl *rxq_ctrl,
        if (priv->hw_csum_l2tun)
                tmpl.rxq.csum_l2tun =
                        !!dev->data->dev_conf.rxmode.hw_ip_checksum;
-       /* Use the entire RX mempool as the memory region. */
-       tmpl.mr = priv_mr_get(priv, mp);
-       if (tmpl.mr == NULL) {
-               tmpl.mr = priv_mr_new(priv, mp);
-               if (tmpl.mr == NULL) {
-                       ret = EINVAL;
-                       ERROR("%p: MR creation failure: %s",
-                             (void *)dev, strerror(ret));
-                       goto error;
-               }
-       }
-       if (dev->data->dev_conf.intr_conf.rxq) {
-               tmpl.channel = ibv_create_comp_channel(priv->ctx);
-               if (tmpl.channel == NULL) {
-                       ret = ENOMEM;
-                       ERROR("%p: Rx interrupt completion channel creation"
-                             " failure: %s",
-                             (void *)dev, strerror(ret));
-                       goto error;
-               }
-       }
-       attr.cq = (struct ibv_exp_cq_init_attr){
-               .comp_mask = 0,
-       };
-       if (priv->cqe_comp) {
-               attr.cq.comp_mask |= IBV_EXP_CQ_INIT_ATTR_FLAGS;
-               attr.cq.flags |= IBV_EXP_CQ_COMPRESSED_CQE;
-               /*
-                * For vectorized Rx, it must not be doubled in order to
-                * make cq_ci and rq_ci aligned.
-                */
-               if (rxq_check_vec_support(&tmpl.rxq) < 0)
-                       cqe_n = (desc * 2) - 1; /* Double the number of CQEs. */
-       }
-       tmpl.cq = ibv_exp_create_cq(priv->ctx, cqe_n, NULL, tmpl.channel, 0,
-                                   &attr.cq);
-       if (tmpl.cq == NULL) {
-               ret = ENOMEM;
-               ERROR("%p: CQ creation failure: %s",
-                     (void *)dev, strerror(ret));
-               goto error;
-       }
-       DEBUG("priv->device_attr.max_qp_wr is %d",
-             priv->device_attr.max_qp_wr);
-       DEBUG("priv->device_attr.max_sge is %d",
-             priv->device_attr.max_sge);
        /* Configure VLAN stripping. */
        tmpl.rxq.vlan_strip = (priv->hw_vlan_strip &&
                               !!dev->data->dev_conf.rxmode.hw_vlan_strip);
-       attr.wq = (struct ibv_exp_wq_init_attr){
-               .wq_context = NULL, /* Could be useful in the future. */
-               .wq_type = IBV_EXP_WQT_RQ,
-               /* Max number of outstanding WRs. */
-               .max_recv_wr = desc >> tmpl.rxq.sges_n,
-               /* Max number of scatter/gather elements in a WR. */
-               .max_recv_sge = 1 << tmpl.rxq.sges_n,
-               .pd = priv->pd,
-               .cq = tmpl.cq,
-               .comp_mask =
-                       IBV_EXP_CREATE_WQ_VLAN_OFFLOADS |
-                       0,
-               .vlan_offloads = (tmpl.rxq.vlan_strip ?
-                                 IBV_EXP_RECEIVE_WQ_CVLAN_STRIP :
-                                 0),
-       };
        /* By default, FCS (CRC) is stripped by hardware. */
        if (dev->data->dev_conf.rxmode.hw_strip_crc) {
                tmpl.rxq.crc_present = 0;
        } else if (priv->hw_fcs_strip) {
-               /* Ask HW/Verbs to leave CRC in place when supported. */
-               attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_SCATTER_FCS;
-               attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
                tmpl.rxq.crc_present = 1;
        } else {
                WARN("%p: CRC stripping has been disabled but will still"
@@ -1004,59 +876,9 @@ mlx5_rxq_ctrl_setup(struct rte_eth_dev *dev, struct 
mlx5_rxq_ctrl *rxq_ctrl,
              (void *)dev,
              tmpl.rxq.crc_present ? "disabled" : "enabled",
              tmpl.rxq.crc_present << 2);
-       if (!mlx5_getenv_int("MLX5_PMD_ENABLE_PADDING"))
-               ; /* Nothing else to do. */
-       else if (priv->hw_padding) {
-               INFO("%p: enabling packet padding on queue %p",
-                    (void *)dev, (void *)rxq_ctrl);
-               attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_RX_END_PADDING;
-               attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
-       } else
-               WARN("%p: packet padding has been requested but is not"
-                    " supported, make sure MLNX_OFED and firmware are"
-                    " up to date",
-                    (void *)dev);
-
-       tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
-       if (tmpl.wq == NULL) {
-               ret = (errno ? errno : EINVAL);
-               ERROR("%p: WQ creation failure: %s",
-                     (void *)dev, strerror(ret));
-               goto error;
-       }
-       /*
-        * Make sure number of WRs*SGEs match expectations since a queue
-        * cannot allocate more than "desc" buffers.
-        */
-       if (((int)attr.wq.max_recv_wr != (desc >> tmpl.rxq.sges_n)) ||
-           ((int)attr.wq.max_recv_sge != (1 << tmpl.rxq.sges_n))) {
-               ERROR("%p: requested %u*%u but got %u*%u WRs*SGEs",
-                     (void *)dev,
-                     (desc >> tmpl.rxq.sges_n), (1 << tmpl.rxq.sges_n),
-                     attr.wq.max_recv_wr, attr.wq.max_recv_sge);
-               ret = EINVAL;
-               goto error;
-       }
        /* Save port ID. */
        tmpl.rxq.port_id = dev->data->port_id;
        DEBUG("%p: RTE port ID: %u", (void *)rxq_ctrl, tmpl.rxq.port_id);
-       /* Change queue state to ready. */
-       mod = (struct ibv_exp_wq_attr){
-               .attr_mask = IBV_EXP_WQ_ATTR_STATE,
-               .wq_state = IBV_EXP_WQS_RDY,
-       };
-       ret = ibv_exp_modify_wq(tmpl.wq, &mod);
-       if (ret) {
-               ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
-                     (void *)dev, strerror(ret));
-               goto error;
-       }
-       ret = rxq_setup(&tmpl);
-       if (ret) {
-               ERROR("%p: cannot initialize RX queue structure: %s",
-                     (void *)dev, strerror(ret));
-               goto error;
-       }
        ret = rxq_alloc_elts(&tmpl, desc);
        if (ret) {
                ERROR("%p: RXQ allocation failed: %s",
@@ -1075,17 +897,12 @@ mlx5_rxq_ctrl_setup(struct rte_eth_dev *dev, struct 
mlx5_rxq_ctrl *rxq_ctrl,
        rte_free(tmpl.rxq.elts);
        tmpl.rxq.elts = elts;
        *rxq_ctrl = tmpl;
-       /* Update doorbell counter. */
-       rxq_ctrl->rxq.rq_ci = desc >> rxq_ctrl->rxq.sges_n;
-       rte_wmb();
-       *rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
        DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
        assert(ret == 0);
        return 0;
 error:
-       elts = tmpl.rxq.elts;
+       rte_free(tmpl.rxq.elts);
        mlx5_rxq_cleanup(&tmpl);
-       rte_free(elts);
        assert(ret > 0);
        return ret;
 }
@@ -1175,14 +992,20 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t 
idx, uint16_t desc,
                }
        }
        ret = mlx5_rxq_ctrl_setup(dev, rxq_ctrl, desc, socket, conf, mp);
-       if (ret)
+       if (ret) {
                rte_free(rxq_ctrl);
-       else {
-               rxq_ctrl->rxq.stats.idx = idx;
-               DEBUG("%p: adding RX queue %p to list",
-                     (void *)dev, (void *)rxq_ctrl);
-               (*priv->rxqs)[idx] = &rxq_ctrl->rxq;
+               goto out;
        }
+       rxq_ctrl->rxq.stats.idx = idx;
+       DEBUG("%p: adding RX queue %p to list",
+             (void *)dev, (void *)rxq_ctrl);
+       (*priv->rxqs)[idx] = &rxq_ctrl->rxq;
+       rxq_ctrl->ibv = mlx5_priv_rxq_ibv_new(priv, idx);
+       if (!rxq_ctrl->ibv) {
+               ret = EAGAIN;
+               goto out;
+       }
+out:
        priv_unlock(priv);
        return -ret;
 }
@@ -1209,7 +1032,7 @@ mlx5_rx_queue_release(void *dpdk_rxq)
        rxq_ctrl = container_of(rxq, struct mlx5_rxq_ctrl, rxq);
        priv = rxq_ctrl->priv;
        priv_lock(priv);
-       if (priv_flow_rxq_in_use(priv, rxq))
+       if (!mlx5_priv_rxq_ibv_releasable(priv, rxq_ctrl->ibv))
                rte_panic("Rx queue %p is still used by a flow and cannot be"
                          " removed\n", (void *)rxq_ctrl);
        for (i = 0; (i != priv->rxqs_n); ++i)
@@ -1253,15 +1076,14 @@ priv_rx_intr_vec_enable(struct priv *priv)
        }
        intr_handle->type = RTE_INTR_HANDLE_EXT;
        for (i = 0; i != n; ++i) {
-               struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
-               struct mlx5_rxq_ctrl *rxq_ctrl =
-                       container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+               /* This rxq ibv must not be released in this function. */
+               struct mlx5_rxq_ibv *rxq = mlx5_priv_rxq_ibv_get(priv, i);
                int fd;
                int flags;
                int rc;
 
                /* Skip queues that cannot request interrupts. */
-               if (!rxq || !rxq_ctrl->channel) {
+               if (!rxq || !rxq->channel) {
                        /* Use invalid intr_vec[] index to disable entry. */
                        intr_handle->intr_vec[i] =
                                RTE_INTR_VEC_RXTX_OFFSET +
@@ -1275,7 +1097,7 @@ priv_rx_intr_vec_enable(struct priv *priv)
                        priv_rx_intr_vec_disable(priv);
                        return -1;
                }
-               fd = rxq_ctrl->channel->fd;
+               fd = rxq->channel->fd;
                flags = fcntl(fd, F_GETFL);
                rc = fcntl(fd, F_SETFL, flags | O_NONBLOCK);
                if (rc < 0) {
@@ -1305,7 +1127,27 @@ void
 priv_rx_intr_vec_disable(struct priv *priv)
 {
        struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
+       unsigned int i;
+       unsigned int rxqs_n = priv->rxqs_n;
+       unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
 
+       if (!priv->dev->data->dev_conf.intr_conf.rxq)
+               return;
+       for (i = 0; i != n; ++i) {
+               struct mlx5_rxq_ctrl *ctrl;
+               struct mlx5_rxq_data *rxq;
+
+               if (intr_handle->intr_vec[i] == RTE_INTR_VEC_RXTX_OFFSET +
+                   RTE_MAX_RXTX_INTR_VEC_ID)
+                       continue;
+               /**
+                * Need to access directly the queue to release the reference
+                * kept in priv_rx_intr_vec_enable().
+                */
+               rxq = (*priv->rxqs)[i];
+               ctrl = container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+               mlx5_priv_rxq_ibv_release(priv, ctrl->ibv);
+       }
        rte_intr_free_epoll_fd(intr_handle);
        free(intr_handle->intr_vec);
        intr_handle->nb_efd = 0;
@@ -1329,19 +1171,19 @@ int
 mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id)
 {
        struct priv *priv = mlx5_get_priv(dev);
-       struct mlx5_rxq_data *rxq = (*priv->rxqs)[rx_queue_id];
-       struct mlx5_rxq_ctrl *rxq_ctrl =
-               container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+       struct mlx5_rxq_ibv *rxq = mlx5_priv_rxq_ibv_get(priv, rx_queue_id);
        int ret;
 
-       if (!rxq || !rxq_ctrl->channel) {
+       if (!rxq || !rxq->channel) {
                ret = EINVAL;
        } else {
-               ibv_mlx5_exp_update_cq_ci(rxq_ctrl->cq, rxq->cq_ci);
-               ret = ibv_req_notify_cq(rxq_ctrl->cq, 0);
+               ibv_mlx5_exp_update_cq_ci(rxq->cq,
+                                         (*priv->rxqs)[rx_queue_id]->cq_ci);
+               ret = ibv_req_notify_cq(rxq->cq, 0);
        }
        if (ret)
                WARN("unable to arm interrupt on rx queue %d", rx_queue_id);
+       mlx5_priv_rxq_ibv_release(priv, rxq);
        return -ret;
 }
 
@@ -1360,26 +1202,312 @@ int
 mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id)
 {
        struct priv *priv = mlx5_get_priv(dev);
-       struct mlx5_rxq_data *rxq = (*priv->rxqs)[rx_queue_id];
-       struct mlx5_rxq_ctrl *rxq_ctrl =
-               container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+       struct mlx5_rxq_ibv *rxq = mlx5_priv_rxq_ibv_get(priv, rx_queue_id);
        struct ibv_cq *ev_cq;
        void *ev_ctx;
        int ret;
 
-       if (!rxq || !rxq_ctrl->channel) {
+       if (!rxq || !rxq->channel) {
                ret = EINVAL;
        } else {
-               ret = ibv_get_cq_event(rxq_ctrl->cq->channel, &ev_cq, &ev_ctx);
-               if (ret || ev_cq != rxq_ctrl->cq)
+               ret = ibv_get_cq_event(rxq->cq->channel, &ev_cq, &ev_ctx);
+               if (ret || ev_cq != rxq->cq)
                        ret = EINVAL;
        }
        if (ret)
                WARN("unable to disable interrupt on rx queue %d",
                     rx_queue_id);
        else
-               ibv_ack_cq_events(rxq_ctrl->cq, 1);
+               ibv_ack_cq_events(rxq->cq, 1);
+       mlx5_priv_rxq_ibv_release(priv, rxq);
        return -ret;
 }
 
 #endif /* HAVE_UPDATE_CQ_CI */
+
+/**
+ * Create the Rx queue Verbs object.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param idx
+ *   Queue index in DPDK Rx queue array
+ *
+ * @return
+ *   The Verbs object initialised if it can be created.
+ */
+struct mlx5_rxq_ibv*
+mlx5_priv_rxq_ibv_new(struct priv *priv, uint16_t idx)
+{
+       struct mlx5_rxq_data *rxq = (*priv->rxqs)[idx];
+       struct mlx5_rxq_ctrl *rxq_ctrl =
+               container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+       struct ibv_exp_wq_attr mod;
+       union {
+               struct ibv_exp_cq_init_attr cq;
+               struct ibv_exp_wq_init_attr wq;
+               struct ibv_exp_cq_attr cq_attr;
+       } attr;
+       unsigned int cqe_n = (1 << rxq->elts_n) - 1;
+       struct mlx5_rxq_ibv *tmpl;
+       struct ibv_mlx5_cq_info cq_info;
+       struct mlx5_rwq *rwq;
+       unsigned int i;
+       int ret = 0;
+
+       assert(!rxq_ctrl->ibv);
+       tmpl = rte_calloc_socket(__func__, 1, sizeof(*tmpl), 0,
+                                rxq_ctrl->socket);
+       if (!tmpl) {
+               ERROR("%p: cannot allocate verbs ressources",
+                      (void*)rxq_ctrl);
+               goto error;
+       }
+       /* Use the entire RX mempool as the memory region. */
+       tmpl->mr = priv_mr_get(priv, rxq->mp);
+       if (!tmpl->mr) {
+               tmpl->mr = priv_mr_new(priv, rxq->mp);
+               if (!tmpl->mr) {
+                       ERROR("%p: MR creation failure", (void *)rxq_ctrl);
+                       goto error;
+               }
+       }
+       if (rxq_ctrl->memory_channel) {
+               tmpl->channel = ibv_create_comp_channel(priv->ctx);
+               if (!tmpl->channel) {
+                       ERROR("%p: Comp Channel creation failure",
+                             (void *)rxq_ctrl);
+                       goto error;
+               }
+       }
+       attr.cq = (struct ibv_exp_cq_init_attr){
+               .comp_mask = 0,
+       };
+       if (priv->cqe_comp) {
+               attr.cq.comp_mask |= IBV_EXP_CQ_INIT_ATTR_FLAGS;
+               attr.cq.flags |= IBV_EXP_CQ_COMPRESSED_CQE;
+               /*
+                * For vectorized Rx, it must not be doubled in order to
+                * make cq_ci and rq_ci aligned.
+                */
+               if (rxq_check_vec_support(rxq) < 0)
+                       cqe_n *= 2;
+       }
+       tmpl->cq = ibv_exp_create_cq(priv->ctx, cqe_n, NULL, tmpl->channel, 0,
+                                    &attr.cq);
+       if (tmpl->cq == NULL) {
+               ERROR("%p: CQ creation failure", (void *)rxq_ctrl);
+               goto error;
+       }
+       if (ibv_mlx5_exp_get_cq_info(tmpl->cq, &cq_info)) {
+               ERROR("Unable to query CQ info. check your OFED.");
+               goto error;
+       }
+       if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) {
+               ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
+                     "it should be set to %u", RTE_CACHE_LINE_SIZE);
+               goto error;
+       }
+       DEBUG("priv->device_attr.max_qp_wr is %d",
+             priv->device_attr.max_qp_wr);
+       DEBUG("priv->device_attr.max_sge is %d",
+             priv->device_attr.max_sge);
+       attr.wq = (struct ibv_exp_wq_init_attr){
+               .wq_context = NULL, /* Could be useful in the future. */
+               .wq_type = IBV_EXP_WQT_RQ,
+               /* Max number of outstanding WRs. */
+               .max_recv_wr = (1 << rxq->elts_n) >> rxq->sges_n,
+               /* Max number of scatter/gather elements in a WR. */
+               .max_recv_sge = 1 << rxq->sges_n,
+               .pd = priv->pd,
+               .cq = tmpl->cq,
+               .comp_mask =
+                       IBV_EXP_CREATE_WQ_VLAN_OFFLOADS |
+                       0,
+               .vlan_offloads = (rxq->vlan_strip ?
+                                 IBV_EXP_RECEIVE_WQ_CVLAN_STRIP :
+                                 0),
+       };
+       /* By default, FCS (CRC) is stripped by hardware. */
+       if (rxq->crc_present) {
+               attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_SCATTER_FCS;
+               attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
+       }
+       if (priv->hw_padding) {
+               attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_RX_END_PADDING;
+               attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
+       }
+       tmpl->wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
+       if (tmpl->wq == NULL) {
+               ERROR("%p: WQ creation failure", (void *)rxq_ctrl);
+               goto error;
+       }
+       /*
+        * Make sure number of WRs*SGEs match expectations since a queue
+        * cannot allocate more than "desc" buffers.
+        */
+       if (((int)attr.wq.max_recv_wr != ((1 << rxq->elts_n) >> rxq->sges_n)) ||
+           ((int)attr.wq.max_recv_sge != (1 << rxq->sges_n))) {
+               ERROR("%p: requested %u*%u but got %u*%u WRs*SGEs",
+                     (void *)rxq_ctrl,
+                     ((1 << rxq->elts_n) >> rxq->sges_n),
+                     (1 << rxq->sges_n),
+                     attr.wq.max_recv_wr, attr.wq.max_recv_sge);
+               goto error;
+       }
+       /* Change queue state to ready. */
+       mod = (struct ibv_exp_wq_attr){
+               .attr_mask = IBV_EXP_WQ_ATTR_STATE,
+               .wq_state = IBV_EXP_WQS_RDY,
+       };
+       ret = ibv_exp_modify_wq(tmpl->wq, &mod);
+       if (ret) {
+               ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed",
+                     (void *)rxq_ctrl);
+               goto error;
+       }
+       /* Fill the rings. */
+       rwq = container_of(tmpl->wq, struct mlx5_rwq, wq);
+       rxq->wqes = (volatile struct mlx5_wqe_data_seg (*)[])
+               (uintptr_t)rwq->rq.buff;
+       for (i = 0; (i != (unsigned int)(1 << rxq->elts_n)); ++i) {
+               struct rte_mbuf *buf = (*rxq->elts)[i];
+               volatile struct mlx5_wqe_data_seg *scat = &(*rxq->wqes)[i];
+
+               /* scat->addr must be able to store a pointer. */
+               assert(sizeof(scat->addr) >= sizeof(uintptr_t));
+               *scat = (struct mlx5_wqe_data_seg){
+                       .addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
+                       .byte_count = htonl(DATA_LEN(buf)),
+                       .lkey = tmpl->mr->lkey,
+               };
+       }
+       rxq->rq_db = rwq->rq.db;
+       rxq->cqe_n = log2above(cq_info.cqe_cnt);
+       rxq->cq_ci = 0;
+       rxq->rq_ci = 0;
+       rxq->cq_db = cq_info.dbrec;
+       rxq->cqes = (volatile struct mlx5_cqe (*)[])(uintptr_t)cq_info.buf;
+       /* Update doorbell counter. */
+       rxq->rq_ci = (1 << rxq->elts_n) >> rxq->sges_n;
+       rte_wmb();
+       *rxq->rq_db = htonl(rxq->rq_ci);
+       DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
+       rte_atomic32_inc(&tmpl->refcnt);
+       DEBUG("%p: Verbs Rx queue %p: refcnt %d", (void*)priv,
+             (void*)tmpl, rte_atomic32_read(&tmpl->refcnt));
+       LIST_INSERT_HEAD(&priv->rxqsibv, tmpl, next);
+       return tmpl;
+error:
+       if (tmpl->wq)
+               claim_zero(ibv_exp_destroy_wq(tmpl->wq));
+       if (tmpl->cq)
+               claim_zero(ibv_destroy_cq(tmpl->cq));
+       if (tmpl->channel)
+               claim_zero(ibv_destroy_comp_channel(tmpl->channel));
+       if (tmpl->mr)
+               priv_mr_release(priv, tmpl->mr);
+       return NULL;
+
+}
+
+/**
+ * Get an Rx queue Verbs object.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param idx
+ *   Queue index in DPDK Rx queue array
+ *
+ * @return
+ *   The Verbs object if it exists.
+ */
+struct mlx5_rxq_ibv*
+mlx5_priv_rxq_ibv_get(struct priv *priv, uint16_t idx)
+{
+       struct mlx5_rxq_data *rxq = (*priv->rxqs)[idx];
+       struct mlx5_rxq_ctrl *ctrl =
+               container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+       struct mlx5_mr *mr __rte_unused;
+
+       if (ctrl->ibv) {
+               mr = priv_mr_get(priv, rxq->mp);
+               rte_atomic32_inc(&ctrl->ibv->refcnt);
+               DEBUG("%p: Verbs Rx queue %p: refcnt %d", (void*)priv,
+                     (void*)ctrl->ibv, rte_atomic32_read(&ctrl->ibv->refcnt));
+       }
+       return ctrl->ibv;
+}
+
+/**
+ * Release an Rx verbs queue object.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param rxq
+ *   Verbs Rx queue object.
+ *
+ * @return
+ *   0 on success, errno value on failure.
+ */
+int
+mlx5_priv_rxq_ibv_release(struct priv *priv, struct mlx5_rxq_ibv *rxq)
+{
+       int ret;
+
+       assert(rxq->wq);
+       assert(rxq->cq);
+       assert(rxq->mr);
+       ret = priv_mr_release(priv, rxq->mr);
+       if (!ret)
+               rxq->mr = NULL;
+       DEBUG("%p: Verbs Rx queue %p: refcnt %d", (void*)priv,
+             (void*)rxq, rte_atomic32_read(&rxq->refcnt));
+       if (rte_atomic32_dec_and_test(&rxq->refcnt)) {
+               claim_zero(ibv_exp_destroy_wq(rxq->wq));
+               claim_zero(ibv_destroy_cq(rxq->cq));
+               if (rxq->channel)
+                       claim_zero(ibv_destroy_comp_channel(rxq->channel));
+               LIST_REMOVE(rxq, next);
+               rte_free(rxq);
+               return 0;
+       }
+       return EBUSY;
+}
+
+/**
+ * Verify the Verbs Rx queue list is empty
+ *
+ * @param priv
+ *  Pointer to private structure.
+ *
+ * @return the number of object not released.
+ */
+int
+mlx5_priv_rxq_ibv_verify(struct priv *priv)
+{
+       int ret = 0;
+       struct mlx5_rxq_ibv *rxq;
+
+       LIST_FOREACH(rxq, &priv->rxqsibv, next) {
+               DEBUG("%p: Verbs Rx queue %p still referenced", (void*)priv,
+                     (void*)rxq);
+               ++ret;
+       }
+       return ret;
+}
+
+/**
+ * Return true if a single reference exists on the object.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param rxq
+ *   Verbs Rx queue object.
+ */
+int
+mlx5_priv_rxq_ibv_releasable(struct priv *priv, struct mlx5_rxq_ibv *rxq)
+{
+       (void)priv;
+       return (rte_atomic32_read(&rxq->refcnt) == 1);
+}
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index c7c7518..abdbf6a 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -130,15 +130,24 @@ struct mlx5_rxq_data {
        struct rte_mbuf fake_mbuf; /* elts padding for vectorized Rx. */
 } __rte_cache_aligned;
 
-/* RX queue control descriptor. */
-struct mlx5_rxq_ctrl {
-       struct priv *priv; /* Back pointer to private data. */
+/* Verbs Rx queue elements. */
+struct mlx5_rxq_ibv {
+       LIST_ENTRY(mlx5_rxq_ibv) next; /* Pointer to the next element. */
+       rte_atomic32_t refcnt; /* Reference counter. */
+       struct mlx5_rxq_ctrl *rxq_ctrl; /* Back pointer to parent. */
        struct ibv_cq *cq; /* Completion Queue. */
        struct ibv_exp_wq *wq; /* Work Queue. */
-       struct mlx5_mr *mr; /* Memory Region (for mp). */
        struct ibv_comp_channel *channel;
-       unsigned int socket; /* CPU socket ID for allocations. */
+       struct mlx5_mr *mr; /* Memory Region (for mp). */
+};
+
+/* RX queue control descriptor. */
+struct mlx5_rxq_ctrl {
+       struct priv *priv; /* Back pointer to private data. */
+       struct mlx5_rxq_ibv *ibv; /* Verbs elements. */
        struct mlx5_rxq_data rxq; /* Data path structure. */
+       unsigned int socket; /* CPU socket ID for allocations. */
+       unsigned int memory_channel:1; /* Need memory channel. */
 };
 
 /* Hash RX queue types. */
@@ -298,7 +307,6 @@ void priv_destroy_hash_rxqs(struct priv *);
 int priv_allow_flow_type(struct priv *, enum hash_rxq_flow_type);
 int priv_rehash_flows(struct priv *);
 void mlx5_rxq_cleanup(struct mlx5_rxq_ctrl *);
-int mlx5_rxq_rehash(struct rte_eth_dev *, struct mlx5_rxq_ctrl *);
 int mlx5_rxq_ctrl_setup(struct rte_eth_dev *, struct mlx5_rxq_ctrl *,
                        uint16_t, unsigned int, const struct rte_eth_rxconf *,
                        struct rte_mempool *);
@@ -311,6 +319,11 @@ void priv_rx_intr_vec_disable(struct priv *priv);
 int mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id);
 int mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id);
 #endif /* HAVE_UPDATE_CQ_CI */
+struct mlx5_rxq_ibv* mlx5_priv_rxq_ibv_new(struct priv *priv, uint16_t idx);
+struct mlx5_rxq_ibv* mlx5_priv_rxq_ibv_get(struct priv *priv, uint16_t idx);
+int mlx5_priv_rxq_ibv_release(struct priv *priv, struct mlx5_rxq_ibv *rxq);
+int mlx5_priv_rxq_ibv_releasable(struct priv *priv, struct mlx5_rxq_ibv *rxq);
+int mlx5_priv_rxq_ibv_verify(struct priv *priv);
 
 /* mlx5_txq.c */
 
@@ -347,7 +360,6 @@ uint16_t mlx5_rx_burst_vec(void *, struct rte_mbuf **, 
uint16_t);
 
 /* mlx5_mr.c */
 
-struct ibv_mr *mlx5_mp2mr(struct ibv_pd *, struct rte_mempool *);
 void mlx5_txq_mp2mr_iter(struct rte_mempool *, void *);
 uint32_t mlx5_txq_mp2mr_reg(struct mlx5_txq_data *, struct rte_mempool *,
                            unsigned int);
diff --git a/drivers/net/mlx5/mlx5_vlan.c b/drivers/net/mlx5/mlx5_vlan.c
index 512052a..dffa1cd 100644
--- a/drivers/net/mlx5/mlx5_vlan.c
+++ b/drivers/net/mlx5/mlx5_vlan.c
@@ -153,7 +153,7 @@ priv_vlan_strip_queue_set(struct priv *priv, uint16_t idx, 
int on)
                .vlan_offloads = vlan_offloads,
        };
 
-       err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
+       err = ibv_exp_modify_wq(rxq_ctrl->ibv->wq, &mod);
        if (err) {
                ERROR("%p: failed to modified stripping mode: %s",
                      (void *)priv, strerror(err));
-- 
2.1.4

Reply via email to