Hi Mou,

PSB one small comment,
Please update and feel free to add my ack.
> -----Original Message-----
> From: Suanming Mou <suanmi...@nvidia.com>
> 
> UMR WQE can convert multiple mkey's memory sapce to contiguous space.
> Take advantage of the UMR WQE, scattered mbuf in one operation can be
> converted to an indirect mkey. The RegEx which only accepts one mkey
> can now process the whole scattered mbuf.
> 
> The maximum scattered mbuf can be supported in one UMR WQE is now
> defined as 64. Multiple operations scattered mbufs can be add to one
> UMR WQE if there is enough space in the KLM array, since the operations
> can address their own mbuf's content by the mkey's address and length.
> However, one operation's scattered mbuf's can't be placed in two
> different UMR WQE's KLM array, if the UMR WQE's KLM does not has enough
> free space for one operation, a new UMR WQE will be required.
> 
> In case the UMR WQE's indirect mkey will be over wrapped by the SQ's
> WQE move, the meky's index used by the UMR WQE should be the index of
> last the RegEX WQE in the operations. As one operation consumes one
> WQE set, build the RegEx WQE by reverse helps address the mkey more
> efficiently. Once the operations in one burst consumes multiple mkeys,
> when the mkey KLM array is full, the reverse WQE set index will always
> be the last of the new mkey's for the new UMR WQE.
> 
> In GGA mode, the SQ WQE's memory layout becomes UMR/NOP and RegEx
> WQE by
> interleave. The UMR and RegEx WQE can be called as WQE set. The SQ's pi
> and ci will also be increased as WQE set not as WQE.
> 
> For operations don't have scattered mbuf, uses the mbuf's mkey directly,
> the WQE set combination is NOP + RegEx.
> For operations have scattered mubf but share the UMR WQE with others,
> the WQE set combination is NOP + RegEx.
> For operations complete the UMR WQE, the WQE set combination is UMR +
> RegEx.
> 
> Signed-off-by: Suanming Mou <suanmi...@nvidia.com>
> ---
>  doc/guides/regexdevs/mlx5.rst            |   5 +
>  doc/guides/rel_notes/release_21_05.rst   |   4 +
>  doc/guides/tools/testregex.rst           |   3 +
>  drivers/regex/mlx5/mlx5_regex.c          |   9 +
>  drivers/regex/mlx5/mlx5_regex.h          |  26 +-
>  drivers/regex/mlx5/mlx5_regex_control.c  |  43 ++-
>  drivers/regex/mlx5/mlx5_regex_fastpath.c | 378 +++++++++++++++++++++--
>  7 files changed, 410 insertions(+), 58 deletions(-)
> 
> diff --git a/doc/guides/regexdevs/mlx5.rst b/doc/guides/regexdevs/mlx5.rst
> index faaa6ac11d..45a0b96980 100644
> --- a/doc/guides/regexdevs/mlx5.rst
> +++ b/doc/guides/regexdevs/mlx5.rst
> @@ -35,6 +35,11 @@ be specified as device parameter. The RegEx device can
> be probed and used with
>  other Mellanox devices, by adding more options in the class.
>  For example: ``class=net:regex`` will probe both the net PMD and the RegEx
> PMD.
> 
> +Features
> +--------
> +
> +- Multi segments mbuf support.
> +
>  Supported NICs
>  --------------
> 
> diff --git a/doc/guides/rel_notes/release_21_05.rst
> b/doc/guides/rel_notes/release_21_05.rst
> index 3d4b061686..281d4aaa64 100644
> --- a/doc/guides/rel_notes/release_21_05.rst
> +++ b/doc/guides/rel_notes/release_21_05.rst
> @@ -113,6 +113,10 @@ New Features
>    * Added command to display Rx queue used descriptor count.
>      ``show port (port_id) rxq (queue_id) desc used count``
> 
> +* **Updated Mellanox RegEx PMD.**
> +
> +  * Added support for multi segments mbuf.
> +
> 
>  Removed Items
>  -------------
> diff --git a/doc/guides/tools/testregex.rst b/doc/guides/tools/testregex.rst
> index a59acd919f..cdb1ffd6ee 100644
> --- a/doc/guides/tools/testregex.rst
> +++ b/doc/guides/tools/testregex.rst
> @@ -68,6 +68,9 @@ Application Options
>  ``--nb_iter N``
>    number of iteration to run
> 
> +``--nb_segs N``
> +  number of mbuf segment
> +
>  ``--help``
>    print application options

I don't think this is part of this patch.
It should belong to the app patch.

> diff --git a/drivers/regex/mlx5/mlx5_regex.c
> b/drivers/regex/mlx5/mlx5_regex.c
> index ac5b205fa9..82c485e50c 100644
> --- a/drivers/regex/mlx5/mlx5_regex.c
> +++ b/drivers/regex/mlx5/mlx5_regex.c
> @@ -199,6 +199,13 @@ mlx5_regex_pci_probe(struct rte_pci_driver *pci_drv
> __rte_unused,
>       }
>       priv->regexdev->dev_ops = &mlx5_regexdev_ops;
>       priv->regexdev->enqueue = mlx5_regexdev_enqueue;
> +#ifdef HAVE_MLX5_UMR_IMKEY
> +     if (!attr.umr_indirect_mkey_disabled &&
> +         !attr.umr_modify_entity_size_disabled)
> +             priv->has_umr = 1;
> +     if (priv->has_umr)
> +             priv->regexdev->enqueue = mlx5_regexdev_enqueue_gga;
> +#endif
>       priv->regexdev->dequeue = mlx5_regexdev_dequeue;
>       priv->regexdev->device = (struct rte_device *)pci_dev;
>       priv->regexdev->data->dev_private = priv;
> @@ -213,6 +220,8 @@ mlx5_regex_pci_probe(struct rte_pci_driver *pci_drv
> __rte_unused,
>           rte_errno = ENOMEM;
>               goto error;
>       }
> +     DRV_LOG(INFO, "RegEx GGA is %s.",
> +             priv->has_umr ? "supported" : "unsupported");
>       return 0;
> 
>  error:
> diff --git a/drivers/regex/mlx5/mlx5_regex.h
> b/drivers/regex/mlx5/mlx5_regex.h
> index a2b3f0d9f3..51a2101e53 100644
> --- a/drivers/regex/mlx5/mlx5_regex.h
> +++ b/drivers/regex/mlx5/mlx5_regex.h
> @@ -15,6 +15,7 @@
>  #include <mlx5_common_devx.h>
> 
>  #include "mlx5_rxp.h"
> +#include "mlx5_regex_utils.h"
> 
>  struct mlx5_regex_sq {
>       uint16_t log_nb_desc; /* Log 2 number of desc for this object. */
> @@ -40,6 +41,7 @@ struct mlx5_regex_qp {
>       struct mlx5_regex_job *jobs;
>       struct ibv_mr *metadata;
>       struct ibv_mr *outputs;
> +     struct ibv_mr *imkey_addr; /* Indirect mkey array region. */
>       size_t ci, pi;
>       struct mlx5_mr_ctrl mr_ctrl;
>  };
> @@ -71,8 +73,29 @@ struct mlx5_regex_priv {
>       struct mlx5_mr_share_cache mr_scache; /* Global shared MR cache.
> */
>       uint8_t is_bf2; /* The device is BF2 device. */
>       uint8_t sq_ts_format; /* Whether SQ supports timestamp formats. */
> +     uint8_t has_umr; /* The device supports UMR. */
>  };
> 
> +#ifdef HAVE_IBV_FLOW_DV_SUPPORT
> +static inline int
> +regex_get_pdn(void *pd, uint32_t *pdn)
> +{
> +     struct mlx5dv_obj obj;
> +     struct mlx5dv_pd pd_info;
> +     int ret = 0;
> +
> +     obj.pd.in = pd;
> +     obj.pd.out = &pd_info;
> +     ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
> +     if (ret) {
> +             DRV_LOG(DEBUG, "Fail to get PD object info");
> +             return ret;
> +     }
> +     *pdn = pd_info.pdn;
> +     return 0;
> +}
> +#endif
> +
>  /* mlx5_regex.c */
>  int mlx5_regex_start(struct rte_regexdev *dev);
>  int mlx5_regex_stop(struct rte_regexdev *dev);
> @@ -108,5 +131,6 @@ uint16_t mlx5_regexdev_enqueue(struct rte_regexdev
> *dev, uint16_t qp_id,
>                      struct rte_regex_ops **ops, uint16_t nb_ops);
>  uint16_t mlx5_regexdev_dequeue(struct rte_regexdev *dev, uint16_t qp_id,
>                      struct rte_regex_ops **ops, uint16_t nb_ops);
> -
> +uint16_t mlx5_regexdev_enqueue_gga(struct rte_regexdev *dev, uint16_t
> qp_id,
> +                    struct rte_regex_ops **ops, uint16_t nb_ops);
>  #endif /* MLX5_REGEX_H */
> diff --git a/drivers/regex/mlx5/mlx5_regex_control.c
> b/drivers/regex/mlx5/mlx5_regex_control.c
> index 55fbb419ed..eef0fe579d 100644
> --- a/drivers/regex/mlx5/mlx5_regex_control.c
> +++ b/drivers/regex/mlx5/mlx5_regex_control.c
> @@ -27,6 +27,9 @@
> 
>  #define MLX5_REGEX_NUM_WQE_PER_PAGE (4096/64)
> 
> +#define MLX5_REGEX_WQE_LOG_NUM(has_umr, log_desc) \
> +             ((has_umr) ? ((log_desc) + 2) : (log_desc))
> +
>  /**
>   * Returns the number of qp obj to be created.
>   *
> @@ -91,26 +94,6 @@ regex_ctrl_create_cq(struct mlx5_regex_priv *priv,
> struct mlx5_regex_cq *cq)
>       return 0;
>  }
> 
> -#ifdef HAVE_IBV_FLOW_DV_SUPPORT
> -static int
> -regex_get_pdn(void *pd, uint32_t *pdn)
> -{
> -     struct mlx5dv_obj obj;
> -     struct mlx5dv_pd pd_info;
> -     int ret = 0;
> -
> -     obj.pd.in = pd;
> -     obj.pd.out = &pd_info;
> -     ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
> -     if (ret) {
> -             DRV_LOG(DEBUG, "Fail to get PD object info");
> -             return ret;
> -     }
> -     *pdn = pd_info.pdn;
> -     return 0;
> -}
> -#endif
> -
>  /**
>   * Destroy the SQ object.
>   *
> @@ -168,14 +151,16 @@ regex_ctrl_create_sq(struct mlx5_regex_priv *priv,
> struct mlx5_regex_qp *qp,
>       int ret;
> 
>       sq->log_nb_desc = log_nb_desc;
> +     sq->sqn = q_ind;
>       sq->ci = 0;
>       sq->pi = 0;
>       ret = regex_get_pdn(priv->pd, &pd_num);
>       if (ret)
>               return ret;
>       attr.wq_attr.pd = pd_num;
> -     ret = mlx5_devx_sq_create(priv->ctx, &sq->sq_obj, log_nb_desc, &attr,
> -                               SOCKET_ID_ANY);
> +     ret = mlx5_devx_sq_create(priv->ctx, &sq->sq_obj,
> +                     MLX5_REGEX_WQE_LOG_NUM(priv->has_umr,
> log_nb_desc),
> +                     &attr, SOCKET_ID_ANY);
>       if (ret) {
>               DRV_LOG(ERR, "Can't create SQ object.");
>               rte_errno = ENOMEM;
> @@ -225,10 +210,18 @@ mlx5_regex_qp_setup(struct rte_regexdev *dev,
> uint16_t qp_ind,
> 
>       qp = &priv->qps[qp_ind];
>       qp->flags = cfg->qp_conf_flags;
> -     qp->cq.log_nb_desc = rte_log2_u32(cfg->nb_desc);
> -     qp->nb_desc = 1 << qp->cq.log_nb_desc;
> +     log_desc = rte_log2_u32(cfg->nb_desc);
> +     /*
> +      * UMR mode requires two WQEs(UMR and RegEx WQE) for one
> descriptor.
> +      * For CQ, expand the CQE number multiple with 2.
> +      * For SQ, the UMR and RegEx WQE for one descriptor consumes 4
> WQEBBS,
> +      * expand the WQE number multiple with 4.
> +      */
> +     qp->cq.log_nb_desc = log_desc + (!!priv->has_umr);
> +     qp->nb_desc = 1 << log_desc;
>       if (qp->flags & RTE_REGEX_QUEUE_PAIR_CFG_OOS_F)
> -             qp->nb_obj = regex_ctrl_get_nb_obj(qp->nb_desc);
> +             qp->nb_obj = regex_ctrl_get_nb_obj
> +                     (1 << MLX5_REGEX_WQE_LOG_NUM(priv->has_umr,
> log_desc));
>       else
>               qp->nb_obj = 1;
>       qp->sqs = rte_malloc(NULL,
> diff --git a/drivers/regex/mlx5/mlx5_regex_fastpath.c
> b/drivers/regex/mlx5/mlx5_regex_fastpath.c
> index beaea7b63f..4f9402c583 100644
> --- a/drivers/regex/mlx5/mlx5_regex_fastpath.c
> +++ b/drivers/regex/mlx5/mlx5_regex_fastpath.c
> @@ -32,6 +32,15 @@
>  #define MLX5_REGEX_WQE_GATHER_OFFSET 32
>  #define MLX5_REGEX_WQE_SCATTER_OFFSET 48
>  #define MLX5_REGEX_METADATA_OFF 32
> +#define MLX5_REGEX_UMR_WQE_SIZE 192
> +/* The maximum KLMs can be added to one UMR indirect mkey. */
> +#define MLX5_REGEX_MAX_KLM_NUM 128
> +/* The KLM array size for one job. */
> +#define MLX5_REGEX_KLMS_SIZE \
> +     ((MLX5_REGEX_MAX_KLM_NUM) * sizeof(struct mlx5_klm))
> +/* In WQE set mode, the pi should be quarter of the
> MLX5_REGEX_MAX_WQE_INDEX. */
> +#define MLX5_REGEX_UMR_SQ_PI_IDX(pi, ops) \
> +     (((pi) + (ops)) & (MLX5_REGEX_MAX_WQE_INDEX >> 2))
> 
>  static inline uint32_t
>  sq_size_get(struct mlx5_regex_sq *sq)
> @@ -49,6 +58,8 @@ struct mlx5_regex_job {
>       uint64_t user_id;
>       volatile uint8_t *output;
>       volatile uint8_t *metadata;
> +     struct mlx5_klm *imkey_array; /* Indirect mkey's KLM array. */
> +     struct mlx5_devx_obj *imkey; /* UMR WQE's indirect meky. */
>  } __rte_cached_aligned;
> 
>  static inline void
> @@ -99,12 +110,13 @@ set_wqe_ctrl_seg(struct mlx5_wqe_ctrl_seg *seg,
> uint16_t pi, uint8_t opcode,
>  }
> 
>  static inline void
> -prep_one(struct mlx5_regex_priv *priv, struct mlx5_regex_qp *qp,
> -      struct mlx5_regex_sq *sq, struct rte_regex_ops *op,
> -      struct mlx5_regex_job *job)
> +__prep_one(struct mlx5_regex_priv *priv, struct mlx5_regex_sq *sq,
> +        struct rte_regex_ops *op, struct mlx5_regex_job *job,
> +        size_t pi, struct mlx5_klm *klm)
>  {
> -     size_t wqe_offset = (sq->pi & (sq_size_get(sq) - 1)) *
> MLX5_SEND_WQE_BB;
> -     uint32_t lkey;
> +     size_t wqe_offset = (pi & (sq_size_get(sq) - 1)) *
> +                         (MLX5_SEND_WQE_BB << (priv->has_umr ? 2 : 0)) +
> +                         (priv->has_umr ? MLX5_REGEX_UMR_WQE_SIZE :
> 0);
>       uint16_t group0 = op->req_flags &
> RTE_REGEX_OPS_REQ_GROUP_ID0_VALID_F ?
>                               op->group_id0 : 0;
>       uint16_t group1 = op->req_flags &
> RTE_REGEX_OPS_REQ_GROUP_ID1_VALID_F ?
> @@ -122,14 +134,11 @@ prep_one(struct mlx5_regex_priv *priv, struct
> mlx5_regex_qp *qp,
>                              RTE_REGEX_OPS_REQ_GROUP_ID2_VALID_F |
>                              RTE_REGEX_OPS_REQ_GROUP_ID3_VALID_F)))
>               group0 = op->group_id0;
> -     lkey = mlx5_mr_addr2mr_bh(priv->pd, 0,
> -                               &priv->mr_scache, &qp->mr_ctrl,
> -                               rte_pktmbuf_mtod(op->mbuf, uintptr_t),
> -                               !!(op->mbuf->ol_flags &
> EXT_ATTACHED_MBUF));
>       uint8_t *wqe = (uint8_t *)(uintptr_t)sq->sq_obj.wqes + wqe_offset;
>       int ds = 4; /*  ctrl + meta + input + output */
> 
> -     set_wqe_ctrl_seg((struct mlx5_wqe_ctrl_seg *)wqe, sq->pi,
> +     set_wqe_ctrl_seg((struct mlx5_wqe_ctrl_seg *)wqe,
> +                      (priv->has_umr ? (pi * 4 + 3) : pi),
>                        MLX5_OPCODE_MMO,
> MLX5_OPC_MOD_MMO_REGEX,
>                        sq->sq_obj.sq->id, 0, ds, 0, 0);
>       set_regex_ctrl_seg(wqe + 12, 0, group0, group1, group2, group3,
> @@ -137,36 +146,54 @@ prep_one(struct mlx5_regex_priv *priv, struct
> mlx5_regex_qp *qp,
>       struct mlx5_wqe_data_seg *input_seg =
>               (struct mlx5_wqe_data_seg *)(wqe +
> 
> MLX5_REGEX_WQE_GATHER_OFFSET);
> -     input_seg->byte_count =
> -             rte_cpu_to_be_32(rte_pktmbuf_data_len(op->mbuf));
> -     input_seg->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(op->mbuf,
> -                                                         uintptr_t));
> -     input_seg->lkey = lkey;
> +     input_seg->byte_count = rte_cpu_to_be_32(klm->byte_count);
> +     input_seg->addr = rte_cpu_to_be_64(klm->address);
> +     input_seg->lkey = klm->mkey;
>       job->user_id = op->user_id;
> +}
> +
> +static inline void
> +prep_one(struct mlx5_regex_priv *priv, struct mlx5_regex_qp *qp,
> +      struct mlx5_regex_sq *sq, struct rte_regex_ops *op,
> +      struct mlx5_regex_job *job)
> +{
> +     struct mlx5_klm klm;
> +
> +     klm.byte_count = rte_pktmbuf_data_len(op->mbuf);
> +     klm.mkey = mlx5_mr_addr2mr_bh(priv->pd, 0,
> +                               &priv->mr_scache, &qp->mr_ctrl,
> +                               rte_pktmbuf_mtod(op->mbuf, uintptr_t),
> +                               !!(op->mbuf->ol_flags &
> EXT_ATTACHED_MBUF));
> +     klm.address = rte_pktmbuf_mtod(op->mbuf, uintptr_t);
> +     __prep_one(priv, sq, op, job, sq->pi, &klm);
>       sq->db_pi = sq->pi;
>       sq->pi = (sq->pi + 1) & MLX5_REGEX_MAX_WQE_INDEX;
>  }
> 
>  static inline void
> -send_doorbell(struct mlx5dv_devx_uar *uar, struct mlx5_regex_sq *sq)
> +send_doorbell(struct mlx5_regex_priv *priv, struct mlx5_regex_sq *sq)
>  {
> +     struct mlx5dv_devx_uar *uar = priv->uar;
>       size_t wqe_offset = (sq->db_pi & (sq_size_get(sq) - 1)) *
> -             MLX5_SEND_WQE_BB;
> +             (MLX5_SEND_WQE_BB << (priv->has_umr ? 2 : 0)) +
> +             (priv->has_umr ? MLX5_REGEX_UMR_WQE_SIZE : 0);
>       uint8_t *wqe = (uint8_t *)(uintptr_t)sq->sq_obj.wqes + wqe_offset;
> -     ((struct mlx5_wqe_ctrl_seg *)wqe)->fm_ce_se =
> MLX5_WQE_CTRL_CQ_UPDATE;
> +     /* Or the fm_ce_se instead of set, avoid the fence be cleared. */
> +     ((struct mlx5_wqe_ctrl_seg *)wqe)->fm_ce_se |=
> MLX5_WQE_CTRL_CQ_UPDATE;
>       uint64_t *doorbell_addr =
>               (uint64_t *)((uint8_t *)uar->base_addr + 0x800);
>       rte_io_wmb();
> -     sq->sq_obj.db_rec[MLX5_SND_DBR] = rte_cpu_to_be_32((sq->db_pi +
> 1) &
> -
> MLX5_REGEX_MAX_WQE_INDEX);
> +     sq->sq_obj.db_rec[MLX5_SND_DBR] = rte_cpu_to_be_32((priv-
> >has_umr ?
> +                                     (sq->db_pi * 4 + 3) : sq->db_pi) &
> +                                     MLX5_REGEX_MAX_WQE_INDEX);
>       rte_wmb();
>       *doorbell_addr = *(volatile uint64_t *)wqe;
>       rte_wmb();
>  }
> 
>  static inline int
> -can_send(struct mlx5_regex_sq *sq) {
> -     return ((uint16_t)(sq->pi - sq->ci) < sq_size_get(sq));
> +get_free(struct mlx5_regex_sq *sq) {
> +     return (sq_size_get(sq) - (uint16_t)(sq->pi - sq->ci));
>  }
> 
>  static inline uint32_t
> @@ -174,6 +201,211 @@ job_id_get(uint32_t qid, size_t sq_size, size_t index) {
>       return qid * sq_size + (index & (sq_size - 1));
>  }
> 
> +#ifdef HAVE_MLX5_UMR_IMKEY
> +static inline int
> +mkey_klm_available(struct mlx5_klm *klm, uint32_t pos, uint32_t new)
> +{
> +     return (klm && ((pos + new) <= MLX5_REGEX_MAX_KLM_NUM));
> +}
> +
> +static inline void
> +complete_umr_wqe(struct mlx5_regex_qp *qp, struct mlx5_regex_sq *sq,
> +              struct mlx5_regex_job *mkey_job,
> +              size_t umr_index, uint32_t klm_size, uint32_t total_len)
> +{
> +     size_t wqe_offset = (umr_index & (sq_size_get(sq) - 1)) *
> +             (MLX5_SEND_WQE_BB * 4);
> +     struct mlx5_wqe_ctrl_seg *wqe = (struct mlx5_wqe_ctrl_seg
> *)((uint8_t *)
> +                                (uintptr_t)sq->sq_obj.wqes + wqe_offset);
> +     struct mlx5_wqe_umr_ctrl_seg *ucseg =
> +                             (struct mlx5_wqe_umr_ctrl_seg *)(wqe + 1);
> +     struct mlx5_wqe_mkey_context_seg *mkc =
> +                             (struct mlx5_wqe_mkey_context_seg *)(ucseg
> + 1);
> +     struct mlx5_klm *iklm = (struct mlx5_klm *)(mkc + 1);
> +     uint16_t klm_align = RTE_ALIGN(klm_size, 4);
> +
> +     memset(wqe, 0, MLX5_REGEX_UMR_WQE_SIZE);
> +     /* Set WQE control seg. Non-inline KLM UMR WQE size must be 9
> WQE_DS. */
> +     set_wqe_ctrl_seg(wqe, (umr_index * 4), MLX5_OPCODE_UMR,
> +                      0, sq->sq_obj.sq->id, 0, 9, 0,
> +                      rte_cpu_to_be_32(mkey_job->imkey->id));
> +     /* Set UMR WQE control seg. */
> +     ucseg->mkey_mask |=
> rte_cpu_to_be_64(MLX5_WQE_UMR_CTRL_MKEY_MASK_LEN |
> +
>       MLX5_WQE_UMR_CTRL_FLAG_TRNSLATION_OFFSET |
> +
>       MLX5_WQE_UMR_CTRL_MKEY_MASK_ACCESS_LOCAL_WRITE);
> +     ucseg->klm_octowords = rte_cpu_to_be_16(klm_align);
> +     /* Set mkey context seg. */
> +     mkc->len = rte_cpu_to_be_64(total_len);
> +     mkc->qpn_mkey = rte_cpu_to_be_32(0xffffff00 |
> +                                     (mkey_job->imkey->id & 0xff));
> +     /* Set UMR pointer to data seg. */
> +     iklm->address = rte_cpu_to_be_64
> +                             ((uintptr_t)((char *)mkey_job->imkey_array));
> +     iklm->mkey = rte_cpu_to_be_32(qp->imkey_addr->lkey);
> +     iklm->byte_count = rte_cpu_to_be_32(klm_align);
> +     /* Clear the padding memory. */
> +     memset((uint8_t *)&mkey_job->imkey_array[klm_size], 0,
> +            sizeof(struct mlx5_klm) * (klm_align - klm_size));
> +
> +     /* Add the following RegEx WQE with fence. */
> +     wqe = (struct mlx5_wqe_ctrl_seg *)
> +                             (((uint8_t *)wqe) +
> MLX5_REGEX_UMR_WQE_SIZE);
> +     wqe->fm_ce_se |= MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE;
> +}
> +
> +static inline void
> +prep_nop_regex_wqe_set(struct mlx5_regex_priv *priv, struct mlx5_regex_sq
> *sq,
> +                    struct rte_regex_ops *op, struct mlx5_regex_job *job,
> +                    size_t pi, struct mlx5_klm *klm)
> +{
> +     size_t wqe_offset = (pi & (sq_size_get(sq) - 1)) *
> +                         (MLX5_SEND_WQE_BB << 2);
> +     struct mlx5_wqe_ctrl_seg *wqe = (struct mlx5_wqe_ctrl_seg
> *)((uint8_t *)
> +                                (uintptr_t)sq->sq_obj.wqes + wqe_offset);
> +
> +     /* Clear the WQE memory used as UMR WQE previously. */
> +     if ((rte_be_to_cpu_32(wqe->opmod_idx_opcode) & 0xff) !=
> MLX5_OPCODE_NOP)
> +             memset(wqe, 0, MLX5_REGEX_UMR_WQE_SIZE);
> +     /* UMR WQE size is 9 DS, align nop WQE to 3 WQEBBS(12 DS). */
> +     set_wqe_ctrl_seg(wqe, pi * 4, MLX5_OPCODE_NOP, 0, sq->sq_obj.sq-
> >id,
> +                      0, 12, 0, 0);
> +     __prep_one(priv, sq, op, job, pi, klm);
> +}
> +
> +static inline void
> +prep_regex_umr_wqe_set(struct mlx5_regex_priv *priv, struct
> mlx5_regex_qp *qp,
> +      struct mlx5_regex_sq *sq, struct rte_regex_ops **op, size_t nb_ops)
> +{
> +     struct mlx5_regex_job *job = NULL;
> +     size_t sqid = sq->sqn, mkey_job_id = 0;
> +     size_t left_ops = nb_ops;
> +     uint32_t klm_num = 0, len;
> +     struct mlx5_klm *mkey_klm = NULL;
> +     struct mlx5_klm klm;
> +
> +     sqid = sq->sqn;
> +     while (left_ops--)
> +             rte_prefetch0(op[left_ops]);
> +     left_ops = nb_ops;
> +     /*
> +      * Build the WQE set by reverse. In case the burst may consume
> +      * multiple mkeys, build the WQE set as normal will hard to
> +      * address the last mkey index, since we will only know the last
> +      * RegEx WQE's index when finishes building.
> +      */
> +     while (left_ops--) {
> +             struct rte_mbuf *mbuf = op[left_ops]->mbuf;
> +             size_t pi = MLX5_REGEX_UMR_SQ_PI_IDX(sq->pi, left_ops);
> +
> +             if (mbuf->nb_segs > 1) {
> +                     size_t scatter_size = 0;
> +
> +                     if (!mkey_klm_available(mkey_klm, klm_num,
> +                                             mbuf->nb_segs)) {
> +                             /*
> +                              * The mkey's KLM is full, create the UMR
> +                              * WQE in the next WQE set.
> +                              */
> +                             if (mkey_klm)
> +                                     complete_umr_wqe(qp, sq,
> +                                             &qp->jobs[mkey_job_id],
> +
>       MLX5_REGEX_UMR_SQ_PI_IDX(pi, 1),
> +                                             klm_num, len);
> +                             /*
> +                              * Get the indircet mkey and KLM array index
> +                              * from the last WQE set.
> +                              */
> +                             mkey_job_id = job_id_get(sqid,
> +                                                      sq_size_get(sq), pi);
> +                             mkey_klm = qp-
> >jobs[mkey_job_id].imkey_array;
> +                             klm_num = 0;
> +                             len = 0;
> +                     }
> +                     /* Build RegEx WQE's data segment KLM. */
> +                     klm.address = len;
> +                     klm.mkey = rte_cpu_to_be_32
> +                                     (qp->jobs[mkey_job_id].imkey->id);
> +                     while (mbuf) {
> +                             /* Build indirect mkey seg's KLM. */
> +                             mkey_klm->mkey =
> mlx5_mr_addr2mr_bh(priv->pd,
> +                                     NULL, &priv->mr_scache, &qp-
> >mr_ctrl,
> +                                     rte_pktmbuf_mtod(mbuf, uintptr_t),
> +                                     !!(mbuf->ol_flags &
> EXT_ATTACHED_MBUF));
> +                             mkey_klm->address = rte_cpu_to_be_64
> +                                     (rte_pktmbuf_mtod(mbuf, uintptr_t));
> +                             mkey_klm->byte_count = rte_cpu_to_be_32
> +
>       (rte_pktmbuf_data_len(mbuf));
> +                             /*
> +                              * Save the mbuf's total size for RegEx data
> +                              * segment.
> +                              */
> +                             scatter_size += rte_pktmbuf_data_len(mbuf);
> +                             mkey_klm++;
> +                             klm_num++;
> +                             mbuf = mbuf->next;
> +                     }
> +                     len += scatter_size;
> +                     klm.byte_count = scatter_size;
> +             } else {
> +                     /* The single mubf case. Build the KLM directly. */
> +                     klm.mkey = mlx5_mr_addr2mr_bh(priv->pd, NULL,
> +                                     &priv->mr_scache, &qp->mr_ctrl,
> +                                     rte_pktmbuf_mtod(mbuf, uintptr_t),
> +                                     !!(mbuf->ol_flags &
> EXT_ATTACHED_MBUF));
> +                     klm.address = rte_pktmbuf_mtod(mbuf, uintptr_t);
> +                     klm.byte_count = rte_pktmbuf_data_len(mbuf);
> +             }
> +             job = &qp->jobs[job_id_get(sqid, sq_size_get(sq), pi)];
> +             /*
> +              * Build the nop + RegEx WQE set by default. The fist nop WQE
> +              * will be updated later as UMR WQE if scattered mubf exist.
> +              */
> +             prep_nop_regex_wqe_set(priv, sq, op[left_ops], job, pi, &klm);
> +     }
> +     /*
> +      * Scattered mbuf have been added to the KLM array. Complete the
> build
> +      * of UMR WQE, update the first nop WQE as UMR WQE.
> +      */
> +     if (mkey_klm)
> +             complete_umr_wqe(qp, sq, &qp->jobs[mkey_job_id], sq->pi,
> +                              klm_num, len);
> +     sq->db_pi = MLX5_REGEX_UMR_SQ_PI_IDX(sq->pi, nb_ops - 1);
> +     sq->pi = MLX5_REGEX_UMR_SQ_PI_IDX(sq->pi, nb_ops);
> +}
> +
> +uint16_t
> +mlx5_regexdev_enqueue_gga(struct rte_regexdev *dev, uint16_t qp_id,
> +                       struct rte_regex_ops **ops, uint16_t nb_ops)
> +{
> +     struct mlx5_regex_priv *priv = dev->data->dev_private;
> +     struct mlx5_regex_qp *queue = &priv->qps[qp_id];
> +     struct mlx5_regex_sq *sq;
> +     size_t sqid, nb_left = nb_ops, nb_desc;
> +
> +     while ((sqid = ffs(queue->free_sqs))) {
> +             sqid--; /* ffs returns 1 for bit 0 */
> +             sq = &queue->sqs[sqid];
> +             nb_desc = get_free(sq);
> +             if (nb_desc) {
> +                     /* The ops be handled can't exceed nb_ops. */
> +                     if (nb_desc > nb_left)
> +                             nb_desc = nb_left;
> +                     else
> +                             queue->free_sqs &= ~(1 << sqid);
> +                     prep_regex_umr_wqe_set(priv, queue, sq, ops,
> nb_desc);
> +                     send_doorbell(priv, sq);
> +                     nb_left -= nb_desc;
> +             }
> +             if (!nb_left)
> +                     break;
> +             ops += nb_desc;
> +     }
> +     nb_ops -= nb_left;
> +     queue->pi += nb_ops;
> +     return nb_ops;
> +}
> +#endif
> +
>  uint16_t
>  mlx5_regexdev_enqueue(struct rte_regexdev *dev, uint16_t qp_id,
>                     struct rte_regex_ops **ops, uint16_t nb_ops)
> @@ -186,17 +418,17 @@ mlx5_regexdev_enqueue(struct rte_regexdev *dev,
> uint16_t qp_id,
>       while ((sqid = ffs(queue->free_sqs))) {
>               sqid--; /* ffs returns 1 for bit 0 */
>               sq = &queue->sqs[sqid];
> -             while (can_send(sq)) {
> +             while (get_free(sq)) {
>                       job_id = job_id_get(sqid, sq_size_get(sq), sq->pi);
>                       prep_one(priv, queue, sq, ops[i], &queue-
> >jobs[job_id]);
>                       i++;
>                       if (unlikely(i == nb_ops)) {
> -                             send_doorbell(priv->uar, sq);
> +                             send_doorbell(priv, sq);
>                               goto out;
>                       }
>               }
>               queue->free_sqs &= ~(1 << sqid);
> -             send_doorbell(priv->uar, sq);
> +             send_doorbell(priv, sq);
>       }
> 
>  out:
> @@ -308,6 +540,10 @@ mlx5_regexdev_dequeue(struct rte_regexdev *dev,
> uint16_t qp_id,
>                         MLX5_REGEX_MAX_WQE_INDEX;
>               size_t sqid = cqe->rsvd3[2];
>               struct mlx5_regex_sq *sq = &queue->sqs[sqid];
> +
> +             /* UMR mode WQE counter move as WQE set(4 WQEBBS).*/
> +             if (priv->has_umr)
> +                     wq_counter >>= 2;
>               while (sq->ci != wq_counter) {
>                       if (unlikely(i == nb_ops)) {
>                               /* Return without updating cq->ci */
> @@ -316,7 +552,9 @@ mlx5_regexdev_dequeue(struct rte_regexdev *dev,
> uint16_t qp_id,
>                       uint32_t job_id = job_id_get(sqid, sq_size_get(sq),
>                                                    sq->ci);
>                       extract_result(ops[i], &queue->jobs[job_id]);
> -                     sq->ci = (sq->ci + 1) &
> MLX5_REGEX_MAX_WQE_INDEX;
> +                     sq->ci = (sq->ci + 1) & (priv->has_umr ?
> +                              (MLX5_REGEX_MAX_WQE_INDEX >> 2) :
> +                               MLX5_REGEX_MAX_WQE_INDEX);
>                       i++;
>               }
>               cq->ci = (cq->ci + 1) & 0xffffff;
> @@ -331,7 +569,7 @@ mlx5_regexdev_dequeue(struct rte_regexdev *dev,
> uint16_t qp_id,
>  }
> 
>  static void
> -setup_sqs(struct mlx5_regex_qp *queue)
> +setup_sqs(struct mlx5_regex_priv *priv, struct mlx5_regex_qp *queue)
>  {
>       size_t sqid, entry;
>       uint32_t job_id;
> @@ -342,6 +580,14 @@ setup_sqs(struct mlx5_regex_qp *queue)
>                       job_id = sqid * sq_size_get(sq) + entry;
>                       struct mlx5_regex_job *job = &queue->jobs[job_id];
> 
> +                     /* Fill UMR WQE with NOP in advanced. */
> +                     if (priv->has_umr) {
> +                             set_wqe_ctrl_seg
> +                                     ((struct mlx5_wqe_ctrl_seg *)wqe,
> +                                      entry * 2, MLX5_OPCODE_NOP, 0,
> +                                      sq->sq_obj.sq->id, 0, 12, 0, 0);
> +                             wqe += MLX5_REGEX_UMR_WQE_SIZE;
> +                     }
>                       set_metadata_seg((struct mlx5_wqe_metadata_seg *)
>                                        (wqe +
> MLX5_REGEX_WQE_METADATA_OFFSET),
>                                        0, queue->metadata->lkey,
> @@ -358,8 +604,9 @@ setup_sqs(struct mlx5_regex_qp *queue)
>  }
> 
>  static int
> -setup_buffers(struct mlx5_regex_qp *qp, struct ibv_pd *pd)
> +setup_buffers(struct mlx5_regex_priv *priv, struct mlx5_regex_qp *qp)
>  {
> +     struct ibv_pd *pd = priv->pd;
>       uint32_t i;
>       int err;
> 
> @@ -395,6 +642,24 @@ setup_buffers(struct mlx5_regex_qp *qp, struct
> ibv_pd *pd)
>               goto err_output;
>       }
> 
> +     if (priv->has_umr) {
> +             ptr = rte_calloc(__func__, qp->nb_desc,
> MLX5_REGEX_KLMS_SIZE,
> +                              MLX5_REGEX_KLMS_SIZE);
> +             if (!ptr) {
> +                     err = -ENOMEM;
> +                     goto err_imkey;
> +             }
> +             qp->imkey_addr = mlx5_glue->reg_mr(pd, ptr,
> +                                     MLX5_REGEX_KLMS_SIZE * qp-
> >nb_desc,
> +                                     IBV_ACCESS_LOCAL_WRITE);
> +             if (!qp->imkey_addr) {
> +                     rte_free(ptr);
> +                     DRV_LOG(ERR, "Failed to register output");
> +                     err = -EINVAL;
> +                     goto err_imkey;
> +             }
> +     }
> +
>       /* distribute buffers to jobs */
>       for (i = 0; i < qp->nb_desc; i++) {
>               qp->jobs[i].output =
> @@ -403,9 +668,18 @@ setup_buffers(struct mlx5_regex_qp *qp, struct
> ibv_pd *pd)
>               qp->jobs[i].metadata =
>                       (uint8_t *)qp->metadata->addr +
>                       (i % qp->nb_desc) * MLX5_REGEX_METADATA_SIZE;
> +             if (qp->imkey_addr)
> +                     qp->jobs[i].imkey_array = (struct mlx5_klm *)
> +                             qp->imkey_addr->addr +
> +                             (i % qp->nb_desc) *
> MLX5_REGEX_MAX_KLM_NUM;
>       }
> +
>       return 0;
> 
> +err_imkey:
> +     ptr = qp->outputs->addr;
> +     rte_free(ptr);
> +     mlx5_glue->dereg_mr(qp->outputs);
>  err_output:
>       ptr = qp->metadata->addr;
>       rte_free(ptr);
> @@ -417,23 +691,57 @@ int
>  mlx5_regexdev_setup_fastpath(struct mlx5_regex_priv *priv, uint32_t qp_id)
>  {
>       struct mlx5_regex_qp *qp = &priv->qps[qp_id];
> -     int err;
> +     struct mlx5_klm klm = { 0 };
> +     struct mlx5_devx_mkey_attr attr = {
> +             .klm_array = &klm,
> +             .klm_num = 1,
> +             .umr_en = 1,
> +     };
> +     uint32_t i;
> +     int err = 0;
> 
>       qp->jobs = rte_calloc(__func__, qp->nb_desc, sizeof(*qp->jobs), 64);
>       if (!qp->jobs)
>               return -ENOMEM;
> -     err = setup_buffers(qp, priv->pd);
> +     err = setup_buffers(priv, qp);
>       if (err) {
>               rte_free(qp->jobs);
>               return err;
>       }
> -     setup_sqs(qp);
> -     return 0;
> +
> +     setup_sqs(priv, qp);
> +
> +     if (priv->has_umr) {
> +#ifdef HAVE_IBV_FLOW_DV_SUPPORT
> +             if (regex_get_pdn(priv->pd, &attr.pd)) {
> +                     err = -rte_errno;
> +                     DRV_LOG(ERR, "Failed to get pdn.");
> +                     mlx5_regexdev_teardown_fastpath(priv, qp_id);
> +                     return err;
> +             }
> +#endif
> +             for (i = 0; i < qp->nb_desc; i++) {
> +                     attr.klm_num = MLX5_REGEX_MAX_KLM_NUM;
> +                     attr.klm_array = qp->jobs[i].imkey_array;
> +                     qp->jobs[i].imkey =
> mlx5_devx_cmd_mkey_create(priv->ctx,
> +                                                                   &attr);
> +                     if (!qp->jobs[i].imkey) {
> +                             err = -rte_errno;
> +                             DRV_LOG(ERR, "Failed to allocate imkey.");
> +                             mlx5_regexdev_teardown_fastpath(priv,
> qp_id);
> +                     }
> +             }
> +     }
> +     return err;
>  }
> 
>  static void
>  free_buffers(struct mlx5_regex_qp *qp)
>  {
> +     if (qp->imkey_addr) {
> +             mlx5_glue->dereg_mr(qp->imkey_addr);
> +             rte_free(qp->imkey_addr->addr);
> +     }
>       if (qp->metadata) {
>               mlx5_glue->dereg_mr(qp->metadata);
>               rte_free(qp->metadata->addr);
> @@ -448,8 +756,14 @@ void
>  mlx5_regexdev_teardown_fastpath(struct mlx5_regex_priv *priv, uint32_t
> qp_id)
>  {
>       struct mlx5_regex_qp *qp = &priv->qps[qp_id];
> +     uint32_t i;
> 
>       if (qp) {
> +             for (i = 0; i < qp->nb_desc; i++) {
> +                     if (qp->jobs[i].imkey)
> +                             claim_zero(mlx5_devx_cmd_destroy
> +                                                     (qp->jobs[i].imkey));
> +             }
>               free_buffers(qp);
>               if (qp->jobs)
>                       rte_free(qp->jobs);
> --
> 2.25.1

Reply via email to