[RFC]: mempool: zero-copy cache get bulk

2022-11-05 Thread Morten Brørup
Zero-copy access to the mempool cache is beneficial for PMD performance, and 
must be provided by the mempool library to fix [Bug 1052] without a performance 
regression.

[Bug 1052]: https://bugs.dpdk.org/show_bug.cgi?id=1052


This RFC offers two conceptual variants of zero-copy get:
1. A simple version.
2. A version where existing (hot) objects in the cache are moved to the top of 
the cache before new objects from the backend driver are pulled in.

I would like some early feedback. Also, which variant do you prefer?

Notes:
* Allowing the 'cache' parameter to be NULL, and getting it from the mempool 
instead, was inspired by rte_mempool_cache_flush().
* Asserting that the 'mp' parameter is not NULL is not done by other functions, 
so I omitted it here too.

NB: Please ignore formatting. Also, this code has not even been compile tested.


PS: No promises, but I expect to offer an RFC for zero-copy put too. :-)


1. Simple version:

/**
 * Get objects from a mempool via zero-copy access to a user-owned mempool 
cache.
 *
 * @param cache
 *   A pointer to the mempool cache.
 * @param mp
 *   A pointer to the mempool.
 * @param n
 *   The number of objects to prefetch into the mempool cache.
 * @return
 *   The pointer to the objects in the mempool cache.
 *   NULL on error
 *   with rte_errno set appropriately.
 */
static __rte_always_inline void *
rte_mempool_cache_get_bulk(struct rte_mempool_cache *cache,
struct rte_mempool *mp,
unsigned int n)
{
unsigned int len;

if (cache == NULL)
cache = rte_mempool_default_cache(mp, rte_lcore_id());
if (cache == NULL) {
rte_errno = EINVAL;
goto fail;
}

rte_mempool_trace_cache_get_bulk(cache, mp, n);

len = cache->len;

if (unlikely(n > len)) {
unsigned int size;

if (unlikely(n > RTE_MEMPOOL_CACHE_MAX_SIZE)) {
rte_errno = EINVAL;
goto fail;
}

/* Fill the cache from the backend; fetch size + requested - len 
objects. */
size = cache->size;

ret = rte_mempool_ops_dequeue_bulk(mp, &cache->objs[len], size + n - 
len);
if (unlikely(ret < 0)) {
/*
 * We are buffer constrained.
 * Do not fill the cache, just satisfy the request.
 */
ret = rte_mempool_ops_dequeue_bulk(mp, &cache->objs[len], n - len);
if (unlikely(ret < 0)) {
rte_errno = -ret;
goto fail;
}

len = 0;
} else
len = size;
} else
len -= n;

cache->len = len;

RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);
RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);

return &cache->objs[len];

fail:

RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1);
RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n);

return NULL;
}


2. Advanced version:

/**
 * Get objects from a mempool via zero-copy access to a user-owned mempool 
cache.
 *
 * @param cache
 *   A pointer to the mempool cache.
 * @param mp
 *   A pointer to the mempool.
 * @param n
 *   The number of objects to prefetch into the mempool cache.
 * @return
 *   The pointer to the objects in the mempool cache.
 *   NULL on error
 *   with rte_errno set appropriately.
 */
static __rte_always_inline void *
rte_mempool_cache_get_bulk(struct rte_mempool_cache *cache,
struct rte_mempool *mp,
unsigned int n)
{
unsigned int len;

if (cache == NULL)
cache = rte_mempool_default_cache(mp, rte_lcore_id());
if (cache == NULL) {
rte_errno = EINVAL;
goto fail;
}

rte_mempool_trace_cache_get_bulk(cache, mp, n);

len = cache->len;

if (unlikely(n > len)) {
unsigned int size;

if (unlikely(n > RTE_MEMPOOL_CACHE_MAX_SIZE)) {
rte_errno = EINVAL;
goto fail;
}

/* Fill the cache from the backend; fetch size + requested - len 
objects. */
size = cache->size;

if (likely(size + n >= 2 * len)) {
/*
 * No overlap when copying (dst >= len): size + n - len >= len.
 * Move (i.e. copy) the existing objects in the cache to the
 * coming top of the cache, to make room for new objects below.
 */
rte_memcpy(&cache->objs[size + n - len], &cache->objs[0], len);

/* Fill the cache below the existing objects in the cache. */
ret = rte_mempool_ops_dequeue_bulk(mp, &cache->objs[0], size + n - 
len);
if (unlikely(ret < 0)) {
goto constrained;
} else
len = size;
} else {
/* Fill the cache on top of any objects in it. */
ret = rte_mempool_ops_dequeue_bulk(mp, &cache->objs[len], size + n 
- len);
if (unlikely(ret < 0)) {

constrained:
/*
 * We are buffer constrained.
 * Do not fill the cache, just satisfy t

[RFC] mempool: zero-copy cache put bulk

2022-11-05 Thread Morten Brørup
Zero-copy access to the mempool cache is beneficial for PMD performance, and 
must be provided by the mempool library to fix [Bug 1052] without a performance 
regression.

[Bug 1052]: https://bugs.dpdk.org/show_bug.cgi?id=1052


This RFC offers a conceptual zero-copy put function, where the application 
promises to store some objects, and in return gets an address where to store 
them.

I would like some early feedback.

Notes:
* Allowing the 'cache' parameter to be NULL, and getting it from the mempool 
instead, was inspired by rte_mempool_cache_flush().
* Asserting that the 'mp' parameter is not NULL is not done by other functions, 
so I omitted it here too.

NB: Please ignore formatting. Also, this code has not even been compile tested.

/**
 * Promise to put objects in a mempool via zero-copy access to a user-owned 
mempool cache.
 *
 * @param cache
 *   A pointer to the mempool cache.
 * @param mp
 *   A pointer to the mempool.
 * @param n
 *   The number of objects to be put in the mempool cache.
 * @return
 *   The pointer to where to put the objects in the mempool cache.
 *   NULL on error
 *   with rte_errno set appropriately.
 */
static __rte_always_inline void *
rte_mempool_cache_put_bulk_promise(struct rte_mempool_cache *cache,
struct rte_mempool *mp,
unsigned int n)
{
void **cache_objs;

if (cache == NULL)
cache = rte_mempool_default_cache(mp, rte_lcore_id());
if (cache == NULL) {
rte_errno = EINVAL;
return NULL;
}

rte_mempool_trace_cache_put_bulk_promise(cache, mp, n);

/* The request itself is too big for the cache */
if (unlikely(n > cache->flushthresh)) {
rte_errno = EINVAL;
return NULL;
}

/*
 * The cache follows the following algorithm:
 *   1. If the objects cannot be added to the cache without crossing
 *  the flush threshold, flush the cache to the backend.
 *   2. Add the objects to the cache.
 */

if (cache->len + n <= cache->flushthresh) {
cache_objs = &cache->objs[cache->len];
cache->len += n;
} else {
cache_objs = &cache->objs[0];
rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);
cache->len = n;
}

RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);

return cache_objs;
}


Med venlig hilsen / Kind regards,
-Morten Brørup




RE: [RFC] mempool: zero-copy cache put bulk

2022-11-05 Thread Honnappa Nagarahalli
+ Akshitha, she is working on similar patch

Few comments inline

> -Original Message-
> From: Morten Brørup 
> Sent: Saturday, November 5, 2022 8:40 AM
> To: dev@dpdk.org; olivier.m...@6wind.com;
> andrew.rybche...@oktetlabs.ru; Honnappa Nagarahalli
> 
> Subject: [RFC] mempool: zero-copy cache put bulk
> 
> Zero-copy access to the mempool cache is beneficial for PMD performance,
> and must be provided by the mempool library to fix [Bug 1052] without a
> performance regression.
> 
> [Bug 1052]: https://bugs.dpdk.org/show_bug.cgi?id=1052
> 
> 
> This RFC offers a conceptual zero-copy put function, where the application
> promises to store some objects, and in return gets an address where to store
> them.
> 
> I would like some early feedback.
> 
> Notes:
> * Allowing the 'cache' parameter to be NULL, and getting it from the
> mempool instead, was inspired by rte_mempool_cache_flush().
I am not sure why the 'cache' parameter is required for this API. This API 
should take the mem pool as the parameter.

We have based our API on 'rte_mempool_do_generic_put' and removed the 'cache' 
parameter. This new API, on success, returns the pointer to memory where the 
objects are copied. On failure it returns NULL and the caller has to call 
'rte_mempool_ops_enqueue_bulk'. Alternatively, the new API could do this as 
well and PMD does not need to do anything if it gets a NULL pointer.

We should think about providing  similar API on the RX side to keep it 
symmetric.

> * Asserting that the 'mp' parameter is not NULL is not done by other
> functions, so I omitted it here too.
> 
> NB: Please ignore formatting. Also, this code has not even been compile
> tested.
We are little bit ahead, tested the changes with i40e PF PMD, wrote unit test 
cases, going through internal review, will send out RFC on Monday

> 
> /**
>  * Promise to put objects in a mempool via zero-copy access to a user-owned
> mempool cache.
>  *
>  * @param cache
>  *   A pointer to the mempool cache.
>  * @param mp
>  *   A pointer to the mempool.
>  * @param n
>  *   The number of objects to be put in the mempool cache.
>  * @return
>  *   The pointer to where to put the objects in the mempool cache.
>  *   NULL on error
>  *   with rte_errno set appropriately.
>  */
> static __rte_always_inline void *
> rte_mempool_cache_put_bulk_promise(struct rte_mempool_cache *cache,
> struct rte_mempool *mp,
> unsigned int n)
> {
> void **cache_objs;
> 
> if (cache == NULL)
> cache = rte_mempool_default_cache(mp, rte_lcore_id());
> if (cache == NULL) {
> rte_errno = EINVAL;
> return NULL;
> }
> 
> rte_mempool_trace_cache_put_bulk_promise(cache, mp, n);
> 
> /* The request itself is too big for the cache */
> if (unlikely(n > cache->flushthresh)) {
> rte_errno = EINVAL;
> return NULL;
> }
> 
> /*
>  * The cache follows the following algorithm:
>  *   1. If the objects cannot be added to the cache without crossing
>  *  the flush threshold, flush the cache to the backend.
>  *   2. Add the objects to the cache.
>  */
> 
> if (cache->len + n <= cache->flushthresh) {
> cache_objs = &cache->objs[cache->len];
> cache->len += n;
> } else {
> cache_objs = &cache->objs[0];
> rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);
> cache->len = n;
> }
> 
> RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1);
> RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);
> 
> return cache_objs;
> }
> 
> 
> Med venlig hilsen / Kind regards,
> -Morten Brørup
> 



[PATCH v2] net/mlx5: fix Windows flow table and queue routine

2022-11-05 Thread Suanming Mou
The macro HAVE_MLX5_HWS_SUPPORT was introduced for HWS only. And
HWS was not supported on Windows. So macro HAVE_MLX5_HWS_SUPPORT
should be only around the code which HWS uses, but avoid including
the code block shared by Linux and Windows.

Fixes: 22681deead3e ("net/mlx5/hws: enable hardware steering")

Signed-off-by: Suanming Mou 
Acked-by: Viacheslav Ovsiienko 
---
v2:
- add ack info.
- fix typo.
---
 drivers/net/mlx5/mlx5.c  | 4 +++-
 drivers/net/mlx5/mlx5_devx.c | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 78234b116c..3f87f83ff0 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -1813,12 +1813,14 @@ mlx5_alloc_table_hash_list(struct mlx5_priv *priv 
__rte_unused)
int err = 0;
 
/* Tables are only used in DV and DR modes. */
-#ifdef HAVE_MLX5_HWS_SUPPORT
+#if defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_INFINIBAND_VERBS_H)
struct mlx5_dev_ctx_shared *sh = priv->sh;
char s[MLX5_NAME_SIZE];
 
+#ifdef HAVE_MLX5_HWS_SUPPORT
if (priv->sh->config.dv_flow_en == 2)
return mlx5_alloc_hw_group_hash_list(priv);
+#endif
MLX5_ASSERT(sh);
snprintf(s, sizeof(s), "%s_flow_table", priv->sh->ibdev_name);
sh->flow_tbls = mlx5_hlist_create(s, MLX5_FLOW_TABLE_HLIST_ARRAY_SIZE,
diff --git a/drivers/net/mlx5/mlx5_devx.c b/drivers/net/mlx5/mlx5_devx.c
index 137e7dd4ac..c1305836cf 100644
--- a/drivers/net/mlx5/mlx5_devx.c
+++ b/drivers/net/mlx5/mlx5_devx.c
@@ -907,6 +907,7 @@ mlx5_devx_hrxq_new(struct rte_eth_dev *dev, struct 
mlx5_hrxq *hrxq,
rte_errno = errno;
goto error;
}
+#if defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_INFINIBAND_VERBS_H)
 #ifdef HAVE_MLX5_HWS_SUPPORT
if (hrxq->hws_flags) {
hrxq->action = mlx5dr_action_create_dest_tir
@@ -916,6 +917,7 @@ mlx5_devx_hrxq_new(struct rte_eth_dev *dev, struct 
mlx5_hrxq *hrxq,
goto error;
return 0;
}
+#endif
if (mlx5_flow_os_create_flow_action_dest_devx_tir(hrxq->tir,
  &hrxq->action)) {
rte_errno = errno;
-- 
2.25.1



RE: [RFC] mempool: zero-copy cache put bulk

2022-11-05 Thread Morten Brørup
> From: Honnappa Nagarahalli [mailto:honnappa.nagaraha...@arm.com]
> Sent: Sunday, 6 November 2022 00.11
> 
> + Akshitha, she is working on similar patch
> 
> Few comments inline
> 
> > From: Morten Brørup 
> > Sent: Saturday, November 5, 2022 8:40 AM
> >
> > Zero-copy access to the mempool cache is beneficial for PMD
> performance,
> > and must be provided by the mempool library to fix [Bug 1052] without
> a
> > performance regression.
> >
> > [Bug 1052]: https://bugs.dpdk.org/show_bug.cgi?id=1052
> >
> >
> > This RFC offers a conceptual zero-copy put function, where the
> application
> > promises to store some objects, and in return gets an address where
> to store
> > them.
> >
> > I would like some early feedback.
> >
> > Notes:
> > * Allowing the 'cache' parameter to be NULL, and getting it from the
> > mempool instead, was inspired by rte_mempool_cache_flush().
> I am not sure why the 'cache' parameter is required for this API. This
> API should take the mem pool as the parameter.
> 
> We have based our API on 'rte_mempool_do_generic_put' and removed the
> 'cache' parameter.

I thoroughly considered omitting the 'cache' parameter, but included it for two 
reasons:

1. The function is a "mempool cache" function (i.e. primarily working on the 
mempool cache), not a "mempool" function.

So it is appropriate to have a pointer directly to the structure it is working 
on. Following this through, I also made 'cache' the first parameter and 'mp' 
the second, like in rte_mempool_cache_flush().

2. In most cases, the function only accesses the mempool structure in order to 
get the cache pointer. Skipping this step improves performance.

And since the cache is created along with the mempool itself (and thus never 
changes for a mempool), it would be safe for the PMD to store the 'cache' 
pointer along with the 'mp' pointer in the PMD's queue structure.

E.g. in the i40e PMD the i40e_rx_queue structure could include a "struct 
rte_mempool_cache *cache" field, which could be used i40e_rxq_rearm() [1] 
instead of "cache = rte_mempool_default_cache(rxq->mp, rte_lcore_id())".

[1] 
https://elixir.bootlin.com/dpdk/v22.11-rc2/source/drivers/net/i40e/i40e_rxtx_vec_avx512.c#L31

> This new API, on success, returns the pointer to
> memory where the objects are copied. On failure it returns NULL and the
> caller has to call 'rte_mempool_ops_enqueue_bulk'. Alternatively, the
> new API could do this as well and PMD does not need to do anything if
> it gets a NULL pointer.

Yes, we agree about these two details:

1. The function should return a pointer, not an integer.
It would be a waste to use a another CPU register to convey a success/error 
integer value, when the success/failure information is just as easily conveyed 
by the pointer return value (non-NULL/NULL), and rte_errno for various error 
values in the unlikely cases.

2. The function should leave it up to the PMD what to do if direct access to 
the cache is unavailable.

> 
> We should think about providing  similar API on the RX side to keep it
> symmetric.

I sent an RFC for that too:
http://inbox.dpdk.org/dev/98cbd80474fa8b44bf855df32c47dc35d87...@smartserver.smartshare.dk/T/#u


> 
> > * Asserting that the 'mp' parameter is not NULL is not done by other
> > functions, so I omitted it here too.
> >
> > NB: Please ignore formatting. Also, this code has not even been
> compile
> > tested.
> We are little bit ahead, tested the changes with i40e PF PMD, wrote
> unit test cases, going through internal review, will send out RFC on
> Monday

Sounds good. Looking forward to review.

> 
> >
> > /**
> >  * Promise to put objects in a mempool via zero-copy access to a
> user-owned
> > mempool cache.
> >  *
> >  * @param cache
> >  *   A pointer to the mempool cache.
> >  * @param mp
> >  *   A pointer to the mempool.
> >  * @param n
> >  *   The number of objects to be put in the mempool cache.
> >  * @return
> >  *   The pointer to where to put the objects in the mempool cache.
> >  *   NULL on error
> >  *   with rte_errno set appropriately.
> >  */
> > static __rte_always_inline void *
> > rte_mempool_cache_put_bulk_promise(struct rte_mempool_cache *cache,
> > struct rte_mempool *mp,
> > unsigned int n)
> > {
> > void **cache_objs;
> >
> > if (cache == NULL)
> > cache = rte_mempool_default_cache(mp, rte_lcore_id());
> > if (cache == NULL) {
> > rte_errno = EINVAL;
> > return NULL;
> > }
> >
> > rte_mempool_trace_cache_put_bulk_promise(cache, mp, n);
> >
> > /* The request itself is too big for the cache */
> > if (unlikely(n > cache->flushthresh)) {
> > rte_errno = EINVAL;
> > return NULL;
> > }
> >
> > /*
> >  * The cache follows the following algorithm:
> >  *   1. If the objects cannot be added to the cache without
> crossing
> >  *  the flush threshold, flush the cache to the backend.
> >  *   2. Add the objects to the cache.
> >  */
> >
> > if