On Thu, Jan 16, 2020 at 01:04:27PM +0000, Viacheslav Ovsiienko wrote:
> The dedicated routine rte_pktmbuf_pool_create_extbuf() is
> provided to create mbuf pool with data buffers located in
> the pinned external memory. The application provides the
> external memory description and routine initializes each
> mbuf with appropriate virtual and physical buffer address.
> It is entirely application responsibility to register
> external memory with rte_extmem_register() API, map this
> memory, etc.
> 
> The new introduced flag RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF
> is set in private pool structure, specifying the new special
> pool type. The allocated mbufs from pool of this kind will
> have the EXT_ATTACHED_MBUF flag set and initialiazed shared
> info structure, allowing cloning with regular mbufs (without
> attached external buffers of any kind).
> 
> Signed-off-by: Viacheslav Ovsiienko <viachesl...@mellanox.com>
> ---
>  lib/librte_mbuf/rte_mbuf.c           | 178 
> ++++++++++++++++++++++++++++++++++-
>  lib/librte_mbuf/rte_mbuf.h           |  84 +++++++++++++++++
>  lib/librte_mbuf/rte_mbuf_version.map |   1 +
>  3 files changed, 260 insertions(+), 3 deletions(-)
> 
> diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
> index 8fa7f49..b9d89d0 100644
> --- a/lib/librte_mbuf/rte_mbuf.c
> +++ b/lib/librte_mbuf/rte_mbuf.c
> @@ -59,9 +59,12 @@
>       }
>  
>       RTE_ASSERT(mp->elt_size >= sizeof(struct rte_mbuf) +
> -             user_mbp_priv->mbuf_data_room_size +
> +             ((user_mbp_priv->flags & RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF) ?
> +                     sizeof(struct rte_mbuf_ext_shared_info) :
> +                     user_mbp_priv->mbuf_data_room_size) +
>               user_mbp_priv->mbuf_priv_size);
> -     RTE_ASSERT(user_mbp_priv->flags == 0);
> +     RTE_ASSERT((user_mbp_priv->flags &
> +                 ~RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF) == 0);
>  
>       mbp_priv = rte_mempool_get_priv(mp);
>       memcpy(mbp_priv, user_mbp_priv, sizeof(*mbp_priv));
> @@ -107,6 +110,89 @@
>       m->next = NULL;
>  }
>  
> +/*
> + * @internal
> + * The callback routine called when reference counter in shinfo for mbufs
> + * with pinned external buffer reaches zero. It means there is no more
> + * references to buffer backing mbuf and this one should be freed.
> + */
> +static void rte_pktmbuf_free_pinned_extmem(void *addr, void *opaque)
> +{
> +     struct rte_mbuf *m = opaque;
> +
> +     RTE_SET_USED(addr);
> +     RTE_ASSERT(RTE_MBUF_HAS_EXTBUF(m));
> +     RTE_ASSERT(RTE_MBUF_HAS_PINNED_EXTBUF(m));
> +     RTE_ASSERT(m->shinfo->fcb_opaque == m);
> +
> +     rte_mbuf_ext_refcnt_set(m->shinfo, 1);
> +     rte_pktmbuf_free_seg(m);

I think it should be rte_mbuf_raw_free(m) instead.

Else, the corresponding code can fail:

  - alloc packet from pinned_pool
  - increase ref counter (this is legal)
  - free packet on 2 cores at the same time

The refcnt will reach 0 on one of the core, then is calls
rte_pktmbuf_free_pinned_extmem(), then rte_pktmbuf_free_seg(), then
rte_pktmbuf_prefree_seg() which will do nothing because refcnt is not 1
-> mem leak



> +}
> +
> +/*
> + * pktmbuf constructor for the pool with pinned external buffer,
> + * given as a callback function to rte_mempool_obj_iter() in
> + * rte_pktmbuf_pool_create_extbuf(). Set the fields of a packet
> + * mbuf to their default values.
> + */
> +void
> +rte_pktmbuf_init_extmem(struct rte_mempool *mp,
> +                     void *opaque_arg,
> +                     void *_m,
> +                     __attribute__((unused)) unsigned int i)
> +{

Can it be static?
If it is public, it should be experimental and in .map

> +     struct rte_mbuf *m = _m;
> +     struct rte_pktmbuf_extmem_init_ctx *ctx = opaque_arg;
> +     struct rte_pktmbuf_extmem *ext_mem;
> +     uint32_t mbuf_size, buf_len, priv_size;
> +     struct rte_mbuf_ext_shared_info *shinfo;
> +
> +     priv_size = rte_pktmbuf_priv_size(mp);
> +     mbuf_size = sizeof(struct rte_mbuf) + priv_size;
> +     buf_len = rte_pktmbuf_data_room_size(mp);
> +
> +     RTE_ASSERT(RTE_ALIGN(priv_size, RTE_MBUF_PRIV_ALIGN) == priv_size);
> +     RTE_ASSERT(mp->elt_size >= mbuf_size);
> +     RTE_ASSERT(buf_len <= UINT16_MAX);
> +
> +     memset(m, 0, mbuf_size);
> +     m->priv_size = priv_size;
> +     m->buf_len = (uint16_t)buf_len;
> +
> +     /* set the data buffer pointers to external memory */
> +     ext_mem = ctx->ext_mem + ctx->ext;
> +
> +     RTE_ASSERT(ctx->ext < ctx->ext_num);
> +     RTE_ASSERT(ctx->off < ext_mem->buf_len);
> +
> +     m->buf_addr = RTE_PTR_ADD(ext_mem->buf_ptr, ctx->off);
> +     m->buf_iova = ext_mem->buf_iova == RTE_BAD_IOVA ?
> +                   RTE_BAD_IOVA : (ext_mem->buf_iova + ctx->off);
> +
> +     ctx->off += ext_mem->elt_size;
> +     if (ctx->off >= ext_mem->buf_len) {
> +             ctx->off = 0;
> +             ++ctx->ext;
> +     }
> +     /* keep some headroom between start of buffer and data */
> +     m->data_off = RTE_MIN(RTE_PKTMBUF_HEADROOM, (uint16_t)m->buf_len);
> +
> +     /* init some constant fields */
> +     m->pool = mp;
> +     m->nb_segs = 1;
> +     m->port = MBUF_INVALID_PORT;
> +     m->ol_flags = EXT_ATTACHED_MBUF;
> +     rte_mbuf_refcnt_set(m, 1);
> +     m->next = NULL;
> +
> +     /* init external buffer shared info items */
> +     shinfo = RTE_PTR_ADD(m, mbuf_size);

I think it should be
shinfo = RTE_PTR_ADD(m, mbuf_size + priv_size);

> +     m->shinfo = shinfo;
> +     shinfo->free_cb = rte_pktmbuf_free_pinned_extmem;
> +     shinfo->fcb_opaque = m;
> +     rte_mbuf_ext_refcnt_set(shinfo, 1);

To me, it is not very clear that free_cb() will only be called
for mbuf from non-pinned pools that are attached to a mbuf from
a pinned pool, i.e. the free_cb is not called at all if attach()
is not used.


> +}
> +
>  /* Helper to create a mbuf pool with given mempool ops name*/
>  struct rte_mempool *
>  rte_pktmbuf_pool_create_by_ops(const char *name, unsigned int n,
> @@ -169,6 +255,92 @@ struct rte_mempool *
>                       data_room_size, socket_id, NULL);
>  }
>  
> +/* Helper to create a mbuf pool with pinned external data buffers. */
> +struct rte_mempool *
> +rte_pktmbuf_pool_create_extbuf(const char *name, unsigned int n,
> +     unsigned int cache_size, uint16_t priv_size,
> +     uint16_t data_room_size, int socket_id,
> +     struct rte_pktmbuf_extmem *ext_mem, unsigned int ext_num)

struct rte_pktmbuf_extmem *ext_mem can be const

> +{
> +     struct rte_mempool *mp;
> +     struct rte_pktmbuf_pool_private mbp_priv;
> +     struct rte_pktmbuf_extmem_init_ctx init_ctx;
> +     const char *mp_ops_name;
> +     unsigned int elt_size;
> +     unsigned int i, n_elts = 0;
> +     int ret;
> +
> +     if (RTE_ALIGN(priv_size, RTE_MBUF_PRIV_ALIGN) != priv_size) {
> +             RTE_LOG(ERR, MBUF, "mbuf priv_size=%u is not aligned\n",
> +                     priv_size);
> +             rte_errno = EINVAL;
> +             return NULL;
> +     }
> +     /* Check the external memory descriptors. */
> +     for (i = 0; i < ext_num; i++) {
> +             struct rte_pktmbuf_extmem *extm = ext_mem + i;
> +
> +             if (!extm->elt_size || !extm->buf_len || !extm->buf_ptr) {
> +                     RTE_LOG(ERR, MBUF, "invalid extmem descriptor\n");
> +                     rte_errno = EINVAL;
> +                     return NULL;
> +             }
> +             if (data_room_size > extm->elt_size) {
> +                     RTE_LOG(ERR, MBUF, "ext elt_size=%u is too small\n",
> +                             priv_size);
> +                     rte_errno = EINVAL;
> +                     return NULL;
> +             }
> +             n_elts += extm->buf_len / extm->elt_size;
> +     }

Maybe it's just me, but I think there is a source of confusion between:

- elt_size: the size of mempool element (i.e a sizeof(mbuf) +
  priv_size + sizeof(shinfo))
- extm->elt_size: the size of data buffer in external memory

The structure fields could be renamed like this:

struct rte_pktmbuf_extmem {
        void *ptr;              /**< The virtual address of data buffer. */
        rte_iova_t iova;        /**< The IO address of the data buffer. */
        size_t len;             /**< External buffer length in bytes. */
        uint16_t mbuf_data_size; /**< mbuf data size in bytes. */
};


> +     /* Check whether enough external memory provided. */
> +     if (n_elts < n) {
> +             RTE_LOG(ERR, MBUF, "not enough extmem\n");
> +             rte_errno = ENOMEM;
> +             return NULL;
> +     }
> +     elt_size = sizeof(struct rte_mbuf) +
> +                (unsigned int)priv_size +
> +                sizeof(struct rte_mbuf_ext_shared_info);
> +
> +     memset(&mbp_priv, 0, sizeof(mbp_priv));
> +     mbp_priv.mbuf_data_room_size = data_room_size;
> +     mbp_priv.mbuf_priv_size = priv_size;
> +     mbp_priv.flags = RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF;
> +
> +     mp = rte_mempool_create_empty(name, n, elt_size, cache_size,
> +              sizeof(struct rte_pktmbuf_pool_private), socket_id, 0);
> +     if (mp == NULL)
> +             return NULL;
> +
> +     mp_ops_name = rte_mbuf_best_mempool_ops();
> +     ret = rte_mempool_set_ops_byname(mp, mp_ops_name, NULL);
> +     if (ret != 0) {
> +             RTE_LOG(ERR, MBUF, "error setting mempool handler\n");
> +             rte_mempool_free(mp);
> +             rte_errno = -ret;
> +             return NULL;
> +     }
> +     rte_pktmbuf_pool_init(mp, &mbp_priv);
> +
> +     ret = rte_mempool_populate_default(mp);
> +     if (ret < 0) {
> +             rte_mempool_free(mp);
> +             rte_errno = -ret;
> +             return NULL;
> +     }

I still think that these ~20 lines could be simplified into a call to
rte_pktmbuf_pool_create(). I don't think that using rte_mempool_obj_iter()
twice would cause a performance issue.

> +
> +     init_ctx = (struct rte_pktmbuf_extmem_init_ctx){
> +             .ext_mem = ext_mem,
> +             .ext_num = ext_num,
> +             .ext = 0,
> +             .off = 0,
> +     };
> +     rte_mempool_obj_iter(mp, rte_pktmbuf_init_extmem, &init_ctx);
> +
> +     return mp;
> +}
> +
>  /* do some sanity checks on a mbuf: panic if it fails */
>  void
>  rte_mbuf_sanity_check(const struct rte_mbuf *m, int is_header)
7> @@ -247,7 +419,7 @@ int rte_mbuf_check(const struct rte_mbuf *m, int 
is_header,
>       return 0;
>  }
>  
> -/**
> +/*
>   * @internal helper function for freeing a bulk of packet mbuf segments
>   * via an array holding the packet mbuf segments from the same mempool
>   * pending to be freed.

I agree with this change, but it should not go in that patch.

> diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
> index 52d57d1..093210e 100644
> --- a/lib/librte_mbuf/rte_mbuf.h
> +++ b/lib/librte_mbuf/rte_mbuf.h
> @@ -637,6 +637,34 @@ static inline struct rte_mbuf *rte_mbuf_raw_alloc(struct 
> rte_mempool *mp)
>  void rte_pktmbuf_init(struct rte_mempool *mp, void *opaque_arg,
>                     void *m, unsigned i);
>  
> +/** The context to initialize the mbufs with pinned external buffers. */
> +struct rte_pktmbuf_extmem_init_ctx {
> +     struct rte_pktmbuf_extmem *ext_mem; /* pointer to descriptor array. */
> +     unsigned int ext_num; /* number of descriptors in array. */
> +     unsigned int ext; /* loop descriptor index. */
> +     size_t off; /* loop buffer offset. */
> +};

I think it could be private.

> +
> +/**
> + * The packet mbuf constructor for pools with pinned external memory.
> + *
> + * This function initializes some fields in the mbuf structure that are
> + * not modified by the user once created (origin pool, buffer start
> + * address, and so on). This function is given as a callback function to
> + * rte_mempool_obj_iter() called from rte_mempool_create_extmem().
> + *
> + * @param mp
> + *   The mempool from which mbufs originate.
> + * @param opaque_arg
> + *   A pointer to the rte_pktmbuf_extmem_init_ctx - initialization
> + *   context structure
> + * @param m
> + *   The mbuf to initialize.
> + * @param i
> + *   The index of the mbuf in the pool table.
> + */
> +void rte_pktmbuf_init_extmem(struct rte_mempool *mp, void *opaque_arg,
> +                          void *m, unsigned int i);
>  

Except if I missing something, it could be private too

>  /**
>   * A  packet mbuf pool constructor.
> @@ -738,6 +766,62 @@ struct rte_mempool *
>       unsigned int cache_size, uint16_t priv_size, uint16_t data_room_size,
>       int socket_id, const char *ops_name);
>  
> +/** A structure that describes the pinned external buffer segment. */
> +struct rte_pktmbuf_extmem {
> +     void *buf_ptr;          /**< The virtual address of data buffer. */
> +     rte_iova_t buf_iova;    /**< The IO address of the data buffer. */
> +     size_t buf_len;         /**< External buffer length in bytes. */
> +     uint16_t elt_size;      /**< mbuf element size in bytes. */
> +};

See my proposition above about fields renaming.

> +
> +/**
> + * Create a mbuf pool with external pinned data buffers.
> + *
> + * This function creates and initializes a packet mbuf pool that contains
> + * only mbufs with external buffer. It is a wrapper to rte_mempool functions.
> + *
> + * @param name
> + *   The name of the mbuf pool.
> + * @param n
> + *   The number of elements in the mbuf pool. The optimum size (in terms
> + *   of memory usage) for a mempool is when n is a power of two minus one:
> + *   n = (2^q - 1).
> + * @param cache_size
> + *   Size of the per-core object cache. See rte_mempool_create() for
> + *   details.
> + * @param priv_size
> + *   Size of application private are between the rte_mbuf structure
> + *   and the data buffer. This value must be aligned to RTE_MBUF_PRIV_ALIGN.
> + * @param data_room_size
> + *   Size of data buffer in each mbuf, including RTE_PKTMBUF_HEADROOM.
> + * @param socket_id
> + *   The socket identifier where the memory should be allocated. The
> + *   value can be *SOCKET_ID_ANY* if there is no NUMA constraint for the
> + *   reserved zone.
> + * @param ext_mem
> + *   Pointer to the array of structures describing the external memory
> + *   for data buffers. It is caller responsibility to register this memory
> + *   with rte_extmem_register() (if needed), map this memory to appropriate
> + *   physical device, etc.
> + * @param ext_num
> + *   Number of elements in the ext_mem array.
> + * @return
> + *   The pointer to the new allocated mempool, on success. NULL on error
> + *   with rte_errno set appropriately. Possible rte_errno values include:
> + *    - E_RTE_NO_CONFIG - function could not get pointer to rte_config 
> structure
> + *    - E_RTE_SECONDARY - function was called from a secondary process 
> instance
> + *    - EINVAL - cache size provided is too large, or priv_size is not 
> aligned.
> + *    - ENOSPC - the maximum number of memzones has already been allocated
> + *    - EEXIST - a memzone with the same name already exists
> + *    - ENOMEM - no appropriate memory area found in which to create memzone
> + */
> +__rte_experimental
> +struct rte_mempool *
> +rte_pktmbuf_pool_create_extbuf(const char *name, unsigned int n,
> +     unsigned int cache_size, uint16_t priv_size,
> +     uint16_t data_room_size, int socket_id,
> +     struct rte_pktmbuf_extmem *ext_mem, unsigned int ext_num);
> +
>  /**
>   * Get the data room size of mbufs stored in a pktmbuf_pool
>   *
> diff --git a/lib/librte_mbuf/rte_mbuf_version.map 
> b/lib/librte_mbuf/rte_mbuf_version.map
> index 3bbb476..ab161bc 100644
> --- a/lib/librte_mbuf/rte_mbuf_version.map
> +++ b/lib/librte_mbuf/rte_mbuf_version.map
> @@ -44,5 +44,6 @@ EXPERIMENTAL {
>       rte_mbuf_dyn_dump;
>       rte_pktmbuf_copy;
>       rte_pktmbuf_free_bulk;
> +     rte_pktmbuf_pool_create_extbuf;
>  
>  };
> -- 
> 1.8.3.1
> 

One more thing: it would be nice to have a functional test for this new
feature. I did a very minimal one to check the basic alloc/free/attach
feature, you can restart from that if you want.

static int
test_ext_pinned(int test_case)
{
        struct rte_pktmbuf_extmem ext_mem;
        struct rte_mempool *pinned_pool = NULL;
        struct rte_mempool *std_pool = NULL;
        const struct rte_memzone *mz = NULL;
        struct rte_mbuf *m = NULL, *m2 = NULL;

        printf("Test mbuf pool with mbufs data pinned to external buffer 
(%d)\n", test_case);

        std_pool = rte_pktmbuf_pool_create("std_pool",
                        NB_MBUF, MEMPOOL_CACHE_SIZE, 0, MBUF_DATA_SIZE,
                        SOCKET_ID_ANY);
        if (std_pool == NULL)
                GOTO_FAIL("std_pool alloc failed");

        mz = rte_memzone_reserve("std_pool",
                                NB_MBUF * sizeof(struct rte_mbuf),
                                SOCKET_ID_ANY,
                                RTE_MEMZONE_2MB|RTE_MEMZONE_SIZE_HINT_ONLY);
        if (mz == NULL)
                GOTO_FAIL("memzone alloc failed");

        ext_mem.buf_ptr = mz->addr;
        ext_mem.buf_iova = mz->iova;
        ext_mem.buf_len = mz->len;
        ext_mem.elt_size = sizeof(struct rte_mbuf);

        pinned_pool = rte_pktmbuf_pool_create_extbuf("pinned_pool",
                                        NB_MBUF, MEMPOOL_CACHE_SIZE,
                                        0, 0, SOCKET_ID_ANY, &ext_mem, 1);
        if (pinned_pool == NULL)
                GOTO_FAIL("pinned_pool alloc failed");

        m = rte_pktmbuf_alloc(pinned_pool);
        if (unlikely(m == NULL))
                goto fail;

        if (test_case != 0) {
                m2 = rte_pktmbuf_alloc(std_pool);
                if (unlikely(m == NULL))
                        goto fail;
                rte_pktmbuf_attach(m2, m);
        }

        if (test_case == 0) {
                rte_pktmbuf_free(m);
        } else if (test_case == 1) {
                rte_pktmbuf_free(m);
                rte_pktmbuf_free(m2);
        } else if (test_case == 2) {
                rte_pktmbuf_free(m2);
                rte_pktmbuf_free(m);
        }


        rte_mempool_free(pinned_pool);
        rte_memzone_free(mz);
        rte_mempool_free(std_pool);
        return 0;

fail:
        rte_pktmbuf_free(m2);
        rte_pktmbuf_free(m);
        rte_mempool_free(pinned_pool);
        rte_memzone_free(mz);
        rte_mempool_free(std_pool);
        return -1;
}

Reply via email to