> -----Original Message----- > From: Olivier Matz <olivier.m...@6wind.com> > Sent: Tuesday, January 14, 2020 18:05 > To: Slava Ovsiienko <viachesl...@mellanox.com> > Cc: dev@dpdk.org; Matan Azrad <ma...@mellanox.com>; Raslan Darawsheh > <rasl...@mellanox.com>; Ori Kam <or...@mellanox.com>; Shahaf Shuler > <shah...@mellanox.com>; step...@networkplumber.org > Subject: Re: [PATCH v3 2/4] mbuf: create packet pool with external memory > buffers > > On Tue, Jan 14, 2020 at 09:15:03AM +0000, Viacheslav Ovsiienko wrote: > > The dedicated routine rte_pktmbuf_pool_create_extbuf() is provided to > > create mbuf pool with data buffers located in the pinned external > > memory. The application provides the external memory description and > > routine initialises each mbuf with appropriate virtual and physical > > buffer address. > > It is entirely application responsibility to register external memory > > with rte_extmem_register() API, map this memory, etc. > > > > The new introduced flag RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF is set in > > private pool structure, specifying the new special pool type. The > > allocated mbufs from pool of this kind will have the EXT_ATTACHED_MBUF > > flag set and NULL shared info pointer, because external buffers are > > not supposed to be freed and sharing management is not needed. Also, > > these mbufs can not be attached to other mbufs (not intended to be > > indirect). > > > > Signed-off-by: Viacheslav Ovsiienko <viachesl...@mellanox.com> > > --- > > lib/librte_mbuf/rte_mbuf.c | 144 > ++++++++++++++++++++++++++++++++++- > > lib/librte_mbuf/rte_mbuf.h | 86 ++++++++++++++++++++- > > lib/librte_mbuf/rte_mbuf_version.map | 1 + > > 3 files changed, 228 insertions(+), 3 deletions(-) > > > > diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c > > index 8fa7f49..d151469 100644 > > --- a/lib/librte_mbuf/rte_mbuf.c > > +++ b/lib/librte_mbuf/rte_mbuf.c > > @@ -59,9 +59,9 @@ > > } > > > > RTE_ASSERT(mp->elt_size >= sizeof(struct rte_mbuf) + > > - user_mbp_priv->mbuf_data_room_size + > > + ((user_mbp_priv->flags & > RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF) ? > > + 0 : user_mbp_priv->mbuf_data_room_size) + > > user_mbp_priv->mbuf_priv_size); > > Is this check really needed? It seems so, it is in separated routine, which might be called externally.
> > > - RTE_ASSERT(user_mbp_priv->flags == 0); > > We can keep > RTE_ASSERT(user_mbp_priv->flags & > ~RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF == 0); OK, thanks. > > > > > mbp_priv = rte_mempool_get_priv(mp); > > memcpy(mbp_priv, user_mbp_priv, sizeof(*mbp_priv)); @@ -107,6 > > +107,63 @@ > > m->next = NULL; > > } > > > > +/* > > + * pktmbuf constructor for the pool with pinned external buffer, > > + * given as a callback function to rte_mempool_obj_iter() in > > + * rte_pktmbuf_pool_create_extbuf(). Set the fields of a packet > > + * mbuf to their default values. > > + */ > > +void > > +rte_pktmbuf_init_extmem(struct rte_mempool *mp, > > + void *opaque_arg, > > + void *_m, > > + __attribute__((unused)) unsigned int i) { > > + struct rte_mbuf *m = _m; > > + struct rte_pktmbuf_extmem_init_ctx *ctx = opaque_arg; > > + struct rte_pktmbuf_extmem *ext_mem; > > + uint32_t mbuf_size, buf_len, priv_size; > > + > > + priv_size = rte_pktmbuf_priv_size(mp); > > + mbuf_size = sizeof(struct rte_mbuf) + priv_size; > > + buf_len = rte_pktmbuf_data_room_size(mp); > > + > > + RTE_ASSERT(RTE_ALIGN(priv_size, RTE_MBUF_PRIV_ALIGN) == > priv_size); > > + RTE_ASSERT(mp->elt_size >= mbuf_size); > > + RTE_ASSERT(buf_len <= UINT16_MAX); > > + > > + memset(m, 0, mbuf_size); > > + m->priv_size = priv_size; > > + m->buf_len = (uint16_t)buf_len; > > + > > + /* set the data buffer pointers to external memory */ > > + ext_mem = ctx->ext_mem + ctx->ext; > > + > > + RTE_ASSERT(ctx->ext < ctx->ext_num); > > + RTE_ASSERT(ctx->off < ext_mem->buf_len); > > + > > + m->buf_addr = RTE_PTR_ADD(ext_mem->buf_ptr, ctx->off); > > + m->buf_iova = ext_mem->buf_iova == RTE_BAD_IOVA ? > > + RTE_BAD_IOVA : (ext_mem->buf_iova + ctx->off); > > + > > + ctx->off += ext_mem->elt_size; > > + if (ctx->off >= ext_mem->buf_len) { > > + ctx->off = 0; > > + ++ctx->ext; > > + } > > + /* keep some headroom between start of buffer and data */ > > + m->data_off = RTE_MIN(RTE_PKTMBUF_HEADROOM, (uint16_t)m- > >buf_len); > > + > > + /* init some constant fields */ > > + m->pool = mp; > > + m->nb_segs = 1; > > + m->port = MBUF_INVALID_PORT; > > + m->ol_flags = EXT_ATTACHED_MBUF; > > + rte_mbuf_refcnt_set(m, 1); > > + m->next = NULL; > > +} > > + > > + > > /* Helper to create a mbuf pool with given mempool ops name*/ struct > > rte_mempool * rte_pktmbuf_pool_create_by_ops(const char *name, > > unsigned int n, @@ -169,6 +226,89 @@ struct rte_mempool * > > data_room_size, socket_id, NULL); > > } > > > > +/* Helper to create a mbuf pool with pinned external data buffers. */ > > +struct rte_mempool * rte_pktmbuf_pool_create_extbuf(const char *name, > > +unsigned int n, > > + unsigned int cache_size, uint16_t priv_size, > > + uint16_t data_room_size, int socket_id, > > + struct rte_pktmbuf_extmem *ext_mem, unsigned int ext_num) { > > + struct rte_mempool *mp; > > + struct rte_pktmbuf_pool_private mbp_priv; > > + struct rte_pktmbuf_extmem_init_ctx init_ctx; > > + const char *mp_ops_name; > > + unsigned int elt_size; > > + unsigned int i, n_elts = 0; > > + int ret; > > + > > + if (RTE_ALIGN(priv_size, RTE_MBUF_PRIV_ALIGN) != priv_size) { > > + RTE_LOG(ERR, MBUF, "mbuf priv_size=%u is not aligned\n", > > + priv_size); > > + rte_errno = EINVAL; > > + return NULL; > > + } > > + /* Check the external memory descriptors. */ > > + for (i = 0; i < ext_num; i++) { > > + struct rte_pktmbuf_extmem *extm = ext_mem + i; > > + > > + if (!extm->elt_size || !extm->buf_len || !extm->buf_ptr) { > > + RTE_LOG(ERR, MBUF, "invalid extmem descriptor\n"); > > + rte_errno = EINVAL; > > + return NULL; > > + } > > + if (data_room_size > extm->elt_size) { > > + RTE_LOG(ERR, MBUF, "ext elt_size=%u is too small\n", > > + priv_size); > > + rte_errno = EINVAL; > > + return NULL; > > + } > > + n_elts += extm->buf_len / extm->elt_size; > > + } > > + /* Check whether enough external memory provided. */ > > + if (n_elts < n) { > > + RTE_LOG(ERR, MBUF, "not enough extmem\n"); > > + rte_errno = ENOMEM; > > + return NULL; > > + } > > + elt_size = sizeof(struct rte_mbuf) + (unsigned int)priv_size; > > + memset(&mbp_priv, 0, sizeof(mbp_priv)); > > + mbp_priv.mbuf_data_room_size = data_room_size; > > + mbp_priv.mbuf_priv_size = priv_size; > > + mbp_priv.flags = RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF; > > + > > + mp = rte_mempool_create_empty(name, n, elt_size, cache_size, > > + sizeof(struct rte_pktmbuf_pool_private), socket_id, 0); > > + if (mp == NULL) > > + return NULL; > > + > > + mp_ops_name = rte_mbuf_best_mempool_ops(); > > + ret = rte_mempool_set_ops_byname(mp, mp_ops_name, NULL); > > + if (ret != 0) { > > + RTE_LOG(ERR, MBUF, "error setting mempool handler\n"); > > + rte_mempool_free(mp); > > + rte_errno = -ret; > > + return NULL; > > + } > > + rte_pktmbuf_pool_init(mp, &mbp_priv); > > + > > + ret = rte_mempool_populate_default(mp); > > + if (ret < 0) { > > + rte_mempool_free(mp); > > + rte_errno = -ret; > > + return NULL; > > + } > > + > > + init_ctx = (struct rte_pktmbuf_extmem_init_ctx){ > > + .ext_mem = ext_mem, > > + .ext_num = ext_num, > > + .ext = 0, > > + .off = 0, > > + }; > > + rte_mempool_obj_iter(mp, rte_pktmbuf_init_extmem, &init_ctx); > > + > > + return mp; > > +} > > Instead of duplicating some code, would it be possible to do: > > int > rte_pktmbuf_pool_attach_extbuf(struct rte_mempool *mp, > struct rte_pktmbuf_extmem *ext_mem, unsigned int ext_num) { > struct rte_pktmbuf_extmem_init_ctx init_ctx = { 0 }; > struct rte_pktmbuf_pool_private *priv; > > /* XXX assert mempool is fully populated? */ > > priv = rte_mempool_get_priv(mp); > mbp_priv.flags |= RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF; > > rte_mempool_obj_iter(mp, rte_pktmbuf_init_extmem, &init_ctx); > > return init_ctx.ret; > } > > The application would have to call: > > rte_pktmbuf_pool_create(...); > rte_pktmbuf_pool_attach_extbuf(...); > It seems there are some disadvantages: - no data_room_size check (we should remove asserts from rte_pktmbuf_pool_init) - rte_mempool_obj_iter would be called twice, it might involve rte_mempool_virt2iova() and it would take some time The code duplication is not so large as it could be seen from the diff - the part of the rte_pktmbuf_pool_create_extbuf() is related to checking extmem, and the main part of job is done in the rte_pktmbuf_init_extmem(). > > > + > > /* do some sanity checks on a mbuf: panic if it fails */ void > > rte_mbuf_sanity_check(const struct rte_mbuf *m, int is_header) diff > > --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h index > > 8f486af..7bde297 100644 > > --- a/lib/librte_mbuf/rte_mbuf.h > > +++ b/lib/librte_mbuf/rte_mbuf.h > > @@ -642,6 +642,34 @@ static inline struct rte_mbuf > > *rte_mbuf_raw_alloc(struct rte_mempool *mp) void > rte_pktmbuf_init(struct rte_mempool *mp, void *opaque_arg, > > void *m, unsigned i); > > > > +/** The context to initialize the mbufs with pinned external buffers. > > +*/ struct rte_pktmbuf_extmem_init_ctx { > > + struct rte_pktmbuf_extmem *ext_mem; /* pointer to descriptor > array. */ > > + unsigned int ext_num; /* number of descriptors in array. */ > > + unsigned int ext; /* loop descriptor index. */ > > + size_t off; /* loop buffer offset. */ }; > > + > > +/** > > + * The packet mbuf constructor for pools with pinned external memory. > > + * > > + * This function initializes some fields in the mbuf structure that > > +are > > + * not modified by the user once created (origin pool, buffer start > > + * address, and so on). This function is given as a callback function > > +to > > + * rte_mempool_obj_iter() called from rte_mempool_create_extmem(). > > + * > > + * @param mp > > + * The mempool from which mbufs originate. > > + * @param opaque_arg > > + * A pointer to the rte_pktmbuf_extmem_init_ctx - initialization > > + * context structure > > + * @param m > > + * The mbuf to initialize. > > + * @param i > > + * The index of the mbuf in the pool table. > > + */ > > +void rte_pktmbuf_init_extmem(struct rte_mempool *mp, void > *opaque_arg, > > + void *m, unsigned int i); > > > > /** > > * A packet mbuf pool constructor. > > @@ -743,6 +771,62 @@ struct rte_mempool * > > unsigned int cache_size, uint16_t priv_size, uint16_t data_room_size, > > int socket_id, const char *ops_name); > > > > +/** A structure that describes the pinned external buffer segment. */ > > +struct rte_pktmbuf_extmem { > > + void *buf_ptr; /**< The virtual address of data buffer. */ > > + rte_iova_t buf_iova; /**< The IO address of the data buffer. */ > > + size_t buf_len; /**< External buffer length in bytes. */ > > + uint16_t elt_size; /**< mbuf element size in bytes. */ > > +}; > > + > > +/** > > + * Create a mbuf pool with external pinned data buffers. > > + * > > + * This function creates and initializes a packet mbuf pool that > > +contains > > + * only mbufs with external buffer. It is a wrapper to rte_mempool > functions. > > + * > > + * @param name > > + * The name of the mbuf pool. > > + * @param n > > + * The number of elements in the mbuf pool. The optimum size (in terms > > + * of memory usage) for a mempool is when n is a power of two minus > one: > > + * n = (2^q - 1). > > + * @param cache_size > > + * Size of the per-core object cache. See rte_mempool_create() for > > + * details. > > + * @param priv_size > > + * Size of application private are between the rte_mbuf structure > > + * and the data buffer. This value must be aligned to > RTE_MBUF_PRIV_ALIGN. > > + * @param data_room_size > > + * Size of data buffer in each mbuf, including RTE_PKTMBUF_HEADROOM. > > + * @param socket_id > > + * The socket identifier where the memory should be allocated. The > > + * value can be *SOCKET_ID_ANY* if there is no NUMA constraint for the > > + * reserved zone. > > + * @param ext_mem > > + * Pointer to the array of structures describing the external memory > > + * for data buffers. It is caller responsibility to register this memory > > + * with rte_extmem_register() (if needed), map this memory to > appropriate > > + * physical device, etc. > > + * @param ext_num > > + * Number of elements in the ext_mem array. > > + * @return > > + * The pointer to the new allocated mempool, on success. NULL on error > > + * with rte_errno set appropriately. Possible rte_errno values include: > > + * - E_RTE_NO_CONFIG - function could not get pointer to rte_config > structure > > + * - E_RTE_SECONDARY - function was called from a secondary process > instance > > + * - EINVAL - cache size provided is too large, or priv_size is not > > aligned. > > + * - ENOSPC - the maximum number of memzones has already been > allocated > > + * - EEXIST - a memzone with the same name already exists > > + * - ENOMEM - no appropriate memory area found in which to create > memzone > > + */ > > +__rte_experimental > > +struct rte_mempool * > > +rte_pktmbuf_pool_create_extbuf(const char *name, unsigned int n, > > + unsigned int cache_size, uint16_t priv_size, > > + uint16_t data_room_size, int socket_id, > > + struct rte_pktmbuf_extmem *ext_mem, unsigned int ext_num); > > + > > /** > > * Get the data room size of mbufs stored in a pktmbuf_pool > > * > > @@ -818,7 +902,7 @@ static inline void rte_pktmbuf_reset(struct rte_mbuf > *m) > > m->nb_segs = 1; > > m->port = MBUF_INVALID_PORT; > > > > - m->ol_flags = 0; > > + m->ol_flags &= EXT_ATTACHED_MBUF; > > m->packet_type = 0; > > rte_pktmbuf_reset_headroom(m); > > > > I wonder if it should go in previous patch? Mmm... Definitely - yes 😊 Thanks, will move this line. > > > diff --git a/lib/librte_mbuf/rte_mbuf_version.map > > b/lib/librte_mbuf/rte_mbuf_version.map > > index 3bbb476..ab161bc 100644 > > --- a/lib/librte_mbuf/rte_mbuf_version.map > > +++ b/lib/librte_mbuf/rte_mbuf_version.map > > @@ -44,5 +44,6 @@ EXPERIMENTAL { > > rte_mbuf_dyn_dump; > > rte_pktmbuf_copy; > > rte_pktmbuf_free_bulk; > > + rte_pktmbuf_pool_create_extbuf; > > > > }; > > -- > > 1.8.3.1 > > With best regards, Slava