> > This patch introduces a new way of attaching an external buffer to a mbuf. > > Attaching an external buffer is quite similar to mbuf indirection in > replacing buffer addresses and length of a mbuf, but a few differences: > - When an indirect mbuf is attached, refcnt of the direct mbuf would be > 2 as long as the direct mbuf itself isn't freed after the attachment. > In such cases, the buffer area of a direct mbuf must be read-only. But > external buffer has its own refcnt and it starts from 1. Unless > multiple mbufs are attached to a mbuf having an external buffer, the > external buffer is writable. > - There's no need to allocate buffer from a mempool. Any buffer can be > attached with appropriate free callback. > - Smaller metadata is required to maintain shared data such as refcnt. > > Signed-off-by: Yongseok Koh <ys...@mellanox.com> > --- > > ** This patch can pass the mbuf_autotest. ** > > Submitting only non-mlx5 patches to meet deadline for RC1. mlx5 patches > will be submitted separately rebased on a differnet patchset which > accommodates new memory hotplug design to mlx PMDs. > > v5: > * rte_pktmbuf_attach_extbuf() sets headroom to 0. > * if shinfo is provided when attaching, user should initialize it. > * minor changes from review. > > v4: > * rte_pktmbuf_attach_extbuf() takes new arguments - buf_iova and shinfo. > user can pass memory for shared data via shinfo argument. > * minor changes from review. > > v3: > * implement external buffer attachment instead of introducing buf_off for > mbuf indirection. > > lib/librte_mbuf/rte_mbuf.h | 303 > ++++++++++++++++++++++++++++++++++++++++----- > 1 file changed, 274 insertions(+), 29 deletions(-) > > diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h > index 43aaa9c5f..e2c12874a 100644 > --- a/lib/librte_mbuf/rte_mbuf.h > +++ b/lib/librte_mbuf/rte_mbuf.h > @@ -344,7 +344,10 @@ extern "C" { > PKT_TX_MACSEC | \ > PKT_TX_SEC_OFFLOAD) > > -#define __RESERVED (1ULL << 61) /**< reserved for future mbuf use > */ > +/** > + * Mbuf having an external buffer attached. shinfo in mbuf must be filled. > + */ > +#define EXT_ATTACHED_MBUF (1ULL << 61) > > #define IND_ATTACHED_MBUF (1ULL << 62) /**< Indirect attached mbuf */ > > @@ -584,8 +587,27 @@ struct rte_mbuf { > /** Sequence number. See also rte_reorder_insert(). */ > uint32_t seqn; > > + /** Shared data for external buffer attached to mbuf. See > + * rte_pktmbuf_attach_extbuf(). > + */ > + struct rte_mbuf_ext_shared_info *shinfo; > + > } __rte_cache_aligned; > > +/** > + * Function typedef of callback to free externally attached buffer. > + */ > +typedef void (*rte_mbuf_extbuf_free_callback_t)(void *addr, void *opaque); > + > +/** > + * Shared data at the end of an external buffer. > + */ > +struct rte_mbuf_ext_shared_info { > + rte_mbuf_extbuf_free_callback_t free_cb; /**< Free callback function */ > + void *fcb_opaque; /**< Free callback argument */ > + rte_atomic16_t refcnt_atomic; /**< Atomically accessed refcnt */ > +}; > + > /**< Maximum number of nb_segs allowed. */ > #define RTE_MBUF_MAX_NB_SEGS UINT16_MAX > > @@ -706,14 +728,33 @@ rte_mbuf_to_baddr(struct rte_mbuf *md) > } > > /** > + * Returns TRUE if given mbuf is cloned by mbuf indirection, or FALSE > + * otherwise. > + * > + * If a mbuf has its data in another mbuf and references it by mbuf > + * indirection, this mbuf can be defined as a cloned mbuf. > + */ > +#define RTE_MBUF_CLONED(mb) ((mb)->ol_flags & IND_ATTACHED_MBUF) > + > +/** > * Returns TRUE if given mbuf is indirect, or FALSE otherwise. > */ > -#define RTE_MBUF_INDIRECT(mb) ((mb)->ol_flags & IND_ATTACHED_MBUF) > +#define RTE_MBUF_INDIRECT(mb) RTE_MBUF_CLONED(mb) > + > +/** > + * Returns TRUE if given mbuf has an external buffer, or FALSE otherwise. > + * > + * External buffer is a user-provided anonymous buffer. > + */ > +#define RTE_MBUF_HAS_EXTBUF(mb) ((mb)->ol_flags & EXT_ATTACHED_MBUF) > > /** > * Returns TRUE if given mbuf is direct, or FALSE otherwise. > + * > + * If a mbuf embeds its own data after the rte_mbuf structure, this mbuf > + * can be defined as a direct mbuf. > */ > -#define RTE_MBUF_DIRECT(mb) (!RTE_MBUF_INDIRECT(mb)) > +#define RTE_MBUF_DIRECT(mb) (!(RTE_MBUF_CLONED(mb) || > RTE_MBUF_HAS_EXTBUF(mb))) > > /** > * Private data in case of pktmbuf pool. > @@ -839,6 +880,58 @@ rte_mbuf_refcnt_set(struct rte_mbuf *m, uint16_t > new_value) > > #endif /* RTE_MBUF_REFCNT_ATOMIC */ > > +/** > + * Reads the refcnt of an external buffer. > + * > + * @param shinfo > + * Shared data of the external buffer. > + * @return > + * Reference count number. > + */ > +static inline uint16_t > +rte_mbuf_ext_refcnt_read(const struct rte_mbuf_ext_shared_info *shinfo) > +{ > + return (uint16_t)(rte_atomic16_read(&shinfo->refcnt_atomic)); > +} > + > +/** > + * Set refcnt of an external buffer. > + * > + * @param shinfo > + * Shared data of the external buffer. > + * @param new_value > + * Value set > + */ > +static inline void > +rte_mbuf_ext_refcnt_set(struct rte_mbuf_ext_shared_info *shinfo, > + uint16_t new_value) > +{ > + rte_atomic16_set(&shinfo->refcnt_atomic, new_value); > +} > + > +/** > + * Add given value to refcnt of an external buffer and return its new > + * value. > + * > + * @param shinfo > + * Shared data of the external buffer. > + * @param value > + * Value to add/subtract > + * @return > + * Updated value > + */ > +static inline uint16_t > +rte_mbuf_ext_refcnt_update(struct rte_mbuf_ext_shared_info *shinfo, > + int16_t value) > +{ > + if (likely(rte_mbuf_ext_refcnt_read(shinfo) == 1)) { > + rte_mbuf_ext_refcnt_set(shinfo, 1 + value); > + return 1 + value; > + } > + > + return (uint16_t)rte_atomic16_add_return(&shinfo->refcnt_atomic, value); > +} > + > /** Mbuf prefetch */ > #define RTE_MBUF_PREFETCH_TO_FREE(m) do { \ > if ((m) != NULL) \ > @@ -1213,11 +1306,127 @@ static inline int rte_pktmbuf_alloc_bulk(struct > rte_mempool *pool, > } > > /** > + * Attach an external buffer to a mbuf. > + * > + * User-managed anonymous buffer can be attached to an mbuf. When attaching > + * it, corresponding free callback function and its argument should be > + * provided. This callback function will be called once all the mbufs are > + * detached from the buffer. > + * > + * The headroom for the attaching mbuf will be set to zero and this can be > + * properly adjusted after attachment. For example, ``rte_pktmbuf_adj()`` > + * or ``rte_pktmbuf_reset_headroom()`` can be used. > + * > + * More mbufs can be attached to the same external buffer by > + * ``rte_pktmbuf_attach()`` once the external buffer has been attached by > + * this API. > + * > + * Detachment can be done by either ``rte_pktmbuf_detach_extbuf()`` or > + * ``rte_pktmbuf_detach()``. > + * > + * Attaching an external buffer is quite similar to mbuf indirection in > + * replacing buffer addresses and length of a mbuf, but a few differences: > + * - When an indirect mbuf is attached, refcnt of the direct mbuf would be > + * 2 as long as the direct mbuf itself isn't freed after the attachment. > + * In such cases, the buffer area of a direct mbuf must be read-only. But > + * external buffer has its own refcnt and it starts from 1. Unless > + * multiple mbufs are attached to a mbuf having an external buffer, the > + * external buffer is writable. > + * - There's no need to allocate buffer from a mempool. Any buffer can be > + * attached with appropriate free callback and its IO address. > + * - Smaller metadata is required to maintain shared data such as refcnt. > + * > + * @warning > + * @b EXPERIMENTAL: This API may change without prior notice. > + * Once external buffer is enabled by allowing experimental API, > + * ``RTE_MBUF_DIRECT()`` and ``RTE_MBUF_INDIRECT()`` are no longer > + * exclusive. A mbuf can be considered direct if it is neither indirect nor > + * having external buffer. > + * > + * @param m > + * The pointer to the mbuf. > + * @param buf_addr > + * The pointer to the external buffer we're attaching to. > + * @param buf_iova > + * IO address of the external buffer we're attaching to. > + * @param buf_len > + * The size of the external buffer we're attaching to. If memory for > + * shared data is not provided, buf_len must be larger than the size of > + * ``struct rte_mbuf_ext_shared_info`` and padding for alignment. If not > + * enough, this function will return NULL. > + * @param shinfo > + * User-provided memory for shared data. If NULL, a few bytes in the > + * trailer of the provided buffer will be dedicated for shared data and > + * the shared data will be properly initialized. Otherwise, user must > + * initialize the content except for free callback and its argument. The > + * pointer of shared data will be stored in m->shinfo. > + * @param free_cb > + * Free callback function to call when the external buffer needs to be > + * freed. > + * @param fcb_opaque > + * Argument for the free callback function. > + * > + * @return > + * A pointer to the new start of the data on success, return NULL > + * otherwise. > + */ > +static inline char * __rte_experimental > +rte_pktmbuf_attach_extbuf(struct rte_mbuf *m, void *buf_addr, > + rte_iova_t buf_iova, uint16_t buf_len, > + struct rte_mbuf_ext_shared_info *shinfo, > + rte_mbuf_extbuf_free_callback_t free_cb, void *fcb_opaque) > +{ > + /* Additional attachment should be done by rte_pktmbuf_attach() */ > + RTE_ASSERT(!RTE_MBUF_HAS_EXTBUF(m));
Shouldn't we have here something like: RTE_ASSERT(RTE_MBUF_DIRECT(m) && rte_mbuf_refcnt_read(m) == 1); ? > + > + m->buf_addr = buf_addr; > + m->buf_iova = buf_iova; > + > + if (shinfo == NULL) { Instead of allocating shinfo ourselves - wound's it be better to rely on caller always allocating afeeling it for us (he can do that at the end/start of buffer, or whenever he likes to. Again in that case - caller can provide one shinfo to several mbufs (with different buf_addrs) and would know for sure that free_cb wouldn't be overwritten by mistake. I.E. mbuf code will only update refcnt inside shinfo. > + void *buf_end = RTE_PTR_ADD(buf_addr, buf_len); > + > + shinfo = RTE_PTR_ALIGN_FLOOR(RTE_PTR_SUB(buf_end, > + sizeof(*shinfo)), sizeof(uintptr_t)); > + if ((void *)shinfo <= buf_addr) > + return NULL; > + > + m->buf_len = RTE_PTR_DIFF(shinfo, buf_addr); > + rte_mbuf_ext_refcnt_set(shinfo, 1); > + } else { > + m->buf_len = buf_len; I think you need to update shinfo>refcnt here too. > + } > + > + m->data_len = 0; > + m->data_off = 0; > + > + m->ol_flags |= EXT_ATTACHED_MBUF; > + m->shinfo = shinfo; > + > + shinfo->free_cb = free_cb; > + shinfo->fcb_opaque = fcb_opaque; > + > + return (char *)m->buf_addr + m->data_off; > +} > + > +/** > + * Detach the external buffer attached to a mbuf, same as > + * ``rte_pktmbuf_detach()`` > + * > + * @param m > + * The mbuf having external buffer. > + */ > +#define rte_pktmbuf_detach_extbuf(m) rte_pktmbuf_detach(m) > + > +/** > * Attach packet mbuf to another packet mbuf. > * > - * After attachment we refer the mbuf we attached as 'indirect', > - * while mbuf we attached to as 'direct'. > - * The direct mbuf's reference counter is incremented. > + * If the mbuf we are attaching to isn't a direct buffer and is attached to > + * an external buffer, the mbuf being attached will be attached to the > + * external buffer instead of mbuf indirection. > + * > + * Otherwise, the mbuf will be indirectly attached. After attachment we > + * refer the mbuf we attached as 'indirect', while mbuf we attached to as > + * 'direct'. The direct mbuf's reference counter is incremented. > * > * Right now, not supported: > * - attachment for already indirect mbuf (e.g. - mi has to be direct). > @@ -1231,19 +1440,19 @@ static inline int rte_pktmbuf_alloc_bulk(struct > rte_mempool *pool, > */ > static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf > *m) > { > - struct rte_mbuf *md; > - > RTE_ASSERT(RTE_MBUF_DIRECT(mi) && > rte_mbuf_refcnt_read(mi) == 1); > > - /* if m is not direct, get the mbuf that embeds the data */ > - if (RTE_MBUF_DIRECT(m)) > - md = m; > - else > - md = rte_mbuf_from_indirect(m); > + if (RTE_MBUF_HAS_EXTBUF(m)) { > + rte_mbuf_ext_refcnt_update(m->shinfo, 1); > + mi->ol_flags = m->ol_flags; > + } else { > + /* if m is not direct, get the mbuf that embeds the data */ > + rte_mbuf_refcnt_update(rte_mbuf_from_indirect(m), 1); > + mi->priv_size = m->priv_size; > + mi->ol_flags = m->ol_flags | IND_ATTACHED_MBUF; > + } > > - rte_mbuf_refcnt_update(md, 1); > - mi->priv_size = m->priv_size; > mi->buf_iova = m->buf_iova; > mi->buf_addr = m->buf_addr; > mi->buf_len = m->buf_len; > @@ -1259,7 +1468,6 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf > *mi, struct rte_mbuf *m) > mi->next = NULL; > mi->pkt_len = mi->data_len; > mi->nb_segs = 1; > - mi->ol_flags = m->ol_flags | IND_ATTACHED_MBUF; > mi->packet_type = m->packet_type; > mi->timestamp = m->timestamp; > > @@ -1268,12 +1476,52 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf > *mi, struct rte_mbuf *m) > } > > /** > - * Detach an indirect packet mbuf. > + * @internal used by rte_pktmbuf_detach(). > * > + * Decrement the reference counter of the external buffer. When the > + * reference counter becomes 0, the buffer is freed by pre-registered > + * callback. > + */ > +static inline void > +__rte_pktmbuf_free_extbuf(struct rte_mbuf *m) > +{ > + RTE_ASSERT(RTE_MBUF_HAS_EXTBUF(m)); > + RTE_ASSERT(m->shinfo != NULL); > + > + if (rte_mbuf_ext_refcnt_update(m->shinfo, -1) == 0) > + m->shinfo->free_cb(m->buf_addr, m->shinfo->fcb_opaque); > +} > + > +/** > + * @internal used by rte_pktmbuf_detach(). > + * > + * Decrement the direct mbuf's reference counter. When the reference > + * counter becomes 0, the direct mbuf is freed. > + */ > +static inline void > +__rte_pktmbuf_free_direct(struct rte_mbuf *m) > +{ > + struct rte_mbuf *md; > + > + RTE_ASSERT(RTE_MBUF_INDIRECT(m)); > + > + md = rte_mbuf_from_indirect(m); > + > + if (rte_mbuf_refcnt_update(md, -1) == 0) { > + md->next = NULL; > + md->nb_segs = 1; > + rte_mbuf_refcnt_set(md, 1); > + rte_mbuf_raw_free(md); > + } > +} > + > +/** > + * Detach a packet mbuf from external buffer or direct buffer. > + * > + * - decrement refcnt and free the external/direct buffer if refcnt > + * becomes zero. > * - restore original mbuf address and length values. > * - reset pktmbuf data and data_len to their default values. > - * - decrement the direct mbuf's reference counter. When the > - * reference counter becomes 0, the direct mbuf is freed. > * > * All other fields of the given packet mbuf will be left intact. > * > @@ -1282,10 +1530,14 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf > *mi, struct rte_mbuf *m) > */ > static inline void rte_pktmbuf_detach(struct rte_mbuf *m) > { > - struct rte_mbuf *md = rte_mbuf_from_indirect(m); > struct rte_mempool *mp = m->pool; > uint32_t mbuf_size, buf_len, priv_size; > > + if (RTE_MBUF_HAS_EXTBUF(m)) > + __rte_pktmbuf_free_extbuf(m); > + else > + __rte_pktmbuf_free_direct(m); > + > priv_size = rte_pktmbuf_priv_size(mp); > mbuf_size = sizeof(struct rte_mbuf) + priv_size; > buf_len = rte_pktmbuf_data_room_size(mp); > @@ -1297,13 +1549,6 @@ static inline void rte_pktmbuf_detach(struct rte_mbuf > *m) > rte_pktmbuf_reset_headroom(m); > m->data_len = 0; > m->ol_flags = 0; > - > - if (rte_mbuf_refcnt_update(md, -1) == 0) { > - md->next = NULL; > - md->nb_segs = 1; > - rte_mbuf_refcnt_set(md, 1); > - rte_mbuf_raw_free(md); > - } > } > > /** > @@ -1327,7 +1572,7 @@ rte_pktmbuf_prefree_seg(struct rte_mbuf *m) > > if (likely(rte_mbuf_refcnt_read(m) == 1)) { > > - if (RTE_MBUF_INDIRECT(m)) > + if (!RTE_MBUF_DIRECT(m)) > rte_pktmbuf_detach(m); > > if (m->next != NULL) { > @@ -1339,7 +1584,7 @@ rte_pktmbuf_prefree_seg(struct rte_mbuf *m) > > } else if (__rte_mbuf_refcnt_update(m, -1) == 0) { > > - if (RTE_MBUF_INDIRECT(m)) > + if (!RTE_MBUF_DIRECT(m)) > rte_pktmbuf_detach(m); > > if (m->next != NULL) { > -- > 2.11.0