> > This patch introduces a new way of attaching an external buffer to a mbuf. > > Attaching an external buffer is quite similar to mbuf indirection in > replacing buffer addresses and length of a mbuf, but a few differences: > - As refcnt of a direct mbuf is at least 2, the buffer area of a direct > mbuf must be read-only. But external buffer has its own refcnt and it > starts from 1. Unless multiple mbufs are attached to a mbuf having an > external buffer, the external buffer is writable. > - There's no need to allocate buffer from a mempool. Any buffer can be > attached with appropriate free callback. > - Smaller metadata is required to maintain shared data such as refcnt. > > Signed-off-by: Yongseok Koh <ys...@mellanox.com> > --- > > Submitting only non-mlx5 patches to meet deadline for RC1. mlx5 patches > will be submitted separately rebased on a differnet patchset which > accommodates new memory hotplug design to mlx PMDs. > > v3: > * implement external buffer attachment instead of introducing buf_off for > mbuf indirection. > > lib/librte_mbuf/rte_mbuf.h | 276 > ++++++++++++++++++++++++++++++++++++++++----- > 1 file changed, 248 insertions(+), 28 deletions(-) > > diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h > index 06eceba37..e64160c81 100644 > --- a/lib/librte_mbuf/rte_mbuf.h > +++ b/lib/librte_mbuf/rte_mbuf.h > @@ -326,7 +326,7 @@ extern "C" { > PKT_TX_MACSEC | \ > PKT_TX_SEC_OFFLOAD) > > -#define __RESERVED (1ULL << 61) /**< reserved for future mbuf use > */ > +#define EXT_ATTACHED_MBUF (1ULL << 61) /**< Mbuf having external buffer */ > > #define IND_ATTACHED_MBUF (1ULL << 62) /**< Indirect attached mbuf */ > > @@ -568,6 +568,24 @@ struct rte_mbuf { > > } __rte_cache_aligned; > > +/** > + * Function typedef of callback to free externally attached buffer. > + */ > +typedef void (*rte_mbuf_extbuf_free_callback_t)(void *addr, void *opaque); > + > +/** > + * Shared data at the end of an external buffer. > + */ > +struct rte_mbuf_ext_shared_info { > + rte_mbuf_extbuf_free_callback_t free_cb; /**< Free callback function */ > + void *fcb_opaque; /**< Free callback argument */ > + RTE_STD_C11 > + union { > + rte_atomic16_t refcnt_atomic; /**< Atomically accessed refcnt */ > + uint16_t refcnt; /**< Non-atomically accessed refcnt */ > + }; > +}; > + > /**< Maximum number of nb_segs allowed. */ > #define RTE_MBUF_MAX_NB_SEGS UINT16_MAX > > @@ -693,9 +711,14 @@ rte_mbuf_to_baddr(struct rte_mbuf *md) > #define RTE_MBUF_INDIRECT(mb) ((mb)->ol_flags & IND_ATTACHED_MBUF) > > /** > + * Returns TRUE if given mbuf has external buffer, or FALSE otherwise. > + */ > +#define RTE_MBUF_HAS_EXTBUF(mb) ((mb)->ol_flags & EXT_ATTACHED_MBUF) > + > +/** > * Returns TRUE if given mbuf is direct, or FALSE otherwise. > */ > -#define RTE_MBUF_DIRECT(mb) (!RTE_MBUF_INDIRECT(mb)) > +#define RTE_MBUF_DIRECT(mb) (!RTE_MBUF_INDIRECT(mb) && > !RTE_MBUF_HAS_EXTBUF(mb))
As a nit: RTE_MBUF_DIRECT(mb) (((mb)->ol_flags & (IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF)) == 0) > > /** > * Private data in case of pktmbuf pool. > @@ -821,6 +844,58 @@ rte_mbuf_refcnt_set(struct rte_mbuf *m, uint16_t > new_value) > > #endif /* RTE_MBUF_REFCNT_ATOMIC */ > > +/** > + * Reads the refcnt of an external buffer. > + * > + * @param shinfo > + * Shared data of the external buffer. > + * @return > + * Reference count number. > + */ > +static inline uint16_t > +rte_extbuf_refcnt_read(const struct rte_mbuf_ext_shared_info *shinfo) > +{ > + return (uint16_t)(rte_atomic16_read(&shinfo->refcnt_atomic)); > +} > + > +/** > + * Set refcnt of an external buffer. > + * > + * @param shinfo > + * Shared data of the external buffer. > + * @param new_value > + * Value set > + */ > +static inline void > +rte_extbuf_refcnt_set(struct rte_mbuf_ext_shared_info *shinfo, > + uint16_t new_value) > +{ > + rte_atomic16_set(&shinfo->refcnt_atomic, new_value); > +} > + > +/** > + * Add given value to refcnt of an external buffer and return its new > + * value. > + * > + * @param shinfo > + * Shared data of the external buffer. > + * @param value > + * Value to add/subtract > + * @return > + * Updated value > + */ > +static inline uint16_t > +rte_extbuf_refcnt_update(struct rte_mbuf_ext_shared_info *shinfo, > + int16_t value) > +{ > + if (likely(rte_extbuf_refcnt_read(shinfo) == 1)) { > + rte_extbuf_refcnt_set(shinfo, 1 + value); > + return 1 + value; > + } > + > + return (uint16_t)rte_atomic16_add_return(&shinfo->refcnt_atomic, value); > +} > + > /** Mbuf prefetch */ > #define RTE_MBUF_PREFETCH_TO_FREE(m) do { \ > if ((m) != NULL) \ > @@ -1195,11 +1270,120 @@ static inline int rte_pktmbuf_alloc_bulk(struct > rte_mempool *pool, > } > > /** > + * Return shared data of external buffer of a mbuf. > + * > + * @param m > + * The pointer to the mbuf. > + * @return > + * The address of the shared data. > + */ > +static inline struct rte_mbuf_ext_shared_info * > +rte_mbuf_ext_shinfo(struct rte_mbuf *m) > +{ > + return (struct rte_mbuf_ext_shared_info *) > + RTE_PTR_ADD(m->buf_addr, m->buf_len); > +} > + > +/** > + * Attach an external buffer to a mbuf. > + * > + * User-managed anonymous buffer can be attached to an mbuf. When attaching > + * it, corresponding free callback function and its argument should be > + * provided. This callback function will be called once all the mbufs are > + * detached from the buffer. > + * > + * More mbufs can be attached to the same external buffer by > + * ``rte_pktmbuf_attach()`` once the external buffer has been attached by > + * this API. > + * > + * Detachment can be done by either ``rte_pktmbuf_detach_extbuf()`` or > + * ``rte_pktmbuf_detach()``. > + * > + * A few bytes in the trailer of the provided buffer will be dedicated for > + * shared data (``struct rte_mbuf_ext_shared_info``) to store refcnt, > + * callback function and so on. The shared data can be referenced by > + * ``rte_mbuf_ext_shinfo()`` > + * > + * Attaching an external buffer is quite similar to mbuf indirection in > + * replacing buffer addresses and length of a mbuf, but a few differences: > + * - As refcnt of a direct mbuf is at least 2, the buffer area of a direct > + * mbuf must be read-only. But external buffer has its own refcnt and it > + * starts from 1. Unless multiple mbufs are attached to a mbuf having an > + * external buffer, the external buffer is writable. > + * - There's no need to allocate buffer from a mempool. Any buffer can be > + * attached with appropriate free callback. > + * - Smaller metadata is required to maintain shared data such as refcnt. > + * > + * @warning > + * @b EXPERIMENTAL: This API may change without prior notice. > + * Once external buffer is enabled by allowing experimental API, > + * ``RTE_MBUF_DIRECT()`` and ``RTE_MBUF_INDIRECT()`` are no longer > + * exclusive. A mbuf can be consiered direct if it is neither indirect nor > + * having external buffer. > + * > + * @param m > + * The pointer to the mbuf. > + * @param buf_addr > + * The pointer to the external buffer we're attaching to. > + * @param buf_len > + * The size of the external buffer we're attaching to. This must be larger > + * than the size of ``struct rte_mbuf_ext_shared_info`` and padding for > + * alignment. If not enough, this function will return NULL. > + * @param free_cb > + * Free callback function to call when the external buffer needs to be > freed. > + * @param fcb_opaque > + * Argument for the free callback function. > + * @return > + * A pointer to the new start of the data on success, return NULL > otherwise. > + */ > +static inline char * __rte_experimental > +rte_pktmbuf_attach_extbuf(struct rte_mbuf *m, void *buf_addr, > + uint16_t buf_len, rte_mbuf_extbuf_free_callback_t free_cb, > + void *fcb_opaque) > +{ > + void *buf_end = RTE_PTR_ADD(buf_addr, buf_len); > + struct rte_mbuf_ext_shared_info *shinfo; > + > + shinfo = RTE_PTR_ALIGN_FLOOR(RTE_PTR_SUB(buf_end, sizeof(*shinfo)), > + sizeof(uintptr_t)); > + > + if ((void *)shinfo <= buf_addr) > + return NULL; > + > + m->buf_addr = buf_addr; > + m->buf_iova = rte_mempool_virt2iova(buf_addr); That wouldn't work for arbitrary extern buffer. Only for the one that is an element in some other mempool. For arbitrary external buffer - callee has to provide PA for it plus guarantee that it's VA would be locked down. >From other side - if your intention is just to use only elements of other >mempools - No need to have free_cb(). mempool_put should do. > + m->buf_len = RTE_PTR_DIFF(shinfo, buf_addr); > + m->data_len = 0; > + > + rte_pktmbuf_reset_headroom(m); > + m->ol_flags |= EXT_ATTACHED_MBUF; > + > + rte_extbuf_refcnt_set(shinfo, 1); > + shinfo->free_cb = free_cb; > + shinfo->fcb_opaque = fcb_opaque; > + > + return (char *)m->buf_addr + m->data_off; > +} > + > +/** > + * Detach the external buffer attached to a mbuf, same as > + * ``rte_pktmbuf_detach()`` > + * > + * @param m > + * The mbuf having external buffer. > + */ > +#define rte_pktmbuf_detach_extbuf(m) rte_pktmbuf_detach(m) > + > +/** > * Attach packet mbuf to another packet mbuf. > * > - * After attachment we refer the mbuf we attached as 'indirect', > - * while mbuf we attached to as 'direct'. > - * The direct mbuf's reference counter is incremented. > + * If the mbuf we are attaching to isn't a direct buffer and is attached to > + * an external buffer, the mbuf being attached will be attached to the > + * external buffer instead of mbuf indirection. > + * > + * Otherwise, the mbuf will be indirectly attached. After attachment we > + * refer the mbuf we attached as 'indirect', while mbuf we attached to as > + * 'direct'. The direct mbuf's reference counter is incremented. > * > * Right now, not supported: > * - attachment for already indirect mbuf (e.g. - mi has to be direct). > @@ -1213,19 +1397,18 @@ static inline int rte_pktmbuf_alloc_bulk(struct > rte_mempool *pool, > */ > static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf > *m) > { > - struct rte_mbuf *md; > - > RTE_ASSERT(RTE_MBUF_DIRECT(mi) && > rte_mbuf_refcnt_read(mi) == 1); > > - /* if m is not direct, get the mbuf that embeds the data */ > - if (RTE_MBUF_DIRECT(m)) > - md = m; > - else > - md = rte_mbuf_from_indirect(m); > + if (RTE_MBUF_HAS_EXTBUF(m)) { > + rte_extbuf_refcnt_update(rte_mbuf_ext_shinfo(m), 1); > + mi->ol_flags = m->ol_flags; > + } else { > + rte_mbuf_refcnt_update(rte_mbuf_from_indirect(m), 1); > + mi->priv_size = m->priv_size; > + mi->ol_flags = m->ol_flags | IND_ATTACHED_MBUF; > + } > > - rte_mbuf_refcnt_update(md, 1); > - mi->priv_size = m->priv_size; > mi->buf_iova = m->buf_iova; > mi->buf_addr = m->buf_addr; > mi->buf_len = m->buf_len; > @@ -1241,7 +1424,6 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf > *mi, struct rte_mbuf *m) > mi->next = NULL; > mi->pkt_len = mi->data_len; > mi->nb_segs = 1; > - mi->ol_flags = m->ol_flags | IND_ATTACHED_MBUF; > mi->packet_type = m->packet_type; > mi->timestamp = m->timestamp; > > @@ -1250,12 +1432,53 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf > *mi, struct rte_mbuf *m) > } > > /** > - * Detach an indirect packet mbuf. > + * @internal used by rte_pktmbuf_detach(). > + * > + * Decrement the reference counter of the external buffer. When the > + * reference counter becomes 0, the buffer is freed by pre-registered > + * callback. > + */ > +static inline void > +__rte_pktmbuf_free_extbuf(struct rte_mbuf *m) > +{ > + struct rte_mbuf_ext_shared_info *shinfo; > + > + RTE_ASSERT(RTE_MBUF_HAS_EXTBUF(m)); > + > + shinfo = rte_mbuf_ext_shinfo(m); > + > + if (rte_extbuf_refcnt_update(shinfo, -1) == 0) > + shinfo->free_cb(m->buf_addr, shinfo->fcb_opaque); I understand the reason but extra function call for each external mbuf - seems quite expensive. Wonder is it possible to group them somehow and amortize the cost? > +} > + > +/** > + * @internal used by rte_pktmbuf_detach(). > * > + * Decrement the direct mbuf's reference counter. When the reference > + * counter becomes 0, the direct mbuf is freed. > + */ > +static inline void > +__rte_pktmbuf_free_direct(struct rte_mbuf *m) > +{ > + struct rte_mbuf *md = rte_mbuf_from_indirect(m); > + > + RTE_ASSERT(RTE_MBUF_INDIRECT(m)); > + > + if (rte_mbuf_refcnt_update(md, -1) == 0) { > + md->next = NULL; > + md->nb_segs = 1; > + rte_mbuf_refcnt_set(md, 1); > + rte_mbuf_raw_free(md); > + } > +} > + > +/** > + * Detach a packet mbuf from external buffer or direct buffer. > + * > + * - decrement refcnt and free the external/direct buffer if refcnt > + * becomes zero. > * - restore original mbuf address and length values. > * - reset pktmbuf data and data_len to their default values. > - * - decrement the direct mbuf's reference counter. When the > - * reference counter becomes 0, the direct mbuf is freed. > * > * All other fields of the given packet mbuf will be left intact. > * > @@ -1264,10 +1487,14 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf > *mi, struct rte_mbuf *m) > */ > static inline void rte_pktmbuf_detach(struct rte_mbuf *m) > { > - struct rte_mbuf *md = rte_mbuf_from_indirect(m); > struct rte_mempool *mp = m->pool; > uint32_t mbuf_size, buf_len, priv_size; > > + if (RTE_MBUF_HAS_EXTBUF(m)) > + __rte_pktmbuf_free_extbuf(m); > + else > + __rte_pktmbuf_free_direct(m); > + > priv_size = rte_pktmbuf_priv_size(mp); > mbuf_size = sizeof(struct rte_mbuf) + priv_size; > buf_len = rte_pktmbuf_data_room_size(mp); > @@ -1279,13 +1506,6 @@ static inline void rte_pktmbuf_detach(struct rte_mbuf > *m) > rte_pktmbuf_reset_headroom(m); > m->data_len = 0; > m->ol_flags = 0; > - > - if (rte_mbuf_refcnt_update(md, -1) == 0) { > - md->next = NULL; > - md->nb_segs = 1; > - rte_mbuf_refcnt_set(md, 1); > - rte_mbuf_raw_free(md); > - } > } > > /** > @@ -1309,7 +1529,7 @@ rte_pktmbuf_prefree_seg(struct rte_mbuf *m) > > if (likely(rte_mbuf_refcnt_read(m) == 1)) { > > - if (RTE_MBUF_INDIRECT(m)) > + if (!RTE_MBUF_DIRECT(m)) > rte_pktmbuf_detach(m); > > if (m->next != NULL) { > @@ -1321,7 +1541,7 @@ rte_pktmbuf_prefree_seg(struct rte_mbuf *m) > > } else if (__rte_mbuf_refcnt_update(m, -1) == 0) { > > - if (RTE_MBUF_INDIRECT(m)) > + if (!RTE_MBUF_DIRECT(m)) > rte_pktmbuf_detach(m); > > if (m->next != NULL) { > -- > 2.11.0