This patch introduces a new way of attaching an external buffer to a mbuf. Attaching an external buffer is quite similar to mbuf indirection in replacing buffer addresses and length of a mbuf, but a few differences: - As refcnt of a direct mbuf is at least 2, the buffer area of a direct mbuf must be read-only. But external buffer has its own refcnt and it starts from 1. Unless multiple mbufs are attached to a mbuf having an external buffer, the external buffer is writable. - There's no need to allocate buffer from a mempool. Any buffer can be attached with appropriate free callback. - Smaller metadata is required to maintain shared data such as refcnt.
Signed-off-by: Yongseok Koh <ys...@mellanox.com> --- Submitting only non-mlx5 patches to meet deadline for RC1. mlx5 patches will be submitted separately rebased on a differnet patchset which accommodates new memory hotplug design to mlx PMDs. v3: * implement external buffer attachment instead of introducing buf_off for mbuf indirection. lib/librte_mbuf/rte_mbuf.h | 276 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 248 insertions(+), 28 deletions(-) diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h index 06eceba37..e64160c81 100644 --- a/lib/librte_mbuf/rte_mbuf.h +++ b/lib/librte_mbuf/rte_mbuf.h @@ -326,7 +326,7 @@ extern "C" { PKT_TX_MACSEC | \ PKT_TX_SEC_OFFLOAD) -#define __RESERVED (1ULL << 61) /**< reserved for future mbuf use */ +#define EXT_ATTACHED_MBUF (1ULL << 61) /**< Mbuf having external buffer */ #define IND_ATTACHED_MBUF (1ULL << 62) /**< Indirect attached mbuf */ @@ -568,6 +568,24 @@ struct rte_mbuf { } __rte_cache_aligned; +/** + * Function typedef of callback to free externally attached buffer. + */ +typedef void (*rte_mbuf_extbuf_free_callback_t)(void *addr, void *opaque); + +/** + * Shared data at the end of an external buffer. + */ +struct rte_mbuf_ext_shared_info { + rte_mbuf_extbuf_free_callback_t free_cb; /**< Free callback function */ + void *fcb_opaque; /**< Free callback argument */ + RTE_STD_C11 + union { + rte_atomic16_t refcnt_atomic; /**< Atomically accessed refcnt */ + uint16_t refcnt; /**< Non-atomically accessed refcnt */ + }; +}; + /**< Maximum number of nb_segs allowed. */ #define RTE_MBUF_MAX_NB_SEGS UINT16_MAX @@ -693,9 +711,14 @@ rte_mbuf_to_baddr(struct rte_mbuf *md) #define RTE_MBUF_INDIRECT(mb) ((mb)->ol_flags & IND_ATTACHED_MBUF) /** + * Returns TRUE if given mbuf has external buffer, or FALSE otherwise. + */ +#define RTE_MBUF_HAS_EXTBUF(mb) ((mb)->ol_flags & EXT_ATTACHED_MBUF) + +/** * Returns TRUE if given mbuf is direct, or FALSE otherwise. */ -#define RTE_MBUF_DIRECT(mb) (!RTE_MBUF_INDIRECT(mb)) +#define RTE_MBUF_DIRECT(mb) (!RTE_MBUF_INDIRECT(mb) && !RTE_MBUF_HAS_EXTBUF(mb)) /** * Private data in case of pktmbuf pool. @@ -821,6 +844,58 @@ rte_mbuf_refcnt_set(struct rte_mbuf *m, uint16_t new_value) #endif /* RTE_MBUF_REFCNT_ATOMIC */ +/** + * Reads the refcnt of an external buffer. + * + * @param shinfo + * Shared data of the external buffer. + * @return + * Reference count number. + */ +static inline uint16_t +rte_extbuf_refcnt_read(const struct rte_mbuf_ext_shared_info *shinfo) +{ + return (uint16_t)(rte_atomic16_read(&shinfo->refcnt_atomic)); +} + +/** + * Set refcnt of an external buffer. + * + * @param shinfo + * Shared data of the external buffer. + * @param new_value + * Value set + */ +static inline void +rte_extbuf_refcnt_set(struct rte_mbuf_ext_shared_info *shinfo, + uint16_t new_value) +{ + rte_atomic16_set(&shinfo->refcnt_atomic, new_value); +} + +/** + * Add given value to refcnt of an external buffer and return its new + * value. + * + * @param shinfo + * Shared data of the external buffer. + * @param value + * Value to add/subtract + * @return + * Updated value + */ +static inline uint16_t +rte_extbuf_refcnt_update(struct rte_mbuf_ext_shared_info *shinfo, + int16_t value) +{ + if (likely(rte_extbuf_refcnt_read(shinfo) == 1)) { + rte_extbuf_refcnt_set(shinfo, 1 + value); + return 1 + value; + } + + return (uint16_t)rte_atomic16_add_return(&shinfo->refcnt_atomic, value); +} + /** Mbuf prefetch */ #define RTE_MBUF_PREFETCH_TO_FREE(m) do { \ if ((m) != NULL) \ @@ -1195,11 +1270,120 @@ static inline int rte_pktmbuf_alloc_bulk(struct rte_mempool *pool, } /** + * Return shared data of external buffer of a mbuf. + * + * @param m + * The pointer to the mbuf. + * @return + * The address of the shared data. + */ +static inline struct rte_mbuf_ext_shared_info * +rte_mbuf_ext_shinfo(struct rte_mbuf *m) +{ + return (struct rte_mbuf_ext_shared_info *) + RTE_PTR_ADD(m->buf_addr, m->buf_len); +} + +/** + * Attach an external buffer to a mbuf. + * + * User-managed anonymous buffer can be attached to an mbuf. When attaching + * it, corresponding free callback function and its argument should be + * provided. This callback function will be called once all the mbufs are + * detached from the buffer. + * + * More mbufs can be attached to the same external buffer by + * ``rte_pktmbuf_attach()`` once the external buffer has been attached by + * this API. + * + * Detachment can be done by either ``rte_pktmbuf_detach_extbuf()`` or + * ``rte_pktmbuf_detach()``. + * + * A few bytes in the trailer of the provided buffer will be dedicated for + * shared data (``struct rte_mbuf_ext_shared_info``) to store refcnt, + * callback function and so on. The shared data can be referenced by + * ``rte_mbuf_ext_shinfo()`` + * + * Attaching an external buffer is quite similar to mbuf indirection in + * replacing buffer addresses and length of a mbuf, but a few differences: + * - As refcnt of a direct mbuf is at least 2, the buffer area of a direct + * mbuf must be read-only. But external buffer has its own refcnt and it + * starts from 1. Unless multiple mbufs are attached to a mbuf having an + * external buffer, the external buffer is writable. + * - There's no need to allocate buffer from a mempool. Any buffer can be + * attached with appropriate free callback. + * - Smaller metadata is required to maintain shared data such as refcnt. + * + * @warning + * @b EXPERIMENTAL: This API may change without prior notice. + * Once external buffer is enabled by allowing experimental API, + * ``RTE_MBUF_DIRECT()`` and ``RTE_MBUF_INDIRECT()`` are no longer + * exclusive. A mbuf can be consiered direct if it is neither indirect nor + * having external buffer. + * + * @param m + * The pointer to the mbuf. + * @param buf_addr + * The pointer to the external buffer we're attaching to. + * @param buf_len + * The size of the external buffer we're attaching to. This must be larger + * than the size of ``struct rte_mbuf_ext_shared_info`` and padding for + * alignment. If not enough, this function will return NULL. + * @param free_cb + * Free callback function to call when the external buffer needs to be freed. + * @param fcb_opaque + * Argument for the free callback function. + * @return + * A pointer to the new start of the data on success, return NULL otherwise. + */ +static inline char * __rte_experimental +rte_pktmbuf_attach_extbuf(struct rte_mbuf *m, void *buf_addr, + uint16_t buf_len, rte_mbuf_extbuf_free_callback_t free_cb, + void *fcb_opaque) +{ + void *buf_end = RTE_PTR_ADD(buf_addr, buf_len); + struct rte_mbuf_ext_shared_info *shinfo; + + shinfo = RTE_PTR_ALIGN_FLOOR(RTE_PTR_SUB(buf_end, sizeof(*shinfo)), + sizeof(uintptr_t)); + + if ((void *)shinfo <= buf_addr) + return NULL; + + m->buf_addr = buf_addr; + m->buf_iova = rte_mempool_virt2iova(buf_addr); + m->buf_len = RTE_PTR_DIFF(shinfo, buf_addr); + m->data_len = 0; + + rte_pktmbuf_reset_headroom(m); + m->ol_flags |= EXT_ATTACHED_MBUF; + + rte_extbuf_refcnt_set(shinfo, 1); + shinfo->free_cb = free_cb; + shinfo->fcb_opaque = fcb_opaque; + + return (char *)m->buf_addr + m->data_off; +} + +/** + * Detach the external buffer attached to a mbuf, same as + * ``rte_pktmbuf_detach()`` + * + * @param m + * The mbuf having external buffer. + */ +#define rte_pktmbuf_detach_extbuf(m) rte_pktmbuf_detach(m) + +/** * Attach packet mbuf to another packet mbuf. * - * After attachment we refer the mbuf we attached as 'indirect', - * while mbuf we attached to as 'direct'. - * The direct mbuf's reference counter is incremented. + * If the mbuf we are attaching to isn't a direct buffer and is attached to + * an external buffer, the mbuf being attached will be attached to the + * external buffer instead of mbuf indirection. + * + * Otherwise, the mbuf will be indirectly attached. After attachment we + * refer the mbuf we attached as 'indirect', while mbuf we attached to as + * 'direct'. The direct mbuf's reference counter is incremented. * * Right now, not supported: * - attachment for already indirect mbuf (e.g. - mi has to be direct). @@ -1213,19 +1397,18 @@ static inline int rte_pktmbuf_alloc_bulk(struct rte_mempool *pool, */ static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m) { - struct rte_mbuf *md; - RTE_ASSERT(RTE_MBUF_DIRECT(mi) && rte_mbuf_refcnt_read(mi) == 1); - /* if m is not direct, get the mbuf that embeds the data */ - if (RTE_MBUF_DIRECT(m)) - md = m; - else - md = rte_mbuf_from_indirect(m); + if (RTE_MBUF_HAS_EXTBUF(m)) { + rte_extbuf_refcnt_update(rte_mbuf_ext_shinfo(m), 1); + mi->ol_flags = m->ol_flags; + } else { + rte_mbuf_refcnt_update(rte_mbuf_from_indirect(m), 1); + mi->priv_size = m->priv_size; + mi->ol_flags = m->ol_flags | IND_ATTACHED_MBUF; + } - rte_mbuf_refcnt_update(md, 1); - mi->priv_size = m->priv_size; mi->buf_iova = m->buf_iova; mi->buf_addr = m->buf_addr; mi->buf_len = m->buf_len; @@ -1241,7 +1424,6 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m) mi->next = NULL; mi->pkt_len = mi->data_len; mi->nb_segs = 1; - mi->ol_flags = m->ol_flags | IND_ATTACHED_MBUF; mi->packet_type = m->packet_type; mi->timestamp = m->timestamp; @@ -1250,12 +1432,53 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m) } /** - * Detach an indirect packet mbuf. + * @internal used by rte_pktmbuf_detach(). + * + * Decrement the reference counter of the external buffer. When the + * reference counter becomes 0, the buffer is freed by pre-registered + * callback. + */ +static inline void +__rte_pktmbuf_free_extbuf(struct rte_mbuf *m) +{ + struct rte_mbuf_ext_shared_info *shinfo; + + RTE_ASSERT(RTE_MBUF_HAS_EXTBUF(m)); + + shinfo = rte_mbuf_ext_shinfo(m); + + if (rte_extbuf_refcnt_update(shinfo, -1) == 0) + shinfo->free_cb(m->buf_addr, shinfo->fcb_opaque); +} + +/** + * @internal used by rte_pktmbuf_detach(). * + * Decrement the direct mbuf's reference counter. When the reference + * counter becomes 0, the direct mbuf is freed. + */ +static inline void +__rte_pktmbuf_free_direct(struct rte_mbuf *m) +{ + struct rte_mbuf *md = rte_mbuf_from_indirect(m); + + RTE_ASSERT(RTE_MBUF_INDIRECT(m)); + + if (rte_mbuf_refcnt_update(md, -1) == 0) { + md->next = NULL; + md->nb_segs = 1; + rte_mbuf_refcnt_set(md, 1); + rte_mbuf_raw_free(md); + } +} + +/** + * Detach a packet mbuf from external buffer or direct buffer. + * + * - decrement refcnt and free the external/direct buffer if refcnt + * becomes zero. * - restore original mbuf address and length values. * - reset pktmbuf data and data_len to their default values. - * - decrement the direct mbuf's reference counter. When the - * reference counter becomes 0, the direct mbuf is freed. * * All other fields of the given packet mbuf will be left intact. * @@ -1264,10 +1487,14 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m) */ static inline void rte_pktmbuf_detach(struct rte_mbuf *m) { - struct rte_mbuf *md = rte_mbuf_from_indirect(m); struct rte_mempool *mp = m->pool; uint32_t mbuf_size, buf_len, priv_size; + if (RTE_MBUF_HAS_EXTBUF(m)) + __rte_pktmbuf_free_extbuf(m); + else + __rte_pktmbuf_free_direct(m); + priv_size = rte_pktmbuf_priv_size(mp); mbuf_size = sizeof(struct rte_mbuf) + priv_size; buf_len = rte_pktmbuf_data_room_size(mp); @@ -1279,13 +1506,6 @@ static inline void rte_pktmbuf_detach(struct rte_mbuf *m) rte_pktmbuf_reset_headroom(m); m->data_len = 0; m->ol_flags = 0; - - if (rte_mbuf_refcnt_update(md, -1) == 0) { - md->next = NULL; - md->nb_segs = 1; - rte_mbuf_refcnt_set(md, 1); - rte_mbuf_raw_free(md); - } } /** @@ -1309,7 +1529,7 @@ rte_pktmbuf_prefree_seg(struct rte_mbuf *m) if (likely(rte_mbuf_refcnt_read(m) == 1)) { - if (RTE_MBUF_INDIRECT(m)) + if (!RTE_MBUF_DIRECT(m)) rte_pktmbuf_detach(m); if (m->next != NULL) { @@ -1321,7 +1541,7 @@ rte_pktmbuf_prefree_seg(struct rte_mbuf *m) } else if (__rte_mbuf_refcnt_update(m, -1) == 0) { - if (RTE_MBUF_INDIRECT(m)) + if (!RTE_MBUF_DIRECT(m)) rte_pktmbuf_detach(m); if (m->next != NULL) { -- 2.11.0