> -----Original Message----- > From: Yongseok Koh [mailto:ys...@mellanox.com] > Sent: Thursday, April 26, 2018 2:10 AM > To: Lu, Wenzhuo <wenzhuo...@intel.com>; Wu, Jingjing <jingjing...@intel.com>; > olivier.m...@6wind.com > Cc: dev@dpdk.org; Ananyev, Konstantin <konstantin.anan...@intel.com>; > arybche...@solarflare.com; step...@networkplumber.org; > tho...@monjalon.net; adrien.mazarg...@6wind.com; nelio.laranje...@6wind.com; > Yongseok Koh <ys...@mellanox.com> > Subject: [PATCH v6 1/2] mbuf: support attaching external buffer to mbuf > > This patch introduces a new way of attaching an external buffer to a mbuf. > > Attaching an external buffer is quite similar to mbuf indirection in > replacing buffer addresses and length of a mbuf, but a few differences: > - When an indirect mbuf is attached, refcnt of the direct mbuf would be > 2 as long as the direct mbuf itself isn't freed after the attachment. > In such cases, the buffer area of a direct mbuf must be read-only. But > external buffer has its own refcnt and it starts from 1. Unless > multiple mbufs are attached to a mbuf having an external buffer, the > external buffer is writable. > - There's no need to allocate buffer from a mempool. Any buffer can be > attached with appropriate free callback. > - Smaller metadata is required to maintain shared data such as refcnt. > > Signed-off-by: Yongseok Koh <ys...@mellanox.com> > ---
Acked-by: Konstantin Ananyev <konstantin.anan...@intel.com> > > ** This patch can pass the mbuf_autotest. ** > > Submitting only non-mlx5 patches to meet deadline for RC1. mlx5 patches > will be submitted separately rebased on a differnet patchset which > accommodates new memory hotplug design to mlx PMDs. > > v6: > * rte_pktmbuf_attach_extbuf() doesn't take NULL shinfo. Instead, > rte_pktmbuf_ext_shinfo_init_helper() is added. > * bug fix in rte_pktmbuf_attach() - shinfo wasn't saved to mi. > * minor changes from review. > > v5: > * rte_pktmbuf_attach_extbuf() sets headroom to 0. > * if shinfo is provided when attaching, user should initialize it. > * minor changes from review. > > v4: > * rte_pktmbuf_attach_extbuf() takes new arguments - buf_iova and shinfo. > user can pass memory for shared data via shinfo argument. > * minor changes from review. > > v3: > * implement external buffer attachment instead of introducing buf_off for > mbuf indirection. > > lib/librte_mbuf/rte_mbuf.h | 335 > +++++++++++++++++++++++++++++++++++++++++---- > 1 file changed, 306 insertions(+), 29 deletions(-) > > diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h > index 43aaa9c5f..0a6885281 100644 > --- a/lib/librte_mbuf/rte_mbuf.h > +++ b/lib/librte_mbuf/rte_mbuf.h > @@ -344,7 +344,10 @@ extern "C" { > PKT_TX_MACSEC | \ > PKT_TX_SEC_OFFLOAD) > > -#define __RESERVED (1ULL << 61) /**< reserved for future mbuf use > */ > +/** > + * Mbuf having an external buffer attached. shinfo in mbuf must be filled. > + */ > +#define EXT_ATTACHED_MBUF (1ULL << 61) > > #define IND_ATTACHED_MBUF (1ULL << 62) /**< Indirect attached mbuf */ > > @@ -584,8 +587,27 @@ struct rte_mbuf { > /** Sequence number. See also rte_reorder_insert(). */ > uint32_t seqn; > > + /** Shared data for external buffer attached to mbuf. See > + * rte_pktmbuf_attach_extbuf(). > + */ > + struct rte_mbuf_ext_shared_info *shinfo; > + > } __rte_cache_aligned; > > +/** > + * Function typedef of callback to free externally attached buffer. > + */ > +typedef void (*rte_mbuf_extbuf_free_callback_t)(void *addr, void *opaque); > + > +/** > + * Shared data at the end of an external buffer. > + */ > +struct rte_mbuf_ext_shared_info { > + rte_mbuf_extbuf_free_callback_t free_cb; /**< Free callback function */ > + void *fcb_opaque; /**< Free callback argument */ > + rte_atomic16_t refcnt_atomic; /**< Atomically accessed refcnt */ > +}; > + > /**< Maximum number of nb_segs allowed. */ > #define RTE_MBUF_MAX_NB_SEGS UINT16_MAX > > @@ -706,14 +728,34 @@ rte_mbuf_to_baddr(struct rte_mbuf *md) > } > > /** > + * Returns TRUE if given mbuf is cloned by mbuf indirection, or FALSE > + * otherwise. > + * > + * If a mbuf has its data in another mbuf and references it by mbuf > + * indirection, this mbuf can be defined as a cloned mbuf. > + */ > +#define RTE_MBUF_CLONED(mb) ((mb)->ol_flags & IND_ATTACHED_MBUF) > + > +/** > * Returns TRUE if given mbuf is indirect, or FALSE otherwise. > */ > -#define RTE_MBUF_INDIRECT(mb) ((mb)->ol_flags & IND_ATTACHED_MBUF) > +#define RTE_MBUF_INDIRECT(mb) RTE_MBUF_CLONED(mb) > + > +/** > + * Returns TRUE if given mbuf has an external buffer, or FALSE otherwise. > + * > + * External buffer is a user-provided anonymous buffer. > + */ > +#define RTE_MBUF_HAS_EXTBUF(mb) ((mb)->ol_flags & EXT_ATTACHED_MBUF) > > /** > * Returns TRUE if given mbuf is direct, or FALSE otherwise. > + * > + * If a mbuf embeds its own data after the rte_mbuf structure, this mbuf > + * can be defined as a direct mbuf. > */ > -#define RTE_MBUF_DIRECT(mb) (!RTE_MBUF_INDIRECT(mb)) > +#define RTE_MBUF_DIRECT(mb) \ > + (!((mb)->ol_flags & (IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF))) > > /** > * Private data in case of pktmbuf pool. > @@ -839,6 +881,58 @@ rte_mbuf_refcnt_set(struct rte_mbuf *m, uint16_t > new_value) > > #endif /* RTE_MBUF_REFCNT_ATOMIC */ > > +/** > + * Reads the refcnt of an external buffer. > + * > + * @param shinfo > + * Shared data of the external buffer. > + * @return > + * Reference count number. > + */ > +static inline uint16_t > +rte_mbuf_ext_refcnt_read(const struct rte_mbuf_ext_shared_info *shinfo) > +{ > + return (uint16_t)(rte_atomic16_read(&shinfo->refcnt_atomic)); > +} > + > +/** > + * Set refcnt of an external buffer. > + * > + * @param shinfo > + * Shared data of the external buffer. > + * @param new_value > + * Value set > + */ > +static inline void > +rte_mbuf_ext_refcnt_set(struct rte_mbuf_ext_shared_info *shinfo, > + uint16_t new_value) > +{ > + rte_atomic16_set(&shinfo->refcnt_atomic, new_value); > +} > + > +/** > + * Add given value to refcnt of an external buffer and return its new > + * value. > + * > + * @param shinfo > + * Shared data of the external buffer. > + * @param value > + * Value to add/subtract > + * @return > + * Updated value > + */ > +static inline uint16_t > +rte_mbuf_ext_refcnt_update(struct rte_mbuf_ext_shared_info *shinfo, > + int16_t value) > +{ > + if (likely(rte_mbuf_ext_refcnt_read(shinfo) == 1)) { > + rte_mbuf_ext_refcnt_set(shinfo, 1 + value); > + return 1 + value; > + } > + > + return (uint16_t)rte_atomic16_add_return(&shinfo->refcnt_atomic, value); > +} > + > /** Mbuf prefetch */ > #define RTE_MBUF_PREFETCH_TO_FREE(m) do { \ > if ((m) != NULL) \ > @@ -1213,11 +1307,157 @@ static inline int rte_pktmbuf_alloc_bulk(struct > rte_mempool *pool, > } > > /** > + * Initialize shared data at the end of an external buffer before attaching > + * to a mbuf by ``rte_pktmbuf_attach_extbuf()``. This is not a mandatory > + * initialization but a helper function to simply spare a few bytes at the > + * end of the buffer for shared data. If shared data is allocated > + * separately, this should not be called but application has to properly > + * initialize the shared data according to its need. > + * > + * Free callback and its argument is saved and the refcnt is set to 1. > + * > + * @warning > + * buf_len must be adjusted to RTE_PTR_DIFF(shinfo, buf_addr) after this > + * initialization. For example, > + * > + * struct rte_mbuf_ext_shared_info *shinfo = > + * rte_pktmbuf_ext_shinfo_init_helpfer(buf_addr, buf_len, > + * free_cb, fcb_arg); > + * buf_len = RTE_PTR_DIFF(shinfo, buf_addr); > + * rte_pktmbuf_attach_extbuf(m, buf_addr, buf_iova, buf_len, shinfo); > + * > + * @param buf_addr > + * The pointer to the external buffer. > + * @param buf_len > + * The size of the external buffer. buf_len must be larger than the size > + * of ``struct rte_mbuf_ext_shared_info`` and padding for alignment. If > + * not enough, this function will return NULL. > + * @param free_cb > + * Free callback function to call when the external buffer needs to be > + * freed. > + * @param fcb_opaque > + * Argument for the free callback function. > + * > + * @return > + * A pointer to the initialized shared data on success, return NULL > + * otherwise. > + */ > +static inline struct rte_mbuf_ext_shared_info * > +rte_pktmbuf_ext_shinfo_init_helper(void *buf_addr, uint16_t buf_len, > + rte_mbuf_extbuf_free_callback_t free_cb, void *fcb_opaque) > +{ > + struct rte_mbuf_ext_shared_info *shinfo; > + void *buf_end = RTE_PTR_ADD(buf_addr, buf_len); > + > + shinfo = RTE_PTR_ALIGN_FLOOR(RTE_PTR_SUB(buf_end, > + sizeof(*shinfo)), sizeof(uintptr_t)); > + if ((void *)shinfo <= buf_addr) > + return NULL; > + > + rte_mbuf_ext_refcnt_set(shinfo, 1); > + > + shinfo->free_cb = free_cb; > + shinfo->fcb_opaque = fcb_opaque; > + > + /* buf_len must be adjusted to RTE_PTR_DIFF(shinfo, buf_addr) */ > + return shinfo; > +} > + > +/** > + * Attach an external buffer to a mbuf. > + * > + * User-managed anonymous buffer can be attached to an mbuf. When attaching > + * it, corresponding free callback function and its argument should be > + * provided via shinfo. This callback function will be called once all the > + * mbufs are detached from the buffer (refcnt becomes zero). > + * > + * The headroom for the attaching mbuf will be set to zero and this can be > + * properly adjusted after attachment. For example, ``rte_pktmbuf_adj()`` > + * or ``rte_pktmbuf_reset_headroom()`` might be used. > + * > + * More mbufs can be attached to the same external buffer by > + * ``rte_pktmbuf_attach()`` once the external buffer has been attached by > + * this API. > + * > + * Detachment can be done by either ``rte_pktmbuf_detach_extbuf()`` or > + * ``rte_pktmbuf_detach()``. > + * > + * Memory for shared data must be provided and user must initialize all of > + * the content properly, escpecially free callback and refcnt. The pointer > + * of shared data will be stored in m->shinfo. > + * ``rte_pktmbuf_ext_shinfo_init_helper`` can help to simply spare a few > + * bytes at the end of buffer for the shared data, store free callback and > + * its argument and set the refcnt to 1. > + * > + * Attaching an external buffer is quite similar to mbuf indirection in > + * replacing buffer addresses and length of a mbuf, but a few differences: > + * - When an indirect mbuf is attached, refcnt of the direct mbuf would be > + * 2 as long as the direct mbuf itself isn't freed after the attachment. > + * In such cases, the buffer area of a direct mbuf must be read-only. But > + * external buffer has its own refcnt and it starts from 1. Unless > + * multiple mbufs are attached to a mbuf having an external buffer, the > + * external buffer is writable. > + * - There's no need to allocate buffer from a mempool. Any buffer can be > + * attached with appropriate free callback and its IO address. > + * - Smaller metadata is required to maintain shared data such as refcnt. > + * > + * @warning > + * @b EXPERIMENTAL: This API may change without prior notice. > + * Once external buffer is enabled by allowing experimental API, > + * ``RTE_MBUF_DIRECT()`` and ``RTE_MBUF_INDIRECT()`` are no longer > + * exclusive. A mbuf can be considered direct if it is neither indirect nor > + * having external buffer. > + * > + * @param m > + * The pointer to the mbuf. > + * @param buf_addr > + * The pointer to the external buffer. > + * @param buf_iova > + * IO address of the external buffer. > + * @param buf_len > + * The size of the external buffer. > + * @param shinfo > + * User-provided memory for shared data of the external buffer. > + */ > +static inline void __rte_experimental > +rte_pktmbuf_attach_extbuf(struct rte_mbuf *m, void *buf_addr, > + rte_iova_t buf_iova, uint16_t buf_len, > + struct rte_mbuf_ext_shared_info *shinfo) > +{ > + /* mbuf should not be read-only */ > + RTE_ASSERT(RTE_MBUF_DIRECT(m) && rte_mbuf_refcnt_read(m) == 1); > + RTE_ASSERT(shinfo->free_cb != NULL); > + > + m->buf_addr = buf_addr; > + m->buf_iova = buf_iova; > + m->buf_len = buf_len; > + > + m->data_len = 0; > + m->data_off = 0; > + > + m->ol_flags |= EXT_ATTACHED_MBUF; > + m->shinfo = shinfo; > +} > + > +/** > + * Detach the external buffer attached to a mbuf, same as > + * ``rte_pktmbuf_detach()`` > + * > + * @param m > + * The mbuf having external buffer. > + */ > +#define rte_pktmbuf_detach_extbuf(m) rte_pktmbuf_detach(m) > + > +/** > * Attach packet mbuf to another packet mbuf. > * > - * After attachment we refer the mbuf we attached as 'indirect', > - * while mbuf we attached to as 'direct'. > - * The direct mbuf's reference counter is incremented. > + * If the mbuf we are attaching to isn't a direct buffer and is attached to > + * an external buffer, the mbuf being attached will be attached to the > + * external buffer instead of mbuf indirection. > + * > + * Otherwise, the mbuf will be indirectly attached. After attachment we > + * refer the mbuf we attached as 'indirect', while mbuf we attached to as > + * 'direct'. The direct mbuf's reference counter is incremented. > * > * Right now, not supported: > * - attachment for already indirect mbuf (e.g. - mi has to be direct). > @@ -1231,19 +1471,20 @@ static inline int rte_pktmbuf_alloc_bulk(struct > rte_mempool *pool, > */ > static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf > *m) > { > - struct rte_mbuf *md; > - > RTE_ASSERT(RTE_MBUF_DIRECT(mi) && > rte_mbuf_refcnt_read(mi) == 1); > > - /* if m is not direct, get the mbuf that embeds the data */ > - if (RTE_MBUF_DIRECT(m)) > - md = m; > - else > - md = rte_mbuf_from_indirect(m); > + if (RTE_MBUF_HAS_EXTBUF(m)) { > + rte_mbuf_ext_refcnt_update(m->shinfo, 1); > + mi->ol_flags = m->ol_flags; > + mi->shinfo = m->shinfo; > + } else { > + /* if m is not direct, get the mbuf that embeds the data */ > + rte_mbuf_refcnt_update(rte_mbuf_from_indirect(m), 1); > + mi->priv_size = m->priv_size; > + mi->ol_flags = m->ol_flags | IND_ATTACHED_MBUF; > + } > > - rte_mbuf_refcnt_update(md, 1); > - mi->priv_size = m->priv_size; > mi->buf_iova = m->buf_iova; > mi->buf_addr = m->buf_addr; > mi->buf_len = m->buf_len; > @@ -1259,7 +1500,6 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf > *mi, struct rte_mbuf *m) > mi->next = NULL; > mi->pkt_len = mi->data_len; > mi->nb_segs = 1; > - mi->ol_flags = m->ol_flags | IND_ATTACHED_MBUF; > mi->packet_type = m->packet_type; > mi->timestamp = m->timestamp; > > @@ -1268,12 +1508,52 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf > *mi, struct rte_mbuf *m) > } > > /** > - * Detach an indirect packet mbuf. > + * @internal used by rte_pktmbuf_detach(). > * > + * Decrement the reference counter of the external buffer. When the > + * reference counter becomes 0, the buffer is freed by pre-registered > + * callback. > + */ > +static inline void > +__rte_pktmbuf_free_extbuf(struct rte_mbuf *m) > +{ > + RTE_ASSERT(RTE_MBUF_HAS_EXTBUF(m)); > + RTE_ASSERT(m->shinfo != NULL); > + > + if (rte_mbuf_ext_refcnt_update(m->shinfo, -1) == 0) > + m->shinfo->free_cb(m->buf_addr, m->shinfo->fcb_opaque); > +} > + > +/** > + * @internal used by rte_pktmbuf_detach(). > + * > + * Decrement the direct mbuf's reference counter. When the reference > + * counter becomes 0, the direct mbuf is freed. > + */ > +static inline void > +__rte_pktmbuf_free_direct(struct rte_mbuf *m) > +{ > + struct rte_mbuf *md; > + > + RTE_ASSERT(RTE_MBUF_INDIRECT(m)); > + > + md = rte_mbuf_from_indirect(m); > + > + if (rte_mbuf_refcnt_update(md, -1) == 0) { > + md->next = NULL; > + md->nb_segs = 1; > + rte_mbuf_refcnt_set(md, 1); > + rte_mbuf_raw_free(md); > + } > +} > + > +/** > + * Detach a packet mbuf from external buffer or direct buffer. > + * > + * - decrement refcnt and free the external/direct buffer if refcnt > + * becomes zero. > * - restore original mbuf address and length values. > * - reset pktmbuf data and data_len to their default values. > - * - decrement the direct mbuf's reference counter. When the > - * reference counter becomes 0, the direct mbuf is freed. > * > * All other fields of the given packet mbuf will be left intact. > * > @@ -1282,10 +1562,14 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf > *mi, struct rte_mbuf *m) > */ > static inline void rte_pktmbuf_detach(struct rte_mbuf *m) > { > - struct rte_mbuf *md = rte_mbuf_from_indirect(m); > struct rte_mempool *mp = m->pool; > uint32_t mbuf_size, buf_len, priv_size; > > + if (RTE_MBUF_HAS_EXTBUF(m)) > + __rte_pktmbuf_free_extbuf(m); > + else > + __rte_pktmbuf_free_direct(m); > + > priv_size = rte_pktmbuf_priv_size(mp); > mbuf_size = sizeof(struct rte_mbuf) + priv_size; > buf_len = rte_pktmbuf_data_room_size(mp); > @@ -1297,13 +1581,6 @@ static inline void rte_pktmbuf_detach(struct rte_mbuf > *m) > rte_pktmbuf_reset_headroom(m); > m->data_len = 0; > m->ol_flags = 0; > - > - if (rte_mbuf_refcnt_update(md, -1) == 0) { > - md->next = NULL; > - md->nb_segs = 1; > - rte_mbuf_refcnt_set(md, 1); > - rte_mbuf_raw_free(md); > - } > } > > /** > @@ -1327,7 +1604,7 @@ rte_pktmbuf_prefree_seg(struct rte_mbuf *m) > > if (likely(rte_mbuf_refcnt_read(m) == 1)) { > > - if (RTE_MBUF_INDIRECT(m)) > + if (!RTE_MBUF_DIRECT(m)) > rte_pktmbuf_detach(m); > > if (m->next != NULL) { > @@ -1339,7 +1616,7 @@ rte_pktmbuf_prefree_seg(struct rte_mbuf *m) > > } else if (__rte_mbuf_refcnt_update(m, -1) == 0) { > > - if (RTE_MBUF_INDIRECT(m)) > + if (!RTE_MBUF_DIRECT(m)) > rte_pktmbuf_detach(m); > > if (m->next != NULL) { > -- > 2.11.0