The function to move head for single threaded case can always use the C11 code, there is no performance difference from GCC intrinsics.
This reduces the exception code to just one function. Signed-off-by: Stephen Hemminger <[email protected]> --- lib/ring/rte_ring_c11_pvt.h | 62 ------------------------------ lib/ring/rte_ring_elem_pvt.h | 74 +++++++++++++++++++++++++++++++++--- lib/ring/rte_ring_gcc_pvt.h | 59 ---------------------------- 3 files changed, 68 insertions(+), 127 deletions(-) diff --git a/lib/ring/rte_ring_c11_pvt.h b/lib/ring/rte_ring_c11_pvt.h index 3258829696..0ba64379fa 100644 --- a/lib/ring/rte_ring_c11_pvt.h +++ b/lib/ring/rte_ring_c11_pvt.h @@ -19,68 +19,6 @@ * For more information please refer to <rte_ring.h>. */ -/** - * @internal This is a helper function that moves the producer/consumer head - * optimized for single threaded case - * - * @param d - * A pointer to the headtail structure with head value to be moved - * @param s - * A pointer to the counter-part headtail structure. Note that this - * function only reads tail value from it - * @param capacity - * Either ring capacity value (for producer), or zero (for consumer) - * @param n - * The number of elements we want to move head value on - * @param behavior - * RTE_RING_QUEUE_FIXED: Move on a fixed number of items - * RTE_RING_QUEUE_VARIABLE: Move on as many items as possible - * @param old_head - * Returns head value as it was before the move - * @param new_head - * Returns the new head value - * @param entries - * Returns the number of ring entries available BEFORE head was moved - * @return - * Actual number of objects the head was moved on - * If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only - */ -static __rte_always_inline unsigned int -__rte_ring_headtail_move_head_st(struct rte_ring_headtail *d, - const struct rte_ring_headtail *s, uint32_t capacity, - unsigned int n, - enum rte_ring_queue_behavior behavior, - uint32_t *old_head, uint32_t *new_head, uint32_t *entries) -{ - uint32_t stail; - - /* Single producer: only this thread writes d->head, - * so a relaxed load is sufficient. - */ - *old_head = rte_atomic_load_explicit(&d->head, rte_memory_order_acquire); - - /* Acquire pairs with the consumer's release-store of tail in __rte_ring_update_tail, - * ensuring the consumer's ring-element reads are complete before - * we observe the updated tail. - */ - stail = rte_atomic_load_explicit(&s->tail, rte_memory_order_acquire); - - /* Unsigned subtraction is modulo 2^32, so entries is always in - * [0, capacity) even if old_head > stail. - */ - *entries = capacity + stail - *old_head; - - /* check that we have enough room in ring */ - if (unlikely(n > *entries)) - n = (behavior == RTE_RING_QUEUE_FIXED) ? 0 : *entries; - - if (n > 0) { - *new_head = *old_head + n; - rte_atomic_store_explicit(&d->head, *new_head, rte_memory_order_relaxed); - } - - return n; -} /** * @internal This is a helper function that moves the producer/consumer head diff --git a/lib/ring/rte_ring_elem_pvt.h b/lib/ring/rte_ring_elem_pvt.h index 74b5fef771..cd77343f38 100644 --- a/lib/ring/rte_ring_elem_pvt.h +++ b/lib/ring/rte_ring_elem_pvt.h @@ -319,12 +319,74 @@ __rte_ring_update_tail(struct rte_ring_headtail *ht, uint32_t old_val, rte_atomic_store_explicit(&ht->tail, new_val, rte_memory_order_release); } -/* Between load and load. there might be cpu reorder in weak model - * (powerpc/arm). - * There are 2 choices for the users - * 1.use rmb() memory barrier - * 2.use one-direction load_acquire/store_release barrier - * It depends on performance test results. +/** + * @internal This is a helper function that moves the producer/consumer head + * optimized for single threaded case + * + * @param d + * A pointer to the headtail structure with head value to be moved + * @param s + * A pointer to the counter-part headtail structure. Note that this + * function only reads tail value from it + * @param capacity + * Either ring capacity value (for producer), or zero (for consumer) + * @param n + * The number of elements we want to move head value on + * @param behavior + * RTE_RING_QUEUE_FIXED: Move on a fixed number of items + * RTE_RING_QUEUE_VARIABLE: Move on as many items as possible + * @param old_head + * Returns head value as it was before the move + * @param new_head + * Returns the new head value + * @param entries + * Returns the number of ring entries available BEFORE head was moved + * @return + * Actual number of objects the head was moved on + * If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only + */ +static __rte_always_inline unsigned int +__rte_ring_headtail_move_head_st(struct rte_ring_headtail *d, + const struct rte_ring_headtail *s, uint32_t capacity, + unsigned int n, + enum rte_ring_queue_behavior behavior, + uint32_t *old_head, uint32_t *new_head, uint32_t *entries) +{ + uint32_t stail; + + /* Single producer: only this thread writes d->head, + * so a relaxed load is sufficient. + */ + *old_head = rte_atomic_load_explicit(&d->head, rte_memory_order_acquire); + + /* Acquire pairs with the consumer's release-store of tail in __rte_ring_update_tail, + * ensuring the consumer's ring-element reads are complete before + * we observe the updated tail. + */ + stail = rte_atomic_load_explicit(&s->tail, rte_memory_order_acquire); + + /* Unsigned subtraction is modulo 2^32, so entries is always in + * [0, capacity) even if old_head > stail. + */ + *entries = capacity + stail - *old_head; + + /* check that we have enough room in ring */ + if (unlikely(n > *entries)) + n = (behavior == RTE_RING_QUEUE_FIXED) ? 0 : *entries; + + if (n > 0) { + *new_head = *old_head + n; + rte_atomic_store_explicit(&d->head, *new_head, rte_memory_order_relaxed); + } + + return n; +} + +/* + * The function __rte_ring_headtail_move_head_mt has two versions + * based on what is most efficient on a given architecture. + * + * The C11 is preferred but on x86 GCC has 10% performance drop. */ #ifdef RTE_USE_C11_MEM_MODEL #include "rte_ring_c11_pvt.h" diff --git a/lib/ring/rte_ring_gcc_pvt.h b/lib/ring/rte_ring_gcc_pvt.h index 6b14c1c822..ec26fe557a 100644 --- a/lib/ring/rte_ring_gcc_pvt.h +++ b/lib/ring/rte_ring_gcc_pvt.h @@ -90,63 +90,4 @@ __rte_ring_headtail_move_head_mt(struct rte_ring_headtail *d, return n; } -/** - * @internal This is a helper function that moves the producer/consumer head - * optimized for single threaded case - * - * @param d - * A pointer to the headtail structure with head value to be moved - * @param s - * A pointer to the counter-part headtail structure. Note that this - * function only reads tail value from it - * @param capacity - * Either ring capacity value (for producer), or zero (for consumer) - * @param n - * The number of elements we want to move head value on - * @param behavior - * RTE_RING_QUEUE_FIXED: Move on a fixed number of items - * RTE_RING_QUEUE_VARIABLE: Move on as many items as possible - * @param old_head - * Returns head value as it was before the move - * @param new_head - * Returns the new head value - * @param entries - * Returns the number of ring entries available BEFORE head was moved - * @return - * Actual number of objects the head was moved on - * If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only - */ -static __rte_always_inline unsigned int -__rte_ring_headtail_move_head_st(struct rte_ring_headtail *d, - const struct rte_ring_headtail *s, uint32_t capacity, - unsigned int n, - enum rte_ring_queue_behavior behavior, - uint32_t *old_head, uint32_t *new_head, uint32_t *entries) -{ - *old_head = d->head; - - /* add rmb barrier to avoid load/load reorder in weak - * memory model. It is noop on x86 - */ - rte_smp_rmb(); - - /* - * The subtraction is done between two unsigned 32bits value - * (the result is always modulo 32 bits even if we have - * *old_head > s->tail). So 'entries' is always between 0 - * and capacity (which is < size). - */ - *entries = (capacity + s->tail - *old_head); - - /* check that we have enough room in ring */ - if (unlikely(n > *entries)) - n = (behavior == RTE_RING_QUEUE_FIXED) ? 0 : *entries; - - if (likely(n > 0)) { - *new_head = *old_head + n; - d->head = *new_head; - } - return n; -} - #endif /* _RTE_RING_GCC_PVT_H_ */ -- 2.53.0

