> This patch replaces use of the deprecated rte_atomic32 code with
> GCC builtin atomic operations.
> 
> Although it would be preferable to use C11 version on all architectures,
> there is a performance loss if we do it that way:
> 
> Measured on i9-13900H, two physical cores MP/MC bulk n=128, 10 runs:
>   with C11 builtin:           5.86 cycles/elem
>   with __sync builtin:        5.36 cycles/elem  (-9.4%)
> 
> The C11 __atomic_compare_exchange_n builtin writes the actual value back
> to its expected pointer on failure. On x86 this forces GCC
> to emit extra instructions on the critical path between the CAS
> and the success-test.
> 
> __sync_bool_compare_and_swap returns a plain bool with no pointer
> writeback, allowing GCC to emit tighter code.
> 
> Signed-off-by: Stephen Hemminger <[email protected]>
> ---
>  lib/ring/meson.build                          |  2 +-
>  lib/ring/rte_ring_elem_pvt.h                  |  2 +-
>  ..._ring_generic_pvt.h => rte_ring_gcc_pvt.h} | 33 +++++++++++--------
>  3 files changed, 21 insertions(+), 16 deletions(-)
>  rename lib/ring/{rte_ring_generic_pvt.h => rte_ring_gcc_pvt.h} (88%)
> 
> diff --git a/lib/ring/meson.build b/lib/ring/meson.build
> index 21f2c12989..2ba160b178 100644
> --- a/lib/ring/meson.build
> +++ b/lib/ring/meson.build
> @@ -9,7 +9,7 @@ indirect_headers += files (
>          'rte_ring_elem.h',
>          'rte_ring_elem_pvt.h',
>          'rte_ring_c11_pvt.h',
> -        'rte_ring_generic_pvt.h',
> +        'rte_ring_gcc_pvt.h',
>          'rte_ring_hts.h',
>          'rte_ring_hts_elem_pvt.h',
>          'rte_ring_peek.h',
> diff --git a/lib/ring/rte_ring_elem_pvt.h b/lib/ring/rte_ring_elem_pvt.h
> index a0fdec9812..9a0170c4f0 100644
> --- a/lib/ring/rte_ring_elem_pvt.h
> +++ b/lib/ring/rte_ring_elem_pvt.h
> @@ -309,7 +309,7 @@ __rte_ring_dequeue_elems(struct rte_ring *r, uint32_t
> cons_head,
>  #ifdef RTE_USE_C11_MEM_MODEL
>  #include "rte_ring_c11_pvt.h"
>  #else
> -#include "rte_ring_generic_pvt.h"
> +#include "rte_ring_gcc_pvt.h"
>  #endif
> 
>  /**
> diff --git a/lib/ring/rte_ring_generic_pvt.h b/lib/ring/rte_ring_gcc_pvt.h
> similarity index 88%
> rename from lib/ring/rte_ring_generic_pvt.h
> rename to lib/ring/rte_ring_gcc_pvt.h
> index c044b0824f..68ab1355e8 100644
> --- a/lib/ring/rte_ring_generic_pvt.h
> +++ b/lib/ring/rte_ring_gcc_pvt.h
> @@ -7,11 +7,11 @@
>   * Used as BSD-3 Licensed with permission from Kip Macy.
>   */
> 
> -#ifndef _RTE_RING_GENERIC_PVT_H_
> -#define _RTE_RING_GENERIC_PVT_H_
> +#ifndef _RTE_RING_GCC_PVT_H_
> +#define _RTE_RING_GCC_PVT_H_
> 
>  /**
> - * @file rte_ring_generic_pvt.h
> + * @file rte_ring_gcc_pvt.h
>   * It is not recommended to include this file directly,
>   * include <rte_ring.h> instead.
>   * Contains internal helper functions for MP/SP and MC/SC ring modes.
> @@ -25,10 +25,8 @@ static __rte_always_inline void
>  __rte_ring_update_tail(struct rte_ring_headtail *ht, uint32_t old_val,
>               uint32_t new_val, uint32_t single, uint32_t enqueue)
>  {
> -     if (enqueue)
> -             rte_smp_wmb();
> -     else
> -             rte_smp_rmb();
> +     RTE_SET_USED(enqueue);
> +
>       /*
>        * If there are other enqueues/dequeues in progress that preceded us,
>        * we need to wait for them to complete
> @@ -37,7 +35,12 @@ __rte_ring_update_tail(struct rte_ring_headtail *ht,
> uint32_t old_val,
>               rte_wait_until_equal_32((volatile uint32_t 
> *)(uintptr_t)&ht->tail,
> old_val,
>                       rte_memory_order_relaxed);
> 
> -     ht->tail = new_val;
> +     /*
> +      * R0: Establishes a synchronizing edge with load-acquire of tail at A1.
> +      * Ensures that memory effects by this thread on ring elements array
> +      * is observed by a different thread of the other type.
> +      */
> +     __atomic_store_n(&ht->tail, new_val, __ATOMIC_RELEASE);
>  }
> 
>  /**
> @@ -73,7 +76,7 @@ __rte_ring_headtail_move_head_mt(struct
> rte_ring_headtail *d,
>               uint32_t *old_head, uint32_t *new_head, uint32_t *entries)
>  {
>       unsigned int max = n;
> -     int success;
> +     bool success;
> 
>       do {
>               /* Reset n to the initial burst count */
> @@ -81,10 +84,10 @@ __rte_ring_headtail_move_head_mt(struct
> rte_ring_headtail *d,
> 
>               *old_head = d->head;
> 
> -             /* add rmb barrier to avoid load/load reorder in weak
> +             /* add fence to avoid load/load reorder in weak
>                * memory model. It is noop on x86
>                */
> -             rte_smp_rmb();
> +             __atomic_thread_fence(__ATOMIC_ACQUIRE);
> 
>               /*
>                *  The subtraction is done between two unsigned 32bits value
> @@ -103,10 +106,12 @@ __rte_ring_headtail_move_head_mt(struct
> rte_ring_headtail *d,
>                       return 0;
> 
>               *new_head = *old_head + n;
> -             success = rte_atomic32_cmpset(
> +
> +             success = __sync_bool_compare_and_swap(
>                               (uint32_t *)(uintptr_t)&d->head,
>                               *old_head, *new_head);
> -     } while (unlikely(success == 0));
> +     } while (unlikely(!success));
> +
>       return n;
>  }
> 
> @@ -169,4 +174,4 @@ __rte_ring_headtail_move_head_st(struct
> rte_ring_headtail *d,
>       return n;
>  }
> 
> -#endif /* _RTE_RING_GENERIC_PVT_H_ */
> +#endif /* _RTE_RING_GCC_PVT_H_ */
> --

Acked-by: Konstantin Ananyev <[email protected]>
Tested-by: Konstantin Ananyev <[email protected]>

> 2.53.0

Reply via email to