This patch replaces use of the deprecated rte_atomic32 code with
GCC builtin atomic operations.

Although it would be preferable to use C11 version on all architectures,
there is a performance loss if we do it that way:

Measured on i9-13900H, two physical cores MP/MC bulk n=128, 10 runs:
  with C11 builtin:           5.86 cycles/elem
  with __sync builtin:        5.36 cycles/elem  (-9.4%)

The C11 __atomic_compare_exchange_n builtin writes the actual value back
to its expected pointer on failure. On x86 this forces GCC
to emit extra instructions on the critical path between the CAS
and the success-test.

__sync_bool_compare_and_swap returns a plain bool with no pointer
writeback, allowing GCC to emit tighter code.

Signed-off-by: Stephen Hemminger <[email protected]>
---
 lib/ring/meson.build                          |  2 +-
 lib/ring/rte_ring_c11_pvt.h                   |  3 +-
 lib/ring/rte_ring_elem_pvt.h                  |  2 +-
 ..._ring_generic_pvt.h => rte_ring_gcc_pvt.h} | 37 +++++++++++--------
 4 files changed, 24 insertions(+), 20 deletions(-)
 rename lib/ring/{rte_ring_generic_pvt.h => rte_ring_gcc_pvt.h} (87%)

diff --git a/lib/ring/meson.build b/lib/ring/meson.build
index 21f2c12989..2ba160b178 100644
--- a/lib/ring/meson.build
+++ b/lib/ring/meson.build
@@ -9,7 +9,7 @@ indirect_headers += files (
         'rte_ring_elem.h',
         'rte_ring_elem_pvt.h',
         'rte_ring_c11_pvt.h',
-        'rte_ring_generic_pvt.h',
+        'rte_ring_gcc_pvt.h',
         'rte_ring_hts.h',
         'rte_ring_hts_elem_pvt.h',
         'rte_ring_peek.h',
diff --git a/lib/ring/rte_ring_c11_pvt.h b/lib/ring/rte_ring_c11_pvt.h
index 5afc14dec9..8358b0f21f 100644
--- a/lib/ring/rte_ring_c11_pvt.h
+++ b/lib/ring/rte_ring_c11_pvt.h
@@ -43,7 +43,6 @@ __rte_ring_update_tail(struct rte_ring_headtail *ht, uint32_t 
old_val,
         */
        rte_atomic_store_explicit(&ht->tail, new_val, rte_memory_order_release);
 }
-
 /**
  * @internal This is a helper function that moves the producer/consumer head
  *    optimized for single threaded case
@@ -82,7 +81,7 @@ __rte_ring_headtail_move_head_st(struct rte_ring_headtail *d,
        /* Single producer: only this thread writes d->head,
         * so a relaxed load is sufficient.
         */
-       *old_head = rte_atomic_load_explicit(&d->head, 
rte_memory_order_relaxed);
+       *old_head = rte_atomic_load_explicit(&d->head,  
rte_memory_order_acquire);
 
        /* Acquire pairs with the consumer's release-store of tail in 
__rte_ring_update_tail,
         * ensuring the consumer's ring-element reads are complete before
diff --git a/lib/ring/rte_ring_elem_pvt.h b/lib/ring/rte_ring_elem_pvt.h
index a0fdec9812..9a0170c4f0 100644
--- a/lib/ring/rte_ring_elem_pvt.h
+++ b/lib/ring/rte_ring_elem_pvt.h
@@ -309,7 +309,7 @@ __rte_ring_dequeue_elems(struct rte_ring *r, uint32_t 
cons_head,
 #ifdef RTE_USE_C11_MEM_MODEL
 #include "rte_ring_c11_pvt.h"
 #else
-#include "rte_ring_generic_pvt.h"
+#include "rte_ring_gcc_pvt.h"
 #endif
 
 /**
diff --git a/lib/ring/rte_ring_generic_pvt.h b/lib/ring/rte_ring_gcc_pvt.h
similarity index 87%
rename from lib/ring/rte_ring_generic_pvt.h
rename to lib/ring/rte_ring_gcc_pvt.h
index c044b0824f..9033a15647 100644
--- a/lib/ring/rte_ring_generic_pvt.h
+++ b/lib/ring/rte_ring_gcc_pvt.h
@@ -7,11 +7,11 @@
  * Used as BSD-3 Licensed with permission from Kip Macy.
  */
 
-#ifndef _RTE_RING_GENERIC_PVT_H_
-#define _RTE_RING_GENERIC_PVT_H_
+#ifndef _RTE_RING_GCC_PVT_H_
+#define _RTE_RING_GCC_PVT_H_
 
 /**
- * @file rte_ring_generic_pvt.h
+ * @file rte_ring_gcc_pvt.h
  * It is not recommended to include this file directly,
  * include <rte_ring.h> instead.
  * Contains internal helper functions for MP/SP and MC/SC ring modes.
@@ -25,10 +25,8 @@ static __rte_always_inline void
 __rte_ring_update_tail(struct rte_ring_headtail *ht, uint32_t old_val,
                uint32_t new_val, uint32_t single, uint32_t enqueue)
 {
-       if (enqueue)
-               rte_smp_wmb();
-       else
-               rte_smp_rmb();
+       RTE_SET_USED(enqueue);
+
        /*
         * If there are other enqueues/dequeues in progress that preceded us,
         * we need to wait for them to complete
@@ -37,7 +35,12 @@ __rte_ring_update_tail(struct rte_ring_headtail *ht, 
uint32_t old_val,
                rte_wait_until_equal_32((volatile uint32_t 
*)(uintptr_t)&ht->tail, old_val,
                        rte_memory_order_relaxed);
 
-       ht->tail = new_val;
+       /*
+        * R0: Establishes a synchronizing edge with load-acquire of tail at A1.
+        * Ensures that memory effects by this thread on ring elements array
+        * is observed by a different thread of the other type.
+        */
+       __atomic_store_n(&ht->tail, new_val, __ATOMIC_RELEASE);
 }
 
 /**
@@ -72,8 +75,8 @@ __rte_ring_headtail_move_head_mt(struct rte_ring_headtail *d,
                unsigned int n, enum rte_ring_queue_behavior behavior,
                uint32_t *old_head, uint32_t *new_head, uint32_t *entries)
 {
+       bool success;
        unsigned int max = n;
-       int success;
 
        do {
                /* Reset n to the initial burst count */
@@ -81,10 +84,10 @@ __rte_ring_headtail_move_head_mt(struct rte_ring_headtail 
*d,
 
                *old_head = d->head;
 
-               /* add rmb barrier to avoid load/load reorder in weak
+               /* add fence to avoid load/load reorder in weak
                 * memory model. It is noop on x86
                 */
-               rte_smp_rmb();
+               __atomic_thread_fence(__ATOMIC_ACQUIRE);
 
                /*
                 *  The subtraction is done between two unsigned 32bits value
@@ -92,7 +95,7 @@ __rte_ring_headtail_move_head_mt(struct rte_ring_headtail *d,
                 * *old_head > s->tail). So 'entries' is always between 0
                 * and capacity (which is < size).
                 */
-               *entries = (capacity + s->tail - *old_head);
+               *entries = capacity + s->tail - *old_head;
 
                /* check that we have enough room in ring */
                if (unlikely(n > *entries))
@@ -100,13 +103,15 @@ __rte_ring_headtail_move_head_mt(struct rte_ring_headtail 
*d,
                                        0 : *entries;
 
                if (n == 0)
-                       return 0;
+                       break;
 
                *new_head = *old_head + n;
-               success = rte_atomic32_cmpset(
+
+               success = __sync_bool_compare_and_swap(
                                (uint32_t *)(uintptr_t)&d->head,
                                *old_head, *new_head);
-       } while (unlikely(success == 0));
+       } while (unlikely(!success));
+
        return n;
 }
 
@@ -169,4 +174,4 @@ __rte_ring_headtail_move_head_st(struct rte_ring_headtail 
*d,
        return n;
 }
 
-#endif /* _RTE_RING_GENERIC_PVT_H_ */
+#endif /* _RTE_RING_GCC_PVT_H_ */
-- 
2.53.0

Reply via email to