Re: [PATCH v2 19/19] ring: use rte optional stdatomic API

Tyler Retzlaff Wed, 25 Oct 2023 16:22:57 -0700

On Wed, Oct 25, 2023 at 03:49:50PM -0700, Tyler Retzlaff wrote:
> On Wed, Oct 25, 2023 at 10:06:23AM +0000, Konstantin Ananyev wrote:
> > 
> > 
> > > 
> > > On Tue, Oct 24, 2023 at 09:43:13AM +0100, Konstantin Ananyev wrote:
> > > > 17.10.2023 21:31, Tyler Retzlaff пишет:
> > > > >Replace the use of gcc builtin __atomic_xxx intrinsics with
> > > > >corresponding rte_atomic_xxx optional stdatomic API
> > > > >
> > > > >Signed-off-by: Tyler Retzlaff <roret...@linux.microsoft.com>
> > > > >---
> > > > >  drivers/net/mlx5/mlx5_hws_cnt.h   |  2 +-
> > > > >  lib/ring/rte_ring_c11_pvt.h       | 33 
> > > > > +++++++++++++++++----------------
> > > > >  lib/ring/rte_ring_core.h          | 10 +++++-----
> > > > >  lib/ring/rte_ring_generic_pvt.h   |  3 ++-
> > > > >  lib/ring/rte_ring_hts_elem_pvt.h  | 22 ++++++++++++----------
> > > > >  lib/ring/rte_ring_peek_elem_pvt.h |  6 +++---
> > > > >  lib/ring/rte_ring_rts_elem_pvt.h  | 27 ++++++++++++++-------------
> > > > >  7 files changed, 54 insertions(+), 49 deletions(-)
> > > > >
> > > > >diff --git a/drivers/net/mlx5/mlx5_hws_cnt.h 
> > > > >b/drivers/net/mlx5/mlx5_hws_cnt.h
> > > > >index f462665..cc9ac10 100644
> > > > >--- a/drivers/net/mlx5/mlx5_hws_cnt.h
> > > > >+++ b/drivers/net/mlx5/mlx5_hws_cnt.h
> > > > >@@ -394,7 +394,7 @@ struct mlx5_hws_age_param {
> > > > >       __rte_ring_get_elem_addr(r, revert2head, sizeof(cnt_id_t), n,
> > > > >                       &zcd->ptr1, &zcd->n1, &zcd->ptr2);
> > > > >       /* Update tail */
> > > > >-      __atomic_store_n(&r->prod.tail, revert2head, __ATOMIC_RELEASE);
> > > > >+      rte_atomic_store_explicit(&r->prod.tail, revert2head, 
> > > > >rte_memory_order_release);
> > > > >       return n;
> > > > >  }
> > > > >diff --git a/lib/ring/rte_ring_c11_pvt.h b/lib/ring/rte_ring_c11_pvt.h
> > > > >index f895950..f8be538 100644
> > > > >--- a/lib/ring/rte_ring_c11_pvt.h
> > > > >+++ b/lib/ring/rte_ring_c11_pvt.h
> > > > >@@ -22,9 +22,10 @@
> > > > >        * we need to wait for them to complete
> > > > >        */
> > > > >       if (!single)
> > > > >-              rte_wait_until_equal_32(&ht->tail, old_val, 
> > > > >__ATOMIC_RELAXED);
> > > > >+              rte_wait_until_equal_32((volatile uint32_t 
> > > > >*)(uintptr_t)&ht->tail, old_val,
> > > > >+                      rte_memory_order_relaxed);
> > > > >-      __atomic_store_n(&ht->tail, new_val, __ATOMIC_RELEASE);
> > > > >+      rte_atomic_store_explicit(&ht->tail, new_val, 
> > > > >rte_memory_order_release);
> > > > >  }
> > > > >  /**
> > > > >@@ -61,19 +62,19 @@
> > > > >       unsigned int max = n;
> > > > >       int success;
> > > > >-      *old_head = __atomic_load_n(&r->prod.head, __ATOMIC_RELAXED);
> > > > >+      *old_head = rte_atomic_load_explicit(&r->prod.head, 
> > > > >rte_memory_order_relaxed);
> > > > >       do {
> > > > >               /* Reset n to the initial burst count */
> > > > >               n = max;
> > > > >               /* Ensure the head is read before tail */
> > > > >-              __atomic_thread_fence(__ATOMIC_ACQUIRE);
> > > > >+              __atomic_thread_fence(rte_memory_order_acquire);
> > > > >               /* load-acquire synchronize with store-release of 
> > > > > ht->tail
> > > > >                * in update_tail.
> > > > >                */
> > > > >-              cons_tail = __atomic_load_n(&r->cons.tail,
> > > > >-                                      __ATOMIC_ACQUIRE);
> > > > >+              cons_tail = rte_atomic_load_explicit(&r->cons.tail,
> > > > >+                                      rte_memory_order_acquire);
> > > > >               /* The subtraction is done between two unsigned 32bits 
> > > > > value
> > > > >                * (the result is always modulo 32 bits even if we have
> > > > >@@ -95,10 +96,10 @@
> > > > >                       r->prod.head = *new_head, success = 1;
> > > > >               else
> > > > >                       /* on failure, *old_head is updated */
> > > > >-                      success = 
> > > > >__atomic_compare_exchange_n(&r->prod.head,
> > > > >+                      success = 
> > > > >rte_atomic_compare_exchange_strong_explicit(&r->prod.head,
> > > > >                                       old_head, *new_head,
> > > > >-                                      0, __ATOMIC_RELAXED,
> > > > >-                                      __ATOMIC_RELAXED);
> > > > >+                                      rte_memory_order_relaxed,
> > > > >+                                      rte_memory_order_relaxed);
> > > > >       } while (unlikely(success == 0));
> > > > >       return n;
> > > > >  }
> > > > >@@ -137,19 +138,19 @@
> > > > >       int success;
> > > > >       /* move cons.head atomically */
> > > > >-      *old_head = __atomic_load_n(&r->cons.head, __ATOMIC_RELAXED);
> > > > >+      *old_head = rte_atomic_load_explicit(&r->cons.head, 
> > > > >rte_memory_order_relaxed);
> > > > >       do {
> > > > >               /* Restore n as it may change every loop */
> > > > >               n = max;
> > > > >               /* Ensure the head is read before tail */
> > > > >-              __atomic_thread_fence(__ATOMIC_ACQUIRE);
> > > > >+              __atomic_thread_fence(rte_memory_order_acquire);
> > > > >               /* this load-acquire synchronize with store-release of 
> > > > > ht->tail
> > > > >                * in update_tail.
> > > > >                */
> > > > >-              prod_tail = __atomic_load_n(&r->prod.tail,
> > > > >-                                      __ATOMIC_ACQUIRE);
> > > > >+              prod_tail = rte_atomic_load_explicit(&r->prod.tail,
> > > > >+                                      rte_memory_order_acquire);
> > > > >               /* The subtraction is done between two unsigned 32bits 
> > > > > value
> > > > >                * (the result is always modulo 32 bits even if we have
> > > > >@@ -170,10 +171,10 @@
> > > > >                       r->cons.head = *new_head, success = 1;
> > > > >               else
> > > > >                       /* on failure, *old_head will be updated */
> > > > >-                      success = 
> > > > >__atomic_compare_exchange_n(&r->cons.head,
> > > > >+                      success = 
> > > > >rte_atomic_compare_exchange_strong_explicit(&r->cons.head,
> > > > >                                                       old_head, 
> > > > > *new_head,
> > > > >-                                                      0, 
> > > > >__ATOMIC_RELAXED,
> > > > >-                                                      
> > > > >__ATOMIC_RELAXED);
> > > > >+                                                      
> > > > >rte_memory_order_relaxed,
> > > > >+                                                      
> > > > >rte_memory_order_relaxed);
> > > > >       } while (unlikely(success == 0));
> > > > >       return n;
> > > > >  }
> > > > >diff --git a/lib/ring/rte_ring_core.h b/lib/ring/rte_ring_core.h
> > > > >index 327fdcf..7a2b577 100644
> > > > >--- a/lib/ring/rte_ring_core.h
> > > > >+++ b/lib/ring/rte_ring_core.h
> > > > >@@ -67,7 +67,7 @@ enum rte_ring_sync_type {
> > > > >   */
> > > > >  struct rte_ring_headtail {
> > > > >       volatile uint32_t head;      /**< prod/consumer head. */
> > > > >-      volatile uint32_t tail;      /**< prod/consumer tail. */
> > > > >+      volatile RTE_ATOMIC(uint32_t) tail;      /**< prod/consumer 
> > > > >tail. */
> > > >
> > > > Probably a stupid q:
> > > > why we do need RTE_ATOMIC() around tail only?
> > > > Why head is not affected?
> > > 
> > > you have a good eye and this is a slightly common issue that i've seen
> > > and there appear to be some interesting things showing up.
> > > 
> > > the field being qualified has atomic operation performed on it the other
> > > field does not in the implementation. it may be an indication of a bug in
> > > the existing code or it may be intentional.
> > 
> > Hmm... but as I can see, we are doing similar operations on  both head and 
> > tail.
> > For head it would be: atomic_load(), then either atomic_store() or 
> > atomic_cas().
> > For tail it would be: atomic_load(), then atomic_store().
> > Or is that because of we missed atomic_store(&r->prod.head, ..., RELAXED) 
> > here:
> > static __rte_always_inline unsigned int
> > __rte_ring_move_prod_head(struct rte_ring *r, unsigned int is_sp,
> >                 unsigned int n, enum rte_ring_queue_behavior behavior,
> >                 uint32_t *old_head, uint32_t *new_head,
> >                 uint32_t *free_entries)
> > {
> > ....
> > if (is_sp)
> >                         r->prod.head = *new_head, success = 1;
> > 
> > ?
> 
> for this instance you are correct, i need to get an understanding of why
> this builds successfully because it shouldn't. that it doesn't fail
> probably isn't harmful but since this is a public header the structure
> is visible it's best to have it carry the correct RTE_ATOMIC(T).
> 
> i'll reply back with what i find.


okay, circling back to answer. simply put we don't seem to have any CI
or convenient way to configure a build with RTE_USE_C11_MEM_MODEL so if
we don't build rte_ring_c11_pvt.h that field is not accessed with a
standard atomic generic function.

this explains why it doesn't fail to build and why it doesn't stick out as
needing to be properly specified. which should be done.

i'll submit a new series that catches the other places the series missed
when using RTE_USE_C11_MEM_MODEL

thanks

> 
> thanks
> 
> > 
> > > 
> > > case 1. atomics should be used but they aren't.
> > > 
> > > there are fields in structures and variables that were accessed in a
> > > 'mixed' manner. that is in some instances __atomic_op_xxx was being used
> > > on them and in other instances not. sometimes it is the initialization
> > > case so it is probably okay, sometimes maybe not...
> > > 
> > > case 2. broader scope atomic operation, or we don't care if narrower
> > >         access is atomic.
> > > 
> > > e.g.
> > > union {
> > >    struct {
> > >        uint32_t head;
> > >        RTE_ATOMIC(uint32_t) tail;
> > >     }
> > >     RTE_ATOMIC(uint64_t) combined;
> > > }
> > > 
> > > again, could be an indication of missing use of atomic, often the
> > > operation on the `combined' field consistently uses atomics but one of
> > > the head/tail fields will not be. on purpose? maybe if we are just doing
> > > == comparison?
> > > 
> > > my approach in this series prioritized no functional change. as a result
> > > if any of the above are real bugs, they stay real bugs but i have not
> > > changed the way the variables are accessed. if i were to change the code
> > > and start atomic specifying it has a risk of performance regression (for
> > > cases where it isn't a bug) because specifying would result in the
> > > compiler code generating for strongest ordering seq_cst for accesses
> > > that are not using atomic generic functions that specify ordering.
> > > 
> > > there is another case which comes up half a dozen times or so that is
> > > also concerning to me, but i would need the maintainers of the code to
> > > adapt the code to be correct or maybe it is okay...
> > > 
> > > 
> > > case 3. qualification discard .. is the existing code really okay?
> > > 
> > > e.g.
> > > 
> > > atomic_compare_exchange(*object, *expected, desired, ...)
> > > 
> > > the issue is with the specification of the memory aliased by expected.
> > > gcc doesn't complain or enforce discarding of qualification when using
> > > builtin intrinsics. the result is that if expected is an atomic type it
> > > may be accessed in a non-atomic manner by the code generated for the
> > > atomic operation.
> > > 
> > > again, i have chosen to maintain existing behavior by casting away the
> > > qualification if present on the expected argument.
> > > 
> > > i feel that in terms of mutating the source tree it is best to separate
> > > conversion to atomic specified/qualified types into this separate series
> > > and then follow up with additional changes that may have
> > > functional/performance impact if not for any other reason that it
> > > narrows where you have to look if there is a change. certainly conversion
> > > to atomics has made these cases far easier to spot in the code.
> > > 
> > > finally in terms of most of the toolchain/targets all of this is pretty
> > > moot because most of them are defaulting to enable_stdatomics=false so
> > > most likely if there are problems they will manifest on windows built with
> > > msvc only.
> > > 
> > > thoughts?
> > >

Re: [PATCH v2 19/19] ring: use rte optional stdatomic API

Reply via email to