Hi David, Thanks for your comments. I have addressed most of them in v10. Please review it. Some comments inline. > -----Original Message----- > From: David Marchand <david.march...@redhat.com> > Sent: Monday, October 14, 2019 11:44 PM > To: Phil Yang (Arm Technology China) <phil.y...@arm.com> > Cc: tho...@monjalon.net; jer...@marvell.com; Gage Eads > <gage.e...@intel.com>; dev <dev@dpdk.org>; hemant.agra...@nxp.com; > Honnappa Nagarahalli <honnappa.nagaraha...@arm.com>; Gavin Hu (Arm > Technology China) <gavin...@arm.com>; nd <n...@arm.com> > Subject: Re: [dpdk-dev] [PATCH v9 1/3] eal/arm64: add 128-bit atomic > compare exchange > > On Wed, Aug 14, 2019 at 10:29 AM Phil Yang <phil.y...@arm.com> wrote: > > > > Add 128-bit atomic compare exchange on aarch64. > > A bit short, seeing the complexity of the code and the additional > RTE_ARM_FEATURE_ATOMICS config flag. Updated in v10.
<snip> > > > > +/*------------------------ 128 bit atomic operations > > -------------------------*/ > > + > > +#define __HAS_ACQ(mo) ((mo) != __ATOMIC_RELAXED && (mo) != > __ATOMIC_RELEASE) > > +#define __HAS_RLS(mo) ((mo) == __ATOMIC_RELEASE || (mo) == > __ATOMIC_ACQ_REL || \ > > + (mo) == __ATOMIC_SEQ_CST) > > + > > +#define __MO_LOAD(mo) (__HAS_ACQ((mo)) ? __ATOMIC_ACQUIRE : > __ATOMIC_RELAXED) > > +#define __MO_STORE(mo) (__HAS_RLS((mo)) ? __ATOMIC_RELEASE : > __ATOMIC_RELAXED) > > Those 4 first macros only make sense when LSE is not available (see below > [1]). > Besides, they are used only once, why not directly use those > conditions where needed? Agree. I removed __MO_LOAD and __MO_STORE in v10 and kept the __HAS_ACQ and __HAS_REL under the non-LSE condition branch in v10. I think they can make the code easy to read. > > > > + > > +#if defined(__ARM_FEATURE_ATOMICS) || > defined(RTE_ARM_FEATURE_ATOMICS) > > +#define __ATOMIC128_CAS_OP(cas_op_name, op_string) > > \ > > +static __rte_noinline rte_int128_t > > \ > > +cas_op_name(rte_int128_t *dst, rte_int128_t old, > > \ > > + rte_int128_t updated) > > \ > > +{ > > \ > > + /* caspX instructions register pair must start from even-numbered > > + * register at operand 1. > > + * So, specify registers for local variables here. > > + */ > > \ > > + register uint64_t x0 __asm("x0") = (uint64_t)old.val[0]; > > \ > > + register uint64_t x1 __asm("x1") = (uint64_t)old.val[1]; > > \ > > + register uint64_t x2 __asm("x2") = (uint64_t)updated.val[0]; > > \ > > + register uint64_t x3 __asm("x3") = (uint64_t)updated.val[1]; > > \ > > + asm volatile( > > \ > > + op_string " %[old0], %[old1], %[upd0], %[upd1], [%[dst]]" > > \ > > + : [old0] "+r" (x0), > > \ > > + [old1] "+r" (x1) > > \ > > + : [upd0] "r" (x2), > > \ > > + [upd1] "r" (x3), > > \ > > + [dst] "r" (dst) > > \ > > + : "memory"); > > \ > > + old.val[0] = x0; > > \ > > + old.val[1] = x1; > > \ > > + return old; > > \ > > +} > > + > > +__ATOMIC128_CAS_OP(__rte_cas_relaxed, "casp") > > +__ATOMIC128_CAS_OP(__rte_cas_acquire, "caspa") > > +__ATOMIC128_CAS_OP(__rte_cas_release, "caspl") > > +__ATOMIC128_CAS_OP(__rte_cas_acq_rel, "caspal") > > If LSE is available, we expose __rte_cas_XX (explicitely) *non* > inlined functions, while without LSE, we expose inlined __rte_ldr_XX > and __rte_stx_XX functions. > So we have a first disparity with non-inlined vs inlined functions > depending on a #ifdef. > Then, we have a second disparity with two sets of "apis" depending on > this #ifdef. > > And we expose those sets with a rte_ prefix, meaning people will try > to use them, but those are not part of a public api. > > Can't we do without them ? (see below [2] for a proposal with ldr/stx, > cas should be the same) No, it doesn't work. Because we need to verify the return value at the end of the loop for these macros. > > > > +#else > > +#define __ATOMIC128_LDX_OP(ldx_op_name, op_string) > > \ > > +static inline rte_int128_t > > \ > > +ldx_op_name(const rte_int128_t *src) > > \ > > +{ > > \ > > + rte_int128_t ret; > > \ > > + asm volatile( > > \ > > + op_string " %0, %1, %2" > > \ > > + : "=&r" (ret.val[0]), > > \ > > + "=&r" (ret.val[1]) > > \ > > + : "Q" (src->val[0]) > > \ > > + : "memory"); > > \ > > + return ret; > > \ > > +} > > + > > +__ATOMIC128_LDX_OP(__rte_ldx_relaxed, "ldxp") > > +__ATOMIC128_LDX_OP(__rte_ldx_acquire, "ldaxp") > > + > > +#define __ATOMIC128_STX_OP(stx_op_name, op_string) > > \ > > +static inline uint32_t > > \ > > +stx_op_name(rte_int128_t *dst, const rte_int128_t src) > > \ > > +{ > > \ > > + uint32_t ret; > > \ > > + asm volatile( > > \ > > + op_string " %w0, %1, %2, %3" > > \ > > + : "=&r" (ret) > > \ > > + : "r" (src.val[0]), > > \ > > + "r" (src.val[1]), > > \ > > + "Q" (dst->val[0]) > > \ > > + : "memory"); > > \ > > + /* Return 0 on success, 1 on failure */ > > \ > > + return ret; > > \ > > +} > > + > > +__ATOMIC128_STX_OP(__rte_stx_relaxed, "stxp") > > +__ATOMIC128_STX_OP(__rte_stx_release, "stlxp") > > +#endif > > + > > +static inline int __rte_experimental > > The __rte_experimental tag comes first. Updated in v10. > > > > +rte_atomic128_cmp_exchange(rte_int128_t *dst, > > + rte_int128_t *exp, > > + const rte_int128_t *src, > > + unsigned int weak, > > + int success, > > + int failure) > > +{ > > + /* Always do strong CAS */ > > + RTE_SET_USED(weak); > > + /* Ignore memory ordering for failure, memory order for > > + * success must be stronger or equal > > + */ > > + RTE_SET_USED(failure); > > + /* Find invalid memory order */ > > + RTE_ASSERT(success == __ATOMIC_RELAXED > > + || success == __ATOMIC_ACQUIRE > > + || success == __ATOMIC_RELEASE > > + || success == __ATOMIC_ACQ_REL > > + || success == __ATOMIC_SEQ_CST); > > + > > +#if defined(__ARM_FEATURE_ATOMICS) || > defined(RTE_ARM_FEATURE_ATOMICS) > > + rte_int128_t expected = *exp; > > + rte_int128_t desired = *src; > > + rte_int128_t old; > > + > > + if (success == __ATOMIC_RELAXED) > > + old = __rte_cas_relaxed(dst, expected, desired); > > + else if (success == __ATOMIC_ACQUIRE) > > + old = __rte_cas_acquire(dst, expected, desired); > > + else if (success == __ATOMIC_RELEASE) > > + old = __rte_cas_release(dst, expected, desired); > > + else > > + old = __rte_cas_acq_rel(dst, expected, desired); > > +#else > > 1: the four first macros (on the memory ordering constraints) can be > moved here then undef'd once unused. > Or you can just do without them. Updated in v10. > > > > + int ldx_mo = __MO_LOAD(success); > > + int stx_mo = __MO_STORE(success); > > + uint32_t ret = 1; > > + register rte_int128_t expected = *exp; > > + register rte_int128_t desired = *src; > > + register rte_int128_t old; > > + > > + /* ldx128 can not guarantee atomic, > > + * Must write back src or old to verify atomicity of ldx128; > > + */ > > + do { > > + if (ldx_mo == __ATOMIC_RELAXED) > > + old = __rte_ldx_relaxed(dst); > > + else > > + old = __rte_ldx_acquire(dst); > > 2: how about using a simple macro that gets passed the op string? > > Something like (untested): > > #define __READ_128(op_string, src, dst) \ > asm volatile( \ > op_string " %0, %1, %2" \ > : "=&r" (dst.val[0]), \ > "=&r" (dst.val[1]) \ > : "Q" (src->val[0]) \ > : "memory") > > Then used like this: > > if (ldx_mo == __ATOMIC_RELAXED) > __READ_128("ldxp", dst, old); > else > __READ_128("ldaxp", dst, old); > > #undef __READ_128 > > > + > > + if (likely(old.int128 == expected.int128)) { > > + if (stx_mo == __ATOMIC_RELAXED) > > + ret = __rte_stx_relaxed(dst, desired); > > + else > > + ret = __rte_stx_release(dst, desired); > > + } else { > > + /* In the failure case (since 'weak' is ignored and > > only > > + * weak == 0 is implemented), expected should > > contain > > + * the atomically read value of dst. This means, > > 'old' > > + * needs to be stored back to ensure it was read > > + * atomically. > > + */ > > + if (stx_mo == __ATOMIC_RELAXED) > > + ret = __rte_stx_relaxed(dst, old); > > + else > > + ret = __rte_stx_release(dst, old); > > And: > > #define __STORE_128(op_string, dst, val, ret) \ > asm volatile( \ > op_string " %w0, %1, %2, %3" \ > : "=&r" (ret) \ > : "r" (val.val[0]), \ > "r" (val.val[1]), \ > "Q" (dst->val[0]) \ > : "memory") > > Used like this: > > if (likely(old.int128 == expected.int128)) { > if (stx_mo == __ATOMIC_RELAXED) > __STORE_128("stxp", dst, desired, ret); > else > __STORE_128("stlxp", dst, desired, ret); > } else { > /* In the failure case (since 'weak' is ignored and only > * weak == 0 is implemented), expected should contain > * the atomically read value of dst. This means, 'old' > * needs to be stored back to ensure it was read > * atomically. > */ > if (stx_mo == __ATOMIC_RELAXED) > __STORE_128("stxp", dst, old, ret); > else > __STORE_128("stlxp", dst, old, ret); > } > > #undef __STORE_128 > > > > + } > > + } while (unlikely(ret)); > > +#endif > > + > > + /* Unconditionally updating expected removes > > + * an 'if' statement. > > + * expected should already be in register if > > + * not in the cache. > > + */ > > + *exp = old; > > + > > + return (old.int128 == expected.int128); > > +} > > + > > #ifdef __cplusplus > > } > > #endif > > diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h > b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h > > index 1335d92..cfe7067 100644 > > --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h > > +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h > > @@ -183,18 +183,6 @@ static inline void > rte_atomic64_clear(rte_atomic64_t *v) > > > > /*------------------------ 128 bit atomic operations > > -------------------------*/ > > > > -/** > > - * 128-bit integer structure. > > - */ > > -RTE_STD_C11 > > -typedef struct { > > - RTE_STD_C11 > > - union { > > - uint64_t val[2]; > > - __extension__ __int128 int128; > > - }; > > -} __rte_aligned(16) rte_int128_t; > > - > > __rte_experimental > > static inline int > > rte_atomic128_cmp_exchange(rte_int128_t *dst, > > diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h > b/lib/librte_eal/common/include/generic/rte_atomic.h > > index 24ff7dc..e6ab15a 100644 > > --- a/lib/librte_eal/common/include/generic/rte_atomic.h > > +++ b/lib/librte_eal/common/include/generic/rte_atomic.h > > @@ -1081,6 +1081,20 @@ static inline void > rte_atomic64_clear(rte_atomic64_t *v) > > > > /*------------------------ 128 bit atomic operations > > -------------------------*/ > > > > +/** > > + * 128-bit integer structure. > > + */ > > +RTE_STD_C11 > > +typedef struct { > > + RTE_STD_C11 > > + union { > > + uint64_t val[2]; > > +#ifdef RTE_ARCH_64 > > + __extension__ __int128 int128; > > +#endif > > You hid this field for x86. > What is the reason? No, we are not hid it for x86. The RTE_ARCH_64 flag covered x86 as well. Thanks, Phil