Hi Konstantin, > -----Original Message----- > From: Gavin Hu (Arm Technology China) > Sent: Thursday, October 24, 2019 12:20 AM > To: Ananyev, Konstantin <konstantin.anan...@intel.com>; dev@dpdk.org > Cc: nd <n...@arm.com>; tho...@monjalon.net; > step...@networkplumber.org; hemant.agra...@nxp.com; > jer...@marvell.com; pbhagavat...@marvell.com; Honnappa Nagarahalli > <honnappa.nagaraha...@arm.com>; Ruifeng Wang (Arm Technology China) > <ruifeng.w...@arm.com>; Phil Yang (Arm Technology China) > <phil.y...@arm.com>; Steve Capper <steve.cap...@arm.com>; nd > <n...@arm.com> > Subject: RE: [dpdk-dev] [PATCH v7 2/7] eal: add the APIs to wait until equal > > Hi Konstantin, > > > -----Original Message----- > > From: Ananyev, Konstantin <konstantin.anan...@intel.com> > > Sent: Friday, October 18, 2019 12:44 AM > > To: Gavin Hu (Arm Technology China) <gavin...@arm.com>; > > dev@dpdk.org > > Cc: nd <n...@arm.com>; tho...@monjalon.net; > > step...@networkplumber.org; hemant.agra...@nxp.com; > > jer...@marvell.com; pbhagavat...@marvell.com; Honnappa Nagarahalli > > <honnappa.nagaraha...@arm.com>; Ruifeng Wang (Arm Technology > China) > > <ruifeng.w...@arm.com>; Phil Yang (Arm Technology China) > > <phil.y...@arm.com>; Steve Capper <steve.cap...@arm.com> > > Subject: RE: [dpdk-dev] [PATCH v7 2/7] eal: add the APIs to wait until equal > > > > > > > > > > Hi Gavin, > > > > > > > > > > > The rte_wait_until_equal_xx APIs abstract the functionality of > > > > 'polling for a memory location to become equal to a given value'. > > > > > > > > Add the RTE_ARM_USE_WFE configuration entry for aarch64, disabled > > > > by default. When it is enabled, the above APIs will call WFE instruction > > > > to save CPU cycles and power. > > > > > > > > Signed-off-by: Gavin Hu <gavin...@arm.com> > > > > Reviewed-by: Ruifeng Wang <ruifeng.w...@arm.com> > > > > Reviewed-by: Steve Capper <steve.cap...@arm.com> > > > > Reviewed-by: Ola Liljedahl <ola.liljed...@arm.com> > > > > Reviewed-by: Honnappa Nagarahalli > <honnappa.nagaraha...@arm.com> > > > > Reviewed-by: Phil Yang <phil.y...@arm.com> > > > > Acked-by: Pavan Nikhilesh <pbhagavat...@marvell.com> > > > > --- > > > > config/arm/meson.build | 1 + > > > > config/common_base | 5 + > > > > .../common/include/arch/arm/rte_pause_64.h | 30 ++++++ > > > > lib/librte_eal/common/include/generic/rte_pause.h | 106 > > +++++++++++++++++++++ > > > > 4 files changed, 142 insertions(+) > > > > > > > > diff --git a/config/arm/meson.build b/config/arm/meson.build > > > > index 979018e..b4b4cac 100644 > > > > --- a/config/arm/meson.build > > > > +++ b/config/arm/meson.build > > > > @@ -26,6 +26,7 @@ flags_common_default = [ > > > > ['RTE_LIBRTE_AVP_PMD', false], > > > > > > > > ['RTE_SCHED_VECTOR', false], > > > > + ['RTE_ARM_USE_WFE', false], > > > > ] > > > > > > > > flags_generic = [ > > > > diff --git a/config/common_base b/config/common_base > > > > index 8ef75c2..8861713 100644 > > > > --- a/config/common_base > > > > +++ b/config/common_base > > > > @@ -111,6 +111,11 @@ CONFIG_RTE_MAX_VFIO_CONTAINERS=64 > > > > CONFIG_RTE_MALLOC_DEBUG=n > > > > CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n > > > > CONFIG_RTE_USE_LIBBSD=n > > > > +# Use WFE instructions to implement the rte_wait_for_equal_xxx APIs, > > > > +# calling these APIs put the cores in low power state while waiting > > > > +# for the memory address to become equal to the expected value. > > > > +# This is supported only by aarch64. > > > > +CONFIG_RTE_ARM_USE_WFE=n > > > > > > > > # > > > > # Recognize/ignore the AVX/AVX512 CPU flags for performance/power > > testing. > > > > diff --git a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h > > b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h > > > > index 93895d3..dabde17 100644 > > > > --- a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h > > > > +++ b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h > > > > @@ -1,5 +1,6 @@ > > > > /* SPDX-License-Identifier: BSD-3-Clause > > > > * Copyright(c) 2017 Cavium, Inc > > > > + * Copyright(c) 2019 Arm Limited > > > > */ > > > > > > > > #ifndef _RTE_PAUSE_ARM64_H_ > > > > @@ -17,6 +18,35 @@ static inline void rte_pause(void) > > > > asm volatile("yield" ::: "memory"); > > > > } > > > > > > > > +#ifdef RTE_ARM_USE_WFE > > > > +#define __WAIT_UNTIL_EQUAL(name, asm_op, wide, type) \ > > > > +static __rte_always_inline void \ > > > > +rte_wait_until_equal_##name(volatile type * addr, type expected) \ > > > > +{ \ > > > > + type tmp; \ > > > > + asm volatile( \ > > > > + #asm_op " %" #wide "[tmp], %[addr]\n" \ > > > > + "cmp %" #wide "[tmp], %" #wide "[expected]\n" \ > > > > + "b.eq 2f\n" \ > > > > + "sevl\n" \ > > > > + "1: wfe\n" \ > > > > + #asm_op " %" #wide "[tmp], %[addr]\n" \ > > > > + "cmp %" #wide "[tmp], %" #wide "[expected]\n" \ > > > > + "bne 1b\n" \ > > > > + "2:\n" \ > > > > + : [tmp] "=&r" (tmp) \ > > > > + : [addr] "Q"(*addr), [expected] "r"(expected) \ > > > > + : "cc", "memory"); \ > > > > +} > > > > One more thought: > > Why do you need to write asm code for the whole procedure? > > Why not to do like linux kernel: > > define wfe() and sev() macros and use them inside normal C code? > > > > #define sev() asm volatile("sev" : : : "memory") > > #define wfe() asm volatile("wfe" : : : "memory") > > > > Then: > > rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected, int > > memorder) > > { > > if (__atomic_load_n(addr, memorder) != expected) { > > sev(); > > do { > > wfe(); > > } while ((__atomic_load_n(addr, memorder) != expected); > > } > > } > > > > ? > A really good suggestion, I made corresponding changes to v8 already, but it > missed a armv8 specific feature after internal discussion. > We call wfe to wait/sleep on the 'monitored' address, it will be waken up > upon someone write to the monitor address, so before wfe, we have to call > load-exclusive instruction to 'monitor'. > __atomic_load_n - disassembled to "ldr" does not do so. We have to use > "ldxrh" for relaxed mem ordering and "ldaxrh" for acquire ordering, in > example of 16-bit. > > Let me re-think coming back to the full assembly procedure or implementing > a 'load-exclusive' function. What do you think? > /Gavin Forgot to mention, kernel uses wfe() without preceding load-exclusive instructions because: 1) it replies on the timer, to wake up, i.e. __delay() 2) explicit calling sev to send wake events, for all kinds of locks 3) IPI instructions.
Our patches can't count on these events, due to of lack of these events or performance impact. /Gavin > > > > > +/* Wait for *addr to be updated with expected value */ > > > > +__WAIT_UNTIL_EQUAL(relaxed_16, ldxrh, w, uint16_t) > > > > +__WAIT_UNTIL_EQUAL(acquire_16, ldaxrh, w, uint16_t) > > > > +__WAIT_UNTIL_EQUAL(relaxed_32, ldxr, w, uint32_t) > > > > +__WAIT_UNTIL_EQUAL(acquire_32, ldaxr, w, uint32_t) > > > > +__WAIT_UNTIL_EQUAL(relaxed_64, ldxr, x, uint64_t) > > > > +__WAIT_UNTIL_EQUAL(acquire_64, ldaxr, x, uint64_t) > > > > +#endif > > > > + > > > > #ifdef __cplusplus > > > > } > > > > #endif > > > > diff --git a/lib/librte_eal/common/include/generic/rte_pause.h > > b/lib/librte_eal/common/include/generic/rte_pause.h > > > > index 52bd4db..8906473 100644 > > > > --- a/lib/librte_eal/common/include/generic/rte_pause.h > > > > +++ b/lib/librte_eal/common/include/generic/rte_pause.h > > > > @@ -1,5 +1,6 @@ > > > > /* SPDX-License-Identifier: BSD-3-Clause > > > > * Copyright(c) 2017 Cavium, Inc > > > > + * Copyright(c) 2019 Arm Limited > > > > */ > > > > > > > > #ifndef _RTE_PAUSE_H_ > > > > @@ -12,6 +13,10 @@ > > > > * > > > > */ > > > > > > > > +#include <stdint.h> > > > > +#include <rte_common.h> > > > > +#include <rte_atomic.h> > > > > + > > > > /** > > > > * Pause CPU execution for a short while > > > > * > > > > @@ -20,4 +25,105 @@ > > > > */ > > > > static inline void rte_pause(void); > > > > > > > > +/** > > > > + * Wait for *addr to be updated with a 16-bit expected value, with a > > relaxed > > > > + * memory ordering model meaning the loads around this API can be > > reordered. > > > > + * > > > > + * @param addr > > > > + * A pointer to the memory location. > > > > + * @param expected > > > > + * A 16-bit expected value to be in the memory location. > > > > + */ > > > > +__rte_always_inline > > > > +static void > > > > +rte_wait_until_equal_relaxed_16(volatile uint16_t *addr, uint16_t > > expected); > > > > + > > > > +/** > > > > + * Wait for *addr to be updated with a 32-bit expected value, with a > > relaxed > > > > + * memory ordering model meaning the loads around this API can be > > reordered. > > > > + * > > > > + * @param addr > > > > + * A pointer to the memory location. > > > > + * @param expected > > > > + * A 32-bit expected value to be in the memory location. > > > > + */ > > > > +__rte_always_inline > > > > +static void > > > > +rte_wait_until_equal_relaxed_32(volatile uint32_t *addr, uint32_t > > expected); > > > > + > > > > +/** > > > > + * Wait for *addr to be updated with a 64-bit expected value, with a > > relaxed > > > > + * memory ordering model meaning the loads around this API can be > > reordered. > > > > + * > > > > + * @param addr > > > > + * A pointer to the memory location. > > > > + * @param expected > > > > + * A 64-bit expected value to be in the memory location. > > > > + */ > > > > +__rte_always_inline > > > > +static void > > > > +rte_wait_until_equal_relaxed_64(volatile uint64_t *addr, uint64_t > > expected); > > > > + > > > > +/** > > > > + * Wait for *addr to be updated with a 16-bit expected value, with an > > acquire > > > > + * memory ordering model meaning the loads after this API can't be > > observed > > > > + * before this API. > > > > + * > > > > + * @param addr > > > > + * A pointer to the memory location. > > > > + * @param expected > > > > + * A 16-bit expected value to be in the memory location. > > > > + */ > > > > +__rte_always_inline > > > > +static void > > > > +rte_wait_until_equal_acquire_16(volatile uint16_t *addr, uint16_t > > expected); > > > > + > > > > +/** > > > > + * Wait for *addr to be updated with a 32-bit expected value, with an > > acquire > > > > + * memory ordering model meaning the loads after this API can't be > > observed > > > > + * before this API. > > > > + * > > > > + * @param addr > > > > + * A pointer to the memory location. > > > > + * @param expected > > > > + * A 32-bit expected value to be in the memory location. > > > > + */ > > > > +__rte_always_inline > > > > +static void > > > > +rte_wait_until_equal_acquire_32(volatile uint32_t *addr, uint32_t > > expected); > > > > > > LGTM in general. > > > One stylish thing: wouldn't it be better to have an API like that: > > > rte_wait_until_equal_acquire_X(addr, expected, memory_order) > > > ? > > > > > > I.E. - pass memorder as parameter, not to incorporate it into function > > name? > > > Less functions, plus user can specify order himself. > > > Plus looks similar to C11 atomic instrincts. > > > > > > > > > > + > > > > +/** > > > > + * Wait for *addr to be updated with a 64-bit expected value, with an > > acquire > > > > + * memory ordering model meaning the loads after this API can't be > > observed > > > > + * before this API. > > > > + * > > > > + * @param addr > > > > + * A pointer to the memory location. > > > > + * @param expected > > > > + * A 64-bit expected value to be in the memory location. > > > > + */ > > > > +__rte_always_inline > > > > +static void > > > > +rte_wait_until_equal_acquire_64(volatile uint64_t *addr, uint64_t > > expected); > > > > + > > > > +#if !defined(RTE_ARM_USE_WFE) > > > > +#define __WAIT_UNTIL_EQUAL(op_name, size, type, memorder) \ > > > > +__rte_always_inline \ > > > > +static void \ > > > > +rte_wait_until_equal_##op_name##_##size(volatile type *addr, \ > > > > + type expected) \ > > > > +{ \ > > > > + while (__atomic_load_n(addr, memorder) != expected) \ > > > > + rte_pause(); \ > > > > +} > > > > + > > > > +/* Wait for *addr to be updated with expected value */ > > > > +__WAIT_UNTIL_EQUAL(relaxed, 16, uint16_t, __ATOMIC_RELAXED) > > > > +__WAIT_UNTIL_EQUAL(acquire, 16, uint16_t, __ATOMIC_ACQUIRE) > > > > +__WAIT_UNTIL_EQUAL(relaxed, 32, uint32_t, __ATOMIC_RELAXED) > > > > +__WAIT_UNTIL_EQUAL(acquire, 32, uint32_t, __ATOMIC_ACQUIRE) > > > > +__WAIT_UNTIL_EQUAL(relaxed, 64, uint64_t, __ATOMIC_RELAXED) > > > > +__WAIT_UNTIL_EQUAL(acquire, 64, uint64_t, __ATOMIC_ACQUIRE) > > > > +#endif /* RTE_ARM_USE_WFE */ > > > > + > > > > #endif /* _RTE_PAUSE_H_ */ > > > > -- > > > > 2.7.4