Re: [lttng-dev] [PATCH 5/7] Replace the arch-specific memory barriers with __atomic builtins

Mathieu Desnoyers via lttng-dev Tue, 21 Mar 2023 13:19:13 -0700

On 2023-03-21 09:31, Ondřej Surý via lttng-dev wrote:

Instead of a custom code, use the __atomic_thread_fence() builtin to
implement the cmm_mb(), cmm_rmb(), cmm_wmb(), cmm_smp_mb(),
cmm_smp_rmb(), and cmm_smp_wmb() on all architectures, and
cmm_read_barrier_depends() on alpha (otherwise it's still no-op).


family of functions

Signed-off-by: Ondřej Surý <ond...@sury.org>
---
  include/urcu/arch/alpha.h   |  6 +++---
  include/urcu/arch/arm.h     | 14 -------------
  include/urcu/arch/generic.h |  6 +++---
  include/urcu/arch/mips.h    |  6 ------
  include/urcu/arch/nios2.h   |  2 --
  include/urcu/arch/ppc.h     | 25 ----------------------
  include/urcu/arch/s390.h    |  2 --
  include/urcu/arch/sparc64.h | 13 ------------
  include/urcu/arch/x86.h     | 42 +++----------------------------------
  9 files changed, 9 insertions(+), 107 deletions(-)

diff --git a/include/urcu/arch/alpha.h b/include/urcu/arch/alpha.h
index dc33e28..61687c7 100644
--- a/include/urcu/arch/alpha.h
+++ b/include/urcu/arch/alpha.h
@@ -29,9 +29,9 @@
  extern "C" {
  #endif

-#define cmm_mb() __asm__ __volatile__ ("mb":::"memory")

-#define cmm_wmb()                      __asm__ __volatile__ ("wmb":::"memory")
-#define cmm_read_barrier_depends()     __asm__ __volatile__ ("mb":::"memory")
+#ifndef cmm_read_barrier_depends
+#define cmm_read_barrier_depends()     __atomic_thread_fence(__ATOMIC_CONSUME)
+#endif

I don't expect a #ifndef in arch-specific code. I would expect theifndef in the generic code.


[...]

diff --git a/include/urcu/arch/generic.h b/include/urcu/arch/generic.h
index be6e41e..2715162 100644
--- a/include/urcu/arch/generic.h
+++ b/include/urcu/arch/generic.h
@@ -44,15 +44,15 @@ extern "C" {
   */

#ifndef cmm_mb

-#define cmm_mb()    __sync_synchronize()
+#define cmm_mb()       __atomic_thread_fence(__ATOMIC_SEQ_CST)
  #endif

#ifndef cmm_rmb

-#define cmm_rmb()      cmm_mb()
+#define cmm_rmb()      __atomic_thread_fence(__ATOMIC_ACQUIRE)
  #endif

#ifndef cmm_wmb

-#define cmm_wmb()      cmm_mb()
+#define cmm_wmb()      __atomic_thread_fence(__ATOMIC_RELEASE)

I don't think rmb/wmb map to ACQUIRE/RELEASE semantic. This is incorrectAFAIU. ACQUIRE/RELEASE are semi-permeable barriers preventing codemotion in one direction or the other, whereas rmb/wmb are barriers thatonly affect code motion of either loads or stores (but in both directions).

In the generic case, rmb/wmb could map to__atomic_thread_fence(__ATOMIC_SEQ_CST).

  #endif
#define cmm_mc() cmm_barrier()

[...]

diff --git a/include/urcu/arch/ppc.h b/include/urcu/arch/ppc.h
index 791529e..618f79c 100644
--- a/include/urcu/arch/ppc.h
+++ b/include/urcu/arch/ppc.h
@@ -34,31 +34,6 @@ extern "C" {
  /* Include size of POWER5+ L3 cache lines: 256 bytes */
  #define CAA_CACHE_LINE_SIZE   256

-#ifdef __NO_LWSYNC__

-#define LWSYNC_OPCODE  "sync\n"
-#else
-#define LWSYNC_OPCODE  "lwsync\n"
-#endif
-
-/*
- * Use sync for all cmm_mb/rmb/wmb barriers because lwsync does not
- * preserve ordering of cacheable vs. non-cacheable accesses, so it
- * should not be used to order with respect to MMIO operations.  An
- * eieio+lwsync pair is also not enough for cmm_rmb, because it will
- * order cacheable and non-cacheable memory operations separately---i.e.
- * not the latter against the former.
- */
-#define cmm_mb()         __asm__ __volatile__ ("sync":::"memory")


I agree that we will want to use the generic implementation for smp_mb.

-
-/*
- * lwsync orders loads in cacheable memory with respect to other loads,
- * and stores in cacheable memory with respect to other stores.
- * Therefore, use it for barriers ordering accesses to cacheable memory
- * only.
- */
-#define cmm_smp_rmb()    __asm__ __volatile__ (LWSYNC_OPCODE:::"memory")
-#define cmm_smp_wmb()    __asm__ __volatile__ (LWSYNC_OPCODE:::"memory")

I suspect that using the generic implementation will be slower thanlwsync. I am tempted to keep a custom implementation for rmb/wmb on ppc.We could have a build mode specific for TSAN which overrides those touse smp_mb instead.

-
  #define mftbl()                                               \
        __extension__                                   \
        ({                                              \

[...]

diff --git a/include/urcu/arch/sparc64.h b/include/urcu/arch/sparc64.h
index 1ff40f5..b4e25ca 100644
--- a/include/urcu/arch/sparc64.h
+++ b/include/urcu/arch/sparc64.h
@@ -40,19 +40,6 @@ extern "C" {

#define CAA_CACHE_LINE_SIZE 256-/*

- * Inspired from the Linux kernel. Workaround Spitfire bug #51.
- */
-#define membar_safe(type)                      \
-__asm__ __volatile__("ba,pt %%xcc, 1f\n\t"   \
-                    "membar " type "\n"    \
-                    "1:\n"                   \
-                    : : : "memory")
-
-#define cmm_mb()       membar_safe("#LoadLoad | #LoadStore | #StoreStore | 
#StoreLoad")
-#define cmm_rmb()      membar_safe("#LoadLoad")
-#define cmm_wmb()      membar_safe("#StoreStore")


Same comment as for ppc.

-
  #ifdef __cplusplus
  }
  #endif
diff --git a/include/urcu/arch/x86.h b/include/urcu/arch/x86.h
index 744f9f9..af4487d 100644
--- a/include/urcu/arch/x86.h
+++ b/include/urcu/arch/x86.h
@@ -46,44 +46,8 @@ extern "C" {
  /* For backwards compat */
  #define CONFIG_RCU_HAVE_FENCE 1

-#define cmm_mb() __asm__ __volatile__ ("mfence":::"memory")

-
-/*
- * Define cmm_rmb/cmm_wmb to "strict" barriers that may be needed when
- * using SSE or working with I/O areas.  cmm_smp_rmb/cmm_smp_wmb are
- * only compiler barriers, which is enough for general use.
- */
-#define cmm_rmb()     __asm__ __volatile__ ("lfence":::"memory")
-#define cmm_wmb()     __asm__ __volatile__ ("sfence"::: "memory")
-#define cmm_smp_rmb() cmm_barrier()
-#define cmm_smp_wmb() cmm_barrier()

Relying on the generic barrier for rmb and wmb would slow things down onx86, we may want to do like I suggest for ppc.

-
-#else
-
-/*
- * We leave smp_rmb/smp_wmb as full barriers for processors that do not have
- * fence instructions.
- *
- * An empty cmm_smp_rmb() may not be enough on old PentiumPro multiprocessor
- * systems, due to an erratum.  The Linux kernel says that "Even distro
- * kernels should think twice before enabling this", but for now let's
- * be conservative and leave the full barrier on 32-bit processors.  Also,
- * IDT WinChip supports weak store ordering, and the kernel may enable it
- * under our feet; cmm_smp_wmb() ceases to be a nop for these processors.
- */
-#if (CAA_BITS_PER_LONG == 32)
-#define cmm_mb()    __asm__ __volatile__ ("lock; addl $0,0(%%esp)":::"memory")
-#define cmm_rmb()    __asm__ __volatile__ ("lock; addl $0,0(%%esp)":::"memory")
-#define cmm_wmb()    __asm__ __volatile__ ("lock; addl $0,0(%%esp)":::"memory")
-#else
-#define cmm_mb()    __asm__ __volatile__ ("lock; addl $0,0(%%rsp)":::"memory")
-#define cmm_rmb()    __asm__ __volatile__ ("lock; addl $0,0(%%rsp)":::"memory")
-#define cmm_wmb()    __asm__ __volatile__ ("lock; addl $0,0(%%rsp)":::"memory")
-#endif

Removing this removes support for older i686 and for URCU_ARCH_K1OM(Xeon Phi). Do we intend to remove that support ?


Thanks,

Mathieu

  #endif
-#define caa_cpu_relax() __asm__ __volatile__ ("rep; nop" : : : "memory")
-
  #define HAS_CAA_GET_CYCLES
#define rdtscll(val) \
@@ -98,10 +62,10 @@ typedef uint64_t caa_cycles_t;
static inline caa_cycles_t caa_get_cycles(void)
  {
-        caa_cycles_t ret = 0;
+       caa_cycles_t ret = 0;
- rdtscll(ret);
-        return ret;
+       rdtscll(ret);
+       return ret;
  }


This whitespace to tab cleanup should be moved to its own patch.

Thanks,

Mathieu

/*


--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com

_______________________________________________
lttng-dev mailing list
lttng-dev@lists.lttng.org
https://lists.lttng.org/cgi-bin/mailman/listinfo/lttng-dev

Re: [lttng-dev] [PATCH 5/7] Replace the arch-specific memory barriers with __atomic builtins

Reply via email to