Hi all,

The atomic_loaddi expander on arm has some issues and can benefit from a 
rewrite to properly
perform double-word atomic loads on various architecture levels.

Consider the code:
----------------------
#include <stdatomic.h>

atomic_ullong foo;

int glob;

int main(void) {
    atomic_load_explicit(&foo, memory_order_acquire);
    return glob;
}
---------------------

Compiled with -O2 -march=armv7-a -std=c11 this gives:
        movw    r3, #:lower16:glob
        movt    r3, #:upper16:glob
        dmb     ish
        movw    r2, #:lower16:foo
        movt    r2, #:upper16:foo
        ldrexd  r0, r1, [r2]
        ldr     r0, [r3]
        bx      lr

For the acquire memory model the barrier should be after the ldrexd, not before.
The same code is generated when compiled with -march=armv7ve. However, we can 
get away with a single LDRD
on such systems. In issue C.c of The ARM Architecture Reference Manual for 
ARMv7-A and ARMv7-R
recommends at chapter A3.5.3:
"In an implementation that includes the Large Physical Address Extension, LDRD 
and STRD accesses to 64-bit aligned
locations are 64-bit single-copy atomic".
We still need the barrier after the LDRD to enforce the acquire ordering 
semantics.

For ARMv8-A we can do even better and use the load double-word acquire 
instruction: LDAEXD, with no need for
a barrier afterwards.

I've discussed the required sequences with some kernel folk and had a read 
through:
https://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
and this is the patch I've come up with.

This patch handles all three of the above cases by rewriting the atomic_loaddi 
expander.
With this patch for the above code with -march=armv7-a we would now generate:
        movw    r3, #:lower16:foo
        movt    r3, #:upper16:foo
        ldrexd  r0, r1, [r3]
        movw    r3, #:lower16:glob
        movt    r3, #:upper16:glob
        dmb     ish
        ldr     r0, [r3]
        bx      lr

For -march=armv7ve:
        movw    r3, #:lower16:foo
        movt    r3, #:upper16:foo
        ldrd    r2, r3, [r3]
        movw    r3, #:lower16:glob
        movt    r3, #:upper16:glob
        dmb     ish
        ldr     r0, [r3]
        bx      lr

and for -march=armv8-a:
        movw    r3, #:lower16:foo
        movt    r3, #:upper16:foo
        ldaexd  r2, r3, [r3]
        movw    r3, #:lower16:glob
        movt    r3, #:upper16:glob
        ldr     r0, [r3]
        bx      lr

For the relaxed memory model the armv7ve and armv8-a can be relaxed to a single
LDRD instruction, without any barriers.

Bootstrapped and tested on arm-none-linux-gnueabihf.

Ok for trunk?

Thanks,
Kyrill

P.S. The backport to the previous branches will look a bit different because the
ARM_FSET_HAS_CPU1 machinery in arm.h was introduced for GCC 6. I'll prepare a 
backport
separately if this is accepted.

2016-02-19  Kyrylo Tkachov  <kyrylo.tkac...@arm.com>

    PR target/69875
    * config/arm/arm.h (TARGET_HAVE_LPAE): Define.
    * config/arm/unspecs.md (VUNSPEC_LDRD_ATOMIC): New value.
    * config/arm/sync.md (arm_atomic_loaddi2_ldrd): New pattern.
    (atomic_loaddi_1): Delete.
    (atomic_loaddi): Rewrite expander using the above changes.

2016-02-19  Kyrylo Tkachov  <kyrylo.tkac...@arm.com>

    PR target/69875
    * gcc.target/arm/atomic_loaddi_acquire.x: New file.
    * gcc.target/arm/atomic_loaddi_relaxed.x: Likewise.
    * gcc.target/arm/atomic_loaddi_seq_cst.x: Likewise.
    * gcc.target/arm/atomic_loaddi_1.c: New test.
    * gcc.target/arm/atomic_loaddi_2.c: Likewise.
    * gcc.target/arm/atomic_loaddi_3.c: Likewise.
    * gcc.target/arm/atomic_loaddi_4.c: Likewise.
    * gcc.target/arm/atomic_loaddi_5.c: Likewise.
    * gcc.target/arm/atomic_loaddi_6.c: Likewise.
    * gcc.target/arm/atomic_loaddi_7.c: Likewise.
    * gcc.target/arm/atomic_loaddi_8.c: Likewise.
    * gcc.target/arm/atomic_loaddi_9.c: Likewise.
diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
index ca7c0e1e6c49e106cd871769dbf45e7b16861ea4..4a0e6aff8ec22c2c597684265cbbd10982bb81b7 100644
--- a/gcc/config/arm/arm.h
+++ b/gcc/config/arm/arm.h
@@ -253,6 +253,10 @@ extern void (*arm_lang_output_object_attributes_hook)(void);
 /* Nonzero if this chip supports ldrex and strex */
 #define TARGET_HAVE_LDREX        ((arm_arch6 && TARGET_ARM) || arm_arch7)
 
+/* Nonzero if this chip supports LPAE.  */
+#define TARGET_HAVE_LPAE						\
+  (arm_arch7 && ARM_FSET_HAS_CPU1 (insn_flags, FL_FOR_ARCH7VE))
+
 /* Nonzero if this chip supports ldrex{bh} and strex{bh}.  */
 #define TARGET_HAVE_LDREXBH ((arm_arch6k && TARGET_ARM) || arm_arch7)
 
diff --git a/gcc/config/arm/sync.md b/gcc/config/arm/sync.md
index 8158f53025400045569533a1e8c6583025d490c8..abcfbcb1eacaabc597c9fde475c1b56624fb5a59 100644
--- a/gcc/config/arm/sync.md
+++ b/gcc/config/arm/sync.md
@@ -96,32 +96,62 @@ (define_insn "atomic_store<mode>"
   [(set_attr "predicable" "yes")
    (set_attr "predicable_short_it" "no")])
 
-;; Note that ldrd and vldr are *not* guaranteed to be single-copy atomic,
-;; even for a 64-bit aligned address.  Instead we use a ldrexd unparied
-;; with a store.
+;; An LDRD instruction usable by the atomic_loaddi expander on LPAE targets
+
+(define_insn "arm_atomic_loaddi2_ldrd"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(unspec_volatile:DI
+	  [(match_operand:DI 1 "arm_sync_memory_operand" "Q")]
+	    VUNSPEC_LDRD_ATOMIC))]
+  "ARM_DOUBLEWORD_ALIGN && TARGET_HAVE_LPAE"
+  "ldrd%?\t%0, %H0, %C1"
+  [(set_attr "predicable" "yes")
+   (set_attr "predicable_short_it" "no")])
+
+;; There are three ways to expand this depending on the architecture
+;; features available.  As for the barriers, a load needs a barrier
+;; after it on all non-relaxed memory models except when the load
+;; has acquire semantics (for ARMv8-A).
+
 (define_expand "atomic_loaddi"
   [(match_operand:DI 0 "s_register_operand")		;; val out
    (match_operand:DI 1 "mem_noofs_operand")		;; memory
    (match_operand:SI 2 "const_int_operand")]		;; model
-  "TARGET_HAVE_LDREXD && ARM_DOUBLEWORD_ALIGN"
+  "(TARGET_HAVE_LDREXD || TARGET_HAVE_LPAE || TARGET_HAVE_LDACQ)
+   && ARM_DOUBLEWORD_ALIGN"
 {
-  enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
-  expand_mem_thread_fence (model);
-  emit_insn (gen_atomic_loaddi_1 (operands[0], operands[1]));
-  if (is_mm_seq_cst (model))
+  memmodel model = memmodel_from_int (INTVAL (operands[2]));
+
+  /* For ARMv8-A we can use an LDAEXD to atomically load two 32-bit registers
+     when acquire or stronger semantics are needed.  When the relaxed model is
+     used this can be relaxed to a normal LDRD.  */
+  if (TARGET_HAVE_LDACQ)
+    {
+      if (is_mm_relaxed (model))
+	emit_insn (gen_arm_atomic_loaddi2_ldrd (operands[0], operands[1]));
+      else
+	emit_insn (gen_arm_load_acquire_exclusivedi (operands[0], operands[1]));
+
+      DONE;
+    }
+
+  /* On LPAE targets LDRD and STRD accesses to 64-bit aligned
+     locations are 64-bit single-copy atomic.  We still need barriers in the
+     appropriate places to implement the ordering constraints.  */
+  if (TARGET_HAVE_LPAE)
+    emit_insn (gen_arm_atomic_loaddi2_ldrd (operands[0], operands[1]));
+  else
+    emit_insn (gen_arm_load_exclusivedi (operands[0], operands[1]));
+
+
+  /* All non-relaxed models need a barrier after the load when load-acquire
+     instructions are not available.  */
+  if (!is_mm_relaxed (model))
     expand_mem_thread_fence (model);
+
   DONE;
 })
 
-(define_insn "atomic_loaddi_1"
-  [(set (match_operand:DI 0 "s_register_operand" "=r")
-	(unspec:DI [(match_operand:DI 1 "mem_noofs_operand" "Ua")]
-		   UNSPEC_LL))]
-  "TARGET_HAVE_LDREXD && ARM_DOUBLEWORD_ALIGN"
-  "ldrexd%?\t%0, %H0, %C1"
-  [(set_attr "predicable" "yes")
-   (set_attr "predicable_short_it" "no")])
-
 (define_expand "atomic_compare_and_swap<mode>"
   [(match_operand:SI 0 "s_register_operand" "")		;; bool out
    (match_operand:QHSD 1 "s_register_operand" "")	;; val out
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index f9254640274bea111c813563a5aa00fcc739737d..22e170fbbf967efd7f445c9f06d894a7b32b5528 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -138,6 +138,7 @@ (define_c_enum "unspecv" [
   VUNSPEC_ATOMIC_XCHG	; Represent an atomic exchange.
   VUNSPEC_ATOMIC_OP	; Represent an atomic operation.
   VUNSPEC_LL		; Represent a load-register-exclusive.
+  VUNSPEC_LDRD_ATOMIC	; Represent an LDRD used as an atomic DImode load.
   VUNSPEC_SC		; Represent a store-register-exclusive.
   VUNSPEC_LAX		; Represent a load-register-acquire-exclusive.
   VUNSPEC_SLX		; Represent a store-register-release-exclusive.
diff --git a/gcc/testsuite/gcc.target/arm/atomic_loaddi_1.c b/gcc/testsuite/gcc.target/arm/atomic_loaddi_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..4f39971a336378be7cafabea9fdc16c3b94c3d3b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/atomic_loaddi_1.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c11 -O" } */
+/* { dg-require-effective-target arm_arch_v7a_ok } */
+/* { dg-add-options arm_arch_v7a } */
+
+#include "atomic_loaddi_acquire.x"
+
+/* { dg-final { scan-assembler-times "ldrexd\tr\[0-9\]+, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "dmb\tish" 1 } } */
diff --git a/gcc/testsuite/gcc.target/arm/atomic_loaddi_2.c b/gcc/testsuite/gcc.target/arm/atomic_loaddi_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..0b18f03e09d1d25daf5c8b2fd460538bedf43dfb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/atomic_loaddi_2.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c11 -O" } */
+/* { dg-require-effective-target arm_arch_v7ve_ok } */
+/* { dg-add-options arm_arch_v7ve } */
+
+#include "atomic_loaddi_acquire.x"
+
+/* { dg-final { scan-assembler-times "ldrd\tr\[0-9\]+, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "dmb\tish" 1 } } */
diff --git a/gcc/testsuite/gcc.target/arm/atomic_loaddi_3.c b/gcc/testsuite/gcc.target/arm/atomic_loaddi_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..080a9362abb334c764224620153deb2013ddcccb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/atomic_loaddi_3.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c11 -O" } */
+/* { dg-require-effective-target arm_arch_v8a_ok } */
+/* { dg-add-options arm_arch_v8a } */
+
+#include "atomic_loaddi_acquire.x"
+
+/* { dg-final { scan-assembler-times "ldaexd\tr\[0-9\]+, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-not "dmb\tish" } } */
diff --git a/gcc/testsuite/gcc.target/arm/atomic_loaddi_4.c b/gcc/testsuite/gcc.target/arm/atomic_loaddi_4.c
new file mode 100644
index 0000000000000000000000000000000000000000..8f94ba61b4d9bd99abdf5a12cd8c0bbed3f7ba18
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/atomic_loaddi_4.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c11 -O" } */
+/* { dg-require-effective-target arm_arch_v7a_ok } */
+/* { dg-add-options arm_arch_v7a } */
+
+#include "atomic_loaddi_relaxed.x"
+
+/* { dg-final { scan-assembler-times "ldrexd\tr\[0-9\]+, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-not "dmb\tish" } } */
diff --git a/gcc/testsuite/gcc.target/arm/atomic_loaddi_5.c b/gcc/testsuite/gcc.target/arm/atomic_loaddi_5.c
new file mode 100644
index 0000000000000000000000000000000000000000..39502c7e25db62cb66660b7ced6bda530431c843
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/atomic_loaddi_5.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c11 -O" } */
+/* { dg-require-effective-target arm_arch_v7ve_ok } */
+/* { dg-add-options arm_arch_v7ve } */
+
+#include "atomic_loaddi_relaxed.x"
+
+/* { dg-final { scan-assembler-times "ldrd\tr\[0-9\]+, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-not "dmb\tish" } } */
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/arm/atomic_loaddi_6.c b/gcc/testsuite/gcc.target/arm/atomic_loaddi_6.c
new file mode 100644
index 0000000000000000000000000000000000000000..aa62d5a103929658718a40f7b469b2fb4e327886
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/atomic_loaddi_6.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c11 -O" } */
+/* { dg-require-effective-target arm_arch_v8a_ok } */
+/* { dg-add-options arm_arch_v8a } */
+
+#include "atomic_loaddi_relaxed.x"
+
+/* { dg-final { scan-assembler-times "ldrd\tr\[0-9\]+, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-not "dmb\tish" } } */
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/arm/atomic_loaddi_7.c b/gcc/testsuite/gcc.target/arm/atomic_loaddi_7.c
new file mode 100644
index 0000000000000000000000000000000000000000..6743663f1e6831d3fc441e80fcfe3d761987970b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/atomic_loaddi_7.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c11 -O" } */
+/* { dg-require-effective-target arm_arch_v7a_ok } */
+/* { dg-add-options arm_arch_v7a } */
+
+#include "atomic_loaddi_seq_cst.x"
+
+/* { dg-final { scan-assembler-times "ldrexd\tr\[0-9\]+, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "dmb\tish" 1 } } */
diff --git a/gcc/testsuite/gcc.target/arm/atomic_loaddi_8.c b/gcc/testsuite/gcc.target/arm/atomic_loaddi_8.c
new file mode 100644
index 0000000000000000000000000000000000000000..f7bd3e5a2b515d91418f5f4a7a8db00336a8b594
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/atomic_loaddi_8.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c11 -O" } */
+/* { dg-require-effective-target arm_arch_v7ve_ok } */
+/* { dg-add-options arm_arch_v7ve } */
+
+#include "atomic_loaddi_seq_cst.x"
+
+/* { dg-final { scan-assembler-times "ldrd\tr\[0-9\]+, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "dmb\tish" 1 } } */
diff --git a/gcc/testsuite/gcc.target/arm/atomic_loaddi_9.c b/gcc/testsuite/gcc.target/arm/atomic_loaddi_9.c
new file mode 100644
index 0000000000000000000000000000000000000000..68b293409ff99965f9f95b92a093da7732f66299
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/atomic_loaddi_9.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c11 -O" } */
+/* { dg-require-effective-target arm_arch_v8a_ok } */
+/* { dg-add-options arm_arch_v8a } */
+
+#include "atomic_loaddi_seq_cst.x"
+
+/* { dg-final { scan-assembler-times "ldaexd\tr\[0-9\]+, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 1 } } */
+/* { dg-final { scan-assembler-not "dmb\tish" } } */
diff --git a/gcc/testsuite/gcc.target/arm/atomic_loaddi_acquire.x b/gcc/testsuite/gcc.target/arm/atomic_loaddi_acquire.x
new file mode 100644
index 0000000000000000000000000000000000000000..28997ef565be7640531808579e5930b61c6b50e5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/atomic_loaddi_acquire.x
@@ -0,0 +1,11 @@
+#include <stdatomic.h>
+
+atomic_ullong foo;
+int glob;
+
+int
+main (void)
+{
+  atomic_load_explicit (&foo, memory_order_acquire);
+  return glob;
+}
diff --git a/gcc/testsuite/gcc.target/arm/atomic_loaddi_relaxed.x b/gcc/testsuite/gcc.target/arm/atomic_loaddi_relaxed.x
new file mode 100644
index 0000000000000000000000000000000000000000..701b3c42c09b6bcd0add779c29321674921a95e5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/atomic_loaddi_relaxed.x
@@ -0,0 +1,11 @@
+#include <stdatomic.h>
+
+atomic_ullong foo;
+int glob;
+
+int
+main (void)
+{
+  atomic_load_explicit (&foo, memory_order_relaxed);
+  return glob;
+}
diff --git a/gcc/testsuite/gcc.target/arm/atomic_loaddi_seq_cst.x b/gcc/testsuite/gcc.target/arm/atomic_loaddi_seq_cst.x
new file mode 100644
index 0000000000000000000000000000000000000000..32e78da67e8b67f8a3bdd80b4957f55d9f413669
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/atomic_loaddi_seq_cst.x
@@ -0,0 +1,11 @@
+#include <stdatomic.h>
+
+atomic_ullong foo;
+int glob;
+
+int
+main (void)
+{
+  atomic_load_explicit (&foo, memory_order_seq_cst);
+  return glob;
+}

Reply via email to