Hi all, The atomic_loaddi expander on arm has some issues and can benefit from a rewrite to properly perform double-word atomic loads on various architecture levels.
Consider the code: ---------------------- #include <stdatomic.h> atomic_ullong foo; int glob; int main(void) { atomic_load_explicit(&foo, memory_order_acquire); return glob; } --------------------- Compiled with -O2 -march=armv7-a -std=c11 this gives: movw r3, #:lower16:glob movt r3, #:upper16:glob dmb ish movw r2, #:lower16:foo movt r2, #:upper16:foo ldrexd r0, r1, [r2] ldr r0, [r3] bx lr For the acquire memory model the barrier should be after the ldrexd, not before. The same code is generated when compiled with -march=armv7ve. However, we can get away with a single LDRD on such systems. In issue C.c of The ARM Architecture Reference Manual for ARMv7-A and ARMv7-R recommends at chapter A3.5.3: "In an implementation that includes the Large Physical Address Extension, LDRD and STRD accesses to 64-bit aligned locations are 64-bit single-copy atomic". We still need the barrier after the LDRD to enforce the acquire ordering semantics. For ARMv8-A we can do even better and use the load double-word acquire instruction: LDAEXD, with no need for a barrier afterwards. I've discussed the required sequences with some kernel folk and had a read through: https://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and this is the patch I've come up with. This patch handles all three of the above cases by rewriting the atomic_loaddi expander. With this patch for the above code with -march=armv7-a we would now generate: movw r3, #:lower16:foo movt r3, #:upper16:foo ldrexd r0, r1, [r3] movw r3, #:lower16:glob movt r3, #:upper16:glob dmb ish ldr r0, [r3] bx lr For -march=armv7ve: movw r3, #:lower16:foo movt r3, #:upper16:foo ldrd r2, r3, [r3] movw r3, #:lower16:glob movt r3, #:upper16:glob dmb ish ldr r0, [r3] bx lr and for -march=armv8-a: movw r3, #:lower16:foo movt r3, #:upper16:foo ldaexd r2, r3, [r3] movw r3, #:lower16:glob movt r3, #:upper16:glob ldr r0, [r3] bx lr For the relaxed memory model the armv7ve and armv8-a can be relaxed to a single LDRD instruction, without any barriers. Bootstrapped and tested on arm-none-linux-gnueabihf. Ok for trunk? Thanks, Kyrill P.S. The backport to the previous branches will look a bit different because the ARM_FSET_HAS_CPU1 machinery in arm.h was introduced for GCC 6. I'll prepare a backport separately if this is accepted. 2016-02-19 Kyrylo Tkachov <kyrylo.tkac...@arm.com> PR target/69875 * config/arm/arm.h (TARGET_HAVE_LPAE): Define. * config/arm/unspecs.md (VUNSPEC_LDRD_ATOMIC): New value. * config/arm/sync.md (arm_atomic_loaddi2_ldrd): New pattern. (atomic_loaddi_1): Delete. (atomic_loaddi): Rewrite expander using the above changes. 2016-02-19 Kyrylo Tkachov <kyrylo.tkac...@arm.com> PR target/69875 * gcc.target/arm/atomic_loaddi_acquire.x: New file. * gcc.target/arm/atomic_loaddi_relaxed.x: Likewise. * gcc.target/arm/atomic_loaddi_seq_cst.x: Likewise. * gcc.target/arm/atomic_loaddi_1.c: New test. * gcc.target/arm/atomic_loaddi_2.c: Likewise. * gcc.target/arm/atomic_loaddi_3.c: Likewise. * gcc.target/arm/atomic_loaddi_4.c: Likewise. * gcc.target/arm/atomic_loaddi_5.c: Likewise. * gcc.target/arm/atomic_loaddi_6.c: Likewise. * gcc.target/arm/atomic_loaddi_7.c: Likewise. * gcc.target/arm/atomic_loaddi_8.c: Likewise. * gcc.target/arm/atomic_loaddi_9.c: Likewise.
diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h index ca7c0e1e6c49e106cd871769dbf45e7b16861ea4..4a0e6aff8ec22c2c597684265cbbd10982bb81b7 100644 --- a/gcc/config/arm/arm.h +++ b/gcc/config/arm/arm.h @@ -253,6 +253,10 @@ extern void (*arm_lang_output_object_attributes_hook)(void); /* Nonzero if this chip supports ldrex and strex */ #define TARGET_HAVE_LDREX ((arm_arch6 && TARGET_ARM) || arm_arch7) +/* Nonzero if this chip supports LPAE. */ +#define TARGET_HAVE_LPAE \ + (arm_arch7 && ARM_FSET_HAS_CPU1 (insn_flags, FL_FOR_ARCH7VE)) + /* Nonzero if this chip supports ldrex{bh} and strex{bh}. */ #define TARGET_HAVE_LDREXBH ((arm_arch6k && TARGET_ARM) || arm_arch7) diff --git a/gcc/config/arm/sync.md b/gcc/config/arm/sync.md index 8158f53025400045569533a1e8c6583025d490c8..abcfbcb1eacaabc597c9fde475c1b56624fb5a59 100644 --- a/gcc/config/arm/sync.md +++ b/gcc/config/arm/sync.md @@ -96,32 +96,62 @@ (define_insn "atomic_store<mode>" [(set_attr "predicable" "yes") (set_attr "predicable_short_it" "no")]) -;; Note that ldrd and vldr are *not* guaranteed to be single-copy atomic, -;; even for a 64-bit aligned address. Instead we use a ldrexd unparied -;; with a store. +;; An LDRD instruction usable by the atomic_loaddi expander on LPAE targets + +(define_insn "arm_atomic_loaddi2_ldrd" + [(set (match_operand:DI 0 "register_operand" "=r") + (unspec_volatile:DI + [(match_operand:DI 1 "arm_sync_memory_operand" "Q")] + VUNSPEC_LDRD_ATOMIC))] + "ARM_DOUBLEWORD_ALIGN && TARGET_HAVE_LPAE" + "ldrd%?\t%0, %H0, %C1" + [(set_attr "predicable" "yes") + (set_attr "predicable_short_it" "no")]) + +;; There are three ways to expand this depending on the architecture +;; features available. As for the barriers, a load needs a barrier +;; after it on all non-relaxed memory models except when the load +;; has acquire semantics (for ARMv8-A). + (define_expand "atomic_loaddi" [(match_operand:DI 0 "s_register_operand") ;; val out (match_operand:DI 1 "mem_noofs_operand") ;; memory (match_operand:SI 2 "const_int_operand")] ;; model - "TARGET_HAVE_LDREXD && ARM_DOUBLEWORD_ALIGN" + "(TARGET_HAVE_LDREXD || TARGET_HAVE_LPAE || TARGET_HAVE_LDACQ) + && ARM_DOUBLEWORD_ALIGN" { - enum memmodel model = memmodel_from_int (INTVAL (operands[2])); - expand_mem_thread_fence (model); - emit_insn (gen_atomic_loaddi_1 (operands[0], operands[1])); - if (is_mm_seq_cst (model)) + memmodel model = memmodel_from_int (INTVAL (operands[2])); + + /* For ARMv8-A we can use an LDAEXD to atomically load two 32-bit registers + when acquire or stronger semantics are needed. When the relaxed model is + used this can be relaxed to a normal LDRD. */ + if (TARGET_HAVE_LDACQ) + { + if (is_mm_relaxed (model)) + emit_insn (gen_arm_atomic_loaddi2_ldrd (operands[0], operands[1])); + else + emit_insn (gen_arm_load_acquire_exclusivedi (operands[0], operands[1])); + + DONE; + } + + /* On LPAE targets LDRD and STRD accesses to 64-bit aligned + locations are 64-bit single-copy atomic. We still need barriers in the + appropriate places to implement the ordering constraints. */ + if (TARGET_HAVE_LPAE) + emit_insn (gen_arm_atomic_loaddi2_ldrd (operands[0], operands[1])); + else + emit_insn (gen_arm_load_exclusivedi (operands[0], operands[1])); + + + /* All non-relaxed models need a barrier after the load when load-acquire + instructions are not available. */ + if (!is_mm_relaxed (model)) expand_mem_thread_fence (model); + DONE; }) -(define_insn "atomic_loaddi_1" - [(set (match_operand:DI 0 "s_register_operand" "=r") - (unspec:DI [(match_operand:DI 1 "mem_noofs_operand" "Ua")] - UNSPEC_LL))] - "TARGET_HAVE_LDREXD && ARM_DOUBLEWORD_ALIGN" - "ldrexd%?\t%0, %H0, %C1" - [(set_attr "predicable" "yes") - (set_attr "predicable_short_it" "no")]) - (define_expand "atomic_compare_and_swap<mode>" [(match_operand:SI 0 "s_register_operand" "") ;; bool out (match_operand:QHSD 1 "s_register_operand" "") ;; val out diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md index f9254640274bea111c813563a5aa00fcc739737d..22e170fbbf967efd7f445c9f06d894a7b32b5528 100644 --- a/gcc/config/arm/unspecs.md +++ b/gcc/config/arm/unspecs.md @@ -138,6 +138,7 @@ (define_c_enum "unspecv" [ VUNSPEC_ATOMIC_XCHG ; Represent an atomic exchange. VUNSPEC_ATOMIC_OP ; Represent an atomic operation. VUNSPEC_LL ; Represent a load-register-exclusive. + VUNSPEC_LDRD_ATOMIC ; Represent an LDRD used as an atomic DImode load. VUNSPEC_SC ; Represent a store-register-exclusive. VUNSPEC_LAX ; Represent a load-register-acquire-exclusive. VUNSPEC_SLX ; Represent a store-register-release-exclusive. diff --git a/gcc/testsuite/gcc.target/arm/atomic_loaddi_1.c b/gcc/testsuite/gcc.target/arm/atomic_loaddi_1.c new file mode 100644 index 0000000000000000000000000000000000000000..4f39971a336378be7cafabea9fdc16c3b94c3d3b --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/atomic_loaddi_1.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-std=c11 -O" } */ +/* { dg-require-effective-target arm_arch_v7a_ok } */ +/* { dg-add-options arm_arch_v7a } */ + +#include "atomic_loaddi_acquire.x" + +/* { dg-final { scan-assembler-times "ldrexd\tr\[0-9\]+, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 1 } } */ +/* { dg-final { scan-assembler-times "dmb\tish" 1 } } */ diff --git a/gcc/testsuite/gcc.target/arm/atomic_loaddi_2.c b/gcc/testsuite/gcc.target/arm/atomic_loaddi_2.c new file mode 100644 index 0000000000000000000000000000000000000000..0b18f03e09d1d25daf5c8b2fd460538bedf43dfb --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/atomic_loaddi_2.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-std=c11 -O" } */ +/* { dg-require-effective-target arm_arch_v7ve_ok } */ +/* { dg-add-options arm_arch_v7ve } */ + +#include "atomic_loaddi_acquire.x" + +/* { dg-final { scan-assembler-times "ldrd\tr\[0-9\]+, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 1 } } */ +/* { dg-final { scan-assembler-times "dmb\tish" 1 } } */ diff --git a/gcc/testsuite/gcc.target/arm/atomic_loaddi_3.c b/gcc/testsuite/gcc.target/arm/atomic_loaddi_3.c new file mode 100644 index 0000000000000000000000000000000000000000..080a9362abb334c764224620153deb2013ddcccb --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/atomic_loaddi_3.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-std=c11 -O" } */ +/* { dg-require-effective-target arm_arch_v8a_ok } */ +/* { dg-add-options arm_arch_v8a } */ + +#include "atomic_loaddi_acquire.x" + +/* { dg-final { scan-assembler-times "ldaexd\tr\[0-9\]+, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 1 } } */ +/* { dg-final { scan-assembler-not "dmb\tish" } } */ diff --git a/gcc/testsuite/gcc.target/arm/atomic_loaddi_4.c b/gcc/testsuite/gcc.target/arm/atomic_loaddi_4.c new file mode 100644 index 0000000000000000000000000000000000000000..8f94ba61b4d9bd99abdf5a12cd8c0bbed3f7ba18 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/atomic_loaddi_4.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-std=c11 -O" } */ +/* { dg-require-effective-target arm_arch_v7a_ok } */ +/* { dg-add-options arm_arch_v7a } */ + +#include "atomic_loaddi_relaxed.x" + +/* { dg-final { scan-assembler-times "ldrexd\tr\[0-9\]+, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 1 } } */ +/* { dg-final { scan-assembler-not "dmb\tish" } } */ diff --git a/gcc/testsuite/gcc.target/arm/atomic_loaddi_5.c b/gcc/testsuite/gcc.target/arm/atomic_loaddi_5.c new file mode 100644 index 0000000000000000000000000000000000000000..39502c7e25db62cb66660b7ced6bda530431c843 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/atomic_loaddi_5.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-std=c11 -O" } */ +/* { dg-require-effective-target arm_arch_v7ve_ok } */ +/* { dg-add-options arm_arch_v7ve } */ + +#include "atomic_loaddi_relaxed.x" + +/* { dg-final { scan-assembler-times "ldrd\tr\[0-9\]+, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 1 } } */ +/* { dg-final { scan-assembler-not "dmb\tish" } } */ \ No newline at end of file diff --git a/gcc/testsuite/gcc.target/arm/atomic_loaddi_6.c b/gcc/testsuite/gcc.target/arm/atomic_loaddi_6.c new file mode 100644 index 0000000000000000000000000000000000000000..aa62d5a103929658718a40f7b469b2fb4e327886 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/atomic_loaddi_6.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-std=c11 -O" } */ +/* { dg-require-effective-target arm_arch_v8a_ok } */ +/* { dg-add-options arm_arch_v8a } */ + +#include "atomic_loaddi_relaxed.x" + +/* { dg-final { scan-assembler-times "ldrd\tr\[0-9\]+, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 1 } } */ +/* { dg-final { scan-assembler-not "dmb\tish" } } */ \ No newline at end of file diff --git a/gcc/testsuite/gcc.target/arm/atomic_loaddi_7.c b/gcc/testsuite/gcc.target/arm/atomic_loaddi_7.c new file mode 100644 index 0000000000000000000000000000000000000000..6743663f1e6831d3fc441e80fcfe3d761987970b --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/atomic_loaddi_7.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-std=c11 -O" } */ +/* { dg-require-effective-target arm_arch_v7a_ok } */ +/* { dg-add-options arm_arch_v7a } */ + +#include "atomic_loaddi_seq_cst.x" + +/* { dg-final { scan-assembler-times "ldrexd\tr\[0-9\]+, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 1 } } */ +/* { dg-final { scan-assembler-times "dmb\tish" 1 } } */ diff --git a/gcc/testsuite/gcc.target/arm/atomic_loaddi_8.c b/gcc/testsuite/gcc.target/arm/atomic_loaddi_8.c new file mode 100644 index 0000000000000000000000000000000000000000..f7bd3e5a2b515d91418f5f4a7a8db00336a8b594 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/atomic_loaddi_8.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-std=c11 -O" } */ +/* { dg-require-effective-target arm_arch_v7ve_ok } */ +/* { dg-add-options arm_arch_v7ve } */ + +#include "atomic_loaddi_seq_cst.x" + +/* { dg-final { scan-assembler-times "ldrd\tr\[0-9\]+, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 1 } } */ +/* { dg-final { scan-assembler-times "dmb\tish" 1 } } */ diff --git a/gcc/testsuite/gcc.target/arm/atomic_loaddi_9.c b/gcc/testsuite/gcc.target/arm/atomic_loaddi_9.c new file mode 100644 index 0000000000000000000000000000000000000000..68b293409ff99965f9f95b92a093da7732f66299 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/atomic_loaddi_9.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-std=c11 -O" } */ +/* { dg-require-effective-target arm_arch_v8a_ok } */ +/* { dg-add-options arm_arch_v8a } */ + +#include "atomic_loaddi_seq_cst.x" + +/* { dg-final { scan-assembler-times "ldaexd\tr\[0-9\]+, r\[0-9\]+, \\\[r\[0-9\]+\\\]" 1 } } */ +/* { dg-final { scan-assembler-not "dmb\tish" } } */ diff --git a/gcc/testsuite/gcc.target/arm/atomic_loaddi_acquire.x b/gcc/testsuite/gcc.target/arm/atomic_loaddi_acquire.x new file mode 100644 index 0000000000000000000000000000000000000000..28997ef565be7640531808579e5930b61c6b50e5 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/atomic_loaddi_acquire.x @@ -0,0 +1,11 @@ +#include <stdatomic.h> + +atomic_ullong foo; +int glob; + +int +main (void) +{ + atomic_load_explicit (&foo, memory_order_acquire); + return glob; +} diff --git a/gcc/testsuite/gcc.target/arm/atomic_loaddi_relaxed.x b/gcc/testsuite/gcc.target/arm/atomic_loaddi_relaxed.x new file mode 100644 index 0000000000000000000000000000000000000000..701b3c42c09b6bcd0add779c29321674921a95e5 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/atomic_loaddi_relaxed.x @@ -0,0 +1,11 @@ +#include <stdatomic.h> + +atomic_ullong foo; +int glob; + +int +main (void) +{ + atomic_load_explicit (&foo, memory_order_relaxed); + return glob; +} diff --git a/gcc/testsuite/gcc.target/arm/atomic_loaddi_seq_cst.x b/gcc/testsuite/gcc.target/arm/atomic_loaddi_seq_cst.x new file mode 100644 index 0000000000000000000000000000000000000000..32e78da67e8b67f8a3bdd80b4957f55d9f413669 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/atomic_loaddi_seq_cst.x @@ -0,0 +1,11 @@ +#include <stdatomic.h> + +atomic_ullong foo; +int glob; + +int +main (void) +{ + atomic_load_explicit (&foo, memory_order_seq_cst); + return glob; +}