Hi Wilco, Thanks for your review. Please find attached the patch amended following your recommendations. The number of new functions for _sync is reduced by 3x. I tested the patch on Graviton2 aarch64-linux. I also checked by hand that the outline functions in libgcc look similar to what GCC produces for the inline version.
Thanks, Sebastian ________________________________________ From: Wilco Dijkstra <wilco.dijks...@arm.com> Sent: Tuesday, April 19, 2022 7:51 AM To: Pop, Sebastian; gcc-patches@gcc.gnu.org Cc: Kyrylo Tkachov Subject: RE: [EXTERNAL] [AArch64] PR105162: emit barrier for __sync and __atomic builtins on CPUs without LSE CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe. Hi Sebastian, > Wilco pointed out in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105162#c7 > that > "Only __sync needs the extra full barrier, but __atomic does not." > The attached patch does that by adding out-of-line functions for > MEMMODEL_SYNC_*. > Those new functions contain a barrier on the path without LSE instructions. Yes, adding _sync versions of the outline functions is the correct approach. However there is no need to have separate _acq/_rel/_seq variants for every function since all but one are _seq. Also we should ensure we generate the same sequence as the inlined versions so that they are consistent. This means ensuring the LDXR macro ignores the 'A' for the _sync variants and the swp function switches to acquire semantics. Cheers, Wilco
From 4e90111e507a611d3daa0f71fc17c6d5cc3e203e Mon Sep 17 00:00:00 2001 From: Sebastian Pop <s...@amazon.com> Date: Mon, 18 Apr 2022 15:13:20 +0000 Subject: [PATCH] [AArch64] add barriers to ool __sync builtins --- gcc/config/aarch64/aarch64-protos.h | 2 +- gcc/config/aarch64/aarch64.cc | 12 ++++-- .../gcc.target/aarch64/sync-comp-swap-ool.c | 6 +++ .../gcc.target/aarch64/sync-op-acquire-ool.c | 6 +++ .../gcc.target/aarch64/sync-op-full-ool.c | 9 ++++ .../gcc.target/aarch64/target_attr_20.c | 2 +- .../gcc.target/aarch64/target_attr_21.c | 2 +- libgcc/config/aarch64/lse.S | 42 +++++++++++++++++-- libgcc/config/aarch64/t-lse | 8 ++-- 9 files changed, 75 insertions(+), 14 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/sync-comp-swap-ool.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sync-op-acquire-ool.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sync-op-full-ool.c diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 46bade28ed6..3ad5e77a1af 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -1051,7 +1051,7 @@ bool aarch64_high_bits_all_ones_p (HOST_WIDE_INT); struct atomic_ool_names { - const char *str[5][4]; + const char *str[5][5]; }; rtx aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx, diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 18f80499079..3ad11e84aae 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -22670,14 +22670,14 @@ aarch64_emit_unlikely_jump (rtx insn) add_reg_br_prob_note (jump, profile_probability::very_unlikely ()); } -/* We store the names of the various atomic helpers in a 5x4 array. +/* We store the names of the various atomic helpers in a 5x5 array. Return the libcall function given MODE, MODEL and NAMES. */ rtx aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx, const atomic_ool_names *names) { - memmodel model = memmodel_base (INTVAL (model_rtx)); + memmodel model = memmodel_from_int (INTVAL (model_rtx)); int mode_idx, model_idx; switch (mode) @@ -22717,6 +22717,11 @@ aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx, case MEMMODEL_SEQ_CST: model_idx = 3; break; + case MEMMODEL_SYNC_ACQUIRE: + case MEMMODEL_SYNC_RELEASE: + case MEMMODEL_SYNC_SEQ_CST: + model_idx = 4; + break; default: gcc_unreachable (); } @@ -22729,7 +22734,8 @@ aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx, { "__aarch64_" #B #N "_relax", \ "__aarch64_" #B #N "_acq", \ "__aarch64_" #B #N "_rel", \ - "__aarch64_" #B #N "_acq_rel" } + "__aarch64_" #B #N "_acq_rel", \ + "__aarch64_" #B #N "_sync" } #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \ { NULL, NULL, NULL, NULL } diff --git a/gcc/testsuite/gcc.target/aarch64/sync-comp-swap-ool.c b/gcc/testsuite/gcc.target/aarch64/sync-comp-swap-ool.c new file mode 100644 index 00000000000..372f4aa8746 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sync-comp-swap-ool.c @@ -0,0 +1,6 @@ +/* { dg-do compile } */ +/* { dg-options "-march=armv8-a+nolse -O2 -fno-ipa-icf -moutline-atomics" } */ + +#include "sync-comp-swap.x" + +/* { dg-final { scan-assembler-times "bl.*__aarch64_cas4_sync" 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sync-op-acquire-ool.c b/gcc/testsuite/gcc.target/aarch64/sync-op-acquire-ool.c new file mode 100644 index 00000000000..95d9c56b5e1 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sync-op-acquire-ool.c @@ -0,0 +1,6 @@ +/* { dg-do compile } */ +/* { dg-options "-march=armv8-a+nolse -O2 -moutline-atomics" } */ + +#include "sync-op-acquire.x" + +/* { dg-final { scan-assembler-times "bl.*__aarch64_swp4_sync" 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sync-op-full-ool.c b/gcc/testsuite/gcc.target/aarch64/sync-op-full-ool.c new file mode 100644 index 00000000000..2f3881d9755 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sync-op-full-ool.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-march=armv8-a+nolse -O2 -moutline-atomics" } */ + +#include "sync-op-full.x" + +/* { dg-final { scan-assembler-times "bl.*__aarch64_ldadd4_sync" 1 } } */ +/* { dg-final { scan-assembler-times "bl.*__aarch64_ldclr4_sync" 1 } } */ +/* { dg-final { scan-assembler-times "bl.*__aarch64_ldeor4_sync" 1 } } */ +/* { dg-final { scan-assembler-times "bl.*__aarch64_ldset4_sync" 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/target_attr_20.c b/gcc/testsuite/gcc.target/aarch64/target_attr_20.c index 509fb039e84..abf9f5cf230 100644 --- a/gcc/testsuite/gcc.target/aarch64/target_attr_20.c +++ b/gcc/testsuite/gcc.target/aarch64/target_attr_20.c @@ -24,4 +24,4 @@ bar (void) } } -/* { dg-final { scan-assembler-not "bl.*__aarch64_cas2_acq_rel" } } */ +/* { dg-final { scan-assembler-not "bl.*__aarch64_cas2" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/target_attr_21.c b/gcc/testsuite/gcc.target/aarch64/target_attr_21.c index acace4c8f2a..02d9577c3d4 100644 --- a/gcc/testsuite/gcc.target/aarch64/target_attr_21.c +++ b/gcc/testsuite/gcc.target/aarch64/target_attr_21.c @@ -24,4 +24,4 @@ bar (void) } } -/* { dg-final { scan-assembler-times "bl.*__aarch64_cas2_acq_rel" 1 } } */ +/* { dg-final { scan-assembler-times "bl.*__aarch64_cas2" 1 } } */ diff --git a/libgcc/config/aarch64/lse.S b/libgcc/config/aarch64/lse.S index c353ec2215b..9c29cf08b59 100644 --- a/libgcc/config/aarch64/lse.S +++ b/libgcc/config/aarch64/lse.S @@ -87,24 +87,44 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see # define L # define M 0x000000 # define N 0x000000 +# define BARRIER #elif MODEL == 2 # define SUFF _acq # define A a # define L # define M 0x400000 # define N 0x800000 +# define BARRIER #elif MODEL == 3 # define SUFF _rel # define A # define L l # define M 0x008000 # define N 0x400000 +# define BARRIER #elif MODEL == 4 # define SUFF _acq_rel # define A a # define L l # define M 0x408000 # define N 0xc00000 +# define BARRIER +#elif MODEL == 5 +# define SUFF _sync +#ifdef L_swp +/* swp has _acq semantics. */ +# define A a +# define L +# define M 0x400000 +# define N 0x800000 +#else +/* All other _sync functions have _seq semantics. */ +# define A a +# define L l +# define M 0x408000 +# define N 0xc00000 +#endif +# define BARRIER dmb ish #else # error #endif @@ -127,7 +147,12 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see #endif #define NAME(BASE) glue4(__aarch64_, BASE, SIZE, SUFF) -#define LDXR glue4(ld, A, xr, S) +#if MODEL == 5 +/* Drop A for _sync functions. */ +# define LDXR glue3(ld, xr, S) +#else +# define LDXR glue4(ld, A, xr, S) +#endif #define STXR glue4(st, L, xr, S) /* Temporary registers used. Other than these, only the return value @@ -183,10 +208,16 @@ STARTFN NAME(cas) bne 1f STXR w(tmp1), s(1), [x2] cbnz w(tmp1), 0b -1: ret +1: BARRIER + ret #else -#define LDXP glue3(ld, A, xp) +#if MODEL == 5 +/* Drop A for _sync functions. */ +# define LDXP glue2(ld, xp) +#else +# define LDXP glue3(ld, A, xp) +#endif #define STXP glue3(st, L, xp) #ifdef HAVE_AS_LSE # define CASP glue3(casp, A, L) x0, x1, x2, x3, [x4] @@ -205,7 +236,8 @@ STARTFN NAME(cas) bne 1f STXP w(tmp2), x2, x3, [x4] cbnz w(tmp2), 0b -1: ret +1: BARRIER + ret #endif @@ -229,6 +261,7 @@ STARTFN NAME(swp) 0: LDXR s(0), [x1] STXR w(tmp1), s(tmp0), [x1] cbnz w(tmp1), 0b + BARRIER ret ENDFN NAME(swp) @@ -273,6 +306,7 @@ STARTFN NAME(LDNM) OP s(tmp1), s(0), s(tmp0) STXR w(tmp2), s(tmp1), [x1] cbnz w(tmp2), 0b + BARRIER ret ENDFN NAME(LDNM) diff --git a/libgcc/config/aarch64/t-lse b/libgcc/config/aarch64/t-lse index 790cada3315..624daf7eddf 100644 --- a/libgcc/config/aarch64/t-lse +++ b/libgcc/config/aarch64/t-lse @@ -18,13 +18,13 @@ # along with GCC; see the file COPYING3. If not see # <http://www.gnu.org/licenses/>. -# Compare-and-swap has 5 sizes and 4 memory models. +# Compare-and-swap has 5 sizes and 5 memory models. S0 := $(foreach s, 1 2 4 8 16, $(addsuffix _$(s), cas)) -O0 := $(foreach m, 1 2 3 4, $(addsuffix _$(m)$(objext), $(S0))) +O0 := $(foreach m, 1 2 3 4 5, $(addsuffix _$(m)$(objext), $(S0))) -# Swap, Load-and-operate have 4 sizes and 4 memory models +# Swap, Load-and-operate have 4 sizes and 5 memory models S1 := $(foreach s, 1 2 4 8, $(addsuffix _$(s), swp ldadd ldclr ldeor ldset)) -O1 := $(foreach m, 1 2 3 4, $(addsuffix _$(m)$(objext), $(S1))) +O1 := $(foreach m, 1 2 3 4 5, $(addsuffix _$(m)$(objext), $(S1))) LSE_OBJS := $(O0) $(O1) -- 2.25.1