Not my specialist subject, but here goes anyway: Wilco Dijkstra <wilco.dijks...@arm.com> writes: > ping > > From: Wilco Dijkstra > Sent: 02 June 2023 18:28 > To: GCC Patches <gcc-patches@gcc.gnu.org> > Cc: Richard Sandiford <richard.sandif...@arm.com>; Kyrylo Tkachov > <kyrylo.tkac...@arm.com> > Subject: [PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 > [PR110061] > > > Enable lock-free 128-bit atomics on AArch64. This is backwards compatible > with > existing binaries, gives better performance than locking atomics and is what > most users expect.
Please add a justification for why it's backwards compatible, rather than just stating that it's so. > Note 128-bit atomic loads use a load/store exclusive loop if LSE2 is not > supported. > This results in an implicit store which is invisible to software as long as > the given > address is writeable (which will be true when using atomics in actual code). Thanks for adding this. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95722 suggests that it's still an open question whether this is a correct thing to do, but it sounds from Joseph's comment that he isn't sure whether atomic loads from read-only data are valid. Linus's comment in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70490 suggests that a reasonable compromise might be to use a storing implementation but not advertise that it is lock-free. Also, the comment above libat_is_lock_free says: /* Note that this can return that a size/alignment is not lock-free even if all the operations that we use to implement the respective accesses provide lock-free forward progress as specified in C++14: Users likely expect "lock-free" to also mean "fast", which is why we do not return true if, for example, we implement loads with this size/alignment using a CAS. */ We don't use a CAS for the fallbacks, but like you say, we do use a load/store exclusive loop. So did you consider not doing this: > +/* State we have lock-free 128-bit atomics. */ > +#undef FAST_ATOMIC_LDST_16 > +#define FAST_ATOMIC_LDST_16 1 ? Otherwise it looks reasonable to me, for whatever that's worth, but: > A simple test on an old Cortex-A72 showed 2.7x speedup of 128-bit atomics. > > Passes regress, OK for commit? > > libatomic/ > PR target/110061 > config/linux/aarch64/atomic_16.S: Implement lock-free ARMv8.0 atomics. > config/linux/aarch64/host-config.h: Use atomic_16.S for baseline v8.0. > State we have lock-free atomics. > > --- > > diff --git a/libatomic/config/linux/aarch64/atomic_16.S > b/libatomic/config/linux/aarch64/atomic_16.S > index > 05439ce394b9653c9bcb582761ff7aaa7c8f9643..0485c284117edf54f41959d2fab9341a9567b1cf > 100644 > --- a/libatomic/config/linux/aarch64/atomic_16.S > +++ b/libatomic/config/linux/aarch64/atomic_16.S > @@ -22,6 +22,21 @@ > <http://www.gnu.org/licenses/>. */ > > > +/* AArch64 128-bit lock-free atomic implementation. > + > + 128-bit atomics are now lock-free for all AArch64 architecture versions. > + This is backwards compatible with existing binaries and gives better > + performance than locking atomics. > + > + 128-bit atomic loads use a exclusive loop if LSE2 is not supported. > + This results in an implicit store which is invisible to software as long > + as the given address is writeable. Since all other atomics have explicit > + writes, this will be true when using atomics in actual code. > + > + The libat_<op>_16 entry points are ARMv8.0. > + The libat_<op>_16_i1 entry points are used when LSE2 is available. */ > + > + > .arch armv8-a+lse > > #define ENTRY(name) \ > @@ -37,6 +52,10 @@ name: \ > .cfi_endproc; \ > .size name, .-name; > > +#define ALIAS(alias,name) \ > + .global alias; \ > + .set alias, name; > + > #define res0 x0 > #define res1 x1 > #define in0 x2 > @@ -70,6 +89,24 @@ name: \ > #define SEQ_CST 5 > > > +ENTRY (libat_load_16) > + mov x5, x0 > + cbnz w1, 2f > + > + /* RELAXED. */ > +1: ldxp res0, res1, [x5] > + stxp w4, res0, res1, [x5] > + cbnz w4, 1b > + ret > + > + /* ACQUIRE/CONSUME/SEQ_CST. */ > +2: ldaxp res0, res1, [x5] > + stxp w4, res0, res1, [x5] > + cbnz w4, 2b > + ret > +END (libat_load_16) > + > + > ENTRY (libat_load_16_i1) > cbnz w1, 1f > > @@ -93,6 +130,23 @@ ENTRY (libat_load_16_i1) > END (libat_load_16_i1) > > > +ENTRY (libat_store_16) > + cbnz w4, 2f > + > + /* RELAXED. */ > +1: ldxp xzr, tmp0, [x0] > + stxp w4, in0, in1, [x0] > + cbnz w4, 1b > + ret > + > + /* RELEASE/SEQ_CST. */ > +2: ldxp xzr, tmp0, [x0] > + stlxp w4, in0, in1, [x0] > + cbnz w4, 2b > + ret > +END (libat_store_16) > + > + > ENTRY (libat_store_16_i1) > cbnz w4, 1f > > @@ -101,14 +155,14 @@ ENTRY (libat_store_16_i1) > ret > > /* RELEASE/SEQ_CST. */ > -1: ldaxp xzr, tmp0, [x0] > +1: ldxp xzr, tmp0, [x0] > stlxp w4, in0, in1, [x0] > cbnz w4, 1b > ret > END (libat_store_16_i1) > > > -ENTRY (libat_exchange_16_i1) > +ENTRY (libat_exchange_16) > mov x5, x0 > cbnz w4, 2f > > @@ -126,22 +180,55 @@ ENTRY (libat_exchange_16_i1) > stxp w4, in0, in1, [x5] > cbnz w4, 3b > ret > -4: > - cmp w4, RELEASE > - b.ne 6f > > - /* RELEASE. */ > -5: ldxp res0, res1, [x5] > + /* RELEASE/ACQ_REL/SEQ_CST. */ > +4: ldaxp res0, res1, [x5] > stlxp w4, in0, in1, [x5] > - cbnz w4, 5b > + cbnz w4, 4b > ret > +END (libat_exchange_16) Please explain (here and in the commit message) why you're adding acquire semantics to the RELEASE case. Thanks, Richard > > - /* ACQ_REL/SEQ_CST. */ > -6: ldaxp res0, res1, [x5] > - stlxp w4, in0, in1, [x5] > - cbnz w4, 6b > + > +ENTRY (libat_compare_exchange_16) > + ldp exp0, exp1, [x1] > + cbz w4, 3f > + cmp w4, RELEASE > + b.hs 4f > + > + /* ACQUIRE/CONSUME. */ > +1: ldaxp tmp0, tmp1, [x0] > + cmp tmp0, exp0 > + ccmp tmp1, exp1, 0, eq > + bne 2f > + stxp w4, in0, in1, [x0] > + cbnz w4, 1b > + mov x0, 1 > ret > -END (libat_exchange_16_i1) > + > +2: stp tmp0, tmp1, [x1] > + mov x0, 0 > + ret > + > + /* RELAXED. */ > +3: ldxp tmp0, tmp1, [x0] > + cmp tmp0, exp0 > + ccmp tmp1, exp1, 0, eq > + bne 2b > + stxp w4, in0, in1, [x0] > + cbnz w4, 3b > + mov x0, 1 > + ret > + > + /* RELEASE/ACQ_REL/SEQ_CST. */ > +4: ldaxp tmp0, tmp1, [x0] > + cmp tmp0, exp0 > + ccmp tmp1, exp1, 0, eq > + bne 2b > + stlxp w4, in0, in1, [x0] > + cbnz w4, 4b > + mov x0, 1 > + ret > +END (libat_compare_exchange_16) > > > ENTRY (libat_compare_exchange_16_i1) > @@ -180,7 +267,7 @@ ENTRY (libat_compare_exchange_16_i1) > END (libat_compare_exchange_16_i1) > > > -ENTRY (libat_fetch_add_16_i1) > +ENTRY (libat_fetch_add_16) > mov x5, x0 > cbnz w4, 2f > > @@ -199,10 +286,10 @@ ENTRY (libat_fetch_add_16_i1) > stlxp w4, tmp0, tmp1, [x5] > cbnz w4, 2b > ret > -END (libat_fetch_add_16_i1) > +END (libat_fetch_add_16) > > > -ENTRY (libat_add_fetch_16_i1) > +ENTRY (libat_add_fetch_16) > mov x5, x0 > cbnz w4, 2f > > @@ -221,10 +308,10 @@ ENTRY (libat_add_fetch_16_i1) > stlxp w4, res0, res1, [x5] > cbnz w4, 2b > ret > -END (libat_add_fetch_16_i1) > +END (libat_add_fetch_16) > > > -ENTRY (libat_fetch_sub_16_i1) > +ENTRY (libat_fetch_sub_16) > mov x5, x0 > cbnz w4, 2f > > @@ -243,10 +330,10 @@ ENTRY (libat_fetch_sub_16_i1) > stlxp w4, tmp0, tmp1, [x5] > cbnz w4, 2b > ret > -END (libat_fetch_sub_16_i1) > +END (libat_fetch_sub_16) > > > -ENTRY (libat_sub_fetch_16_i1) > +ENTRY (libat_sub_fetch_16) > mov x5, x0 > cbnz w4, 2f > > @@ -265,10 +352,10 @@ ENTRY (libat_sub_fetch_16_i1) > stlxp w4, res0, res1, [x5] > cbnz w4, 2b > ret > -END (libat_sub_fetch_16_i1) > +END (libat_sub_fetch_16) > > > -ENTRY (libat_fetch_or_16_i1) > +ENTRY (libat_fetch_or_16) > mov x5, x0 > cbnz w4, 2f > > @@ -287,10 +374,10 @@ ENTRY (libat_fetch_or_16_i1) > stlxp w4, tmp0, tmp1, [x5] > cbnz w4, 2b > ret > -END (libat_fetch_or_16_i1) > +END (libat_fetch_or_16) > > > -ENTRY (libat_or_fetch_16_i1) > +ENTRY (libat_or_fetch_16) > mov x5, x0 > cbnz w4, 2f > > @@ -309,10 +396,10 @@ ENTRY (libat_or_fetch_16_i1) > stlxp w4, res0, res1, [x5] > cbnz w4, 2b > ret > -END (libat_or_fetch_16_i1) > +END (libat_or_fetch_16) > > > -ENTRY (libat_fetch_and_16_i1) > +ENTRY (libat_fetch_and_16) > mov x5, x0 > cbnz w4, 2f > > @@ -331,10 +418,10 @@ ENTRY (libat_fetch_and_16_i1) > stlxp w4, tmp0, tmp1, [x5] > cbnz w4, 2b > ret > -END (libat_fetch_and_16_i1) > +END (libat_fetch_and_16) > > > -ENTRY (libat_and_fetch_16_i1) > +ENTRY (libat_and_fetch_16) > mov x5, x0 > cbnz w4, 2f > > @@ -353,10 +440,10 @@ ENTRY (libat_and_fetch_16_i1) > stlxp w4, res0, res1, [x5] > cbnz w4, 2b > ret > -END (libat_and_fetch_16_i1) > +END (libat_and_fetch_16) > > > -ENTRY (libat_fetch_xor_16_i1) > +ENTRY (libat_fetch_xor_16) > mov x5, x0 > cbnz w4, 2f > > @@ -375,10 +462,10 @@ ENTRY (libat_fetch_xor_16_i1) > stlxp w4, tmp0, tmp1, [x5] > cbnz w4, 2b > ret > -END (libat_fetch_xor_16_i1) > +END (libat_fetch_xor_16) > > > -ENTRY (libat_xor_fetch_16_i1) > +ENTRY (libat_xor_fetch_16) > mov x5, x0 > cbnz w4, 2f > > @@ -397,10 +484,10 @@ ENTRY (libat_xor_fetch_16_i1) > stlxp w4, res0, res1, [x5] > cbnz w4, 2b > ret > -END (libat_xor_fetch_16_i1) > +END (libat_xor_fetch_16) > > > -ENTRY (libat_fetch_nand_16_i1) > +ENTRY (libat_fetch_nand_16) > mov x5, x0 > mvn in0, in0 > mvn in1, in1 > @@ -421,10 +508,10 @@ ENTRY (libat_fetch_nand_16_i1) > stlxp w4, tmp0, tmp1, [x5] > cbnz w4, 2b > ret > -END (libat_fetch_nand_16_i1) > +END (libat_fetch_nand_16) > > > -ENTRY (libat_nand_fetch_16_i1) > +ENTRY (libat_nand_fetch_16) > mov x5, x0 > mvn in0, in0 > mvn in1, in1 > @@ -445,21 +532,38 @@ ENTRY (libat_nand_fetch_16_i1) > stlxp w4, res0, res1, [x5] > cbnz w4, 2b > ret > -END (libat_nand_fetch_16_i1) > +END (libat_nand_fetch_16) > > > -ENTRY (libat_test_and_set_16_i1) > - mov w2, 1 > - cbnz w1, 2f > - > - /* RELAXED. */ > - swpb w0, w2, [x0] > - ret > +/* __atomic_test_and_set is always inlined, so this entry is unused and > + only required for completeness. */ > +ENTRY (libat_test_and_set_16) > > - /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST. */ > -2: swpalb w0, w2, [x0] > + /* RELAXED/ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST. */ > + mov x5, x0 > +1: ldaxrb w0, [x5] > + stlxrb w4, w2, [x5] > + cbnz w4, 1b > ret > -END (libat_test_and_set_16_i1) > +END (libat_test_and_set_16) > + > + > +/* Alias entry points which are the same in baseline and LSE2. */ > + > +ALIAS (libat_exchange_16_i1, libat_exchange_16) > +ALIAS (libat_fetch_add_16_i1, libat_fetch_add_16) > +ALIAS (libat_add_fetch_16_i1, libat_add_fetch_16) > +ALIAS (libat_fetch_sub_16_i1, libat_fetch_sub_16) > +ALIAS (libat_sub_fetch_16_i1, libat_sub_fetch_16) > +ALIAS (libat_fetch_or_16_i1, libat_fetch_or_16) > +ALIAS (libat_or_fetch_16_i1, libat_or_fetch_16) > +ALIAS (libat_fetch_and_16_i1, libat_fetch_and_16) > +ALIAS (libat_and_fetch_16_i1, libat_and_fetch_16) > +ALIAS (libat_fetch_xor_16_i1, libat_fetch_xor_16) > +ALIAS (libat_xor_fetch_16_i1, libat_xor_fetch_16) > +ALIAS (libat_fetch_nand_16_i1, libat_fetch_nand_16) > +ALIAS (libat_nand_fetch_16_i1, libat_nand_fetch_16) > +ALIAS (libat_test_and_set_16_i1, libat_test_and_set_16) > > > /* GNU_PROPERTY_AARCH64_* macros from elf.h for use in asm code. */ > diff --git a/libatomic/config/linux/aarch64/host-config.h > b/libatomic/config/linux/aarch64/host-config.h > index > bea26825b4f75bb8ff348ab4b5fc45f4a5bd561e..851c78c01cd643318aaa52929ce4550266238b79 > 100644 > --- a/libatomic/config/linux/aarch64/host-config.h > +++ b/libatomic/config/linux/aarch64/host-config.h > @@ -35,10 +35,19 @@ > #endif > #define IFUNC_NCOND(N) (1) > > -#if N == 16 && IFUNC_ALT != 0 > +#endif /* HAVE_IFUNC */ > + > +/* All 128-bit atomic functions are defined in aarch64/atomic_16.S. */ > +#if N == 16 > # define DONE 1 > #endif > > -#endif /* HAVE_IFUNC */ > +/* State we have lock-free 128-bit atomics. */ > +#undef FAST_ATOMIC_LDST_16 > +#define FAST_ATOMIC_LDST_16 1 > +#undef MAYBE_HAVE_ATOMIC_CAS_16 > +#define MAYBE_HAVE_ATOMIC_CAS_16 1 > +#undef MAYBE_HAVE_ATOMIC_EXCHANGE_16 > +#define MAYBE_HAVE_ATOMIC_EXCHANGE_16 1 > > #include_next <host-config.h>