Hi Richard, >> Enable lock-free 128-bit atomics on AArch64. This is backwards compatible >> with >> existing binaries, gives better performance than locking atomics and is what >> most users expect. > > Please add a justification for why it's backwards compatible, rather > than just stating that it's so.
This isn't any different than the LSE2 support which also switches some CPUs to lock-free implementations. This is basically switching the rest. It trivially follows from the fact that GCC always calls libatomic so that you switch all atomics in a process. I'll add that to the description. Note the compatibility story is even better than this. We are also compatible with LLVM and future GCC versions which may inline these sequences. > Thanks for adding this. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95722 > suggests that it's still an open question whether this is a correct thing > to do, but it sounds from Joseph's comment that he isn't sure whether > atomic loads from read-only data are valid. Yes it's not useful to do an atomic read if it is a read-only value... It should be feasible to mark atomic types as mutable to force them to .data (see eg. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108659 and https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109553). > Linus's comment in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70490 > suggests that a reasonable compromise might be to use a storing > implementation but not advertise that it is lock-free. Also, > the comment above libat_is_lock_free says: > > /* Note that this can return that a size/alignment is not lock-free even if > all the operations that we use to implement the respective accesses provide > lock-free forward progress as specified in C++14: Users likely expect > "lock-free" to also mean "fast", which is why we do not return true if, for > example, we implement loads with this size/alignment using a CAS. */ I don't believe lying about being lock-free like that is a good idea. When you use a faster lock-free implementation, you want to tell users about it (so they aren't forced to use nasty inline assembler hacks for example). > We don't use a CAS for the fallbacks, but like you say, we do use a > load/store exclusive loop. So did you consider not doing this: > +/* State we have lock-free 128-bit atomics. */ > +#undef FAST_ATOMIC_LDST_16 > +#define FAST_ATOMIC_LDST_16 1 That would result in __atomic_is_lock_free incorrectly returning false. Note that __atomic_always_lock_free remains false for 128-bit since there is no inlining in the compiler, but __atomic_is_lock_free should be true. > - /* RELEASE. */ > -5: ldxp res0, res1, [x5] > + /* RELEASE/ACQ_REL/SEQ_CST. */ > +4: ldaxp res0, res1, [x5] > stlxp w4, in0, in1, [x5] > - cbnz w4, 5b > + cbnz w4, 4b > ret > +END (libat_exchange_16) > Please explain (here and in the commit message) why you're adding > acquire semantics to the RELEASE case. That merges the RELEASE with ACQ_REL/SEQ_CST cases to keep the code short and simple like much of the code. I've added a note in the commit msg. Cheers, Wilco Here is v2 - this also incorporates the PR111404 fix to compare-exchange: Enable lock-free 128-bit atomics on AArch64. This is backwards compatible with existing binaries (as for these GCC always calls into libatomic, so all 128-bit atomic uses in a process are switched), gives better performance than locking atomics and is what most users expect. Note 128-bit atomic loads use a load/store exclusive loop if LSE2 is not supported. This results in an implicit store which is invisible to software as long as the given address is writeable (which will be true when using atomics in actual code). Passes regress, OK for commit? libatomic/ config/linux/aarch64/atomic_16.S: Implement lock-free ARMv8.0 atomics. (libat_exchange_16): Merge RELEASE and ACQ_REL/SEQ_CST cases. config/linux/aarch64/host-config.h: Use atomic_16.S for baseline v8.0. State we have lock-free atomics. --- diff --git a/libatomic/config/linux/aarch64/atomic_16.S b/libatomic/config/linux/aarch64/atomic_16.S index 05439ce394b9653c9bcb582761ff7aaa7c8f9643..a099037179b3f1210145baea02a9d43418629813 100644 --- a/libatomic/config/linux/aarch64/atomic_16.S +++ b/libatomic/config/linux/aarch64/atomic_16.S @@ -22,6 +22,22 @@ <http://www.gnu.org/licenses/>. */ +/* AArch64 128-bit lock-free atomic implementation. + + 128-bit atomics are now lock-free for all AArch64 architecture versions. + This is backwards compatible with existing binaries (as we swap all uses + of 128-bit atomics via an ifunc) and gives better performance than locking + atomics. + + 128-bit atomic loads use a exclusive loop if LSE2 is not supported. + This results in an implicit store which is invisible to software as long + as the given address is writeable. Since all other atomics have explicit + writes, this will be true when using atomics in actual code. + + The libat_<op>_16 entry points are ARMv8.0. + The libat_<op>_16_i1 entry points are used when LSE2 is available. */ + + .arch armv8-a+lse #define ENTRY(name) \ @@ -37,6 +53,10 @@ name: \ .cfi_endproc; \ .size name, .-name; +#define ALIAS(alias,name) \ + .global alias; \ + .set alias, name; + #define res0 x0 #define res1 x1 #define in0 x2 @@ -70,6 +90,24 @@ name: \ #define SEQ_CST 5 +ENTRY (libat_load_16) + mov x5, x0 + cbnz w1, 2f + + /* RELAXED. */ +1: ldxp res0, res1, [x5] + stxp w4, res0, res1, [x5] + cbnz w4, 1b + ret + + /* ACQUIRE/CONSUME/SEQ_CST. */ +2: ldaxp res0, res1, [x5] + stxp w4, res0, res1, [x5] + cbnz w4, 2b + ret +END (libat_load_16) + + ENTRY (libat_load_16_i1) cbnz w1, 1f @@ -93,6 +131,23 @@ ENTRY (libat_load_16_i1) END (libat_load_16_i1) +ENTRY (libat_store_16) + cbnz w4, 2f + + /* RELAXED. */ +1: ldxp xzr, tmp0, [x0] + stxp w4, in0, in1, [x0] + cbnz w4, 1b + ret + + /* RELEASE/SEQ_CST. */ +2: ldxp xzr, tmp0, [x0] + stlxp w4, in0, in1, [x0] + cbnz w4, 2b + ret +END (libat_store_16) + + ENTRY (libat_store_16_i1) cbnz w4, 1f @@ -101,14 +156,14 @@ ENTRY (libat_store_16_i1) ret /* RELEASE/SEQ_CST. */ -1: ldaxp xzr, tmp0, [x0] +1: ldxp xzr, tmp0, [x0] stlxp w4, in0, in1, [x0] cbnz w4, 1b ret END (libat_store_16_i1) -ENTRY (libat_exchange_16_i1) +ENTRY (libat_exchange_16) mov x5, x0 cbnz w4, 2f @@ -126,22 +181,60 @@ ENTRY (libat_exchange_16_i1) stxp w4, in0, in1, [x5] cbnz w4, 3b ret -4: - cmp w4, RELEASE - b.ne 6f - /* RELEASE. */ -5: ldxp res0, res1, [x5] + /* RELEASE/ACQ_REL/SEQ_CST. */ +4: ldaxp res0, res1, [x5] stlxp w4, in0, in1, [x5] - cbnz w4, 5b + cbnz w4, 4b ret +END (libat_exchange_16) - /* ACQ_REL/SEQ_CST. */ -6: ldaxp res0, res1, [x5] - stlxp w4, in0, in1, [x5] - cbnz w4, 6b + +ENTRY (libat_compare_exchange_16) + ldp exp0, exp1, [x1] + cbz w4, 3f + cmp w4, RELEASE + b.hs 5f + + /* ACQUIRE/CONSUME. */ +1: ldaxp tmp0, tmp1, [x0] + cmp tmp0, exp0 + ccmp tmp1, exp1, 0, eq + csel tmp0, in0, tmp0, eq + csel tmp1, in1, tmp1, eq + stxp w4, tmp0, tmp1, [x0] + cbnz w4, 1b + beq 2f + stp tmp0, tmp1, [x1] +2: cset x0, eq + ret + + /* RELAXED. */ +3: ldxp tmp0, tmp1, [x0] + cmp tmp0, exp0 + ccmp tmp1, exp1, 0, eq + csel tmp0, in0, tmp0, eq + csel tmp1, in1, tmp1, eq + stxp w4, tmp0, tmp1, [x0] + cbnz w4, 3b + beq 4f + stp tmp0, tmp1, [x1] +4: cset x0, eq + ret + + /* RELEASE/ACQ_REL/SEQ_CST. */ +5: ldaxp tmp0, tmp1, [x0] + cmp tmp0, exp0 + ccmp tmp1, exp1, 0, eq + csel tmp0, in0, tmp0, eq + csel tmp1, in1, tmp1, eq + stlxp w4, tmp0, tmp1, [x0] + cbnz w4, 5b + beq 6f + stp tmp0, tmp1, [x1] +6: cset x0, eq ret -END (libat_exchange_16_i1) +END (libat_compare_exchange_16) ENTRY (libat_compare_exchange_16_i1) @@ -180,7 +273,7 @@ ENTRY (libat_compare_exchange_16_i1) END (libat_compare_exchange_16_i1) -ENTRY (libat_fetch_add_16_i1) +ENTRY (libat_fetch_add_16) mov x5, x0 cbnz w4, 2f @@ -199,10 +292,10 @@ ENTRY (libat_fetch_add_16_i1) stlxp w4, tmp0, tmp1, [x5] cbnz w4, 2b ret -END (libat_fetch_add_16_i1) +END (libat_fetch_add_16) -ENTRY (libat_add_fetch_16_i1) +ENTRY (libat_add_fetch_16) mov x5, x0 cbnz w4, 2f @@ -221,10 +314,10 @@ ENTRY (libat_add_fetch_16_i1) stlxp w4, res0, res1, [x5] cbnz w4, 2b ret -END (libat_add_fetch_16_i1) +END (libat_add_fetch_16) -ENTRY (libat_fetch_sub_16_i1) +ENTRY (libat_fetch_sub_16) mov x5, x0 cbnz w4, 2f @@ -243,10 +336,10 @@ ENTRY (libat_fetch_sub_16_i1) stlxp w4, tmp0, tmp1, [x5] cbnz w4, 2b ret -END (libat_fetch_sub_16_i1) +END (libat_fetch_sub_16) -ENTRY (libat_sub_fetch_16_i1) +ENTRY (libat_sub_fetch_16) mov x5, x0 cbnz w4, 2f @@ -265,10 +358,10 @@ ENTRY (libat_sub_fetch_16_i1) stlxp w4, res0, res1, [x5] cbnz w4, 2b ret -END (libat_sub_fetch_16_i1) +END (libat_sub_fetch_16) -ENTRY (libat_fetch_or_16_i1) +ENTRY (libat_fetch_or_16) mov x5, x0 cbnz w4, 2f @@ -287,10 +380,10 @@ ENTRY (libat_fetch_or_16_i1) stlxp w4, tmp0, tmp1, [x5] cbnz w4, 2b ret -END (libat_fetch_or_16_i1) +END (libat_fetch_or_16) -ENTRY (libat_or_fetch_16_i1) +ENTRY (libat_or_fetch_16) mov x5, x0 cbnz w4, 2f @@ -309,10 +402,10 @@ ENTRY (libat_or_fetch_16_i1) stlxp w4, res0, res1, [x5] cbnz w4, 2b ret -END (libat_or_fetch_16_i1) +END (libat_or_fetch_16) -ENTRY (libat_fetch_and_16_i1) +ENTRY (libat_fetch_and_16) mov x5, x0 cbnz w4, 2f @@ -331,10 +424,10 @@ ENTRY (libat_fetch_and_16_i1) stlxp w4, tmp0, tmp1, [x5] cbnz w4, 2b ret -END (libat_fetch_and_16_i1) +END (libat_fetch_and_16) -ENTRY (libat_and_fetch_16_i1) +ENTRY (libat_and_fetch_16) mov x5, x0 cbnz w4, 2f @@ -353,10 +446,10 @@ ENTRY (libat_and_fetch_16_i1) stlxp w4, res0, res1, [x5] cbnz w4, 2b ret -END (libat_and_fetch_16_i1) +END (libat_and_fetch_16) -ENTRY (libat_fetch_xor_16_i1) +ENTRY (libat_fetch_xor_16) mov x5, x0 cbnz w4, 2f @@ -375,10 +468,10 @@ ENTRY (libat_fetch_xor_16_i1) stlxp w4, tmp0, tmp1, [x5] cbnz w4, 2b ret -END (libat_fetch_xor_16_i1) +END (libat_fetch_xor_16) -ENTRY (libat_xor_fetch_16_i1) +ENTRY (libat_xor_fetch_16) mov x5, x0 cbnz w4, 2f @@ -397,10 +490,10 @@ ENTRY (libat_xor_fetch_16_i1) stlxp w4, res0, res1, [x5] cbnz w4, 2b ret -END (libat_xor_fetch_16_i1) +END (libat_xor_fetch_16) -ENTRY (libat_fetch_nand_16_i1) +ENTRY (libat_fetch_nand_16) mov x5, x0 mvn in0, in0 mvn in1, in1 @@ -421,10 +514,10 @@ ENTRY (libat_fetch_nand_16_i1) stlxp w4, tmp0, tmp1, [x5] cbnz w4, 2b ret -END (libat_fetch_nand_16_i1) +END (libat_fetch_nand_16) -ENTRY (libat_nand_fetch_16_i1) +ENTRY (libat_nand_fetch_16) mov x5, x0 mvn in0, in0 mvn in1, in1 @@ -445,21 +538,38 @@ ENTRY (libat_nand_fetch_16_i1) stlxp w4, res0, res1, [x5] cbnz w4, 2b ret -END (libat_nand_fetch_16_i1) +END (libat_nand_fetch_16) -ENTRY (libat_test_and_set_16_i1) - mov w2, 1 - cbnz w1, 2f - - /* RELAXED. */ - swpb w0, w2, [x0] - ret +/* __atomic_test_and_set is always inlined, so this entry is unused and + only required for completeness. */ +ENTRY (libat_test_and_set_16) - /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST. */ -2: swpalb w0, w2, [x0] + /* RELAXED/ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST. */ + mov x5, x0 +1: ldaxrb w0, [x5] + stlxrb w4, w2, [x5] + cbnz w4, 1b ret -END (libat_test_and_set_16_i1) +END (libat_test_and_set_16) + + +/* Alias entry points which are the same in baseline and LSE2. */ + +ALIAS (libat_exchange_16_i1, libat_exchange_16) +ALIAS (libat_fetch_add_16_i1, libat_fetch_add_16) +ALIAS (libat_add_fetch_16_i1, libat_add_fetch_16) +ALIAS (libat_fetch_sub_16_i1, libat_fetch_sub_16) +ALIAS (libat_sub_fetch_16_i1, libat_sub_fetch_16) +ALIAS (libat_fetch_or_16_i1, libat_fetch_or_16) +ALIAS (libat_or_fetch_16_i1, libat_or_fetch_16) +ALIAS (libat_fetch_and_16_i1, libat_fetch_and_16) +ALIAS (libat_and_fetch_16_i1, libat_and_fetch_16) +ALIAS (libat_fetch_xor_16_i1, libat_fetch_xor_16) +ALIAS (libat_xor_fetch_16_i1, libat_xor_fetch_16) +ALIAS (libat_fetch_nand_16_i1, libat_fetch_nand_16) +ALIAS (libat_nand_fetch_16_i1, libat_nand_fetch_16) +ALIAS (libat_test_and_set_16_i1, libat_test_and_set_16) /* GNU_PROPERTY_AARCH64_* macros from elf.h for use in asm code. */ diff --git a/libatomic/config/linux/aarch64/host-config.h b/libatomic/config/linux/aarch64/host-config.h index 9747accd88f5881d0496f9f8104149e74bbbe268..265b6ebd82fd7cb7fd36554d4da82ef54b3b99b5 100644 --- a/libatomic/config/linux/aarch64/host-config.h +++ b/libatomic/config/linux/aarch64/host-config.h @@ -35,11 +35,20 @@ #endif #define IFUNC_NCOND(N) (1) -#if N == 16 && IFUNC_ALT != 0 +#endif /* HAVE_IFUNC */ + +/* All 128-bit atomic functions are defined in aarch64/atomic_16.S. */ +#if N == 16 # define DONE 1 #endif -#endif /* HAVE_IFUNC */ +/* State we have lock-free 128-bit atomics. */ +#undef FAST_ATOMIC_LDST_16 +#define FAST_ATOMIC_LDST_16 1 +#undef MAYBE_HAVE_ATOMIC_CAS_16 +#define MAYBE_HAVE_ATOMIC_CAS_16 1 +#undef MAYBE_HAVE_ATOMIC_EXCHANGE_16 +#define MAYBE_HAVE_ATOMIC_EXCHANGE_16 1 #ifdef HWCAP_USCAT