[PATCH 1/8] [RFC] Define new floating point builtin fetch_add functions

2024-09-19 Thread mmalcomson
From: Matthew Malcomson 

This commit just defines the new names -- as yet don't implement them.
Saving this commit because this is one decision, and recording
what the decision was and why:

Adding new floating point builtins for each floating point type that
is defined in the general code *except* f128x (which would have a size
greater than 16bytes -- the largest integral atomic operation we
currently support).

We have to base our naming on floating point *types* rather than sizes
since different types can have the same size and the operations need
to be distinguished based on type.  N.b. one could make size-suffixed
builtins that are still overloaded based on types but I thought that
this was the cleaner approach.
(Actual requirement is distinction based on mode, this is how I choose
which internal function to use in a later patch.  I believe that
defining the function in terms of types and internally mapping to modes
is a sensible split between user interface and internal implementation).

N.b. in order to choose whether these operations are available or not
in something like libstdc++ I use something like
`__has_builtin(__atomic_fetch_add_fp)`.  This happens to be the
builtin for implementing the relevant operation on doubles, but it
also seems like a nice name to check.
- This would require that all compiler implementations have floating
  point atomics for all floating point types they support available at
  the same time.  I don't expect this is much of a problem but invite
  dissent.

N.b. I used the below type suffixes (following what seems like the
existing convention for builtins):
  - float  -> f
  - double -> 
  - long double -> l
  - _FloatN -> fN   (for N <- (16, 32, 64, 128))
  - _FloatNx -> fNx (for N <- (32, 64))

Richi suggested doing this expansion generally for all these builtins
following Cxy _Atomic semantics on IRC.
Since C hasn't specified any fetch_ semantics for floating point
types, C++ has only specified `atomic<>::fetch_{add,sub}`, and the
operations other than these are all bitwise operations (which don't
to map well to floating point), I believe I have followed that
suggestion by implementing all fetch_{sub,add}/{add,sub}_fetch
operations.

I have not implemented anything for the __sync_* builtins on the
belief that these are legacy and new code should use the __atomic_*
builtins.  Happy to adjust if that is a bad choice.

Only the new function types were needed for most cases.
The Fortran frontend does not use `builtin-types.def` so it needed the
fortran `types.def` to be updated to include the floating point built
in types in `enum builtin_type` local to `gfc_init_builtin_functions`.
- N.b. these types are already available in the fortran
  frontend (being defined by `build_common_tree_nodes`), it's just
  that they were not available for sync-builtins.def functions until
  this commit.

--
N.b. for this RFC I've not checked that any other frontends can access
these builtins.  Even the fortran frontend I've only adjusted things to
ensure stuff builds.

Signed-off-by: Matthew Malcomson 
---
 gcc/builtin-types.def | 20 ++
 gcc/fortran/types.def | 48 +++
 gcc/sync-builtins.def | 40 
 3 files changed, 108 insertions(+)

diff --git a/gcc/builtin-types.def b/gcc/builtin-types.def
index c97d6bad1de..97ccd945b55 100644
--- a/gcc/builtin-types.def
+++ b/gcc/builtin-types.def
@@ -802,6 +802,26 @@ DEF_FUNCTION_TYPE_3 (BT_FN_VOID_VPTR_I2_INT, BT_VOID, 
BT_VOLATILE_PTR, BT_I2, BT
 DEF_FUNCTION_TYPE_3 (BT_FN_VOID_VPTR_I4_INT, BT_VOID, BT_VOLATILE_PTR, BT_I4, 
BT_INT)
 DEF_FUNCTION_TYPE_3 (BT_FN_VOID_VPTR_I8_INT, BT_VOID, BT_VOLATILE_PTR, BT_I8, 
BT_INT)
 DEF_FUNCTION_TYPE_3 (BT_FN_VOID_VPTR_I16_INT, BT_VOID, BT_VOLATILE_PTR, 
BT_I16, BT_INT)
+DEF_FUNCTION_TYPE_3 (BT_FN_FLOAT_VPTR_FLOAT_INT, BT_FLOAT, BT_VOLATILE_PTR,
+BT_FLOAT, BT_INT)
+DEF_FUNCTION_TYPE_3 (BT_FN_DOUBLE_VPTR_DOUBLE_INT, BT_DOUBLE, BT_VOLATILE_PTR,
+BT_DOUBLE, BT_INT)
+DEF_FUNCTION_TYPE_3 (BT_FN_LONGDOUBLE_VPTR_LONGDOUBLE_INT, BT_LONGDOUBLE,
+BT_VOLATILE_PTR, BT_LONGDOUBLE, BT_INT)
+DEF_FUNCTION_TYPE_3 (BT_FN_BFLOAT16_VPTR_BFLOAT16_INT, BT_BFLOAT16, 
BT_VOLATILE_PTR,
+BT_BFLOAT16, BT_INT)
+DEF_FUNCTION_TYPE_3 (BT_FN_FLOAT16_VPTR_FLOAT16_INT, BT_FLOAT16, 
BT_VOLATILE_PTR,
+BT_FLOAT16, BT_INT)
+DEF_FUNCTION_TYPE_3 (BT_FN_FLOAT32_VPTR_FLOAT32_INT, BT_FLOAT32, 
BT_VOLATILE_PTR,
+BT_FLOAT32, BT_INT)
+DEF_FUNCTION_TYPE_3 (BT_FN_FLOAT64_VPTR_FLOAT64_INT, BT_FLOAT64, 
BT_VOLATILE_PTR,
+BT_FLOAT64, BT_INT)
+DEF_FUNCTION_TYPE_3 (BT_FN_FLOAT128_VPTR_FLOAT128_INT, BT_FLOAT128, 
BT_VOLATILE_PTR,
+BT_FLOAT128, BT_INT)
+DEF_FUNCTION_TYPE_3 (BT_FN_FLOAT32X_VPTR_FLOAT32X_INT, BT_FLOAT32X, 
BT_VOLATILE_PTR,
+BT_FLOAT32X, BT_INT)
+DEF_FUNCTION

[PATCH 0/8] [RFC] Introduce floating point fetch_add builtins

2024-09-19 Thread mmalcomson
From: Matthew Malcomson 

Hello, this is an RFC for adding an atomic floating point fetch_add builtin
(and variants) to GCC.  The atomic fetch_add operation is defined to work
on the base floating point types in the C++20 standard chapter 31.7.3, and
extended to work for all cv-unqualified floating point types in C++23
chapter 33.5.7.4.

Honestly not sure who to Cc, please do point me to someone else if that's
better.

This is nowhere near complete (for one thing even the tests I've added
don't fully pass), but I think I have a complete enough idea that it's
worth checking if this is something that could be agreed on.

As it stands no target except the nvptx backend would natively support
these operations.

Main questions that I'm looking to resolve with this RFC:
1) Would GCC be OK accepting this implementation even though no backend
   would be implementing these yet?
   - AIUI only the nvptx backend could theoretically implement this.
   - Even without a backend implementing it natively, the ability to use
 this in code (especially libstdc++) enables other compilers to
 generate better code for GPU's using standard C++.
2) Would libstdc++ be OK relying on `__has_builtin(__atomic_fetch_add_fp)`
   (i.e. a check on the resolved builtin rather than the more user-facing
   one) in order to determine whether floating point atomic fetch_add is
   available.
   - N.b. this builtin is actually the builtin working on the "double"
 type, one would have to rely on any compilers implementing that
 particular resolved builtin to also implement the other floating point
 atomic fetch_add builtins that they would want to support in libstdc++
 `atomic<[floating_point_type]>::fetch_add`.

More specific questions about the choice of which builtins to implement and
whether the types are OK:
1) Is it OK to not implement the `__sync_*` versions?
   Since these are deprecated and the `__atomic_*` versions are there to
   match the C/C++ code atomic operations (which is a large part of the
   reason for the new floating point operations).
2) Is it OK to not implement all the `fetch_*` operations?
   None of the bitwise operations are specified for C++ and bitwise
   operations are (AIUI) rarely used on floating point values.
3) Wanting to be able to farm out to libatomic meant that we need constant names
   for the specialised functions.
   - This led to the naming convention based on floating point type.
   - That naming convention *could* be updated to include the special backend
 floating point types if needed.  I have not done this mostly because I
 thought it would not add much, though I have not looked into this very
 closely.
4) Wanting to name the functions based on floating point type rather than size
   meant that the mapping from type passed to the overloaded version to
   specialised builtin was less direct than for the integral versions.
   - Ended up with a hard-coded table in the source to check this.
   - Would very much like some better approach, not certain what better approach
 I could find.
   - Will eventually at least share the hard-coded tables (which isn't
 happening yet because this is at RFC level).
5) Are there any other types that I should use?
   Similarly are there any types that I'm trying to use that I shouldn't?
   I *believe* I've implemented all the types that make sense and are
   general builtin types.  Could easily have missed some (e.g. left
   `_Float128x` alone because AIUI it's larger than 128bits which means we
   don't have any other atomic operations on such data), could also easily
   be misunderstanding the mention in the C++ standards of "extended" types
   (which I believe is the `_Float*` and `bfloat16` types).
6) Anything special about floating point maths that I'm tripping up on?
   (Especially around CAS loop where we expand the operation directly,
   sometimes convert a PLUS into a MINUS of a negative value ...).
   Don't know of anything myself, but also a bit wary of floating point
   maths.

N.b. I know that there's a reasonable amount of work left in:
1) Ensuring that all the places which use atomic types are updated
   (e.g. asan), 
2) Ensuring that all frontends can use these to the level that they could
   use the integral atomics.
3) Ensuring the major backends can still compile libatomic (I had to do
   something in AArch64 libatomic backend to make it compile, seems like
   there might be more to do for others).


Matthew Malcomson (8):
  Define new floating point builtin fetch_add functions
  Add FP types for atomic builtin overload resolution
  Tie the new atomic builtins to the backend
  Have libatomic working as first draft
  Use new builtins in libstdc++
  First attempt at testsuite
  Mention floating point atomic fetch_add etc in docs
  Add demo implementation of one of the operations

 gcc/builtin-types.def |   20 +
 gcc/builtins.cc   |  153 ++-
 gcc/c-family/c-c

[PATCH 7/8] [RFC] Mention floating point atomic fetch_add etc in docs

2024-09-19 Thread mmalcomson
From: Matthew Malcomson 

Signed-off-by: Matthew Malcomson 
---
 gcc/doc/extend.texi | 12 
 1 file changed, 12 insertions(+)

diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 66c99ef7a66..a3e3e7da5d6 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -13501,6 +13501,18 @@ the same format with the addition of a @samp{size_t} 
parameter inserted
 as the first parameter indicating the size of the object being pointed to.
 All objects must be the same size.
 
+Moreover, the @samp{__atomic_fetch_add}, @samp{__atomic_fetch_sub},
+@samp{__atomic_add_fetch} and @samp{__atomic_sub_fetch} builtins can all
+accept floating point types of @code{float}, @code{double}, @code{long double},
+@code{bfloat16}, @code{_Float16}, @code{_Float32}, @code{_Float64},
+@code{_Float128}, @code{_Float32x} and @code{_Float64x}.  These use a lock-free
+built-in function if the size of the floating point type makes that possible
+and otherwise leave an external call to be resolved at run time.  This external
+call is of the same format but specialised to the given floating point type.
+The specialised versions of these functions are denoted by one of the
+suffixes @code{_fpf}, @code{_fp}, @code{_fpl}, @code{_fpf16b}, @code{_fpf16},
+@code{_fpf32}, @code{_fpf64}, @code{_fpf128}, @code{_fpf32x}, @code{_fpf64x}.
+
 There are 6 different memory orders that can be specified.  These map
 to the C++11 memory orders with the same names, see the C++11 standard
 or the @uref{https://gcc.gnu.org/wiki/Atomic/GCCMM/AtomicSync,GCC wiki
-- 
2.43.0



[PATCH 3/8] [RFC] Tie the new atomic builtins to the backend

2024-09-19 Thread mmalcomson
From: Matthew Malcomson 

Need to implement something in the

Things implemented in this patch:
1) Update the optabs definitions to include floating point versions of
   atomic fetch_add variants.
2) When expanding into a CAS loop in RTL because the floating point
   optab is not implemented, there are now two different modes.  One is
   the integral mode in which the atomic CAS (and load) should be
   performed, and one is the floating point mode in which the operation
   should be performed.
   - Extra handling of modes etc in `expand_atomic_fetch_op`.

Things to highlight to any reviewer:
1) Needed another mapping from builtin to mode.
   This is *almost* shared code between this and the frontend.  Looks
   like this could be shared if I put some effort into it.
2) I do not always expand into the modify before version, but also use
   the modify after version when unable to inline.
   - From looking at the dates on different parts of the code, it seems
 that this used to be needed before libatomic was added as a target
 library.  Since libatomic currently implements both the fetch_
 and _fetch versions I don't believe it's needed any more.
3) I `extract_bit_field` to convert between representations when
   expanding as a fallback (because fallback CAS loop loads in integral
   register and we want to reinterpret that as a floating point type
   before the intermediate operation).
   - Not sure if there's a better way I don't know about.

Other than that everything seems mostly straight-forwardly following
what is already done.

Signed-off-by: Matthew Malcomson 
---
 gcc/builtins.cc | 153 +---
 gcc/optabs.cc   | 101 
 gcc/optabs.def  |   6 +-
 gcc/optabs.h|   2 +-
 4 files changed, 241 insertions(+), 21 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 0b902896ddd..0ffd7d0b211 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -6394,6 +6394,46 @@ get_builtin_sync_mode (int fcode_diff)
   return int_mode_for_size (BITS_PER_UNIT << fcode_diff, 0).require ();
 }
 
+/* Reconsitute the machine modes relevant for this builtin operation from the
+   builtin difference from the _N version of a fetch_add atomic.
+
+   Only works for floating point atomic builtins.
+   FCODE_DIFF should be fcode - base, where base is the FOO_N code for the
+   group of builtins.  N.b. this is a different base to that used by
+   `get_builtin_sync_mode` because that matches the builtin enum offset used in
+   c-common.cc to find the builtin enum from a given MODE.
+
+   TODO Really do need to figure out a bit neater code here.  Should not be
+   inlining the mapping from type to offset in two different places.  */
+static inline machine_mode
+get_builtin_fp_sync_mode (int fcode_diff, machine_mode *mode)
+{
+  struct type_to_offset { tree type; size_t offset; };
+  static const struct type_to_offset fp_type_mappings[] = {
+{ float_type_node, 6 },
+{ double_type_node, 7 },
+{ long_double_type_node, 8 },
+{ bfloat16_type_node ? bfloat16_type_node : error_mark_node, 9 },
+{ float16_type_node ? float16_type_node : error_mark_node, 10 },
+{ float32_type_node ? float32_type_node : error_mark_node, 11 },
+{ float64_type_node ? float64_type_node : error_mark_node, 12 },
+{ float128_type_node ? float128_type_node : error_mark_node, 13 },
+{ float32x_type_node ? float32x_type_node : error_mark_node, 14 },
+{ float64x_type_node ? float64x_type_node : error_mark_node, 15 }
+  };
+ gcc_assert (fcode_diff <= 15 && fcode_diff >= 6);
+ for (size_t i = 0; i < sizeof(fp_type_mappings)/sizeof(fp_type_mappings[0]); 
i++)
+   {
+ if ((size_t)fcode_diff == fp_type_mappings[i].offset)
+   {
+*mode = TYPE_MODE (fp_type_mappings[i].type);
+return int_mode_for_size (GET_MODE_SIZE (*mode) * BITS_PER_UNIT, 0)
+  .require ();
+   }
+  } 
+ gcc_unreachable ();
+}
+
 /* Expand the memory expression LOC and return the appropriate memory operand
for the builtin_sync operations.  */
 
@@ -6886,9 +6926,10 @@ expand_builtin_atomic_store (machine_mode mode, tree exp)
resolved to an instruction sequence.  */
 
 static rtx
-expand_builtin_atomic_fetch_op (machine_mode mode, tree exp, rtx target,
+expand_builtin_atomic_fetch_op (machine_mode expand_mode, tree exp, rtx target,
enum rtx_code code, bool fetch_after,
-   bool ignore, enum built_in_function ext_call)
+   bool ignore, enum built_in_function ext_call,
+   machine_mode load_mode = VOIDmode)
 {
   rtx val, mem, ret;
   enum memmodel model;
@@ -6898,13 +6939,13 @@ expand_builtin_atomic_fetch_op (machine_mode mode, tree 
exp, rtx target,
   model = get_memmodel (CALL_EXPR_ARG (exp, 2));
 
   /* Expand the operands.  */
-  mem = get_builtin_sync_mem (CALL_EXPR_ARG (exp, 0), mode);
-  val = expand_expr_force_mod

[PATCH 8/8] [RFC] Add demo implementation of one of the operations

2024-09-19 Thread mmalcomson
From: Matthew Malcomson 

Do demo implementation in AArch64 since that's the backend I'm most
familiar with.

Nothing much else to say -- nice to see that the demo implementation
seems to work as expected (being used for fetch_add, add_fetch and
sub_fetch even though it's only defined for fetch_sub).

Demo implementation ensures that I can run some execution tests.

Demo is added behind a flag in order to be able to run the testsuite
with different variants (with the flag and without).
Ensuring that the functionality worked for both the fallback and when
this optab was implemented (also check with the two different fallbacks
of either using libatomic or inlining a CAS loop).

In order to run with both this and the fallback implementation I use the
following flag in RUNTESTFLAGS:
--target_board='unix {unix/-mtesting-fp-atomics}'

Signed-off-by: Matthew Malcomson 
---
 gcc/config/aarch64/aarch64.h   |  2 ++
 gcc/config/aarch64/aarch64.opt |  5 +
 gcc/config/aarch64/atomics.md  | 15 +++
 3 files changed, 22 insertions(+)

diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index fac1882bcb3..c2f37545cd7 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -119,6 +119,8 @@
of LSE instructions.  */
 #define TARGET_OUTLINE_ATOMICS (aarch64_flag_outline_atomics)
 
+#define TARGET_TESTING_FP_ATOMICS (aarch64_flag_testing_fp_atomics)
+
 /* Align definitions of arrays, unions and structures so that
initializations and copies can be made more efficient.  This is not
ABI-changing, so it only affects places where we can see the
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index 6356c419399..ed031258575 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -332,6 +332,11 @@ moutline-atomics
 Target Var(aarch64_flag_outline_atomics) Init(2) Save
 Generate local calls to out-of-line atomic operations.
 
+mtesting-fp-atomics
+Target Var(aarch64_flag_testing_fp_atomics) Init(0) Save
+Use the demonstration implementation of atomic_fetch_sub_ for floating
+point modes.
+
 -param=aarch64-vect-compare-costs=
 Target Joined UInteger Var(aarch64_vect_compare_costs) Init(1) IntegerRange(0, 
1) Param
 When vectorizing, consider using multiple different approaches and use
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index 32a0a723732..ee8fbcd6c58 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -368,6 +368,21 @@
 ;; However we also implement the acquire memory barrier with DMB LD,
 ;; and so the ST is not blocked by the barrier.
 
+(define_insn "atomic_fetch_sub"
+  [(set (match_operand:GPF 0 "register_operand" "=&w")
+(match_operand:GPF 1 "aarch64_sync_memory_operand" "+Q"))
+(set (match_dup 1)
+(unspec_volatile:GPF
+[(minus:GPF (match_dup 1)
+   (match_operand:GPF 2 "register_operand" "w"))
+ (match_operand:SI 3 "const_int_operand")]
+ UNSPECV_ATOMIC_LDOP_PLUS))
+(clobber (match_scratch:GPF 4 "=w"))]
+"TARGET_TESTING_FP_ATOMICS"
+"// Here's your sandwich.\;ldr %0, %1\;fsub %4, %0, %2\;str 
%4, %1\;// END"
+)
+
+
 (define_insn "aarch64_atomic__lse"
   [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "+Q")
(unspec_volatile:ALLI
-- 
2.43.0



[PATCH 2/8] [RFC] Add FP types for atomic builtin overload resolution

2024-09-19 Thread mmalcomson
From: Matthew Malcomson 

Have a bit of an ugly mapping from floating point type to the builtin
using that type.  Would like to find some code-sharing between this, the
function (in a later patch in this series) that finds the relevant mode
from a given builtin, and the general sync-builtins.def file.
As yet don't have a nice way to do that, but haven't looked that hard.

Other than that, seems we can cleanly emit the functions that we need.

N.b. we match which function to use based on the MODE of the type for
two reasons:
1) Can't match directly on type as otherwise `typedef float x` would
   mean that `x` could no longer be used with that intrinsic.
2) MODE (i.e. the types ABI) is the thing that we need to distinguish
   between when deciding which fundamental operation needs to be
   applied.

Signed-off-by: Matthew Malcomson 
---
 gcc/c-family/c-common.cc | 88 
 1 file changed, 70 insertions(+), 18 deletions(-)

diff --git a/gcc/c-family/c-common.cc b/gcc/c-family/c-common.cc
index e7e371fd26f..c0a2b136d67 100644
--- a/gcc/c-family/c-common.cc
+++ b/gcc/c-family/c-common.cc
@@ -7360,13 +7360,15 @@ speculation_safe_value_resolve_return (tree 
first_param, tree result)
 
 static int
 sync_resolve_size (tree function, vec *params, bool fetch,
-  bool orig_format)
+  bool orig_format,
+   int *fp_specialisation_offset)
 {
   /* Type of the argument.  */
   tree argtype;
   /* Type the argument points to.  */
   tree type;
   int size;
+  bool valid_float = false;
 
   if (vec_safe_is_empty (params))
 {
@@ -7385,7 +7387,8 @@ sync_resolve_size (tree function, vec 
*params, bool fetch,
 goto incompatible;
 
   type = TREE_TYPE (type);
-  if (!INTEGRAL_TYPE_P (type) && !POINTER_TYPE_P (type))
+  valid_float = fp_specialisation_offset && fetch && SCALAR_FLOAT_TYPE_P 
(type);
+  if (!INTEGRAL_TYPE_P (type) && !POINTER_TYPE_P (type) && !valid_float)
 goto incompatible;
 
   if (!COMPLETE_TYPE_P (type))
@@ -7402,6 +7405,40 @@ sync_resolve_size (tree function, vec 
*params, bool fetch,
   && !targetm.scalar_mode_supported_p (TImode))
 return -1;
 
+  if (valid_float)
+{
+  tree fp_type = type;
+  /* TODO Want a better reverse-mapping between an argument type and
+ the builtin enum.  */
+  struct type_to_offset { tree type; size_t offset; };
+  static const struct type_to_offset fp_type_mappings[] = {
+{ float_type_node, 6 },
+{ double_type_node, 7 },
+{ long_double_type_node, 8 },
+{ bfloat16_type_node ? bfloat16_type_node : error_mark_node, 9 },
+{ float16_type_node ? float16_type_node : error_mark_node, 10 },
+{ float32_type_node ? float32_type_node : error_mark_node, 11 },
+{ float64_type_node ? float64_type_node : error_mark_node, 12 },
+{ float128_type_node ? float128_type_node : error_mark_node, 13 },
+{ float32x_type_node ? float32x_type_node : error_mark_node, 14 },
+{ float64x_type_node ? float64x_type_node : error_mark_node, 15 }
+  };
+  size_t offset = 0;
+  for (size_t i = 0;
+   i < sizeof(fp_type_mappings)/sizeof(fp_type_mappings[0]);
+   ++i) {
+if (TYPE_MODE (fp_type) == TYPE_MODE (fp_type_mappings[i].type))
+  {
+offset = fp_type_mappings[i].offset;
+break;
+  }
+  }
+  if (offset == 0)
+goto incompatible;
+  *fp_specialisation_offset = offset;
+  return -1;
+}
+
   if (size == 1 || size == 2 || size == 4 || size == 8 || size == 16)
 return size;
 
@@ -7462,9 +7499,10 @@ sync_resolve_params (location_t loc, tree orig_function, 
tree function,
 arguments (e.g. EXPECTED argument of __atomic_compare_exchange_n),
 bool arguments (e.g. WEAK argument) or signed int arguments (memmodel
 kinds).  */
-  if (TREE_CODE (arg_type) == INTEGER_TYPE && TYPE_UNSIGNED (arg_type))
+  if ((TREE_CODE (arg_type) == INTEGER_TYPE && TYPE_UNSIGNED (arg_type))
+  || SCALAR_FLOAT_TYPE_P (arg_type))
{
- /* Ideally for the first conversion we'd use convert_for_assignment
+ /* Ideally) for the first conversion we'd use convert_for_assignment
 so that we get warnings for anything that doesn't match the pointer
 type.  This isn't portable across the C and C++ front ends atm.  */
  val = (*params)[parmnum];
@@ -8256,7 +8294,6 @@ atomic_bitint_fetch_using_cas_loop (location_t loc,
 NULL_TREE);
 }
 
-
 /* Some builtin functions are placeholders for other expressions.  This
function should be called immediately after parsing the call expression
before surrounding code has committed to the type of the expression.
@@ -8277,6 +8314,9 @@ resolve_overloaded_builtin (location_t loc, tree function,
  and so must be rejected.  */
   bool fetch_op = true;
   bool orig_format = true;
+  /* Is this fun

[PATCH 5/8] [RFC] Use new builtins in libstdc++

2024-09-19 Thread mmalcomson
From: Matthew Malcomson 

Points to question here are:
1) Whether checking for this particular internal builtin is OK (this one
   happens to be the one implementing the operation for a `double`, we
   would have to rely on the approach that if anyone implements this
   operation for a `double` they implement it for all the floating point
   types that their C++ frontend and libstdc++ handle).
2) Whether the `#if` bit should be somewhere else instead of put in the
   `__fetch_add_flt` function.  I put it there because that's where it
   seemed natural, but am not familiar enough with libstdc++ to be
   confident in that decision.

We still need the CAS loop fallback for any compiler that doesn't
implement this builtin, and hence will still need some extra choice to
be made for floating point types.  Once all compilers we care about
implement this we can remove this special handling and merge the
floating point and integral operations into the same template.

Signed-off-by: Matthew Malcomson 
---
 libstdc++-v3/include/bits/atomic_base.h | 16 
 1 file changed, 16 insertions(+)

diff --git a/libstdc++-v3/include/bits/atomic_base.h 
b/libstdc++-v3/include/bits/atomic_base.h
index 1c2367b39b6..d3b1a022db2 100644
--- a/libstdc++-v3/include/bits/atomic_base.h
+++ b/libstdc++-v3/include/bits/atomic_base.h
@@ -1217,30 +1217,41 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   _Tp
   __fetch_add_flt(_Tp* __ptr, _Val<_Tp> __i, memory_order __m) noexcept
   {
+#if __has_builtin(__atomic_fetch_add_fp)
+   return __atomic_fetch_add(__ptr, __i, int(__m));
+#else
_Val<_Tp> __oldval = load(__ptr, memory_order_relaxed);
_Val<_Tp> __newval = __oldval + __i;
while (!compare_exchange_weak(__ptr, __oldval, __newval, __m,
  memory_order_relaxed))
  __newval = __oldval + __i;
return __oldval;
+#endif
   }
 
 template
   _Tp
   __fetch_sub_flt(_Tp* __ptr, _Val<_Tp> __i, memory_order __m) noexcept
   {
+#if __has_builtin(__atomic_fetch_sub)
+   return __atomic_fetch_sub(__ptr, __i, int(__m));
+#else
_Val<_Tp> __oldval = load(__ptr, memory_order_relaxed);
_Val<_Tp> __newval = __oldval - __i;
while (!compare_exchange_weak(__ptr, __oldval, __newval, __m,
  memory_order_relaxed))
  __newval = __oldval - __i;
return __oldval;
+#endif
   }
 
 template
   _Tp
   __add_fetch_flt(_Tp* __ptr, _Val<_Tp> __i) noexcept
   {
+#if __has_builtin(__atomic_add_fetch)
+   return __atomic_add_fetch(__ptr, __i, __ATOMIC_SEQ_CST);
+#else
_Val<_Tp> __oldval = load(__ptr, memory_order_relaxed);
_Val<_Tp> __newval = __oldval + __i;
while (!compare_exchange_weak(__ptr, __oldval, __newval,
@@ -1248,12 +1259,16 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
  memory_order_relaxed))
  __newval = __oldval + __i;
return __newval;
+#endif
   }
 
 template
   _Tp
   __sub_fetch_flt(_Tp* __ptr, _Val<_Tp> __i) noexcept
   {
+#if __has_builtin(__atomic_sub_fetch)
+  return __atomic_sub_fetch(__ptr, __i, __ATOMIC_SEQ_CST);
+#else
_Val<_Tp> __oldval = load(__ptr, memory_order_relaxed);
_Val<_Tp> __newval = __oldval - __i;
while (!compare_exchange_weak(__ptr, __oldval, __newval,
@@ -1261,6 +1276,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
  memory_order_relaxed))
  __newval = __oldval - __i;
return __newval;
+#endif
   }
   } // namespace __atomic_impl
 
-- 
2.43.0



[PATCH 4/8] [RFC] Have libatomic working as first draft

2024-09-19 Thread mmalcomson
From: Matthew Malcomson 

As it stands there are still a few things to look at whether they could
be improved:
1) Need to find the exact version of automake to use.  I'm using
   automake 1.15.1 from https://ftp.gnu.org/gnu/automake/ but the
   header is claiming I'm using automake 1.15.
2) The internal naming is all a little "not right" up for floating
   point.  E.g. the SIZE() macro is no longer adding a SIZE integer
   suffix to something but instead adding a suffix representing a type.
   Not sure whether the churn to fix this is worth it -- will ask
   upstream.
3) Have not implemented the word-size compare and swap loop fallback.
   This because the implementation uses a mask and the mask is not
   always the same for any given architecture.  Hence the existing
   approach in code would not work for all floating point types.
   - I would appreciate some feedback about whether this is OK to not
 implement.  Seems reasonable to me.
4) In the existing test for the availability of an atomic fetch
   operation there are two things that I do not know why they are needed
   and hence didn't add them to the check for atomic floating point
   fetch_{add,sub}.
   I just wanted to highlight this in case I missed something.
   1) I only put the `x` register into a register with an `asm` call.
  To be honest I don't know why anything need be put into a
  register, but I didn't put the floating point value into a
  register because I didn't know of a standard GCC floating point
  register constraint that worked across all architectures.
  - Is there any need for this `asm` line (I copied from existing
libatomic configure code without understanding).
  - Is there any need for the constant addition to be applied?
   2) I used a cast of a 1.0 floating point literal as the "addition"
  for all floating point types in the configury check.
  - Is there something subtle I'm missing about this?
(I *think* it's fine, but felt like this seemed to be a place
where I could trip up without knowing).

Description of things done in this commit:

We implement the new floating point builtins around fetch_add.  This is
mostly a configure/makefile change.  The main overview of the changes is
that we create a new list of suffixes (_fpf, _fp, _fpl, _fp16b, _fp16,
_fp32, _fp64, _fp128, _fp32x, _fp64x) and re-compile fadd_n.c and
fsub_n.c for these suffixes.
The existing machinery for checking whether a given atomic builtin is
implemented is extended to check for these same suffixes on the atomic
builtins.  The existing machinery for generating atomic fetch_
implementations using a given suffix and general patterns is also
re-used (n.b. with the exception that the implementation based on
a compare and exchange of a word is not implemented because the
pre-processor does not know the size of the floating point types).

The AArch64 backend is updated slightly.  It didn't build because it
assumed there was some IFUNC for all operations implemented (and didn't
have any IFUNC for the new floating point operations).

The new functions are advertised as LIBATOMIC_1.3 in the linker map for
the dynamic library.

Signed-off-by: Matthew Malcomson 
---
 libatomic/Makefile.am|6 +-
 libatomic/Makefile.in|   12 +-
 libatomic/acinclude.m4   |   49 +
 libatomic/auto-config.h.in   |   84 +-
 libatomic/config/linux/aarch64/host-config.h |2 +
 libatomic/configure  | 1153 +-
 libatomic/configure.ac   |4 +
 libatomic/fadd_n.c   |   23 +
 libatomic/fop_n.c|5 +-
 libatomic/fsub_n.c   |   23 +
 libatomic/libatomic.map  |   44 +
 libatomic/libatomic_i.h  |   58 +
 libatomic/testsuite/Makefile.in  |1 +
 13 files changed, 1392 insertions(+), 72 deletions(-)

diff --git a/libatomic/Makefile.am b/libatomic/Makefile.am
index efadd9dcd48..ec24f1da86b 100644
--- a/libatomic/Makefile.am
+++ b/libatomic/Makefile.am
@@ -110,6 +110,7 @@ IFUNC_OPT   = $(word $(PAT_S),$(IFUNC_OPTIONS))
 M_SIZE = -DN=$(PAT_N)
 M_IFUNC= $(if $(PAT_S),$(IFUNC_DEF) $(IFUNC_OPT))
 M_FILE = $(PAT_BASE)_n.c
+M_FLOATING  = $(if $(findstring $(PAT_N),$(FPSUFFIXES)),-DFLOATING)
 
 # The lack of explicit dependency on the source file means that VPATH cannot
 # work properly.  Instead, perform this operation by hand.  First, collect a
@@ -120,10 +121,13 @@ all_c_files   := $(foreach 
dir,$(search_path),$(wildcard $(dir)/*.c))
 M_SRC  = $(firstword $(filter %/$(M_FILE), $(all_c_files)))
 
 %_.lo: Makefile
-   $(LTCOMPILE) $(M_DEPS) $(M_SIZE) $(M_IFUNC) -c -o $@ $(M_SRC)
+   $(LTCOMPILE) $(M_DEPS) $(M_SIZE) $(M_FLOATING) $(M_IFUNC) -c -o $@ 
$(M_SRC)
 
 ## Include all of the sizes in the "normal" set of c

[PATCH 04/11] builtins: Add FP types for atomic builtin overload resolution

2024-11-14 Thread mmalcomson
From: Matthew Malcomson 

N.b. we match which function to use based on the MODE of the type for
two reasons:
1) Can't match directly on type as otherwise `typedef float x` would
   mean that `x` could no longer be used with that intrinsic.
2) MODE (i.e. the types ABI) is the thing that we need to distinguish
   between when deciding which fundamental operation needs to be
   applied.

Use wrapper in builtins.cc to check if possible, this is in builtins.cc
in order to share with the code in a patch later on in the patch series.
Have to put my static array in a function so that it initialises later
than the `global_trees`.

N.b. if there are any floating point types that are available but not as
arithmetic types or simply not available, that gets handled by the
sync_resolve_size check against available types.

gcc/ChangeLog:

* builtins.cc (struct type_to_repr_builtin): New.
(fp_type_mappings): Hard coded mappings between type and example
builtin from which offset can be determined.
(get_builtin_fp_offset): New.
* builtins.h (get_builtin_fp_offset): New.

gcc/c-family/ChangeLog:

* c-common.cc (sync_resolve_size): Determine offset to floating
point builtin and return that when we have a floating point
type.
(sync_resolve_params): Handle casting to floating point types.
(resolve_overloaded_builtin): Resolve floating point fetch_add
and fetch_sub.

Signed-off-by: Matthew Malcomson 
---
 gcc/builtins.cc  | 57 
 gcc/builtins.h   |  1 +
 gcc/c-family/c-common.cc | 52 +++-
 3 files changed, 97 insertions(+), 13 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 0d90c2ad5f8..480d38db058 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -6394,6 +6394,63 @@ get_builtin_sync_mode (int fcode_diff)
   return int_mode_for_size (BITS_PER_UNIT << fcode_diff, 0).require ();
 }
 
+/* Mappings between floating point types and builtin offsets.
+   fp_type_mappings gives a mapping between types and a representative builtin.
+   Can convert from that representative builtin to an offset from base builtin
+   by subtracting BUILT_IN_ATOMIC_FETCH_ADD_N.
+
+   Only works for floating point atomic builtins.  */
+struct type_to_repr_builtin
+{
+  tree type;
+  enum built_in_function example;
+};
+static size_t
+fp_type_mappings (const struct type_to_repr_builtin **x)
+{
+  static const struct type_to_repr_builtin fp_type_mappings[]
+= {{float_type_node, BUILT_IN_ATOMIC_FETCH_ADD_FPF},
+   {double_type_node, BUILT_IN_ATOMIC_FETCH_ADD_FP},
+   {long_double_type_node, BUILT_IN_ATOMIC_FETCH_ADD_FPL},
+   {bfloat16_type_node ? bfloat16_type_node : error_mark_node,
+   BUILT_IN_ATOMIC_FETCH_ADD_FPF16B},
+   {float16_type_node ? float16_type_node : error_mark_node,
+   BUILT_IN_ATOMIC_FETCH_ADD_FPF16},
+   {float32_type_node ? float32_type_node : error_mark_node,
+   BUILT_IN_ATOMIC_FETCH_ADD_FPF32},
+   {float64_type_node ? float64_type_node : error_mark_node,
+   BUILT_IN_ATOMIC_FETCH_ADD_FPF64},
+   {float128_type_node ? float128_type_node : error_mark_node,
+   BUILT_IN_ATOMIC_FETCH_ADD_FPF128},
+   {float32x_type_node ? float32x_type_node : error_mark_node,
+   BUILT_IN_ATOMIC_FETCH_ADD_FPF32X},
+   {float64x_type_node ? float64x_type_node : error_mark_node,
+   BUILT_IN_ATOMIC_FETCH_ADD_FPF64X}};
+  *x = fp_type_mappings;
+  return sizeof (fp_type_mappings) / sizeof (fp_type_mappings[0]);
+}
+
+static inline size_t
+get_builtin_fp_offset (struct type_to_repr_builtin x)
+{
+  return x.example - BUILT_IN_ATOMIC_FETCH_ADD_N;
+}
+
+int
+get_builtin_fp_offset (tree type)
+{
+  const struct type_to_repr_builtin *tto_p;
+  size_t len = fp_type_mappings (&tto_p);
+  for (size_t i = 0; i < len; ++i)
+{
+  struct type_to_repr_builtin tto = tto_p[i];
+  if (tto.type != error_mark_node
+ && TYPE_MODE (tto.type) == TYPE_MODE (type))
+   return get_builtin_fp_offset (tto);
+}
+  return 0;
+}
+
 /* Expand the memory expression LOC and return the appropriate memory operand
for the builtin_sync operations.  */
 
diff --git a/gcc/builtins.h b/gcc/builtins.h
index 8d93f75a9a4..7ac9981442d 100644
--- a/gcc/builtins.h
+++ b/gcc/builtins.h
@@ -130,6 +130,7 @@ extern tree std_fn_abi_va_list (tree);
 extern tree std_canonical_va_list_type (tree);
 extern void std_expand_builtin_va_start (tree, rtx);
 extern void expand_builtin_trap (void);
+extern int get_builtin_fp_offset (tree);
 extern void expand_ifn_atomic_bit_test_and (gcall *);
 extern void expand_ifn_atomic_compare_exchange (gcall *);
 extern void expand_ifn_atomic_op_fetch_cmp_0 (gcall *);
diff --git a/gcc/c-family/c-common.cc b/gcc/c-family/c-common.cc
index 156d20dfd5d..5dc7fc10db3 100644
--- a/gcc/c-family/c-common.cc
+++ b/gcc/c-family/c-common.cc
@@ -52,6 +52,7 @@ along with GCC; see the file COPYING3.  I

[PATCH 00/11] Add FP overloads for __atomic_fetch_add etc

2024-11-14 Thread mmalcomson
From: Matthew Malcomson 

Hello,

This is the revision of the RFC posted here:
https://gcc.gnu.org/pipermail/gcc-patches/2024-September/663355.html

This patchset introduces floating point versions of atomic fetch_add,
fetch_sub, add_fetch and sub_fetch.  Instructions for performing these
operations have been directly available in GPU hardware for a while, and
are now starting to get added to CPU ISA's with instructions like the
AArch64 LDFADD.  Clang has allowed floating point types to be used with
these builtins for a while now https://reviews.llvm.org/D71726.

Introducing these new overloads to this builtin allows users to directly
specify the operation needed and hence allows the compiler to provide
optimised output if possible.

There is additional motivation to use such floating point type atomic
operations in libstdc++ so that other compilers can use libstdc++ to
generate optimal code for their own targets (e.g. NVC++ can use
libstdc++ atomic::fetch_add to generate optimal code for GPU's
when using the `-stdpar` argument).  A patch implementing that has been
sent to the libstdc++ mailing list here:
https://gcc.gnu.org/pipermail/libstdc++/2024-October/059865.html

Uses of such builtins in libstdc++ doesn't technically need introduction
of the builtins to GCC -- just ensuring that SFINAE techniques can be used
with overloaded builtins (as is done in this patch:
https://gcc.gnu.org/pipermail/gcc-patches/2024-October/665982.html).
That said it is still worthwhile to add these builtins -- especially now
that a primary target will be adding these operations.
N.b. I include an update to that libstdc++ patch in this patch series
because I noticed the ChangeLog in the cover letter had spaces instead
of tabs.

In the original RFC I suggested that libstdc++ use preprocessor checks
along the lines of `__has_builtin(__atomic_fetch_add_fp)` to determine
whether to use the new builtins or a CAS loop written inline.  After asking
the clang community I found out that such resolved versions of the atomic
builtins are not exposed by clang to the user.  They suggested the use of
SFINAE to determine if a given type works with the `__atomic_fetch_add`
builtin.  That's why I have posted the SFINAE patch above and chosen this
approach.

--
As standard with the existing atomic builtins, we add the same functions
in libatomic, allowing a fallback for when a given target has not
implemented these operations directly in hardware.  In order to use
these functions we need to have a naming scheme that encodes the type --
we use a suffix of _fp to denote that this operation is on a floating
point type, and a further empty suffix to denote a double, 'f' to denote
a float, and similar.  The scheme for the second part of the suffix
taken from the existing builtins that have different versions for
different floating point types -- e.g.  __builtin_acosh,
__builtin_acoshf, __builtin_acoshl, etc.

In order to add floating point functions to libatomic we updated the
makefile machinery to use names more descriptive of the new setup (where
the SIZE of the datatype can no longer be used to distinguish all
operations from each other).  Moreover we add a CAS loop implementation
in fop_n.c that handles floating point exception information and handles
casting between floating point and integral types when switching between
applying the operation and using CAS to attempt to store.

--
As Joseph Myers pointed out in response to my RFC, when performing
floating point operations in a CAS loop there is the floating point
exception information to take care of.  In order to take care of this
information I use the existing `atomic_assign_expand_fenv` target hook
to generate code that checks this information.

Partly due to the fact that this hook emits GENERIC code and partly due
to the language-specific semantics of floating point exceptions, this
means we now decide whether to emit a CAS loop handling the frontend
(during overload resolution).  The frontend decides to only use the
underlying builtin if the backend has an optab defined that can
implement it directly.

--
Now that the expansion to a CAS loop is performed in overloaded builtin
resolution, this means that if the user were to directly use a resolved
version (e.g. `__atomic_fetch_add_fp` for a double) that would not
expand into a CAS loop inline.  Instead (assuming the optab is not
implemented for this target) it would pass through and end up using the
libatomic fallback.

This is not ideal, but I believe the complexity of adding another clause
for this expansion to a CAS loop is not worth the benefit of handling a
CAS loop expansion for this specific case (partly on the assumption that
users would rarely specify the resolved version and partly on the belief
that these resolved versions are not actually part of the user-facing
interface -- since they're not documented in the manual and don't seem
to be used 

[PATCH 08/11] c: c++: flag to disable fetch_op handling fenv exceptions

2024-11-14 Thread mmalcomson
From: Matthew Malcomson 

N.b. including docs maintainers to ask about whether this flag should be
documented in invoke.texi -- I suspect that would be determined partly
based on the feedback I get on this patch about whether it should be
something mostly for the testsuite and otherwise internally derived
based on other factors or whether it should be something available for a
user to adjust directly.

-- >8 --- 8< ---
As it stands the floating point exception handling needs to include
libatomic.  I plan to include libatomic by default when compiling as per
PR/81358.  However even that won't help when a target has no libatomic.

Bare metal cross compilers get built without libatomic often.  Rather
than disable this feature for bare metal targets we provide a fallback
through a command line argument that allows disabling the emition of
floating point exception handling code in calls to __atomic_fetch_add
and similar.

This is also useful for the testsuite, since it enables more testing of
this feature on targets which have not built libatomic.

N.b. I would appreciate any feedback about how one should handle such a
situation when working with C11 _Atomic types.  They have the same
problem that they require libatomic and sometimes libatomic is not
available.  Is this just something that will stay as a known limitation
for certain platforms?  Is there something in the works to make it more
viable?

Similarly I would open the question up to others about whether it seems
sensible to turn this flag on by default when the compiler is configured
with `--disable-libatomic`.  I'm not confident on that since I don't
know how often people disable building libatomic and instead test
against a system libatomic.

gcc/c-family/ChangeLog:

* c-common.cc (atomic_alttyped_fetch_using_cas_loop): Avoid
emitting floating point exception handling code if new flag
given.
* c.opt: Introduce new -fatomic-fp-fetchop-exceptions flag.

gcc/testsuite/ChangeLog:

* gcc.dg/atomic-op-fp.c: Use new flag to avoid requiring
libatomic when libatomic is not available.
* gcc.dg/atomic-op-fpf.c: Likewise.
* gcc.dg/atomic-op-fpf128.c: Likewise.
* gcc.dg/atomic-op-fpf16.c: Likewise.
* gcc.dg/atomic-op-fpf16b.c: Likewise.
* gcc.dg/atomic-op-fpf32.c: Likewise.
* gcc.dg/atomic-op-fpf32x.c: Likewise.
* gcc.dg/atomic-op-fpf64.c: Likewise.
* gcc.dg/atomic-op-fpf64x.c: Likewise.
* gcc.dg/atomic-op-fpl.c: Likewise.
* gcc.dg/atomic-op-fp-fenv-flag.c: New test.

Signed-off-by: Matthew Malcomson 
---
 gcc/c-family/c-common.cc  |  2 +-
 gcc/c-family/c.opt|  6 +++
 gcc/testsuite/gcc.dg/atomic-op-fp-fenv-flag.c | 44 +++
 gcc/testsuite/gcc.dg/atomic-op-fp.c   |  1 +
 gcc/testsuite/gcc.dg/atomic-op-fpf.c  |  1 +
 gcc/testsuite/gcc.dg/atomic-op-fpf128.c   |  1 +
 gcc/testsuite/gcc.dg/atomic-op-fpf16.c|  1 +
 gcc/testsuite/gcc.dg/atomic-op-fpf16b.c   |  1 +
 gcc/testsuite/gcc.dg/atomic-op-fpf32.c|  1 +
 gcc/testsuite/gcc.dg/atomic-op-fpf32x.c   |  1 +
 gcc/testsuite/gcc.dg/atomic-op-fpf64.c|  1 +
 gcc/testsuite/gcc.dg/atomic-op-fpf64x.c   |  1 +
 gcc/testsuite/gcc.dg/atomic-op-fpl.c  |  1 +
 13 files changed, 61 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.dg/atomic-op-fp-fenv-flag.c

diff --git a/gcc/c-family/c-common.cc b/gcc/c-family/c-common.cc
index 17bdcbfedc3..50c177da4cd 100644
--- a/gcc/c-family/c-common.cc
+++ b/gcc/c-family/c-common.cc
@@ -8409,7 +8409,7 @@ atomic_alttyped_fetch_using_cas_loop (location_t loc,
   bool need_fenv
 = (flag_trapping_math && SCALAR_FLOAT_TYPE_P (nonatomic_lhs_type));
   tree hold_call = NULL_TREE, clear_call = NULL_TREE, update_call = NULL_TREE;
-  if (need_fenv)
+  if (need_fenv && flag_atomic_fp_fetchop_exceptions)
 targetm.atomic_assign_expand_fenv (&hold_call, &clear_call, &update_call);
   if (hold_call)
 add_stmt (hold_call);
diff --git a/gcc/c-family/c.opt b/gcc/c-family/c.opt
index 47d6f083ffe..6fc66a3bfe7 100644
--- a/gcc/c-family/c.opt
+++ b/gcc/c-family/c.opt
@@ -1682,6 +1682,12 @@ fasm
 C ObjC C++ ObjC++ Var(flag_no_asm, 0)
 Recognize the \"asm\" keyword.
 
+fatomic-fp-fetchop-exceptions
+C C++ Var(flag_atomic_fp_fetchop_exceptions) Init(1)
+Choose whether __atomic_fetch_add, __atomic_fetch_sub, __atomic_add_fetch, and
+__atomic_sub_fetch on floating point types handle floating point exceptions.
+This requires linking against libatomic.
+
 ; Define extra predefined macros for use in libgcc.
 fbuilding-libgcc
 C ObjC C++ ObjC++ Undocumented Var(flag_building_libgcc)
diff --git a/gcc/testsuite/gcc.dg/atomic-op-fp-fenv-flag.c 
b/gcc/testsuite/gcc.dg/atomic-op-fp-fenv-flag.c
new file mode 100644
index 000..1bb0ea4a26b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/atomic-op-fp-fenv-flag.c
@@ -0,0 +1,44 @@
+/

[PATCH 06/11] builtins: optab: Tie the new atomic builtins to the backend

2024-11-14 Thread mmalcomson
From: Matthew Malcomson 

Not sure who to Cc for this.  Honestly just guessing a bit here.  Please
do redirect me if anyone knows of a better set of people to ask.

-- >8 --- 8< ---
Update the optabs definitions to include floating point versions of
atomic fetch_add variants.

Things to highlight to any reviewer:
1) I do not always expand into the modify before version, but also use
   the modify after version when unable to inline.
   - From looking at the dates on different parts of the code, it seems
 that this used to be needed before libatomic was added as a target
 library.  Since libatomic currently implements both the fetch_
 and _fetch versions I don't believe it's needed any more.

2) We do not expand into a CAS loop in the backend (contrary to fetch_op
   operations on other types).
   At the same time add an assertion that we never hit the clause for
   using a CAS loop at expand time if we have a floating point mode.

   The reason for this is that in the expansion of a CAS loop for
   floating point operations, one needs to handle what to do with
   floating point exception information.  Exactly what is necessary may
   depend on the frontend.  As it stands the only frontend that handles
   floating point fetch_op operations is the C/C++ frontend.  These only
   emit the direct atomic function if there is a floating point
   implementation of any of the fetch_add family of optabs.  If there is
   one of this family available then the expand pass should be able to
   avoid a CAS loop, if there is none of this family of optabs available
   then the frontend would have emitted its own CAS loop accounting for
   floating point exception information.

   If floating point fetch_op operations are added to any other frontend
   that frontend will need to handle the floating point exception
   behaviour for their language.  If there is essentially nothing to
   do then it may be worth adjusting the expand pass to allow expanding
   a CAS loop in a floating point mode while adding a hook into the
   frontends that queries "should the expand pass be replacing a
   floating point fetch_op with a CAS loop".

   N.b. the current requirement does mean that if there is an optab for
   a floating point fetch_add then the optab must be able to take any
   floating point value.
   This is because the frontend checks whether optabs exist, and based
   on that decides whether to expand as a CAS loop or not.

gcc/ChangeLog:

* builtins.cc (get_builtin_fp_sync_mode): New.
(expand_builtin): Handle new floating point fetch_add/fetch_sub
and add_fetch/sub_fetch.
* optabs.cc (expand_atomic_fetch_op): Add assertion that we
never use compare and swap on a floating point type.
* optabs.def: Add floating point type optabs for existing
fetch_add/fetch_sub/add_fetch/sub_fetch operations.

Signed-off-by: Matthew Malcomson 
---
 gcc/builtins.cc | 95 +
 gcc/optabs.cc   | 19 ++
 gcc/optabs.def  |  6 +++-
 3 files changed, 119 insertions(+), 1 deletion(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 3585c5c9af9..7807f6d7c90 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -6436,6 +6436,19 @@ get_builtin_fp_offset (struct type_to_repr_builtin x)
   return x.example - BUILT_IN_ATOMIC_FETCH_ADD_N;
 }
 
+static inline machine_mode
+get_builtin_fp_sync_mode (int fcode_diff)
+{
+  const struct type_to_repr_builtin *tto_p;
+  size_t len = fp_type_mappings (&tto_p);
+  for (size_t i = 0; i < len; i++)
+{
+  if ((size_t) fcode_diff == get_builtin_fp_offset (tto_p[i]))
+   return TYPE_MODE (tto_p[i].type);
+}
+  gcc_unreachable ();
+}
+
 int
 get_builtin_fp_offset (tree type)
 {
@@ -8921,6 +8934,88 @@ expand_builtin (tree exp, rtx target, rtx subtarget, 
machine_mode mode,
return target;
   break;
 
+case BUILT_IN_ATOMIC_FETCH_ADD_FPF:
+case BUILT_IN_ATOMIC_FETCH_ADD_FP:
+case BUILT_IN_ATOMIC_FETCH_ADD_FPL:
+case BUILT_IN_ATOMIC_FETCH_ADD_FPF16B:
+case BUILT_IN_ATOMIC_FETCH_ADD_FPF16:
+case BUILT_IN_ATOMIC_FETCH_ADD_FPF32:
+case BUILT_IN_ATOMIC_FETCH_ADD_FPF64:
+case BUILT_IN_ATOMIC_FETCH_ADD_FPF128:
+case BUILT_IN_ATOMIC_FETCH_ADD_FPF32X:
+case BUILT_IN_ATOMIC_FETCH_ADD_FPF64X:
+  mode = get_builtin_fp_sync_mode (fcode - BUILT_IN_ATOMIC_FETCH_ADD_N);
+  target = expand_builtin_atomic_fetch_op (mode, exp, target, PLUS, false,
+  ignore, BUILT_IN_NONE);
+  if (target)
+   return target;
+  break;
+
+case BUILT_IN_ATOMIC_ADD_FETCH_FPF:
+case BUILT_IN_ATOMIC_ADD_FETCH_FP:
+case BUILT_IN_ATOMIC_ADD_FETCH_FPL:
+case BUILT_IN_ATOMIC_ADD_FETCH_FPF16B:
+case BUILT_IN_ATOMIC_ADD_FETCH_FPF16:
+case BUILT_IN_ATOMIC_ADD_FETCH_FPF32:
+case BUILT_IN_ATOMIC_ADD_FETCH_FPF64:
+case BUILT_IN_ATOMIC_ADD_FETCH_FPF128:
+case BUILT_IN_A

[PATCH 01/11] libatomic: Split concept of SUFFIX and SIZE in libatomic

2024-11-14 Thread mmalcomson
From: Matthew Malcomson 

Cc'ing in build machinery maintainers who can hopefully check the
validity of those changes.

N.b. would appreciate information on who to Cc in for the libatomic
parts of this patch.  I don't see anyone in the MAINTAINERS file against
libatomic.

-- >8 --- 8< ---
Many functions in libatomic are defined multiple times with different
suffixes from 1, 2, 4, 8, 16.  These also correspond to the size in
bytes of the types that we are working on.  The macro N is set to
these different sizes when compiling files multiple times for
different sizes.

In a later patch in this same patchset, I introduce functions handling
floating point types to libatomic.  The sizes of these floating point
types overlap with the sizes of the integer types already used.

In order to handle that we need a distinction between the SUFFIX that
a function handling a given type will have, and the SIZE of the data
type that it will use.

This distinction will allow things like defining
atomic_compare_exchange_n for a given size, while still being able to
distinguish between types elsewhere.

Alongside this naming switch we introduce the UINTTYPE macro, which is
currently identical to UTYPE.  The only difference is that this is
defined in terms of SIZE while UTYPE is now defined in terms of SUFFIX.
This keeps a logical separation between an unsigned integer of the
relevant size, and the type that we are working with.  This distinction
is useful for the introduction of floating point typed functions in a
later patch.

This patch converts the naming scheme so that the Makefile determines
the SUFF parameter on the command line while libatomic_i.h defines N
as SUFF.  Hence codegen should not change at all, while the semantics
are made distinct.

We also go through all known backends to convert SIZE() to
SUFFIX() accordingly.

Testing:
 - Ensured that object files in libatomic build directory do not change
   (once the .debug_str and .comment sections are removed) after
   applying this patch.
   Did that for:
   - s390x-unknown-linux-gnu (using crosstool-ng)
   - aarch64 linux (native bootstrap)
   - arm-none-linux-gnueabihf (armv7, using cross compiler)
   - x86_64 linux (native bootstrap)
   - i686-ubuntu16.04-linux-gnu (using crosstool-ng)

libatomic/ChangeLog:

* Makefile.am: Rename PAT_N -> PAT_SUFF.
Rename M_SIZE -> M_SUFFIX
* Makefile.in: Regenerate.
* cas_n.c: Replace uses of SIZE with SUFFIX.
* config/linux/arm/host-config.h (atomic_compare_exchange_n):
Use UINTTYPE instead of UTYPE.
* config/s390/cas_n.c: Replace uses of N with SUFF, and replace
uses of SIZE with SUFFIX.
* config/s390/exch_n.c: Likewise.
* config/s390/load_n.c: Likewise.
* config/s390/store_n.c: Likewise.
* config/x86/host-config.h (atomic_compare_exchange_n): Use
UINTTYPE instead of UTYPE.
(atomic_load_n): Likewise.
(atomic_store_n): Likewise.
* exch_n.c: Replace uses of SIZE with SUFFIX.
* fadd_n.c: Likewise.
* fop_n.c: Likewise.
* libatomic_i.h: Define N in terms of SUFF (rather than have it
passed on command line).
(SUFFIX): New.
(PTR): Define in terms of SUFF.
(ITYPE): Likewise.
(UTYPE): Likewise.
(UINTTYPE): New.
(DECLARE_ALL_SIZED): Rename to
(DECLARE_ALL_SUFFIXED): this.
(DECLARE_ALL_SIZED_): Rename to
(DECLARE_ALL_SUFFIXED_): this.
* load_n.c: Replace uses of SIZE with SUFFIX.
* store_n.c: Likewise.
* tas_n.c: Likewise.

Signed-off-by: Matthew Malcomson 
---
 libatomic/Makefile.am| 10 ++--
 libatomic/Makefile.in|  6 +-
 libatomic/cas_n.c|  8 +--
 libatomic/config/linux/arm/host-config.h |  2 +-
 libatomic/config/s390/cas_n.c|  6 +-
 libatomic/config/s390/exch_n.c   |  4 +-
 libatomic/config/s390/load_n.c   |  4 +-
 libatomic/config/s390/store_n.c  |  4 +-
 libatomic/config/x86/host-config.h   | 14 ++---
 libatomic/exch_n.c   | 12 ++--
 libatomic/fadd_n.c   |  2 +-
 libatomic/fop_n.c| 22 
 libatomic/libatomic_i.h  | 72 +---
 libatomic/load_n.c   | 12 ++--
 libatomic/store_n.c  | 12 ++--
 libatomic/tas_n.c| 12 ++--
 16 files changed, 104 insertions(+), 98 deletions(-)

diff --git a/libatomic/Makefile.am b/libatomic/Makefile.am
index efadd9dcd48..0f2122822d6 100644
--- a/libatomic/Makefile.am
+++ b/libatomic/Makefile.am
@@ -86,9 +86,9 @@ libatomic_la_DEPENDENCIES = $(libatomic_la_LIBADD) 
$(libatomic_version_dep)
 ## dependencies do the job just as well:
 -include $(wildcard $(DEPDIR)/*.Ppo)
 
-## Naming pattern: base_n_i_.lo
+## Naming pattern: base_s_i_.lo
 ##
-## N   size of data
+

[PATCH 07/11] testsuite: Add tests for fp resolutions of __atomic_fetch_add

2024-11-14 Thread mmalcomson
From: Matthew Malcomson 

N.b. including testsuite maintainers hoping for review of the new
effective targets I've added.

-- >8 --- 8< ---
Tests we include are:
- A set of tests checking the observed arithmetic behaviour of the
  new builtins.
- A test ensuring the floating point exception handling for of the new
  builtins for each of the standard floating point types.
- A test checking that we error for unknown types in a similar manner to
  before.

--
Name effective targets as sync_double_runtime (and similar for each
different floating point type).  This not only corresponds slightly
better to the underlying effective targets that we use (e.g.
sync_long_long_runtime), but it also shows the user of the effective
target that this requires the ability to run executables (which is
required at least in part due to the way in which this effective target
is checked).

Ensure we include `dg-add-options` in those testcases which have such an
option.  This allowing them to run on as many targets without error as
they can.

We generate checks for whether each floating point type is atomic.
These generated checks simply map the floating point type to the
effective target of the integral type of the same size.  The assumption
here being that if there is not the ability to perform integral
atomic operations on a given size there would also not be the ability to
perform atomic operations on floating point types of the same size.
This holds for the current mechanism by which GCC implements things like
atomic loads.

N.b. the actual effective target TCL procedures are inlined instead of
generated.  While this is more code it also means that there is a
standard string to `grep` for that will find these effective targets
when someone finds an effective target in a testcase and wants
to see what that effective target is actually checking.

Ensure we include a comment describing why it's OK to require the
ability to run an executable to pass these effective targets.

N.b. in order to make this effective target work with C++ as well as C
we use checks on `__cplusplus`, and use ${tool}_load rather than
gcc_load in order to ensure this effective target works in the g++
testsuite.

--
Do not link in libatomic for atomic-op-fp* testcases

As it stands, libatomic can not easily be linked in for these testcases
in this directory.  It requires a bit of setup that is handled in
gcc.dg/atomic/atomic.exp.  This is not performed in the gcc.dg/dg.exp
test runner.

In a following patchset we hope to include libatomic by default, and
with that the testsuite should be updated.  N.b. at this point in the
patchset these tests will always fail (due to needing a later patch that
introduces a flag to avoid requiring libatomic).

--
Require libatomic for atomic-op-fp-fenv.c

Clearly needs libatomic (partly due to the use of __atomic_feraiseexcept
I introduce, partly due to __atomic_store implementations required on
targets which don't have `long double` atomic loads/stores).

Since it clearly needs libatomic, put this in the gcc.dg/atomic
directory which always includes libatomic and only runs its tests if
libatomic is available.

--
Directives added spell out the following:
1) This test is a run test (i.e. it should be executed and the execution
   should succeed).
2) We want to specify -std=c23 for the floating point types introduced
   in that standard.
3) If libatomic is not available, this target needs to support atomic
   operations on this data size.
4) This target always needs to support this floating point type.

This allows running the test when libatomic is not available but the
target supports atomic operations directly on this data size, and also
running the test via libatomic when the target does not support atomic
operations directly on the given data size.
That works when combined with a later patch that makes libatomic
available to these testcases.

--
We also include a testcase that we still emit the correct errors.  E.g.
when bfloat16 is not available on the target, or if it is available but
not an arithmetic type.

gcc/testsuite/ChangeLog:

* lib/target-supports.exp
(check_effective_target_bfloat16_storage_only): New.
(check_effective_target_sync_float_runtime): New.
(check_effective_target_sync_double_runtime): New.
(check_effective_target_sync_long_double_runtime): New.
(check_effective_target_sync_bfloat16_runtime): New.
(check_effective_target_sync_float16_runtime): New.
(check_effective_target_sync_float32_runtime): New.
(check_effective_target_sync_float64_runtime): New.
(check_effective_target_sync_float128_runtime): New.
(check_effective_target_sync_float32x_runtime): New.
(check_effective_target_sync_float64x_runtime): New.
* gcc.dg/atomic-op-fp-errs.

[PATCH 03/11] c: c++: Define new floating point builtin fetch_add functions

2024-11-14 Thread mmalcomson
From: Matthew Malcomson 

Not sure who to Cc for this.  Honestly just guessing a bit here.  Please
do redirect me if anyone knows of a better set of people to ask.

-- >8 --- 8< ---
This commit just defines the new names -- as yet don't implement them.
Saving this commit because this is one decision, and recording
what the decision was and why:

Adding new floating point builtins for each floating point type that
is defined in the general code *except* f128x (which would have a size
greater than 16bytes -- the largest integral atomic operation we
currently support).

We have to base our naming on floating point *types* rather than sizes
since different types can have the same size and the operations need
to be distinguished based on type.  N.b. one could make size-suffixed
builtins that are still overloaded based on types but I thought that
this was the cleaner approach.
(Actual requirement is distinction based on mode, this is how I choose
which internal function to use in a later patch.  I believe that
defining the function in terms of types and internally mapping to modes
is a sensible split between user interface and internal implementation).

Have checked with clang developers that they're happy with those names.
https://discourse.llvm.org/t/atomic-floating-point-operations-and-libstdc/81461

N.b. in order to choose whether these operations are available or not
in something like libstdc++ we use SFINAE on the type.  This is already
available in clang the below link has the patch where I add this ability
into GCC:
https://gcc.gnu.org/pipermail/gcc-patches/2024-October/664999.html

N.b. I used the below type suffixes (following what seems like the
existing convention for builtins):
  - float  -> f
  - double -> 
  - long double -> l
  - _FloatN -> fN   (for N <- (16, 32, 64, 128))
  - _FloatNx -> fNx (for N <- (32, 64))

Richi suggested doing this expansion generally for all these builtins
following Cxy _Atomic semantics on IRC.
Since C hasn't specified any fetch_ semantics for floating point
types, C++ has only specified `atomic<>::fetch_{add,sub}`, and the
operations other than these are all bitwise operations (which don't
to map well to floating point), I believe I have followed that
suggestion by implementing all fetch_{sub,add}/{add,sub}_fetch
operations.

I have not implemented anything for the __sync_* builtins on the
belief that these are legacy and new code should use the __atomic_*
builtins.  Happy to adjust if that is a bad choice.

Only the new function types were needed for most cases.
The Fortran frontend does not use `builtin-types.def` but does include
`sync-builtins.def`.  Since the new definitions in `sync-builtins.def`
use new enums describing their type the fortran `types.def` file needed
to be updated to avoid a build error.
Some of the floating point types that these new functions use may not be
available depending on the target.
`builtin-types.def` defines the associated type enum to
`error_mark_node` when unavailable.  This can not be done with
`types.def` since in the fortran frontend there is no handling of
`error_mark_node` being the type defined with these macros.  Similarly
the fortran frontend can not handle functions defined in
`sync-builtins.def` with `error_mark_node` as a type (while other
frontends can).

The new functions will not automatically be exposed in fortran by simply
defining them.  If/when they are exposed they will have to be exposed
with knowledge of the floating point semantics of Fortran in order to
correctly handle floating point exceptions when these builtins are
expanded as a CAS loop.  I.e. the current definitions in
`sync-builtins.def` are essentially dead code from the gfortran users
perspective.

Since there is no functionality to maintain here, I have introduced a
new macro DEF_DUMMY_FUNCTION_TYPE in the fortran types.def which defines
a type to match BT_FN_VOID_INT.  That is used as the type of each of the
new specialist floating point atomic functions.

gcc/ChangeLog:

* builtin-types.def (BT_FN_FLOAT_VPTR_FLOAT_INT): New type.
(BT_FN_DOUBLE_VPTR_DOUBLE_INT): New type.
(BT_FN_LONGDOUBLE_VPTR_LONGDOUBLE_INT): New type.
(BT_FN_BFLOAT16_VPTR_BFLOAT16_INT): New type.
(BT_FN_FLOAT16_VPTR_FLOAT16_INT): New type.
(BT_FN_FLOAT32_VPTR_FLOAT32_INT): New type.
(BT_FN_FLOAT64_VPTR_FLOAT64_INT): New type.
(BT_FN_FLOAT128_VPTR_FLOAT128_INT): New type.
(BT_FN_FLOAT32X_VPTR_FLOAT32X_INT): New type.
(BT_FN_FLOAT64X_VPTR_FLOAT64X_INT): New type.
* sync-builtins.def (DEF_SYNC_FLOATN_NX_BUILTINS): New.
(DEF_SYNC_FLOAT_BUILTINS): New.
(ADD_FETCH_TYPE): New.
(BUILT_IN_ATOMIC_ADD_FETCH): New.
(SUB_FETCH_TYPE): New.
(BUILT_IN_ATOMIC_SUB_FETCH): New.
(FETCH_ADD_TYPE): New.
(BUILT_IN_ATOMIC_FETCH_ADD): New.
(FETCH_SUB_TYPE): New.
(BUILT_IN_ATOMIC_FETCH_SUB): New.

gcc/fortran/ChangeLog:

 

[PATCH 05/11] c: c++: Expand into CAS loop in frontend

2024-11-14 Thread mmalcomson
From: Matthew Malcomson 

Use wrapper in builtins.cc to check if required.  N.b. if there are any
floating point types that are available but not as arithmetic types or
simply not available, that should be handled by the sync_resolve_size
check against available types.  We add an assertion in the CAS loop
expansion on this requirement.

Note that when deciding whether to emit a CAS loop the C/C++ frontends
check for if *any* FP fetch_add related optabs are available on this
target.  If checking whether the specific operation we're expanding is
available for the given mode that would mean that we would miss the
possible transformation of fetch_add to fetch_sub with a negative value,
and similar for the transformation fetch_add to add_fetch with an
adjustment on the result value.

These transformations allow the target backend to only specify one
pattern and have the compiler use it for a set of options.  They are
performed in the expand phase.

We avoid expanding to a CAS loop when -fno-inline-atomics is passed and
in that case emit a call to the resolved version of
`__atomic_fetch_add_fp*` or similar.

--
The new floating point atomic builtins take floating point operands.
Those operands could be EXCESS_PRECISION_EXPR tree nodes.

In our CAS loop expansion for fetch_add/fetch_sub we convert the value
being added/subtracted from the atomic data into the relevant type for
this function before using it.  We also fully fold this expression for
simplification at this point before emitting the remaining code.

This existing code does not handle EXCESS_PRECISION_EXPR's since
c_fully_fold can not handle such codes except as the outermost
expression.
Since this expansion is semantically at the point of a function
boundary (an argument being passed to a builtin), we want to expand a
CAS loop on the semantic value.  Hence in this CAS loop expansion,
when the value being added/subtracted is an EXCESS_PRECISION_EXPR we
adjust the semantic value in that expression to the semantic value of
the argument and pass that into `c_fully_fold`.

--
libstdc++ atomic class requires padding to be cleared to satisfy
the standard (P0528).  When implementing fetch_add as a builtin, we need
to maintain this property.

For most floating point types this is not a problem, but long double on
x86 does have to watch out for this.

When the builtin expands to a CAS loop, it's pretty awkward to implement
the padding clear in libstdc++.  Would have to essentially perform the
same CAS loop again in order to clear padding.

Hence it seems most reasonable to clear padding in the builtin expansion
itself.  That does mean that all code performing a fetch_add on a
long double on x86 would have this extra operation to clear padding
whether needed or not.  I believe that seems reasonable.
For other types this expands into a NOP very early on.

--
The SFINAE handling in the CAS loop needed some updating for floating
point types.  In this area we need to:
1) Remove the assertion that we never expand a CAS loop in the frontend
   when `complain` is unset.  This assertion was there because _BitInt
   is not available in C++ and the CAS loop was only there for C++.
   Now that we sometimes expand the CAS loop for floating point types we
   can indeed do this in the C++ frontend.  Hence this assertion no
   longer holds.
2) Thread the `complain` parameter through the CAS loop expansion in the
   C/C++ frontend.
3) Account for the new possibility that `convert` would raise an error
   (when attempting to convert from a pointer to a floating point type).
   This was done with an extra check in both `sync_resolve_params` and
   `atomic_alttyped_fetch_using_cas_loop` in a similar way to other
   checks on types in functions called by `resolve_overloaded_builtin`
   because threading the `complain` argument through the `convert`
   function interface prooved to be too invasive (do not know of an
   interface that performs the conversion while fully handling SFINAE
   contexts e.g. `ocp_convert` uses `convert_to_real_maybe_fold`).
4) Update tests to check that we provide the relevant too few/many
   arguments error accordingly when __atomic_add_fetch is incorrectly
   called.  Also check that this error is not emitted when in an SFINAE
   context.
   Similar for incorrectly passing a pointer to the floating point
   fetch_add function.

gcc/ChangeLog:

* builtins.cc (atomic_fp_fetch_add_implemented): New.
* builtins.h (atomic_fp_fetch_add_implemented): New decl.

gcc/c-family/ChangeLog:

* c-common.cc  (sync_resolve_params): Check for cast from
pointer to floating point type and error accordingly.
(atomic_bitint_fetch_using_cas_loop): Renamed to ...
(atomic_alttyped_fetch_using_cas_loop): .. this, and updated to
handle floating point types.  Also add new argument `complain`
indicating whether this function should emit

[PATCH 09/11] doc: Mention floating point atomic fetch_add etc in docs

2024-11-14 Thread mmalcomson
From: Matthew Malcomson 

Do not mention suffixed versions since integral suffixed versions are
not mentioned in existing docs.

gcc/ChangeLog:

* doc/extend.texi: Document ability to use floating point atomic
fetch_add/fetch_sub/add_fetch/sub_fetch builtins.

Signed-off-by: Matthew Malcomson 
---
 gcc/doc/extend.texi | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index c566474074d..f235ca6e60a 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -13704,6 +13704,16 @@ the same format with the addition of a @samp{size_t} 
parameter inserted
 as the first parameter indicating the size of the object being pointed to.
 All objects must be the same size.
 
+Moreover, the @samp{__atomic_fetch_add}, @samp{__atomic_fetch_sub},
+@samp{__atomic_add_fetch} and @samp{__atomic_sub_fetch} builtins can accept
+floating point types of @code{float}, @code{double}, @code{long double}.  They
+can handle @code{bfloat16}, @code{_Float16}, @code{_Float32}, @code{_Float64},
+@code{_Float128}, @code{_Float32x} and @code{_Float64x} types if these are
+supported by the architecture.  These functions use a lock-free built-in
+function if the size of the floating point type makes that possible and
+otherwise leave an external call to be resolved at run time.  This external
+call is of the same format but specialised to the given floating point type.
+
 There are 6 different memory orders that can be specified.  These map
 to the C++11 memory orders with the same names, see the C++11 standard
 or the @uref{https://gcc.gnu.org/wiki/Atomic/GCCMM/AtomicSync,GCC wiki
-- 
2.43.0



[PATCH 10/11] [Not For Commit] Add demo implementation of one of the operations

2024-11-14 Thread mmalcomson
From: Matthew Malcomson 

Do demo implementation in AArch64 since that's the backend I'm most
familiar with.

Nothing much else to say -- nice to see that the demo implementation
seems to work as expected (being used for fetch_add, add_fetch and
sub_fetch even though it's only defined for fetch_sub).

Demo implementation ensures that I can run some execution tests with the
alternate codepath.

Demo is added behind a flag in order to be able to run the testsuite
with different variants (with the flag and without).
Ensuring that the functionality worked for both the fallback and when
this optab was implemented (also check with the two different fallbacks
of either using libatomic or inlining a CAS loop).

In order to run with this and the fallback implementation I use the
following flag in RUNTESTFLAGS:
--target_board='unix {unix/-mtesting-fp-atomics}'

Signed-off-by: Matthew Malcomson 
---
 gcc/config/aarch64/aarch64.h   |  2 ++
 gcc/config/aarch64/aarch64.opt |  5 +
 gcc/config/aarch64/atomics.md  | 15 +++
 3 files changed, 22 insertions(+)

diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index f07b2c49f0d..16e30972aa9 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -121,6 +121,8 @@
of LSE instructions.  */
 #define TARGET_OUTLINE_ATOMICS (aarch64_flag_outline_atomics)
 
+#define TARGET_TESTING_FP_ATOMICS (aarch64_flag_testing_fp_atomics)
+
 /* Align definitions of arrays, unions and structures so that
initializations and copies can be made more efficient.  This is not
ABI-changing, so it only affects places where we can see the
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index c2c9965b062..d02d6f11839 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -338,6 +338,11 @@ moutline-atomics
 Target Var(aarch64_flag_outline_atomics) Init(2) Save
 Generate local calls to out-of-line atomic operations.
 
+mtesting-fp-atomics
+Target Var(aarch64_flag_testing_fp_atomics) Init(0) Save
+Use the demonstration implementation of atomic_fetch_sub_ for floating
+point modes.
+
 -param=aarch64-vect-compare-costs=
 Target Joined UInteger Var(aarch64_vect_compare_costs) Init(1) IntegerRange(0, 
1) Param
 When vectorizing, consider using multiple different approaches and use
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index 32a0a723732..ee8fbcd6c58 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -368,6 +368,21 @@
 ;; However we also implement the acquire memory barrier with DMB LD,
 ;; and so the ST is not blocked by the barrier.
 
+(define_insn "atomic_fetch_sub"
+  [(set (match_operand:GPF 0 "register_operand" "=&w")
+(match_operand:GPF 1 "aarch64_sync_memory_operand" "+Q"))
+(set (match_dup 1)
+(unspec_volatile:GPF
+[(minus:GPF (match_dup 1)
+   (match_operand:GPF 2 "register_operand" "w"))
+ (match_operand:SI 3 "const_int_operand")]
+ UNSPECV_ATOMIC_LDOP_PLUS))
+(clobber (match_scratch:GPF 4 "=w"))]
+"TARGET_TESTING_FP_ATOMICS"
+"// Here's your sandwich.\;ldr %0, %1\;fsub %4, %0, %2\;str 
%4, %1\;// END"
+)
+
+
 (define_insn "aarch64_atomic__lse"
   [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "+Q")
(unspec_volatile:ALLI
-- 
2.43.0



[PATCH 11/11] [Not For Commit] Link libatomic into tests when available

2024-11-14 Thread mmalcomson
From: Matthew Malcomson 

This is not to go into the upstream.  This was used for testing.
This patch set is supposed to be upstreamed in combination with a
patchset automatically linking in libatomic when compiling.

The two patchsets are being worked on in parallel.  Some of the tests in
this series are failing due to needing libatomic.  Manually adding in
the link against libatomic, and running the testsuite natively with
`make install` having been run means I can check these tests should run
once libatomic is getting linked in.

This commit is here for that use-case if anyone wanted to test it.  Not
for commit.

Signed-off-by: Matthew Malcomson 
---
 gcc/testsuite/gcc.dg/atomic-op-fp.c   | 1 +
 gcc/testsuite/gcc.dg/atomic-op-fpf.c  | 1 +
 gcc/testsuite/gcc.dg/atomic-op-fpf128.c   | 1 +
 gcc/testsuite/gcc.dg/atomic-op-fpf16.c| 1 +
 gcc/testsuite/gcc.dg/atomic-op-fpf16b.c   | 1 +
 gcc/testsuite/gcc.dg/atomic-op-fpf32.c| 1 +
 gcc/testsuite/gcc.dg/atomic-op-fpf32x.c   | 1 +
 gcc/testsuite/gcc.dg/atomic-op-fpf64.c| 1 +
 gcc/testsuite/gcc.dg/atomic-op-fpf64x.c   | 1 +
 gcc/testsuite/gcc.dg/atomic-op-fpl.c  | 1 +
 libstdc++-v3/testsuite/29_atomics/atomic_float/1.cc   | 1 +
 libstdc++-v3/testsuite/29_atomics/atomic_ref/float.cc | 1 +
 12 files changed, 12 insertions(+)

diff --git a/gcc/testsuite/gcc.dg/atomic-op-fp.c 
b/gcc/testsuite/gcc.dg/atomic-op-fp.c
index 9947bb91f96..10675f9ff36 100644
--- a/gcc/testsuite/gcc.dg/atomic-op-fp.c
+++ b/gcc/testsuite/gcc.dg/atomic-op-fp.c
@@ -3,6 +3,7 @@
 /* { dg-do run } */
 /* Can use fallback if libatomic is available, otherwise need hardware 
support.  */
 /* { dg-require-effective-target sync_double_runtime { target { ! 
libatomic_available } } } */
+/* { dg-additional-options "-latomic" { target libatomic_available } } */
 /* { dg-additional-options "-fno-atomic-fp-fetchop-exceptions" { target { ! 
libatomic_available } } } */
 
 /* Test the execution of the __atomic_*OP builtin routines for a double.  */
diff --git a/gcc/testsuite/gcc.dg/atomic-op-fpf.c 
b/gcc/testsuite/gcc.dg/atomic-op-fpf.c
index 2a88d0a00fd..44960cfb33f 100644
--- a/gcc/testsuite/gcc.dg/atomic-op-fpf.c
+++ b/gcc/testsuite/gcc.dg/atomic-op-fpf.c
@@ -3,6 +3,7 @@
 /* { dg-do run } */
 /* Can use fallback if libatomic is available, otherwise need hardware 
support.  */
 /* { dg-require-effective-target sync_float_runtime { target { ! 
libatomic_available } } } */
+/* { dg-additional-options "-latomic" { target libatomic_available } } */
 /* { dg-additional-options "-fno-atomic-fp-fetchop-exceptions" { target { ! 
libatomic_available } } } */
 
 /* Test the execution of the __atomic_*OP builtin routines for a float.  */
diff --git a/gcc/testsuite/gcc.dg/atomic-op-fpf128.c 
b/gcc/testsuite/gcc.dg/atomic-op-fpf128.c
index abf5a1855da..4225c2a407f 100644
--- a/gcc/testsuite/gcc.dg/atomic-op-fpf128.c
+++ b/gcc/testsuite/gcc.dg/atomic-op-fpf128.c
@@ -4,6 +4,7 @@
 /* { dg-additional-options "-std=c23" } */
 /* Can use fallback if libatomic is available, otherwise need hardware 
support.  */
 /* { dg-require-effective-target sync_float128_runtime { target { ! 
libatomic_available } } } */
+/* { dg-additional-options "-latomic" { target libatomic_available } } */
 /* { dg-additional-options "-fno-atomic-fp-fetchop-exceptions" { target { ! 
libatomic_available } } } */
 /* { dg-require-effective-target float128_runtime } */
 /* { dg-add-options float128 } */
diff --git a/gcc/testsuite/gcc.dg/atomic-op-fpf16.c 
b/gcc/testsuite/gcc.dg/atomic-op-fpf16.c
index 375fdce8c9f..053a157ac48 100644
--- a/gcc/testsuite/gcc.dg/atomic-op-fpf16.c
+++ b/gcc/testsuite/gcc.dg/atomic-op-fpf16.c
@@ -4,6 +4,7 @@
 /* { dg-additional-options "-std=c23" } */
 /* Can use fallback if libatomic is available, otherwise need hardware 
support.  */
 /* { dg-require-effective-target sync_float16_runtime { target { ! 
libatomic_available } } } */
+/* { dg-additional-options "-latomic" { target libatomic_available } } */
 /* { dg-additional-options "-fno-atomic-fp-fetchop-exceptions" { target { ! 
libatomic_available } } } */
 /* { dg-require-effective-target float16_runtime } */
 /* { dg-add-options float16 } */
diff --git a/gcc/testsuite/gcc.dg/atomic-op-fpf16b.c 
b/gcc/testsuite/gcc.dg/atomic-op-fpf16b.c
index 13dfd9b0ac2..5ab81a8773e 100644
--- a/gcc/testsuite/gcc.dg/atomic-op-fpf16b.c
+++ b/gcc/testsuite/gcc.dg/atomic-op-fpf16b.c
@@ -4,6 +4,7 @@
 /* { dg-additional-options "-std=c23" } */
 /* Can use fallback if libatomic is available, otherwise need hardware 
support.  */
 /* { dg-require-effective-target sync_bfloat16_runtime { target { ! 
libatomic_available } } } */
+/* { dg-additional-options "-latomic" { target libatomic_available } } */
 /* { dg-additional-options "-fno-atomic-fp-fetchop-exceptions" { target { ! 
libatomic_available } } } */
 /* { dg-require-effective-target bfloat16_runti

[PATCH] clang-format BraceWrapping.AfterCaseLabel to true

2024-12-04 Thread mmalcomson
From: Matthew Malcomson 

This setting seems to better match the indentation that is used in GCC.

Adds an exra level of indentation after braces in a case statement.

Only manual testing done on the switch statements in
c-common.cc:resolve_overloaded_builtin and
alias.cc:record_component_aliases.

Ok for trunk?

contrib/ChangeLog:

* clang-format: Set BraceWrapping.AfterCaseLabel.

Signed-off-by: Matthew Malcomson 
---
 contrib/clang-format | 1 +
 1 file changed, 1 insertion(+)

diff --git a/contrib/clang-format b/contrib/clang-format
index f4f708648eb..95f1455c14d 100644
--- a/contrib/clang-format
+++ b/contrib/clang-format
@@ -32,6 +32,7 @@ BinPackArguments: true
 BinPackParameters: true
 BraceWrapping:
   AfterClass: true
+  AfterCaseLabel: true
   AfterControlStatement: true
   AfterEnum: true
   AfterFunction: true
-- 
2.43.0



[PATCH] clang-format AlwaysBreakAfterReturnType to TopLevelDefinitions

2024-12-09 Thread mmalcomson
From: Matthew Malcomson 

The previous value of TopLevel meant that the function name of
declarations would also be on a new line.  This does not match the
current formatting of headers.

Manual testing done on c-common.h.

Also set BraceWrapping.BeforeWhile to true to match the formatting
specified for do/while loops in GNU coding standards.
https://www.gnu.org/prep/standards/standards.html#Formatting

Ok for trunk?

contrib/ChangeLog:

* clang-format: AlwaysBreakAfterReturnType set to
TopLevelDefinitions and BraceWrapping.BeforeWhile set to true.

Signed-off-by: Matthew Malcomson 
---
 contrib/clang-format | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/contrib/clang-format b/contrib/clang-format
index 95f1455c14d..4ed50ab6b26 100644
--- a/contrib/clang-format
+++ b/contrib/clang-format
@@ -27,7 +27,7 @@
 ---
 Language: Cpp
 AccessModifierOffset: -2
-AlwaysBreakAfterReturnType: TopLevel
+AlwaysBreakAfterReturnType: TopLevelDefinitions
 BinPackArguments: true
 BinPackParameters: true
 BraceWrapping:
@@ -42,6 +42,7 @@ BraceWrapping:
   AfterUnion: true
   BeforeCatch: true
   BeforeElse: true
+  BeforeWhile: true
   IndentBraces: true
   SplitEmptyFunction: false
 BreakBeforeBinaryOperators: All
-- 
2.43.0



[PATCH] gcc: testsuite: Fix builtin-speculation-overloads[14].C testism

2025-02-12 Thread mmalcomson
From: Matthew Malcomson 

I've posted the patch on the relevant Bugzilla, but also sending to
mailing list.  If should have only done one please do mention.

- 8< --- >8 
When making warnings trigger a failure in template substitution I
could not find any way to trigger the warning about builtin speculation
not being available on the given target.

Turns out I misread the code -- this warning happens when the
speculation_barrier pattern is not defined.

Here we add an effective target to represent
"__builtin_speculation_safe_value is available on this target" and use
that to adjust our test on SFINAE behaviour accordingly.
N.b. this means that we get extra testing -- not just that things work
on targets which support __builtin_speculation_safe_value, but also that
the behaviour works on targets which don't support it.

Tested with AArch64 native, AArch64 cross compiler, and RISC-V cross
compiler (just running the tests that I've changed).

Points of interest for any reviewer:

In the new `check_known_compiler_messages_nocache` procedure I use some
procedures from `prune.exp`.  This works for the use I need in
the g++ testsuite since g++.exp imports prune.exp and g++-dg.exp
includes gcc-dg.exp which does the initialisation of prune_notes
(needed for this procedure).
- Would it be preferred to include a `load_lib prune.exp` statement at
  the top of `target-supports.exp` in order to use this procedure?
- What about the handling of `initialize_prune_notes` which must have
  been called before calling `prune_gcc_output`?
- I believe it's sensible to not use `gcc-dg-prune` which wraps
  `prune_gcc_output` since I don't believe the wrapping functionality
  useful here -- wanted to highlight that decision for review.

Ok for trunk?

gcc/testsuite/ChangeLog:

PR/117991
* g++.dg/template/builtin-speculation-overloads.def: SUCCESS
argument in SPECULATION_ASSERTS now uses a macro `true_def`
instead of the literal `true` for arguments which should work
with `__builtin_speculation_safe_value`.
* g++.dg/template/builtin-speculation-overloads1.C: Define
`true_def` macro on command line to compiler according to the
effective target representing that
`__builtin_speculation_safe_value` does something on this
target.
* g++.dg/template/builtin-speculation-overloads4.C: Likewise.
* lib/target-supports.exp
(check_known_compiler_messages_nocache): New.
(check_effective_target_speculation_barrier_defined): New.

Signed-off-by: Matthew Malcomson 
---
 .../builtin-speculation-overloads.def |  9 ++-
 .../template/builtin-speculation-overloads1.C |  2 +
 .../template/builtin-speculation-overloads4.C |  2 +
 gcc/testsuite/lib/target-supports.exp | 62 +++
 4 files changed, 72 insertions(+), 3 deletions(-)

diff --git a/gcc/testsuite/g++.dg/template/builtin-speculation-overloads.def 
b/gcc/testsuite/g++.dg/template/builtin-speculation-overloads.def
index 39d9b748d52..ada13e6f77c 100644
--- a/gcc/testsuite/g++.dg/template/builtin-speculation-overloads.def
+++ b/gcc/testsuite/g++.dg/template/builtin-speculation-overloads.def
@@ -15,14 +15,17 @@ class X{};
 class Large { public: int arr[10]; };
 class Incomplete;
 
+/* Using `true_def` in order to account for the fact that if this target
+ * doesn't support __builtin_speculation_safe_value at all everything fails to
+ * substitute.  */
 #define SPECULATION_ASSERTS
\
-  MAKE_SPECULATION_ASSERT (int, true)  
\
+  MAKE_SPECULATION_ASSERT (int, true_def)  
\
   MAKE_SPECULATION_ASSERT (float, false)   
\
   MAKE_SPECULATION_ASSERT (X, false)   
\
   MAKE_SPECULATION_ASSERT (Large, false)   
\
   MAKE_SPECULATION_ASSERT (Incomplete, false)  
\
-  MAKE_SPECULATION_ASSERT (int *, true)
\
-  MAKE_SPECULATION_ASSERT (long, true)
+  MAKE_SPECULATION_ASSERT (int *, true_def)
\
+  MAKE_SPECULATION_ASSERT (long, true_def)
 
 int main() {
 SPECULATION_ASSERTS
diff --git a/gcc/testsuite/g++.dg/template/builtin-speculation-overloads1.C 
b/gcc/testsuite/g++.dg/template/builtin-speculation-overloads1.C
index bc8f1083a99..4c50d4aa6f5 100644
--- a/gcc/testsuite/g++.dg/template/builtin-speculation-overloads1.C
+++ b/gcc/testsuite/g++.dg/template/builtin-speculation-overloads1.C
@@ -1,5 +1,7 @@
 /* Check that overloaded builtins can be used in templates with SFINAE.  */
 // { dg-do compile { target c++17 } }
+// { dg-additional-options "-Dtrue_def=true" { target 
speculation_barrier_defined } }
+// { dg-additional-options "-Dtrue_def=false" { target { ! 
speculation_barrier_defined } }

[PATCH] libstdc++: editorconfig: Adjust wildcard patterns

2024-12-06 Thread mmalcomson
From: Matthew Malcomson 

According to the editorconfig file format description, a match against
one of multiple different strings is described with those different
strings separated by commas and within curly braces.  E.g.
[{x,y}.txt]

https://editorconfig.org/, under "Wildcard Patterns".

The current libstdc++-v3/.editorconfig file has a few places where we
match against similar globs by using strings separated by commas but
without the curly braces.  E.g.
[*.h,*.cc]

This doesn't take affect in neovim nor emacs (as far as I can tell), I
haven't looked into other editors.
I would expect that following the standard syntax described in the
documentation would satisfy more editors.  Hence this patch suggests
following that standard by using something like:
[*.{h,cc}]

libstdc++-v3/ChangeLog:

* .editorconfig: Adjust globbing style to standard syntax.

Signed-off-by: Matthew Malcomson 
---
 libstdc++-v3/.editorconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libstdc++-v3/.editorconfig b/libstdc++-v3/.editorconfig
index 88107cedda2..c95e4e26f7a 100644
--- a/libstdc++-v3/.editorconfig
+++ b/libstdc++-v3/.editorconfig
@@ -5,14 +5,14 @@ root = true
 end_of_line = lf
 insert_final_newline = true
 
-[*.h,*.cc]
+[*.{h,cc}]
 charset = utf-8
 indent_style = tab
 indent_size = 2
 tab_width = 8
 trim_trailing_whitespace = true
 
-[Makefile*,ChangeLog*]
+[{Makefile,ChangeLog}*]
 indent_style = tab
 indent_size = 8
 trim_trailing_whitespace = true
-- 
2.43.0



[PATCH] testsuite: libitm: Adjust how libitm.c++ passes link flags

2025-01-02 Thread mmalcomson
From: Matthew Malcomson 

For the `gcc` and `g++` tools we often pass -B/path/to/object/dir in via
`TEST_ALWAYS_FLAGS` (see e.g. asan.exp where this is set).
In libitm.c++/c++.exp we pass that -B flag via the `tool_flags` argument
to `dg-runtest`.

Passing as the `tool_flags` argument means that these flags get added to
the name of the test.  This means that if one were to compare the
testsuite results between runs made in different build directories
libitm.c++ gives a reasonable amount of NA->PASS and PASS->NA
differences even though the same tests passed each time.

This patch follows the example set in other parts of the testsuite like
asan_init and passes the -B arguments to the compiler via a global
variable called `TEST_ALWAYS_FLAGS`.  For this DejaGNU "tool" we had to
newly initialise that variable in libitm_init and add a check against
that variable in libitm_target_compile.  I thought about adding the
relevant flags we need for C++ into `ALWAYS_CFLAGS` but decided against
it since the name didn't match what we would be using it for.

Testing done to bootstrap & regtest on AArch64.  Manually observed that
the testsuite diff between two different build directories no longer
exists.

N.b. since I pass the new flags in the `ldflags` option of the DejaGNU
options while the previous code always passed this -B flag, the compile
test throwdown.C no longer gets compiled with this -B flag.  I believe
that is not a problem.

libitm/ChangeLog:

* testsuite/libitm.c++/c++.exp: Use TEST_ALWAYS_FLAGS instead of
passing arguments to dg-runtest.
* testsuite/lib/libitm.exp (libitm_init): Initialise
TEST_ALWAYS_FLAGS.
(libitm_target_compile): Take flags from TEST_ALWAYS_FLAGS.

Signed-off-by: Matthew Malcomson 
---
 libitm/testsuite/lib/libitm.exp | 8 
 libitm/testsuite/libitm.c++/c++.exp | 7 ---
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/libitm/testsuite/lib/libitm.exp b/libitm/testsuite/lib/libitm.exp
index ac390d6d0dd..42a5aac4b0b 100644
--- a/libitm/testsuite/lib/libitm.exp
+++ b/libitm/testsuite/lib/libitm.exp
@@ -77,6 +77,7 @@ proc libitm_init { args } {
 global blddir
 global gluefile wrap_flags
 global ALWAYS_CFLAGS
+global TEST_ALWAYS_FLAGS
 global CFLAGS
 global TOOL_EXECUTABLE TOOL_OPTIONS
 global GCC_UNDER_TEST
@@ -145,6 +146,9 @@ proc libitm_init { args } {
}
 }
 
+# This set in order to give libitm.c++/c++.exp a nicely named flag to set
+# when adding C++ options.
+set TEST_ALWAYS_FLAGS ""
 set ALWAYS_CFLAGS ""
 if { $blddir != "" } {
lappend ALWAYS_CFLAGS "additional_flags=-B${blddir}/"
@@ -191,6 +195,7 @@ proc libitm_target_compile { source dest type options } {
 global libitm_compile_options
 global gluefile wrap_flags
 global ALWAYS_CFLAGS
+global TEST_ALWAYS_FLAGS
 global GCC_UNDER_TEST
 global lang_test_file
 global lang_library_path
@@ -217,6 +222,9 @@ proc libitm_target_compile { source dest type options } {
 if [info exists ALWAYS_CFLAGS] {
set options [concat "$ALWAYS_CFLAGS" $options]
 }
+if [info exists TEST_ALWAYS_FLAGS] {
+   set options [concat "$TEST_ALWAYS_FLAGS" $options]
+}
 
 set options [dg-additional-files-options $options $source $dest $type]
 
diff --git a/libitm/testsuite/libitm.c++/c++.exp 
b/libitm/testsuite/libitm.c++/c++.exp
index ab278f2cb33..d501e7e8170 100644
--- a/libitm/testsuite/libitm.c++/c++.exp
+++ b/libitm/testsuite/libitm.c++/c++.exp
@@ -56,10 +56,10 @@ if { $lang_test_file_found } {
 # Gather a list of all tests.
 set tests [lsort [glob -nocomplain $srcdir/$subdir/*.C]]
 
-set stdcxxadder ""
+set saved_TEST_ALWAYS_FLAGS $TEST_ALWAYS_FLAGS
 if { $blddir != "" } {
set ld_library_path 
"$always_ld_library_path:${blddir}/${lang_library_path}"
-   set stdcxxadder "-B ${blddir}/${lang_library_path}"
+   set TEST_ALWAYS_FLAGS "$TEST_ALWAYS_FLAGS 
ldflags=-B${blddir}/${lang_library_path}"
 } else {
set ld_library_path "$always_ld_library_path"
 }
@@ -74,7 +74,8 @@ if { $lang_test_file_found } {
 }
 
 # Main loop.
-dg-runtest $tests $stdcxxadder $libstdcxx_includes
+dg-runtest $tests "" $libstdcxx_includes
+set TEST_ALWAYS_FLAGS $saved_TEST_ALWAYS_FLAGS
 }
 
 # All done.
-- 
2.43.0