From 2a2c3e3683aaf3041524df166fc6f8cf20895a0b Mon Sep 17 00:00:00 2001
From: Soumya AR <soumyaa@nvidia.com>
Date: Mon, 30 Jun 2025 12:17:30 -0700
Subject: [PATCH] aarch64: Enable selective LDAPUR generation for cores with
 RCPC2

This patch adds the ability to fold the address computation into the addressing
mode for LDAPR instructions using LDAPUR when RCPC2 is available.

LDAPUR emission is controlled by the tune flag enable_ldapur, to enable it on a
per-core basis. Earlier, the following code:

uint64_t
foo (std::atomic<uint64_t> *x)
{
  return x[1].load(std::memory_order_acquire);
}

would generate:

foo(std::atomic<unsigned long>*):
	add     x0, x0, 8
	ldapr   x0, [x0]
	ret

but now generates:

foo(std::atomic<unsigned long>*):
	ldapur  x0, [x0, 8]
	ret

The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.
OK for mainline?

Signed-off-by: Soumya AR <soumyaa@nvidia.com>

gcc/ChangeLog:

	* config/aarch64/aarch64-tuning-flags.def (AARCH64_EXTRA_TUNING_OPTION):
	Add the enable_ldapur flag to conwtrol LDAPUR emission.
	* config/aarch64/aarch64.h (TARGET_ENABLE_LDAPUR): Use new flag.
	* config/aarch64/aarch64.md (any): Add ldapur_enable attribute.
	* config/aarch64/atomics.md: (aarch64_atomic_load<mode>_rcpc): Modify
	to emit LDAPUR for cores with RCPC2 when enable_ldapur is set.
	(*aarch64_atomic_load<ALLX:mode>_rcpc_zext): Likewise.
	(*aarch64_atomic_load<ALLX:mode>_rcpc_sext): Modified to emit LDAPURS
	for addressing with offsets.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/ldapur.c: New test.
---
 gcc/config/aarch64/aarch64-tuning-flags.def |  2 +
 gcc/config/aarch64/aarch64.h                |  5 ++
 gcc/config/aarch64/aarch64.md               | 11 +++-
 gcc/config/aarch64/atomics.md               | 22 +++++---
 gcc/testsuite/gcc.target/aarch64/ldapur.c   | 61 +++++++++++++++++++++
 5 files changed, 92 insertions(+), 9 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldapur.c

diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index f2c916e9d77..5bf54165306 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -44,6 +44,8 @@ AARCH64_EXTRA_TUNING_OPTION ("avoid_cross_loop_fma", AVOID_CROSS_LOOP_FMA)
 
 AARCH64_EXTRA_TUNING_OPTION ("fully_pipelined_fma", FULLY_PIPELINED_FMA)
 
+AARCH64_EXTRA_TUNING_OPTION ("enable_ldapur", ENABLE_LDAPUR)
+
 /* Enable is the target prefers to use a fresh register for predicate outputs
    rather than re-use an input predicate register.  */
 AARCH64_EXTRA_TUNING_OPTION ("avoid_pred_rmw", AVOID_PRED_RMW)
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index e8bd8c73c12..08ad8350fbc 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -490,6 +490,11 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED
   (bool (aarch64_tune_params.extra_tuning_flags \
 	 & AARCH64_EXTRA_TUNE_CHEAP_FPMR_WRITE))
 
+/* Enable folding address computation into LDAPUR when RCPC2 is available.  */
+#define TARGET_ENABLE_LDAPUR (TARGET_RCPC2 \
+			      && (aarch64_tune_params.extra_tuning_flags \
+				  & AARCH64_EXTRA_TUNE_ENABLE_LDAPUR))
+
 /* Combinatorial tests.  */
 
 #define TARGET_SVE2_OR_SME2 \
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index e11e13033d2..76dc0e82c2a 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -482,6 +482,8 @@
 ;; clobber for SVE predicates.
 (define_attr "pred_clobber" "any,no,yes" (const_string "any"))
 
+(define_attr "ldapur_enable" "any,no,yes" (const_string "any"))
+
 ;; [For compatibility with Arm in pipeline models]
 ;; Attribute that specifies whether or not the instruction touches fp
 ;; registers.
@@ -506,7 +508,14 @@
 	  (eq_attr "pred_clobber" "yes")
 	  (match_test "TARGET_SVE_PRED_CLOBBER"))
 	(eq_attr "pred_clobber" "any"))
-
+      (ior
+	(and
+	  (eq_attr "ldapur_enable" "no")
+	  (match_test "!TARGET_ENABLE_LDAPUR"))
+	(and
+	  (eq_attr "ldapur_enable" "yes")
+	  (match_test "TARGET_ENABLE_LDAPUR"))
+	(eq_attr "ldapur_enable" "any"))
       (ior
 	(eq_attr "arch" "any")
 
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index 36b0dbd1f57..d20bc082135 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -679,13 +679,16 @@
 )
 
 (define_insn "aarch64_atomic_load<mode>_rcpc"
-  [(set (match_operand:ALLI 0 "register_operand" "=r")
+  [(set (match_operand:ALLI 0 "register_operand")
     (unspec_volatile:ALLI
-      [(match_operand:ALLI 1 "aarch64_sync_memory_operand" "Q")
+      [(match_operand:ALLI 1 "aarch64_rcpc_memory_operand")
        (match_operand:SI 2 "const_int_operand")]			;; model
       UNSPECV_LDAP))]
   "TARGET_RCPC"
-  "ldapr<atomic_sfx>\t%<w>0, %1"
+  {@ [ cons: =0 , 1   ; attrs: ldapur_enable ]
+     [ r       , Q   ; no                    ] ldapr<atomic_sfx>\t%<w>0, %1
+     [ r       , Ust ; yes                   ] ldapur<atomic_sfx>\t%<w>0, %1
+  }
 )
 
 (define_insn "aarch64_atomic_load<mode>"
@@ -705,21 +708,24 @@
 )
 
 (define_insn "*aarch64_atomic_load<ALLX:mode>_rcpc_zext"
-  [(set (match_operand:SD_HSDI 0 "register_operand" "=r")
+  [(set (match_operand:SD_HSDI 0 "register_operand")
     (zero_extend:SD_HSDI
       (unspec_volatile:ALLX
-        [(match_operand:ALLX 1 "aarch64_sync_memory_operand" "Q")
+        [(match_operand:ALLX 1 "aarch64_rcpc_memory_operand")
          (match_operand:SI 2 "const_int_operand")]			;; model
        UNSPECV_LDAP)))]
   "TARGET_RCPC && (<SD_HSDI:sizen> > <ALLX:sizen>)"
-  "ldapr<ALLX:atomic_sfx>\t%w0, %1"
+  {@ [ cons: =0 , 1   ; attrs: ldapur_enable ]
+     [ r       , Q   ; no                    ] ldapr<ALLX:atomic_sfx>\t%w0, %1
+     [ r       , Ust ; yes                   ] ldapur<ALLX:atomic_sfx>\t%w0, %1
+  }
 )
 
 (define_insn "*aarch64_atomic_load<ALLX:mode>_rcpc_sext"
-  [(set (match_operand:GPI  0 "register_operand" "=r")
+  [(set (match_operand:GPI  0 "register_operand" "=r,r")
     (sign_extend:GPI
       (unspec_volatile:ALLX
-        [(match_operand:ALLX 1 "aarch64_sync_memory_operand" "Q")
+        [(match_operand:ALLX 1 "aarch64_rcpc_memory_operand" "=Q,Ust")
          (match_operand:SI 2 "const_int_operand")]			;; model
        UNSPECV_LDAP)))]
   "TARGET_RCPC2 && (<GPI:sizen> > <ALLX:sizen>)"
diff --git a/gcc/testsuite/gcc.target/aarch64/ldapur.c b/gcc/testsuite/gcc.target/aarch64/ldapur.c
new file mode 100644
index 00000000000..7bac2312384
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/ldapur.c
@@ -0,0 +1,61 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -std=c99 -moverride=tune=enable_ldapur" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <stdatomic.h>
+#include <stdint.h>
+
+#pragma GCC target "arch=armv8.4-a"
+
+atomic_ullong u64;
+atomic_uint u32;
+atomic_ushort u16;
+atomic_uchar u8[2]; /* Force an offset for u8 */
+
+#define TEST(name, ldsize, rettype)				\
+rettype								\
+test_##name (void)						\
+{								\
+  return atomic_load_explicit (&ldsize, memory_order_acquire);	\
+}								\
+
+
+/*
+** test_u8_u64:
+**	...
+**	ldapurb	w[0-9]+, \[x[0-9]+, [0-9]+\]
+**	ret
+*/
+TEST(u8_u64, u8[1], uint64_t)
+
+/*
+** test_u16_u64:
+**	...
+**	ldapurh	w[0-9]+, \[x[0-9]+, [0-9]+\]
+**	ret
+*/
+TEST(u16_u64, u16, uint64_t)
+
+/*
+**test_u32_u64:
+**	...
+**	ldapur	w[0-9]+, \[x[0-9]+, [0-9]+\]
+**	ret
+*/
+TEST(u32_u64, u32, uint64_t)
+
+/*
+**test_u8_u32:
+**	...
+**	ldapurb	w[0-9]+, \[x[0-9]+, [0-9]+\]
+**	ret
+*/
+TEST(u8_u32, u8[1], uint32_t)
+
+/*
+**test_u16_u32:
+**	...
+**	ldapurh	w[0-9]+, \[x[0-9]+, [0-9]+\]
+**	ret
+*/
+TEST(u16_u32, u16, uint32_t)
\ No newline at end of file
-- 
2.44.0

