[PATCH] Adding RBIT gcc builtin for ARM

2019-06-19 Thread Ayan Shafqat
The attached patch contains __builtin_arm_rbit which generates RBIT 
instruction for ARM targets.


Please let me know if you any questions or comments, or commit this 
patch for me as I do not have write access to SVN.


Thanks
Ayan

commit a692b5b4965840babbdaf5e2b9b1feb1995d351d
Author: Ayan Shafqat 
Date:   Mon Jun 17 21:46:54 2019 -0400

Implementing RBIT builtin as described in ACLE doc

ARM's RBIT instruction is used to reverse the bit order
of a word. This is present in ARMv6 and above in both
ARM and Thumb modes. This is also specified as an intrinsic
function in ACLE documentation.

This commit implements the GCC builtin for ARM target for
RBIT instruction, __builtin_arm_rbit. Also, this implements
the intrinsic functions as stated in ARM ACLE documentation,
which are listed below:

uint32_t __rbit(uint32_t x);
unsigned long __rbitl(unsigned long x);
uint64_t __rbitll(uint64_t x);

Note: __rbitll is implemented as two calls to __rbit. I know
this is not how it's done in AArch64, but this is what I can
do for now.

diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index ae582172ab9..83dcb7b411c 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -11568,6 +11568,13 @@
   [(set_attr "predicable" "yes")
(set_attr "type" "clz")])

+(define_insn "rbit"
+  [(set (match_operand:SI 0 "s_register_operand" "=r")
+   (unspec:SI [(match_operand:SI 1 "s_register_operand" "r")] 
UNSPEC_RBIT))]
+  "TARGET_32BIT && arm_arch_thumb2"
+  "rbit%?\\t%0, %1"
+  [(set_attr "predicable" "yes")])
+
 (define_insn "rbitsi2"
   [(set (match_operand:SI 0 "s_register_operand" "=r")
(unspec:SI [(match_operand:SI 1 "s_register_operand" "r")] 
UNSPEC_RBIT))]
diff --git a/gcc/config/arm/arm_acle.h b/gcc/config/arm/arm_acle.h
index 2c7acc698ea..ce1b102444b 100644
--- a/gcc/config/arm/arm_acle.h
+++ b/gcc/config/arm/arm_acle.h
@@ -168,6 +168,29 @@ __arm_mrrc2 (const unsigned int __coproc, const 
unsigned int __opc1,

 {
   return __builtin_arm_mrrc2 (__coproc, __opc1,  __CRm);
 }
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+__rbit(uint32_t __op1)
+{
+  return __builtin_arm_rbit(__op1);
+}
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+__rbitll(uint64_t __op1)
+{
+  return (((uint64_t)__rbit(__op1)) << 32U) | __rbit(__op1 >> 32U);
+}
+
+__extension__ static __inline unsigned long __attribute__ 
((__always_inline__))

+__rbitl(unsigned long __op1)
+{
+#if __SIZEOF_LONG__ == 4
+  return __rbit(__op1);
+#else
+  return __rbitll(__op1);
+#endif
+}
+
 #endif /* __ARM_ARCH >= 6.  */
 #endif /* __ARM_ARCH >= 6 ||  defined (__ARM_ARCH_5TE__).  */
 #endif /*  __ARM_ARCH >= 5.  */
diff --git a/gcc/config/arm/arm_acle_builtins.def 
b/gcc/config/arm/arm_acle_builtins.def

index b2438d66da2..ecb3be491fc 100644
--- a/gcc/config/arm/arm_acle_builtins.def
+++ b/gcc/config/arm/arm_acle_builtins.def
@@ -24,6 +24,7 @@ VAR1 (UBINOP, crc32w, si)
 VAR1 (UBINOP, crc32cb, si)
 VAR1 (UBINOP, crc32ch, si)
 VAR1 (UBINOP, crc32cw, si)
+VAR1 (UBINOP, rbit, si)
 VAR1 (CDP, cdp, void)
 VAR1 (CDP, cdp2, void)
 VAR1 (LDC, ldc, void)
diff --git a/gcc/testsuite/gcc.target/arm/acle/rbit.c 
b/gcc/testsuite/gcc.target/arm/acle/rbit.c

new file mode 100644
index 000..7803dd33615
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/acle/rbit.c
@@ -0,0 +1,18 @@
+/* Test the crc32d ACLE intrinsic.  */
+
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_crc_ok } */
+/* { dg-options "-save-temps -O0" } */
+/* { dg-add-options arm_crc } */
+
+#include "arm_acle.h"
+
+void test_rbit (void)
+{
+  uint32_t out_uint32_t;
+  uint32_t arg0_uint32_t;
+
+  out_uint32_t = __rbit (arg0_uint32_t);
+}
+
+/* { dg-final { scan-assembler-times "rbit\t...?, ...?\n" 2 } } */


[PATCH] ARM ACLE: add inline definitions for __fma and __fmaf in aarch64 and aarch32 headers

2025-03-04 Thread Ayan Shafqat
Hi GCC team,

This patch introduces inline definitions for the __fma and __fmaf
functions in the ARM ACLE headers for both aarch64 and arm targets. The
new implementations use the built-in functions (__builtin_fma and
__builtin_fmaf) to ensure proper inlining and adherence to the ARM ACLE
requirements[1].

Changes include:
  - In gcc/config/aarch64/arm_acle.h:
  Added inline definitions for __fma and __fmaf.
  - In gcc/config/arm/arm_acle.h:
  Added inline definitions for __fma and __fmaf.

These changes have been tested locally, and I have verified that they
integrate smoothly with the existing ARM backend configurations. Please
let me know if you have any questions or need further modifications.

Thanks,
Ayan

[1] ARM ACLE Document: 
https://arm-software.github.io/acle/main/acle.html#fused-multiply-accumulate-fma

Signed-off-by: Ayan Shafqat 

diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
index 7976c117daf..d9e2401ea9f 100644
--- a/gcc/config/aarch64/arm_acle.h
+++ b/gcc/config/aarch64/arm_acle.h
@@ -129,6 +129,20 @@ __jcvt (double __a)
 
 #pragma GCC pop_options
 
+__extension__ extern __inline double
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__fma (double __x, double __y, double __z)
+{
+  return __builtin_fma (__x, __y, __z);
+}
+
+__extension__ extern __inline float
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__fmaf (float __x, float __y, float __z)
+{
+  return __builtin_fmaf (__x, __y, __z);
+}
+
 #pragma GCC push_options
 #pragma GCC target ("+nothing+frintts")
 __extension__ extern __inline float
diff --git a/gcc/config/arm/arm_acle.h b/gcc/config/arm/arm_acle.h
index c6c03fdce27..256710a2c31 100644
--- a/gcc/config/arm/arm_acle.h
+++ b/gcc/config/arm/arm_acle.h
@@ -829,6 +829,20 @@ __crc32cd (uint32_t __a, uint64_t __b)
 #endif /* __ARM_FEATURE_CRC32  */
 #pragma GCC pop_options
 
+__extension__ extern __inline double
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__fma (double __x, double __y, double __z)
+{
+  return __builtin_fma (__x, __y, __z);
+}
+
+__extension__ extern __inline float
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__fmaf (float __x, float __y, float __z)
+{
+  return __builtin_fmaf (__x, __y, __z);
+}
+
 #ifdef __cplusplus
 }
 #endif



[PATCH 1/2] aarch64: Add FMA and FMAF intrinsics and tests

2025-03-09 Thread Ayan Shafqat
This patch introduces inline definitions for the __fma and __fmaf
functions in arm_acle.h for AArch64 targets. These definitions rely on
__builtin_fma and __builtin_fmaf to ensure proper inlining and to meet
the ACLE requirements [1].

The patch has been tested locally using a crosstool-NG sysroot for
AArch64, confirming that the generated code uses the expected fused
multiply-accumulate instructions (fmadd).

[1] 
https://arm-software.github.io/acle/main/acle.html#fused-multiply-accumulate-fma

Signed-off-by: Ayan Shafqat 

---
 gcc/config/aarch64/arm_acle.h   | 14 ++
 .../gcc.target/aarch64/acle/acle_fma.c  | 17 +
 2 files changed, 31 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/acle_fma.c

diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
index 7976c117daf..d9e2401ea9f 100644
--- a/gcc/config/aarch64/arm_acle.h
+++ b/gcc/config/aarch64/arm_acle.h
@@ -129,6 +129,20 @@ __jcvt (double __a)
 
 #pragma GCC pop_options
 
+__extension__ extern __inline double
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__fma (double __x, double __y, double __z)
+{
+  return __builtin_fma (__x, __y, __z);
+}
+
+__extension__ extern __inline float
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__fmaf (float __x, float __y, float __z)
+{
+  return __builtin_fmaf (__x, __y, __z);
+}
+
 #pragma GCC push_options
 #pragma GCC target ("+nothing+frintts")
 __extension__ extern __inline float
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/acle_fma.c 
b/gcc/testsuite/gcc.target/aarch64/acle/acle_fma.c
new file mode 100644
index 000..9363a75b593
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/acle/acle_fma.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include "arm_acle.h"
+
+double test_acle_fma (double x, double y, double z)
+{
+  return __fma (x, y, z);
+}
+
+float test_acle_fmaf (float x, float y, float z)
+{
+  return __fmaf (x, y, z);
+}
+
+/* { dg-final { scan-assembler-times "fmadd\td\[0-9\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmadd\ts\[0-9\]" 1 } } */
-- 
2.43.0



[PATCH 2/2] arm: Add FMA and FMAF intrinsics with corresponding tests

2025-03-09 Thread Ayan Shafqat
This patch introduces inline definitions for the __fma and __fmaf
functions in arm_acle.h for arm targets. These definitions rely on
__builtin_fma and __builtin_fmaf to ensure proper inlining and to meet
the ACLE requirements [1].

The patch has been tested locally using a crosstool-NG sysroot for
arm-cortexa9_neon-linux-gnueabihf, confirming that the generated code
uses the expected fused multiply-accumulate instructions:

vfma.f32 for single precision
vmfa.f64 for double precision

Signed-off-by: Ayan Shafqat 

[1] 
https://arm-software.github.io/acle/main/acle.html#fused-multiply-accumulate-fma

---
 gcc/config/arm/arm_acle.h| 18 ++
 gcc/testsuite/gcc.target/arm/acle/acle_fma.c | 17 +
 2 files changed, 35 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/arm/acle/acle_fma.c

diff --git a/gcc/config/arm/arm_acle.h b/gcc/config/arm/arm_acle.h
index c6c03fdce27..14c28f11b9c 100644
--- a/gcc/config/arm/arm_acle.h
+++ b/gcc/config/arm/arm_acle.h
@@ -829,6 +829,24 @@ __crc32cd (uint32_t __a, uint64_t __b)
 #endif /* __ARM_FEATURE_CRC32  */
 #pragma GCC pop_options
 
+#ifdef __ARM_FEATURE_FMA
+__extension__ extern __inline double
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__fma (double __x, double __y, double __z)
+{
+  return __builtin_fma (__x, __y, __z);
+}
+#endif
+
+#ifdef __ARM_FEATURE_FMA
+__extension__ extern __inline float
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__fmaf (float __x, float __y, float __z)
+{
+  return __builtin_fmaf (__x, __y, __z);
+}
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/gcc/testsuite/gcc.target/arm/acle/acle_fma.c 
b/gcc/testsuite/gcc.target/arm/acle/acle_fma.c
new file mode 100644
index 000..4177ac81f07
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/acle/acle_fma.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv7-a -mfpu=neon-vfpv4 -mfloat-abi=hard" } */
+
+#include "arm_acle.h"
+
+double test_acle_fma (double x, double y, double z)
+{
+  return __fma (x, y, z);
+}
+
+float test_acle_fmaf (float x, float y, float z)
+{
+  return __fmaf (x, y, z);
+}
+
+/* { dg-final { scan-assembler-times "vfma.f64\td\[0-9\]," 1 } } */
+/* { dg-final { scan-assembler-times "vfma.f32\ts\[0-9\]" 1 } } */
-- 
2.43.0



Re: [PATCH 1/2] aarch64: Add FMA and FMAF intrinsics and tests

2025-03-11 Thread Ayan Shafqat
Hello Kyrylo,

On Tue, Mar 11, 2025 at 08:55:46AM +, Kyrylo Tkachov wrote:
> This looks ok to me.
> GCC is currently in a regression fixing stage so normally such a change would 
> wait until stage 1 reopens.
> But this looks like a pretty safe change so I’m not against taking it now.
> Do you need someone to commit this for you?

Thank you very much for reviewing this patch! I do not have commit
access to GCC, so I would greatly appreciate it if you can commit this
patch on my behalf. Please let me know if you need anything else.

Best regards,
Ayan


[PATCH 1/3] AArch64: Use BUILTIN_VHSDF_HSDF for vector and scalar sqrt builtins

2025-03-11 Thread Ayan Shafqat
This patch changes the `sqrt` builtin definition from `BUILTIN_VHSDF_DF`
to `BUILTIN_VHSDF_HSDF` in `aarch64-simd-builtins.def`, ensuring the
builtin covers half, single, and double precision variants. The redundant
`VAR1 (UNOP, sqrt, 2, FP, hf)` lines are removed, as they are no longer
needed now that `BUILTIN_VHSDF_HSDF` handles those cases.

Signed-off-by: Ayan Shafqat 
Signed-off-by: Andrew Pinski 
---
 gcc/config/aarch64/aarch64-simd-builtins.def | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def 
b/gcc/config/aarch64/aarch64-simd-builtins.def
index 6cc45b18a72..685bf0dc408 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -57,7 +57,7 @@
   VAR1 (BINOPP, pmull, 0, DEFAULT, v8qi)
   VAR1 (BINOPP, pmull_hi, 0, DEFAULT, v16qi)
   BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0, FP)
-  BUILTIN_VHSDF_DF (UNOP, sqrt, 2, FP)
+  BUILTIN_VHSDF_HSDF (UNOP, sqrt, 2, FP)
   BUILTIN_VDQ_I (BINOP, addp, 0, DEFAULT)
   BUILTIN_VDQ_I (BINOPU, addp, 0, DEFAULT)
   BUILTIN_VDQ_BHSI (UNOP, clrsb, 2, DEFAULT)
@@ -848,9 +848,6 @@
   BUILTIN_VHSDF_HSDF (BINOP_USS, facgt, 0, FP)
   BUILTIN_VHSDF_HSDF (BINOP_USS, facge, 0, FP)
 
-  /* Implemented by sqrt2.  */
-  VAR1 (UNOP, sqrt, 2, FP, hf)
-
   /* Implemented by hf2.  */
   VAR1 (UNOP, floatdi, 2, FP, hf)
   VAR1 (UNOP, floatsi, 2, FP, hf)
-- 
2.43.0



[PATCH 2/3] AArch64: Add __sqrt and __sqrtf intrinsics to arm_acle.h

2025-03-11 Thread Ayan Shafqat
This patch introduces two new inline functions, __sqrt and __sqrtf, in
arm_acle.h for AArch64 targets. These functions wrap the new builtins
__builtin_aarch64_sqrtdf and __builtin_aarch64_sqrtsf, respectively,
providing direct access to hardware instructions without relying on the
standard math library or optimization levels.

Signed-off-by: Ayan Shafqat 
---
 gcc/config/aarch64/arm_acle.h | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
index 7976c117daf..d972a4e7e7e 100644
--- a/gcc/config/aarch64/arm_acle.h
+++ b/gcc/config/aarch64/arm_acle.h
@@ -118,6 +118,20 @@ __revl (unsigned long __value)
 return __rev (__value);
 }
 
+__extension__ extern __inline double
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__sqrt(double  __x)
+{
+return __builtin_aarch64_sqrtdf (__x);
+}
+
+__extension__ extern __inline float
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__sqrtf(float __x)
+{
+return __builtin_aarch64_sqrtsf (__x);
+}
+
 #pragma GCC push_options
 #pragma GCC target ("+nothing+jscvt")
 __extension__ extern __inline int32_t
-- 
2.43.0



[PATCH 3/3] AArch64: Add tests for __sqrt and __sqrtf intrinsic

2025-03-11 Thread Ayan Shafqat
This patch introduces acle_sqrt.c in the AArch64 testsuite, verifying
that the new __sqrt and __sqrtf intrinsics emit the expected fsqrt
instructions for double and float arguments.

Coverage for new intrinsics ensures that __sqrt and __sqrtf are
correctly expanded to hardware instructions and do not fall back to
library calls, regardless of optimization levels.

Signed-off-by: Ayan Shafqat 
---
 .../gcc.target/aarch64/acle/acle_sqrt.c | 17 +
 1 file changed, 17 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/acle_sqrt.c

diff --git a/gcc/testsuite/gcc.target/aarch64/acle/acle_sqrt.c 
b/gcc/testsuite/gcc.target/aarch64/acle/acle_sqrt.c
new file mode 100644
index 000..1e3ed9eaa6d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/acle/acle_sqrt.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include "arm_acle.h"
+
+double test_acle_sqrt (double x)
+{
+  return __sqrt (x);
+}
+
+float test_acle_sqrtf (float x)
+{
+  return __sqrtf (x);
+}
+
+/* { dg-final { scan-assembler-times "fsqrt\td\[0-9\]" 1 } } */
+/* { dg-final { scan-assembler-times "fsqrt\ts\[0-9\]" 1 } } */
-- 
2.43.0



[PATCH 1/2] arm: Add support for NEON vsqrt builtins (hf, sf, df)

2025-03-13 Thread Ayan Shafqat
Introduce support for a new set of NEON square-root intrinsics for half,
single, and double precision.

modified:   gcc/config/arm/arm-builtins.cc
1. Define the df_UP macro to map to E_DFmode.
2. Add CODE_FOR_neon_vsqrtsf and CODE_FOR_neon_vsqrtdf constants that
   reference the underlying VFP sqrt RTL patterns (sqrtsf2 and sqrtdf2).

modified:   gcc/config/arm/arm_vfp_builtins.def
1. Replace the single-mode entry for vsqrt with a unified VAR3 entry
   that supports hf, sf, and df modes.

These modifications enable the use of __builtin_neon_vsqrt{hf,sf,df} in user
code and ensure the correct mode is selected for each precision variant.

Signed-off-by: Ayan Shafqat 
Signed-off-by: Andrew Pinski 
---
 gcc/config/arm/arm-builtins.cc  | 3 +++
 gcc/config/arm/arm_vfp_builtins.def | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/gcc/config/arm/arm-builtins.cc b/gcc/config/arm/arm-builtins.cc
index c56ab5db985..acc86c7e8a1 100644
--- a/gcc/config/arm/arm-builtins.cc
+++ b/gcc/config/arm/arm-builtins.cc
@@ -694,6 +694,7 @@ arm_set_sat_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 #define hi_UPE_HImode
 #define void_UP E_VOIDmode
 #define sf_UP   E_SFmode
+#define df_UP E_DFmode
 #define UP(X) X##_UP
 
 typedef struct {
@@ -710,6 +711,8 @@ constexpr insn_code CODE_FOR_neon_usdotv8qi = 
CODE_FOR_neon_usdotv2siv8qi;
 constexpr insn_code CODE_FOR_neon_sdotv16qi = CODE_FOR_neon_sdotv4siv16qi;
 constexpr insn_code CODE_FOR_neon_udotv16qi = CODE_FOR_neon_udotv4siv16qi;
 constexpr insn_code CODE_FOR_neon_usdotv16qi = CODE_FOR_neon_usdotv4siv16qi;
+constexpr insn_code CODE_FOR_neon_vsqrtsf  = CODE_FOR_sqrtsf2;
+constexpr insn_code CODE_FOR_neon_vsqrtdf  = CODE_FOR_sqrtdf2;
 
 #define CF(N,X) CODE_FOR_neon_##N##X
 
diff --git a/gcc/config/arm/arm_vfp_builtins.def 
b/gcc/config/arm/arm_vfp_builtins.def
index 1fbf71e728e..8cafd72b565 100644
--- a/gcc/config/arm/arm_vfp_builtins.def
+++ b/gcc/config/arm/arm_vfp_builtins.def
@@ -40,7 +40,7 @@ VAR1 (UNOP, vrndm, hf)
 VAR1 (UNOP, vrndn, hf)
 VAR1 (UNOP, vrndp, hf)
 VAR1 (UNOP, vrndx, hf)
-VAR1 (UNOP, vsqrt, hf)
+VAR3 (UNOP, vsqrt, hf, sf, df)
 
 VAR2 (BINOP, vcvths_n, hf, si)
 VAR2 (BINOP, vcvthu_n, hf, si)
-- 
2.43.0



[PATCH v2 1/2] Aarch64: Add FMA and FMAF intrinsic and corresponding tests

2025-03-13 Thread Ayan Shafqat
This patch introduces inline definitions for the __fma and __fmaf
functions in arm_acle.h for Aarch64 targets. These definitions rely on
__builtin_fma and __builtin_fmaf to ensure proper inlining and to meet
the ACLE requirements [1].

The patch has been tested locally using a crosstool-NG sysroot for
Aarch64, confirming that the generated code uses the expected fused
multiply-accumulate instructions (fmadd).

gcc/ChangeLog:

* config/aarch64/arm_acle.h:
(__fma): New Function
(__fmaf): New Function

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/acle/acle_fma.c: New test.
---
 gcc/config/aarch64/arm_acle.h   | 14 ++
 .../gcc.target/aarch64/acle/acle_fma.c  | 17 +
 2 files changed, 31 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/acle_fma.c

diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
index 7976c117daf..d9e2401ea9f 100644
--- a/gcc/config/aarch64/arm_acle.h
+++ b/gcc/config/aarch64/arm_acle.h
@@ -129,6 +129,20 @@ __jcvt (double __a)
 
 #pragma GCC pop_options
 
+__extension__ extern __inline double
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__fma (double __x, double __y, double __z)
+{
+  return __builtin_fma (__x, __y, __z);
+}
+
+__extension__ extern __inline float
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__fmaf (float __x, float __y, float __z)
+{
+  return __builtin_fmaf (__x, __y, __z);
+}
+
 #pragma GCC push_options
 #pragma GCC target ("+nothing+frintts")
 __extension__ extern __inline float
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/acle_fma.c 
b/gcc/testsuite/gcc.target/aarch64/acle/acle_fma.c
new file mode 100644
index 000..9363a75b593
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/acle/acle_fma.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include "arm_acle.h"
+
+double test_acle_fma (double x, double y, double z)
+{
+  return __fma (x, y, z);
+}
+
+float test_acle_fmaf (float x, float y, float z)
+{
+  return __fmaf (x, y, z);
+}
+
+/* { dg-final { scan-assembler-times "fmadd\td\[0-9\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmadd\ts\[0-9\]" 1 } } */
-- 
2.43.0



[PATCH v2 2/2] arm: Add FMA and FMAF intrinsics with corresponding tests

2025-03-13 Thread Ayan Shafqat
This patch introduces inline definitions for the __fma and __fmaf
functions in arm_acle.h for arm targets. These definitions rely on
__builtin_fma and __builtin_fmaf to ensure proper inlining and to meet
the ACLE requirements [1].

The patch has been tested locally using a crosstool-NG sysroot for
arm-cortexa9_neon-linux-gnueabihf, confirming that the generated code
uses the expected fused multiply-accumulate instructions:

vfma.f32 for single precision
vmfa.f64 for double precision

[1] 
https://arm-software.github.io/acle/main/acle.html#fused-multiply-accumulate-fma

gcc/ChangeLog:

* config/arm/arm_acle.h (__attribute__):
(__fma): New Function
(__fmaf): New Function

gcc/testsuite/ChangeLog:

* gcc.target/arm/acle/acle_fma.c: New test.
---
 gcc/config/arm/arm_acle.h| 18 ++
 gcc/testsuite/gcc.target/arm/acle/acle_fma.c | 17 +
 2 files changed, 35 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/arm/acle/acle_fma.c

diff --git a/gcc/config/arm/arm_acle.h b/gcc/config/arm/arm_acle.h
index c6c03fdce27..14c28f11b9c 100644
--- a/gcc/config/arm/arm_acle.h
+++ b/gcc/config/arm/arm_acle.h
@@ -829,6 +829,24 @@ __crc32cd (uint32_t __a, uint64_t __b)
 #endif /* __ARM_FEATURE_CRC32  */
 #pragma GCC pop_options
 
+#ifdef __ARM_FEATURE_FMA
+__extension__ extern __inline double
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__fma (double __x, double __y, double __z)
+{
+  return __builtin_fma (__x, __y, __z);
+}
+#endif
+
+#ifdef __ARM_FEATURE_FMA
+__extension__ extern __inline float
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__fmaf (float __x, float __y, float __z)
+{
+  return __builtin_fmaf (__x, __y, __z);
+}
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/gcc/testsuite/gcc.target/arm/acle/acle_fma.c 
b/gcc/testsuite/gcc.target/arm/acle/acle_fma.c
new file mode 100644
index 000..4177ac81f07
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/acle/acle_fma.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv7-a -mfpu=neon-vfpv4 -mfloat-abi=hard" } */
+
+#include "arm_acle.h"
+
+double test_acle_fma (double x, double y, double z)
+{
+  return __fma (x, y, z);
+}
+
+float test_acle_fmaf (float x, float y, float z)
+{
+  return __fmaf (x, y, z);
+}
+
+/* { dg-final { scan-assembler-times "vfma.f64\td\[0-9\]," 1 } } */
+/* { dg-final { scan-assembler-times "vfma.f32\ts\[0-9\]" 1 } } */
-- 
2.43.0



[PATCH v2 1/3] Aarch64: Use BUILTIN_VHSDF_HSDF for vector and scalar sqrt builtins

2025-03-13 Thread Ayan Shafqat
This patch changes the `sqrt` builtin definition from `BUILTIN_VHSDF_DF`
to `BUILTIN_VHSDF_HSDF` in `aarch64-simd-builtins.def`, ensuring the
builtin covers half, single, and double precision variants. The redundant
`VAR1 (UNOP, sqrt, 2, FP, hf)` lines are removed, as they are no longer
needed now that `BUILTIN_VHSDF_HSDF` handles those cases.

gcc/ChangeLog:

* config/aarch64/aarch64-simd-builtins.def: Change
BUILTIN_VHSDF_DF to BUILTIN_VHSDF_HSDF

Signed-off-by: Ayan Shafqat 
Signed-off-by: Andrew Pinski 
---
 gcc/config/aarch64/aarch64-simd-builtins.def | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def 
b/gcc/config/aarch64/aarch64-simd-builtins.def
index 6cc45b18a72..685bf0dc408 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -57,7 +57,7 @@
   VAR1 (BINOPP, pmull, 0, DEFAULT, v8qi)
   VAR1 (BINOPP, pmull_hi, 0, DEFAULT, v16qi)
   BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0, FP)
-  BUILTIN_VHSDF_DF (UNOP, sqrt, 2, FP)
+  BUILTIN_VHSDF_HSDF (UNOP, sqrt, 2, FP)
   BUILTIN_VDQ_I (BINOP, addp, 0, DEFAULT)
   BUILTIN_VDQ_I (BINOPU, addp, 0, DEFAULT)
   BUILTIN_VDQ_BHSI (UNOP, clrsb, 2, DEFAULT)
@@ -848,9 +848,6 @@
   BUILTIN_VHSDF_HSDF (BINOP_USS, facgt, 0, FP)
   BUILTIN_VHSDF_HSDF (BINOP_USS, facge, 0, FP)
 
-  /* Implemented by sqrt2.  */
-  VAR1 (UNOP, sqrt, 2, FP, hf)
-
   /* Implemented by hf2.  */
   VAR1 (UNOP, floatdi, 2, FP, hf)
   VAR1 (UNOP, floatsi, 2, FP, hf)
-- 
2.43.0



[PATCH v2 2/3] Aarch64: Add __sqrt and __sqrtf intrinsics to arm_acle.h

2025-03-13 Thread Ayan Shafqat
This patch introduces two new inline functions, __sqrt and __sqrtf, in
arm_acle.h for Aarch64 targets. These functions wrap the new builtins
__builtin_aarch64_sqrtdf and __builtin_aarch64_sqrtsf, respectively,
providing direct access to hardware instructions without relying on the
standard math library or optimization levels.

gcc/ChangeLog:

* config/aarch64/arm_acle.h (__sqrt, __sqrtf): New function.

Signed-off-by: Ayan Shafqat 
---
 gcc/config/aarch64/arm_acle.h | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
index 7976c117daf..d972a4e7e7e 100644
--- a/gcc/config/aarch64/arm_acle.h
+++ b/gcc/config/aarch64/arm_acle.h
@@ -118,6 +118,20 @@ __revl (unsigned long __value)
 return __rev (__value);
 }
 
+__extension__ extern __inline double
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__sqrt(double  __x)
+{
+return __builtin_aarch64_sqrtdf (__x);
+}
+
+__extension__ extern __inline float
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__sqrtf(float __x)
+{
+return __builtin_aarch64_sqrtsf (__x);
+}
+
 #pragma GCC push_options
 #pragma GCC target ("+nothing+jscvt")
 __extension__ extern __inline int32_t
-- 
2.43.0



[PATCH v2 3/3] Aarch64: Add tests for __sqrt and __sqrtf intrinsic

2025-03-13 Thread Ayan Shafqat
This patch introduces acle_sqrt.c in the AArch64 testsuite, verifying
that the new __sqrt and __sqrtf intrinsics emit the expected fsqrt
instructions for double and float arguments.

Coverage for new intrinsics ensures that __sqrt and __sqrtf are
correctly expanded to hardware instructions and do not fall back to
library calls, regardless of optimization levels.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/acle/acle_sqrt.c: New test.

Signed-off-by: Ayan Shafqat 
---
 .../gcc.target/aarch64/acle/acle_sqrt.c | 17 +
 1 file changed, 17 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/acle_sqrt.c

diff --git a/gcc/testsuite/gcc.target/aarch64/acle/acle_sqrt.c 
b/gcc/testsuite/gcc.target/aarch64/acle/acle_sqrt.c
new file mode 100644
index 000..1e3ed9eaa6d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/acle/acle_sqrt.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include "arm_acle.h"
+
+double test_acle_sqrt (double x)
+{
+  return __sqrt (x);
+}
+
+float test_acle_sqrtf (float x)
+{
+  return __sqrtf (x);
+}
+
+/* { dg-final { scan-assembler-times "fsqrt\td\[0-9\]" 1 } } */
+/* { dg-final { scan-assembler-times "fsqrt\ts\[0-9\]" 1 } } */
-- 
2.43.0



[PATCH 2/3] Aarch64: Add __sqrt and __sqrtf intrinsics to arm_acle.h

2025-03-13 Thread Ayan Shafqat
This patch introduces two new inline functions, __sqrt and __sqrtf, in
arm_acle.h for Aarch64 targets. These functions wrap the new builtins
__builtin_aarch64_sqrtdf and __builtin_aarch64_sqrtsf, respectively,
providing direct access to hardware instructions without relying on the
standard math library or optimization levels.

gcc/ChangeLog:

* config/aarch64/arm_acle.h (__sqrt, __sqrtf): New function.

Signed-off-by: Ayan Shafqat 
---
 gcc/config/aarch64/arm_acle.h | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
index 7976c117daf..d972a4e7e7e 100644
--- a/gcc/config/aarch64/arm_acle.h
+++ b/gcc/config/aarch64/arm_acle.h
@@ -118,6 +118,20 @@ __revl (unsigned long __value)
 return __rev (__value);
 }
 
+__extension__ extern __inline double
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__sqrt(double  __x)
+{
+return __builtin_aarch64_sqrtdf (__x);
+}
+
+__extension__ extern __inline float
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__sqrtf(float __x)
+{
+return __builtin_aarch64_sqrtsf (__x);
+}
+
 #pragma GCC push_options
 #pragma GCC target ("+nothing+jscvt")
 __extension__ extern __inline int32_t
-- 
2.43.0



[PATCH 2/2] arm: Add ACLE sqrt intrinsic using NEON vsqrt builtins

2025-03-14 Thread Ayan Shafqat
Add inline implementations of the ACLE __sqrt() and __sqrtf() functions
in arm_acle.h. These functions, defined when __ARM_FP is available[1],
forward the square-root operation to the corresponding NEON builtins:

* __sqrt() calls __builtin_neon_vsqrtdf for double precision.
* __sqrtf() calls __builtin_neon_vsqrtsf for single precision.

Additionally, a new testsuite file (acle_sqrt.c) is introduced to verify
the generated assembly contains the proper vsqrt instructions (vsqrt.f64
and vsqrt.f32) when compiling ACLE sqrt intrinsics.

modified:   gcc/config/arm/arm_acle.h
  * Add inline definitions for __sqrt() and __sqrtf().

new file:   gcc/testsuite/gcc.target/arm/acle/acle_sqrt.c
  * Create tests to compile and check for vsqrt.f64 and vsqrt.f32 instructions.

[1] https://developer.arm.com/documentation/101028/0012/3--C-language-extensions

Signed-off-by: Ayan Shafqat 
---
 gcc/ChangeLog | 16 
 gcc/config/arm/arm_acle.h | 18 ++
 gcc/testsuite/ChangeLog   |  6 ++
 gcc/testsuite/gcc.target/arm/acle/acle_sqrt.c | 17 +
 4 files changed, 57 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/arm/acle/acle_sqrt.c

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 704146d97aa..25b1905fa77 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,19 @@
+2025-03-13  Ayan Shafqat 
+
+   * config/arm/arm_acle.h (__sqrt, __sqrtf):
+ Add inline definitions for __sqrt() and __sqrtf() (guarded by 
__ARM_FP)
+ that forward to the NEON square-root builtins (__builtin_neon_vsqrtdf
+ and __builtin_neon_vsqrtsf), enabling ACLE sqrt intrinsics.
+
+   * config/arm/arm-builtins.cc (df_UP):
+ Define df_UP to map to E_DFmode and add CODE_FOR_neon_vsqrtsf and
+ CODE_FOR_neon_vsqrtdf to support the new vsqrt builtins.
+
+   * config/arm/arm_vfp_builtins.def (VAR1, VAR3):
+ Replace the single-mode vsqrt entry (VAR1) with a VAR3 entry that 
supports
+ hf, sf, and df modes.
+
+
 2025-03-12  Jeff Law  
 
Revert:
diff --git a/gcc/config/arm/arm_acle.h b/gcc/config/arm/arm_acle.h
index c6c03fdce27..c5f8d35c7d5 100644
--- a/gcc/config/arm/arm_acle.h
+++ b/gcc/config/arm/arm_acle.h
@@ -829,6 +829,24 @@ __crc32cd (uint32_t __a, uint64_t __b)
 #endif /* __ARM_FEATURE_CRC32  */
 #pragma GCC pop_options
 
+#ifdef __ARM_FP
+__extension__ extern __inline double
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__sqrt(double  __x)
+{
+  return __builtin_neon_vsqrtdf (__x);
+}
+#endif
+
+#ifdef __ARM_FP
+__extension__ extern __inline float
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__sqrtf(float __x)
+{
+  return __builtin_neon_vsqrtsf (__x);
+}
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 95a405651c6..a03b78f9fba 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,9 @@
+2025-03-13  Ayan Shafqat 
+
+   * gcc.target/arm/acle/acle_sqrt.c: New test to verify that the ACLE
+ __sqrt() and __sqrtf() intrinsics are correctly lowered to the 
expected
+ vsqrt.f64 and vsqrt.f32 instructions.
+
 2025-03-11  Jakub Jelinek  
 
PR c/117178
diff --git a/gcc/testsuite/gcc.target/arm/acle/acle_sqrt.c 
b/gcc/testsuite/gcc.target/arm/acle/acle_sqrt.c
new file mode 100644
index 000..f95e3476c4d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/acle/acle_sqrt.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include "arm_acle.h"
+
+double test_acle_sqrt (double x)
+{
+  return __sqrt (x);
+}
+
+float test_acle_sqrtf (float x)
+{
+  return __sqrtf (x);
+}
+
+/* { dg-final { scan-assembler-times "vsqrt.f64\td\[0-9\]" 1 } } */
+/* { dg-final { scan-assembler-times "vsqrt.f32\ts\[0-9\]" 1 } } */
-- 
2.43.0



[PATCH v2 2/2] arm: Add ACLE sqrt intrinsic using NEON vsqrt builtins

2025-03-16 Thread Ayan Shafqat
Add inline implementations of the ACLE __sqrt() and __sqrtf() functions
in arm_acle.h. These functions, defined when __ARM_FP is available[1],
forward the square-root operation to the corresponding NEON builtins:

* __sqrt() calls __builtin_neon_vsqrtdf for double precision.
* __sqrtf() calls __builtin_neon_vsqrtsf for single precision.

Additionally, a new testsuite file (acle_sqrt.c) is introduced to verify
the generated assembly contains the proper vsqrt instructions (vsqrt.f64
and vsqrt.f32) when compiling ACLE sqrt intrinsics.

[1] https://developer.arm.com/documentation/101028/0012/3--C-language-extensions

gcc/ChangeLog:

* config/arm/arm_acle.h (__sqrt, __sqrtf): New functions

gcc/testsuite/ChangeLog:

* gcc.target/arm/acle/acle_sqrt.c: New test.

Signed-off-by: Ayan Shafqat 
---
 gcc/config/arm/arm_acle.h | 18 ++
 gcc/testsuite/gcc.target/arm/acle/acle_sqrt.c | 19 +++
 2 files changed, 37 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/arm/acle/acle_sqrt.c

diff --git a/gcc/config/arm/arm_acle.h b/gcc/config/arm/arm_acle.h
index c6c03fdce27..c00a5dbce69 100644
--- a/gcc/config/arm/arm_acle.h
+++ b/gcc/config/arm/arm_acle.h
@@ -829,6 +829,24 @@ __crc32cd (uint32_t __a, uint64_t __b)
 #endif /* __ARM_FEATURE_CRC32  */
 #pragma GCC pop_options
 
+#ifdef __ARM_FP
+__extension__ static __inline double
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__sqrt (double __x)
+{
+  return __builtin_neon_vsqrtdf (__x);
+}
+#endif
+
+#ifdef __ARM_FP
+__extension__ static __inline float
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__sqrtf (float __x)
+{
+  return __builtin_neon_vsqrtsf (__x);
+}
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/gcc/testsuite/gcc.target/arm/acle/acle_sqrt.c 
b/gcc/testsuite/gcc.target/arm/acle/acle_sqrt.c
new file mode 100644
index 000..bf2ff1ffa8b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/acle/acle_sqrt.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include "arm_acle.h"
+
+double
+test_acle_sqrt (double x)
+{
+  return __sqrt (x);
+}
+
+float
+test_acle_sqrtf (float x)
+{
+  return __sqrtf (x);
+}
+
+/* { dg-final { scan-assembler-times "vsqrt.f64\td\[0-9\]" 1 } } */
+/* { dg-final { scan-assembler-times "vsqrt.f32\ts\[0-9\]" 1 } } */
-- 
2.43.0



[PATCH v2 1/2] arm: Add support for NEON vsqrt builtins (hf, sf, df)

2025-03-16 Thread Ayan Shafqat
Introduce support for a new set of NEON square-root intrinsics for half,
single, and double precision.

modified:   gcc/config/arm/arm-builtins.cc
1. Define the df_UP macro to map to E_DFmode.
2. Add CODE_FOR_neon_vsqrtsf and CODE_FOR_neon_vsqrtdf constants that
   reference the underlying VFP sqrt RTL patterns (sqrtsf2 and sqrtdf2).

modified:   gcc/config/arm/arm_vfp_builtins.def
1. Replace the single-mode entry for vsqrt with a unified VAR3 entry
   that supports hf, sf, and df modes.

These modifications enable the use of __builtin_neon_vsqrt{hf,sf,df} in user
code and ensure the correct mode is selected for each precision variant.

Signed-off-by: Ayan Shafqat 
Signed-off-by: Andrew Pinski 

gcc/ChangeLog:

* config/arm/arm-builtins.cc (df_UP): New macro.

* config/arm/arm_vfp_builtins.def (VAR1, VAR3): Change VAR1 to
VAR3, for HF, SF, and DF modes.
---
 gcc/config/arm/arm-builtins.cc  | 3 +++
 gcc/config/arm/arm_vfp_builtins.def | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/gcc/config/arm/arm-builtins.cc b/gcc/config/arm/arm-builtins.cc
index c56ab5db985..acc86c7e8a1 100644
--- a/gcc/config/arm/arm-builtins.cc
+++ b/gcc/config/arm/arm-builtins.cc
@@ -694,6 +694,7 @@ arm_set_sat_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 #define hi_UPE_HImode
 #define void_UP E_VOIDmode
 #define sf_UP   E_SFmode
+#define df_UP E_DFmode
 #define UP(X) X##_UP
 
 typedef struct {
@@ -710,6 +711,8 @@ constexpr insn_code CODE_FOR_neon_usdotv8qi = 
CODE_FOR_neon_usdotv2siv8qi;
 constexpr insn_code CODE_FOR_neon_sdotv16qi = CODE_FOR_neon_sdotv4siv16qi;
 constexpr insn_code CODE_FOR_neon_udotv16qi = CODE_FOR_neon_udotv4siv16qi;
 constexpr insn_code CODE_FOR_neon_usdotv16qi = CODE_FOR_neon_usdotv4siv16qi;
+constexpr insn_code CODE_FOR_neon_vsqrtsf  = CODE_FOR_sqrtsf2;
+constexpr insn_code CODE_FOR_neon_vsqrtdf  = CODE_FOR_sqrtdf2;
 
 #define CF(N,X) CODE_FOR_neon_##N##X
 
diff --git a/gcc/config/arm/arm_vfp_builtins.def 
b/gcc/config/arm/arm_vfp_builtins.def
index 1fbf71e728e..8cafd72b565 100644
--- a/gcc/config/arm/arm_vfp_builtins.def
+++ b/gcc/config/arm/arm_vfp_builtins.def
@@ -40,7 +40,7 @@ VAR1 (UNOP, vrndm, hf)
 VAR1 (UNOP, vrndn, hf)
 VAR1 (UNOP, vrndp, hf)
 VAR1 (UNOP, vrndx, hf)
-VAR1 (UNOP, vsqrt, hf)
+VAR3 (UNOP, vsqrt, hf, sf, df)
 
 VAR2 (BINOP, vcvths_n, hf, si)
 VAR2 (BINOP, vcvthu_n, hf, si)
-- 
2.43.0



Re: [PATCH 2/3] Aarch64: Add __sqrt and __sqrtf intrinsics to arm_acle.h

2025-03-16 Thread Ayan Shafqat
Hi Jakub:

Thank you very much for the review feedback. I have addressed the
feedback in v2 of the patch [1]. See additional replies below.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2025-March/677754.html

On Thu, Mar 13, 2025 at 10:28:52PM +0100, Jakub Jelinek wrote:
> On Thu, Mar 13, 2025 at 05:23:00PM -0400, Ayan Shafqat wrote:
> > This patch introduces two new inline functions, __sqrt and __sqrtf, in
> > +__sqrt(double  __x)

> Just formatting nits, there should be space in between the function name
> and ( and only one space between double and __x.

Thanks for catching this.

> Also, it is unclear why it uses __extension__ (but admittedly it is used
> elsewhere in the header.

That is a question to the ARM maintainers. I have followed the
convention in the rest of the file. If that needs changes, let
me know.

> 
> > +{
> > +return __builtin_aarch64_sqrtdf (__x);
> 
> Just two space indentation rather than 4 spaces.

Got it! Thanks for pointing this out.

> 
> > +}
> > +
> > +__extension__ extern __inline float
> > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > +__sqrtf(float __x)
> 
> See above
> 
> > +{
> > +return __builtin_aarch64_sqrtsf (__x);
> 
> Ditto
> 
>   Jakub
> 

Thank you again for the review. Let me know if you spot anything else.

Best regards,
Ayan


Re: [PATCH v2 3/3] Aarch64: Add tests for __sqrt and __sqrtf intrinsic

2025-03-16 Thread Ayan Shafqat
Hello Jakub:

Thank you very much for your feedback. See additional replies below.

On Thu, Mar 13, 2025 at 10:31:44PM +0100, Jakub Jelinek wrote:
> On Thu, Mar 13, 2025 at 05:25:26PM -0400, Ayan Shafqat wrote:
> > gcc/testsuite/ChangeLog:
> > 
> > * gcc.target/aarch64/acle/acle_sqrt.c: New test.
> > 
> > Signed-off-by: Ayan Shafqat 
> 
> Tests should be in the same patch as the code they are testing,
> not committed separately.

Yes, thanks for pointing this out. I agree that tests should be in the
same commit as of introduction of the new feature. I have merged the two
changes into one in the new version. Please see the newer version of
this patch:

https://gcc.gnu.org/pipermail/gcc-patches/2025-March/677754.html

> 
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/acle/acle_sqrt.c
> > @@ -0,0 +1,17 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2" } */
> > +
> > +#include "arm_acle.h"
> > +
> > +double test_acle_sqrt (double x)
> 
> The normal GNU formatting is
> double
> test_acle_sqrt (double x)
> (i.e. function name at the start of line, so one can grep for it).
> 
> > +{
> > +  return __sqrt (x);
> > +}
> > +
> > +float test_acle_sqrtf (float x)
> 
> Ditto.
> 
>   Jakub
> 

Thanks, I have addressed it in the v2 of this patch. Let me know if
there are any other issues.

Best,
Ayan



[PATCH v2 2/2] Aarch64: Add __sqrt and __sqrtf intrinsics and corresponding tests

2025-03-16 Thread Ayan Shafqat
This patch introduces two new inline functions, __sqrt and __sqrtf, in
arm_acle.h for Aarch64 targets. These functions wrap the new builtins
__builtin_aarch64_sqrtdf and __builtin_aarch64_sqrtsf, respectively,
providing direct access to hardware instructions without relying on the
standard math library or optimization levels.

This patch also introduces acle_sqrt.c in the AArch64 testsuite,
verifying that the new __sqrt and __sqrtf intrinsics emit the expected
fsqrt instructions for double and float arguments.

Coverage for new intrinsics ensures that __sqrt and __sqrtf are
correctly expanded to hardware instructions and do not fall back to
library calls, regardless of optimization levels.

gcc/ChangeLog:

* config/aarch64/arm_acle.h (__sqrt, __sqrtf): New function.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/acle/acle_sqrt.c: New test.

Signed-off-by: Ayan Shafqat 
---
 gcc/config/aarch64/arm_acle.h | 14 ++
 .../gcc.target/aarch64/acle/acle_sqrt.c   | 19 +++
 2 files changed, 33 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/acle_sqrt.c

diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
index 7976c117daf..2900e934239 100644
--- a/gcc/config/aarch64/arm_acle.h
+++ b/gcc/config/aarch64/arm_acle.h
@@ -118,6 +118,20 @@ __revl (unsigned long __value)
 return __rev (__value);
 }
 
+__extension__ extern __inline double
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__sqrt (double __x)
+{
+  return __builtin_aarch64_sqrtdf (__x);
+}
+
+__extension__ extern __inline float
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__sqrtf (float __x)
+{
+  return __builtin_aarch64_sqrtsf (__x);
+}
+
 #pragma GCC push_options
 #pragma GCC target ("+nothing+jscvt")
 __extension__ extern __inline int32_t
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/acle_sqrt.c 
b/gcc/testsuite/gcc.target/aarch64/acle/acle_sqrt.c
new file mode 100644
index 000..482351fa7e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/acle/acle_sqrt.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include "arm_acle.h"
+
+double
+test_acle_sqrt (double x)
+{
+  return __sqrt (x);
+}
+
+float
+test_acle_sqrtf (float x)
+{
+  return __sqrtf (x);
+}
+
+/* { dg-final { scan-assembler-times "fsqrt\td\[0-9\]" 1 } } */
+/* { dg-final { scan-assembler-times "fsqrt\ts\[0-9\]" 1 } } */
-- 
2.43.0



[PATCH v2 1/2] Aarch64: Use BUILTIN_VHSDF_HSDF for vector and scalar sqrt builtins

2025-03-16 Thread Ayan Shafqat
This patch changes the `sqrt` builtin definition from `BUILTIN_VHSDF_DF`
to `BUILTIN_VHSDF_HSDF` in `aarch64-simd-builtins.def`, ensuring the
builtin covers half, single, and double precision variants. The redundant
`VAR1 (UNOP, sqrt, 2, FP, hf)` lines are removed, as they are no longer
needed now that `BUILTIN_VHSDF_HSDF` handles those cases.

gcc/ChangeLog:

* config/aarch64/aarch64-simd-builtins.def: Change
BUILTIN_VHSDF_DF to BUILTIN_VHSDF_HSDF

Signed-off-by: Ayan Shafqat 
Signed-off-by: Andrew Pinski 
---
 gcc/config/aarch64/aarch64-simd-builtins.def | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def 
b/gcc/config/aarch64/aarch64-simd-builtins.def
index 6cc45b18a72..685bf0dc408 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -57,7 +57,7 @@
   VAR1 (BINOPP, pmull, 0, DEFAULT, v8qi)
   VAR1 (BINOPP, pmull_hi, 0, DEFAULT, v16qi)
   BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0, FP)
-  BUILTIN_VHSDF_DF (UNOP, sqrt, 2, FP)
+  BUILTIN_VHSDF_HSDF (UNOP, sqrt, 2, FP)
   BUILTIN_VDQ_I (BINOP, addp, 0, DEFAULT)
   BUILTIN_VDQ_I (BINOPU, addp, 0, DEFAULT)
   BUILTIN_VDQ_BHSI (UNOP, clrsb, 2, DEFAULT)
@@ -848,9 +848,6 @@
   BUILTIN_VHSDF_HSDF (BINOP_USS, facgt, 0, FP)
   BUILTIN_VHSDF_HSDF (BINOP_USS, facge, 0, FP)
 
-  /* Implemented by sqrt2.  */
-  VAR1 (UNOP, sqrt, 2, FP, hf)
-
   /* Implemented by hf2.  */
   VAR1 (UNOP, floatdi, 2, FP, hf)
   VAR1 (UNOP, floatsi, 2, FP, hf)
-- 
2.43.0



[PATCH v3 2/2] arm: Add FMA and FMAF intrinsics with corresponding tests

2025-03-16 Thread Ayan Shafqat
This patch introduces inline definitions for the __fma and __fmaf
functions in arm_acle.h for arm targets. These definitions rely on
__builtin_fma and __builtin_fmaf to ensure proper inlining and to meet
the ACLE requirements [1].

The patch has been tested locally using a crosstool-NG sysroot for
arm-cortexa9_neon-linux-gnueabihf, confirming that the generated code
uses the expected fused multiply-accumulate instructions:

vfma.f32 for single precision
vmfa.f64 for double precision

[1] 
https://arm-software.github.io/acle/main/acle.html#fused-multiply-accumulate-fma

gcc/ChangeLog:

* config/arm/arm_acle.h (__fma, __fmaf): New functions.

gcc/testsuite/ChangeLog:

* gcc.target/arm/acle/acle_fma.c: New test.
---
 gcc/config/arm/arm_acle.h| 19 +++
 gcc/testsuite/gcc.target/arm/acle/acle_fma.c | 19 +++
 2 files changed, 38 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/arm/acle/acle_fma.c

diff --git a/gcc/config/arm/arm_acle.h b/gcc/config/arm/arm_acle.h
index c6c03fdce27..02cb67d1516 100644
--- a/gcc/config/arm/arm_acle.h
+++ b/gcc/config/arm/arm_acle.h
@@ -829,6 +829,25 @@ __crc32cd (uint32_t __a, uint64_t __b)
 #endif /* __ARM_FEATURE_CRC32  */
 #pragma GCC pop_options
 
+#pragma GCC push_options
+#pragma GCC target("fpu=neon-vfpv4")
+
+__extension__ extern __inline double
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__fma (double __x, double __y, double __z)
+{
+  return __builtin_fma (__x, __y, __z);
+}
+
+__extension__ extern __inline float
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__fmaf (float __x, float __y, float __z)
+{
+  return __builtin_fmaf (__x, __y, __z);
+}
+
+#pragma GCC pop_options
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/gcc/testsuite/gcc.target/arm/acle/acle_fma.c 
b/gcc/testsuite/gcc.target/arm/acle/acle_fma.c
new file mode 100644
index 000..cba4f48929d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/acle/acle_fma.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv7-a -mfpu=neon-vfpv4 -mfloat-abi=hard" } */
+
+#include "arm_acle.h"
+
+double
+test_acle_fma (double x, double y, double z)
+{
+  return __fma (x, y, z);
+}
+
+float
+test_acle_fmaf (float x, float y, float z)
+{
+  return __fmaf (x, y, z);
+}
+
+/* { dg-final { scan-assembler-times "vfma.f64\td\[0-9\]," 1 } } */
+/* { dg-final { scan-assembler-times "vfma.f32\ts\[0-9\]" 1 } } */
-- 
2.43.0



Re: [PATCH 1/2] aarch64: Add FMA and FMAF intrinsics and tests

2025-03-15 Thread Ayan Shafqat
On Thu, Mar 13, 2025 at 08:31:24AM +, Kyrylo Tkachov wrote:
>
> I forgot during the review, but a patch needs a ChangeLog entry.
> Could you provide one please to add to the commit log?
>

I have submitted the patch again in the mailing list:

https://gcc.gnu.org/pipermail/gcc-patches/2025-March/677588.html

Let me know if you need anything else.

Thanks in advance,
Ayan


[PATCH v3 1/2] Aarch64: Add FMA and FMAF intrinsic and corresponding tests

2025-03-17 Thread Ayan Shafqat
This patch introduces inline definitions for the __fma and __fmaf
functions in arm_acle.h for Aarch64 targets. These definitions rely on
__builtin_fma and __builtin_fmaf to ensure proper inlining and to meet
the ACLE requirements [1].

The patch has been tested locally using a crosstool-NG sysroot for
Aarch64, confirming that the generated code uses the expected fused
multiply-accumulate instructions (fmadd).

[1] 
https://arm-software.github.io/acle/main/acle.html#fused-multiply-accumulate-fma

gcc/ChangeLog:

* config/aarch64/arm_acle.h (__fma, __fmaf): New functions.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/acle/acle_fma.c: New test.
---
 gcc/config/aarch64/arm_acle.h | 14 ++
 .../gcc.target/aarch64/acle/acle_fma.c| 19 +++
 2 files changed, 33 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/acle_fma.c

diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
index 7976c117daf..d9e2401ea9f 100644
--- a/gcc/config/aarch64/arm_acle.h
+++ b/gcc/config/aarch64/arm_acle.h
@@ -129,6 +129,20 @@ __jcvt (double __a)
 
 #pragma GCC pop_options
 
+__extension__ extern __inline double
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__fma (double __x, double __y, double __z)
+{
+  return __builtin_fma (__x, __y, __z);
+}
+
+__extension__ extern __inline float
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__fmaf (float __x, float __y, float __z)
+{
+  return __builtin_fmaf (__x, __y, __z);
+}
+
 #pragma GCC push_options
 #pragma GCC target ("+nothing+frintts")
 __extension__ extern __inline float
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/acle_fma.c 
b/gcc/testsuite/gcc.target/aarch64/acle/acle_fma.c
new file mode 100644
index 000..d7986caba31
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/acle/acle_fma.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include "arm_acle.h"
+
+double
+test_acle_fma (double x, double y, double z)
+{
+  return __fma (x, y, z);
+}
+
+float
+test_acle_fmaf (float x, float y, float z)
+{
+  return __fmaf (x, y, z);
+}
+
+/* { dg-final { scan-assembler-times "fmadd\td\[0-9\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmadd\ts\[0-9\]" 1 } } */
-- 
2.43.0