[PATCH] Adding RBIT gcc builtin for ARM
The attached patch contains __builtin_arm_rbit which generates RBIT instruction for ARM targets. Please let me know if you any questions or comments, or commit this patch for me as I do not have write access to SVN. Thanks Ayan commit a692b5b4965840babbdaf5e2b9b1feb1995d351d Author: Ayan Shafqat Date: Mon Jun 17 21:46:54 2019 -0400 Implementing RBIT builtin as described in ACLE doc ARM's RBIT instruction is used to reverse the bit order of a word. This is present in ARMv6 and above in both ARM and Thumb modes. This is also specified as an intrinsic function in ACLE documentation. This commit implements the GCC builtin for ARM target for RBIT instruction, __builtin_arm_rbit. Also, this implements the intrinsic functions as stated in ARM ACLE documentation, which are listed below: uint32_t __rbit(uint32_t x); unsigned long __rbitl(unsigned long x); uint64_t __rbitll(uint64_t x); Note: __rbitll is implemented as two calls to __rbit. I know this is not how it's done in AArch64, but this is what I can do for now. diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md index ae582172ab9..83dcb7b411c 100644 --- a/gcc/config/arm/arm.md +++ b/gcc/config/arm/arm.md @@ -11568,6 +11568,13 @@ [(set_attr "predicable" "yes") (set_attr "type" "clz")]) +(define_insn "rbit" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (unspec:SI [(match_operand:SI 1 "s_register_operand" "r")] UNSPEC_RBIT))] + "TARGET_32BIT && arm_arch_thumb2" + "rbit%?\\t%0, %1" + [(set_attr "predicable" "yes")]) + (define_insn "rbitsi2" [(set (match_operand:SI 0 "s_register_operand" "=r") (unspec:SI [(match_operand:SI 1 "s_register_operand" "r")] UNSPEC_RBIT))] diff --git a/gcc/config/arm/arm_acle.h b/gcc/config/arm/arm_acle.h index 2c7acc698ea..ce1b102444b 100644 --- a/gcc/config/arm/arm_acle.h +++ b/gcc/config/arm/arm_acle.h @@ -168,6 +168,29 @@ __arm_mrrc2 (const unsigned int __coproc, const unsigned int __opc1, { return __builtin_arm_mrrc2 (__coproc, __opc1, __CRm); } + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +__rbit(uint32_t __op1) +{ + return __builtin_arm_rbit(__op1); +} + +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +__rbitll(uint64_t __op1) +{ + return (((uint64_t)__rbit(__op1)) << 32U) | __rbit(__op1 >> 32U); +} + +__extension__ static __inline unsigned long __attribute__ ((__always_inline__)) +__rbitl(unsigned long __op1) +{ +#if __SIZEOF_LONG__ == 4 + return __rbit(__op1); +#else + return __rbitll(__op1); +#endif +} + #endif /* __ARM_ARCH >= 6. */ #endif /* __ARM_ARCH >= 6 || defined (__ARM_ARCH_5TE__). */ #endif /* __ARM_ARCH >= 5. */ diff --git a/gcc/config/arm/arm_acle_builtins.def b/gcc/config/arm/arm_acle_builtins.def index b2438d66da2..ecb3be491fc 100644 --- a/gcc/config/arm/arm_acle_builtins.def +++ b/gcc/config/arm/arm_acle_builtins.def @@ -24,6 +24,7 @@ VAR1 (UBINOP, crc32w, si) VAR1 (UBINOP, crc32cb, si) VAR1 (UBINOP, crc32ch, si) VAR1 (UBINOP, crc32cw, si) +VAR1 (UBINOP, rbit, si) VAR1 (CDP, cdp, void) VAR1 (CDP, cdp2, void) VAR1 (LDC, ldc, void) diff --git a/gcc/testsuite/gcc.target/arm/acle/rbit.c b/gcc/testsuite/gcc.target/arm/acle/rbit.c new file mode 100644 index 000..7803dd33615 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/acle/rbit.c @@ -0,0 +1,18 @@ +/* Test the crc32d ACLE intrinsic. */ + +/* { dg-do assemble } */ +/* { dg-require-effective-target arm_crc_ok } */ +/* { dg-options "-save-temps -O0" } */ +/* { dg-add-options arm_crc } */ + +#include "arm_acle.h" + +void test_rbit (void) +{ + uint32_t out_uint32_t; + uint32_t arg0_uint32_t; + + out_uint32_t = __rbit (arg0_uint32_t); +} + +/* { dg-final { scan-assembler-times "rbit\t...?, ...?\n" 2 } } */
[PATCH] ARM ACLE: add inline definitions for __fma and __fmaf in aarch64 and aarch32 headers
Hi GCC team, This patch introduces inline definitions for the __fma and __fmaf functions in the ARM ACLE headers for both aarch64 and arm targets. The new implementations use the built-in functions (__builtin_fma and __builtin_fmaf) to ensure proper inlining and adherence to the ARM ACLE requirements[1]. Changes include: - In gcc/config/aarch64/arm_acle.h: Added inline definitions for __fma and __fmaf. - In gcc/config/arm/arm_acle.h: Added inline definitions for __fma and __fmaf. These changes have been tested locally, and I have verified that they integrate smoothly with the existing ARM backend configurations. Please let me know if you have any questions or need further modifications. Thanks, Ayan [1] ARM ACLE Document: https://arm-software.github.io/acle/main/acle.html#fused-multiply-accumulate-fma Signed-off-by: Ayan Shafqat diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h index 7976c117daf..d9e2401ea9f 100644 --- a/gcc/config/aarch64/arm_acle.h +++ b/gcc/config/aarch64/arm_acle.h @@ -129,6 +129,20 @@ __jcvt (double __a) #pragma GCC pop_options +__extension__ extern __inline double +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__fma (double __x, double __y, double __z) +{ + return __builtin_fma (__x, __y, __z); +} + +__extension__ extern __inline float +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__fmaf (float __x, float __y, float __z) +{ + return __builtin_fmaf (__x, __y, __z); +} + #pragma GCC push_options #pragma GCC target ("+nothing+frintts") __extension__ extern __inline float diff --git a/gcc/config/arm/arm_acle.h b/gcc/config/arm/arm_acle.h index c6c03fdce27..256710a2c31 100644 --- a/gcc/config/arm/arm_acle.h +++ b/gcc/config/arm/arm_acle.h @@ -829,6 +829,20 @@ __crc32cd (uint32_t __a, uint64_t __b) #endif /* __ARM_FEATURE_CRC32 */ #pragma GCC pop_options +__extension__ extern __inline double +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__fma (double __x, double __y, double __z) +{ + return __builtin_fma (__x, __y, __z); +} + +__extension__ extern __inline float +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__fmaf (float __x, float __y, float __z) +{ + return __builtin_fmaf (__x, __y, __z); +} + #ifdef __cplusplus } #endif
[PATCH 1/2] aarch64: Add FMA and FMAF intrinsics and tests
This patch introduces inline definitions for the __fma and __fmaf functions in arm_acle.h for AArch64 targets. These definitions rely on __builtin_fma and __builtin_fmaf to ensure proper inlining and to meet the ACLE requirements [1]. The patch has been tested locally using a crosstool-NG sysroot for AArch64, confirming that the generated code uses the expected fused multiply-accumulate instructions (fmadd). [1] https://arm-software.github.io/acle/main/acle.html#fused-multiply-accumulate-fma Signed-off-by: Ayan Shafqat --- gcc/config/aarch64/arm_acle.h | 14 ++ .../gcc.target/aarch64/acle/acle_fma.c | 17 + 2 files changed, 31 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/acle_fma.c diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h index 7976c117daf..d9e2401ea9f 100644 --- a/gcc/config/aarch64/arm_acle.h +++ b/gcc/config/aarch64/arm_acle.h @@ -129,6 +129,20 @@ __jcvt (double __a) #pragma GCC pop_options +__extension__ extern __inline double +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__fma (double __x, double __y, double __z) +{ + return __builtin_fma (__x, __y, __z); +} + +__extension__ extern __inline float +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__fmaf (float __x, float __y, float __z) +{ + return __builtin_fmaf (__x, __y, __z); +} + #pragma GCC push_options #pragma GCC target ("+nothing+frintts") __extension__ extern __inline float diff --git a/gcc/testsuite/gcc.target/aarch64/acle/acle_fma.c b/gcc/testsuite/gcc.target/aarch64/acle/acle_fma.c new file mode 100644 index 000..9363a75b593 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/acle/acle_fma.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +#include "arm_acle.h" + +double test_acle_fma (double x, double y, double z) +{ + return __fma (x, y, z); +} + +float test_acle_fmaf (float x, float y, float z) +{ + return __fmaf (x, y, z); +} + +/* { dg-final { scan-assembler-times "fmadd\td\[0-9\]" 1 } } */ +/* { dg-final { scan-assembler-times "fmadd\ts\[0-9\]" 1 } } */ -- 2.43.0
[PATCH 2/2] arm: Add FMA and FMAF intrinsics with corresponding tests
This patch introduces inline definitions for the __fma and __fmaf functions in arm_acle.h for arm targets. These definitions rely on __builtin_fma and __builtin_fmaf to ensure proper inlining and to meet the ACLE requirements [1]. The patch has been tested locally using a crosstool-NG sysroot for arm-cortexa9_neon-linux-gnueabihf, confirming that the generated code uses the expected fused multiply-accumulate instructions: vfma.f32 for single precision vmfa.f64 for double precision Signed-off-by: Ayan Shafqat [1] https://arm-software.github.io/acle/main/acle.html#fused-multiply-accumulate-fma --- gcc/config/arm/arm_acle.h| 18 ++ gcc/testsuite/gcc.target/arm/acle/acle_fma.c | 17 + 2 files changed, 35 insertions(+) create mode 100644 gcc/testsuite/gcc.target/arm/acle/acle_fma.c diff --git a/gcc/config/arm/arm_acle.h b/gcc/config/arm/arm_acle.h index c6c03fdce27..14c28f11b9c 100644 --- a/gcc/config/arm/arm_acle.h +++ b/gcc/config/arm/arm_acle.h @@ -829,6 +829,24 @@ __crc32cd (uint32_t __a, uint64_t __b) #endif /* __ARM_FEATURE_CRC32 */ #pragma GCC pop_options +#ifdef __ARM_FEATURE_FMA +__extension__ extern __inline double +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__fma (double __x, double __y, double __z) +{ + return __builtin_fma (__x, __y, __z); +} +#endif + +#ifdef __ARM_FEATURE_FMA +__extension__ extern __inline float +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__fmaf (float __x, float __y, float __z) +{ + return __builtin_fmaf (__x, __y, __z); +} +#endif + #ifdef __cplusplus } #endif diff --git a/gcc/testsuite/gcc.target/arm/acle/acle_fma.c b/gcc/testsuite/gcc.target/arm/acle/acle_fma.c new file mode 100644 index 000..4177ac81f07 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/acle/acle_fma.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=armv7-a -mfpu=neon-vfpv4 -mfloat-abi=hard" } */ + +#include "arm_acle.h" + +double test_acle_fma (double x, double y, double z) +{ + return __fma (x, y, z); +} + +float test_acle_fmaf (float x, float y, float z) +{ + return __fmaf (x, y, z); +} + +/* { dg-final { scan-assembler-times "vfma.f64\td\[0-9\]," 1 } } */ +/* { dg-final { scan-assembler-times "vfma.f32\ts\[0-9\]" 1 } } */ -- 2.43.0
Re: [PATCH 1/2] aarch64: Add FMA and FMAF intrinsics and tests
Hello Kyrylo, On Tue, Mar 11, 2025 at 08:55:46AM +, Kyrylo Tkachov wrote: > This looks ok to me. > GCC is currently in a regression fixing stage so normally such a change would > wait until stage 1 reopens. > But this looks like a pretty safe change so I’m not against taking it now. > Do you need someone to commit this for you? Thank you very much for reviewing this patch! I do not have commit access to GCC, so I would greatly appreciate it if you can commit this patch on my behalf. Please let me know if you need anything else. Best regards, Ayan
[PATCH 1/3] AArch64: Use BUILTIN_VHSDF_HSDF for vector and scalar sqrt builtins
This patch changes the `sqrt` builtin definition from `BUILTIN_VHSDF_DF` to `BUILTIN_VHSDF_HSDF` in `aarch64-simd-builtins.def`, ensuring the builtin covers half, single, and double precision variants. The redundant `VAR1 (UNOP, sqrt, 2, FP, hf)` lines are removed, as they are no longer needed now that `BUILTIN_VHSDF_HSDF` handles those cases. Signed-off-by: Ayan Shafqat Signed-off-by: Andrew Pinski --- gcc/config/aarch64/aarch64-simd-builtins.def | 5 + 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 6cc45b18a72..685bf0dc408 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -57,7 +57,7 @@ VAR1 (BINOPP, pmull, 0, DEFAULT, v8qi) VAR1 (BINOPP, pmull_hi, 0, DEFAULT, v16qi) BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0, FP) - BUILTIN_VHSDF_DF (UNOP, sqrt, 2, FP) + BUILTIN_VHSDF_HSDF (UNOP, sqrt, 2, FP) BUILTIN_VDQ_I (BINOP, addp, 0, DEFAULT) BUILTIN_VDQ_I (BINOPU, addp, 0, DEFAULT) BUILTIN_VDQ_BHSI (UNOP, clrsb, 2, DEFAULT) @@ -848,9 +848,6 @@ BUILTIN_VHSDF_HSDF (BINOP_USS, facgt, 0, FP) BUILTIN_VHSDF_HSDF (BINOP_USS, facge, 0, FP) - /* Implemented by sqrt2. */ - VAR1 (UNOP, sqrt, 2, FP, hf) - /* Implemented by hf2. */ VAR1 (UNOP, floatdi, 2, FP, hf) VAR1 (UNOP, floatsi, 2, FP, hf) -- 2.43.0
[PATCH 2/3] AArch64: Add __sqrt and __sqrtf intrinsics to arm_acle.h
This patch introduces two new inline functions, __sqrt and __sqrtf, in arm_acle.h for AArch64 targets. These functions wrap the new builtins __builtin_aarch64_sqrtdf and __builtin_aarch64_sqrtsf, respectively, providing direct access to hardware instructions without relying on the standard math library or optimization levels. Signed-off-by: Ayan Shafqat --- gcc/config/aarch64/arm_acle.h | 14 ++ 1 file changed, 14 insertions(+) diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h index 7976c117daf..d972a4e7e7e 100644 --- a/gcc/config/aarch64/arm_acle.h +++ b/gcc/config/aarch64/arm_acle.h @@ -118,6 +118,20 @@ __revl (unsigned long __value) return __rev (__value); } +__extension__ extern __inline double +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__sqrt(double __x) +{ +return __builtin_aarch64_sqrtdf (__x); +} + +__extension__ extern __inline float +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__sqrtf(float __x) +{ +return __builtin_aarch64_sqrtsf (__x); +} + #pragma GCC push_options #pragma GCC target ("+nothing+jscvt") __extension__ extern __inline int32_t -- 2.43.0
[PATCH 3/3] AArch64: Add tests for __sqrt and __sqrtf intrinsic
This patch introduces acle_sqrt.c in the AArch64 testsuite, verifying that the new __sqrt and __sqrtf intrinsics emit the expected fsqrt instructions for double and float arguments. Coverage for new intrinsics ensures that __sqrt and __sqrtf are correctly expanded to hardware instructions and do not fall back to library calls, regardless of optimization levels. Signed-off-by: Ayan Shafqat --- .../gcc.target/aarch64/acle/acle_sqrt.c | 17 + 1 file changed, 17 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/acle_sqrt.c diff --git a/gcc/testsuite/gcc.target/aarch64/acle/acle_sqrt.c b/gcc/testsuite/gcc.target/aarch64/acle/acle_sqrt.c new file mode 100644 index 000..1e3ed9eaa6d --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/acle/acle_sqrt.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +#include "arm_acle.h" + +double test_acle_sqrt (double x) +{ + return __sqrt (x); +} + +float test_acle_sqrtf (float x) +{ + return __sqrtf (x); +} + +/* { dg-final { scan-assembler-times "fsqrt\td\[0-9\]" 1 } } */ +/* { dg-final { scan-assembler-times "fsqrt\ts\[0-9\]" 1 } } */ -- 2.43.0
[PATCH 1/2] arm: Add support for NEON vsqrt builtins (hf, sf, df)
Introduce support for a new set of NEON square-root intrinsics for half, single, and double precision. modified: gcc/config/arm/arm-builtins.cc 1. Define the df_UP macro to map to E_DFmode. 2. Add CODE_FOR_neon_vsqrtsf and CODE_FOR_neon_vsqrtdf constants that reference the underlying VFP sqrt RTL patterns (sqrtsf2 and sqrtdf2). modified: gcc/config/arm/arm_vfp_builtins.def 1. Replace the single-mode entry for vsqrt with a unified VAR3 entry that supports hf, sf, and df modes. These modifications enable the use of __builtin_neon_vsqrt{hf,sf,df} in user code and ensure the correct mode is selected for each precision variant. Signed-off-by: Ayan Shafqat Signed-off-by: Andrew Pinski --- gcc/config/arm/arm-builtins.cc | 3 +++ gcc/config/arm/arm_vfp_builtins.def | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/gcc/config/arm/arm-builtins.cc b/gcc/config/arm/arm-builtins.cc index c56ab5db985..acc86c7e8a1 100644 --- a/gcc/config/arm/arm-builtins.cc +++ b/gcc/config/arm/arm-builtins.cc @@ -694,6 +694,7 @@ arm_set_sat_qualifiers[SIMD_MAX_BUILTIN_ARGS] #define hi_UPE_HImode #define void_UP E_VOIDmode #define sf_UP E_SFmode +#define df_UP E_DFmode #define UP(X) X##_UP typedef struct { @@ -710,6 +711,8 @@ constexpr insn_code CODE_FOR_neon_usdotv8qi = CODE_FOR_neon_usdotv2siv8qi; constexpr insn_code CODE_FOR_neon_sdotv16qi = CODE_FOR_neon_sdotv4siv16qi; constexpr insn_code CODE_FOR_neon_udotv16qi = CODE_FOR_neon_udotv4siv16qi; constexpr insn_code CODE_FOR_neon_usdotv16qi = CODE_FOR_neon_usdotv4siv16qi; +constexpr insn_code CODE_FOR_neon_vsqrtsf = CODE_FOR_sqrtsf2; +constexpr insn_code CODE_FOR_neon_vsqrtdf = CODE_FOR_sqrtdf2; #define CF(N,X) CODE_FOR_neon_##N##X diff --git a/gcc/config/arm/arm_vfp_builtins.def b/gcc/config/arm/arm_vfp_builtins.def index 1fbf71e728e..8cafd72b565 100644 --- a/gcc/config/arm/arm_vfp_builtins.def +++ b/gcc/config/arm/arm_vfp_builtins.def @@ -40,7 +40,7 @@ VAR1 (UNOP, vrndm, hf) VAR1 (UNOP, vrndn, hf) VAR1 (UNOP, vrndp, hf) VAR1 (UNOP, vrndx, hf) -VAR1 (UNOP, vsqrt, hf) +VAR3 (UNOP, vsqrt, hf, sf, df) VAR2 (BINOP, vcvths_n, hf, si) VAR2 (BINOP, vcvthu_n, hf, si) -- 2.43.0
[PATCH v2 1/2] Aarch64: Add FMA and FMAF intrinsic and corresponding tests
This patch introduces inline definitions for the __fma and __fmaf functions in arm_acle.h for Aarch64 targets. These definitions rely on __builtin_fma and __builtin_fmaf to ensure proper inlining and to meet the ACLE requirements [1]. The patch has been tested locally using a crosstool-NG sysroot for Aarch64, confirming that the generated code uses the expected fused multiply-accumulate instructions (fmadd). gcc/ChangeLog: * config/aarch64/arm_acle.h: (__fma): New Function (__fmaf): New Function gcc/testsuite/ChangeLog: * gcc.target/aarch64/acle/acle_fma.c: New test. --- gcc/config/aarch64/arm_acle.h | 14 ++ .../gcc.target/aarch64/acle/acle_fma.c | 17 + 2 files changed, 31 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/acle_fma.c diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h index 7976c117daf..d9e2401ea9f 100644 --- a/gcc/config/aarch64/arm_acle.h +++ b/gcc/config/aarch64/arm_acle.h @@ -129,6 +129,20 @@ __jcvt (double __a) #pragma GCC pop_options +__extension__ extern __inline double +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__fma (double __x, double __y, double __z) +{ + return __builtin_fma (__x, __y, __z); +} + +__extension__ extern __inline float +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__fmaf (float __x, float __y, float __z) +{ + return __builtin_fmaf (__x, __y, __z); +} + #pragma GCC push_options #pragma GCC target ("+nothing+frintts") __extension__ extern __inline float diff --git a/gcc/testsuite/gcc.target/aarch64/acle/acle_fma.c b/gcc/testsuite/gcc.target/aarch64/acle/acle_fma.c new file mode 100644 index 000..9363a75b593 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/acle/acle_fma.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +#include "arm_acle.h" + +double test_acle_fma (double x, double y, double z) +{ + return __fma (x, y, z); +} + +float test_acle_fmaf (float x, float y, float z) +{ + return __fmaf (x, y, z); +} + +/* { dg-final { scan-assembler-times "fmadd\td\[0-9\]" 1 } } */ +/* { dg-final { scan-assembler-times "fmadd\ts\[0-9\]" 1 } } */ -- 2.43.0
[PATCH v2 2/2] arm: Add FMA and FMAF intrinsics with corresponding tests
This patch introduces inline definitions for the __fma and __fmaf functions in arm_acle.h for arm targets. These definitions rely on __builtin_fma and __builtin_fmaf to ensure proper inlining and to meet the ACLE requirements [1]. The patch has been tested locally using a crosstool-NG sysroot for arm-cortexa9_neon-linux-gnueabihf, confirming that the generated code uses the expected fused multiply-accumulate instructions: vfma.f32 for single precision vmfa.f64 for double precision [1] https://arm-software.github.io/acle/main/acle.html#fused-multiply-accumulate-fma gcc/ChangeLog: * config/arm/arm_acle.h (__attribute__): (__fma): New Function (__fmaf): New Function gcc/testsuite/ChangeLog: * gcc.target/arm/acle/acle_fma.c: New test. --- gcc/config/arm/arm_acle.h| 18 ++ gcc/testsuite/gcc.target/arm/acle/acle_fma.c | 17 + 2 files changed, 35 insertions(+) create mode 100644 gcc/testsuite/gcc.target/arm/acle/acle_fma.c diff --git a/gcc/config/arm/arm_acle.h b/gcc/config/arm/arm_acle.h index c6c03fdce27..14c28f11b9c 100644 --- a/gcc/config/arm/arm_acle.h +++ b/gcc/config/arm/arm_acle.h @@ -829,6 +829,24 @@ __crc32cd (uint32_t __a, uint64_t __b) #endif /* __ARM_FEATURE_CRC32 */ #pragma GCC pop_options +#ifdef __ARM_FEATURE_FMA +__extension__ extern __inline double +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__fma (double __x, double __y, double __z) +{ + return __builtin_fma (__x, __y, __z); +} +#endif + +#ifdef __ARM_FEATURE_FMA +__extension__ extern __inline float +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__fmaf (float __x, float __y, float __z) +{ + return __builtin_fmaf (__x, __y, __z); +} +#endif + #ifdef __cplusplus } #endif diff --git a/gcc/testsuite/gcc.target/arm/acle/acle_fma.c b/gcc/testsuite/gcc.target/arm/acle/acle_fma.c new file mode 100644 index 000..4177ac81f07 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/acle/acle_fma.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=armv7-a -mfpu=neon-vfpv4 -mfloat-abi=hard" } */ + +#include "arm_acle.h" + +double test_acle_fma (double x, double y, double z) +{ + return __fma (x, y, z); +} + +float test_acle_fmaf (float x, float y, float z) +{ + return __fmaf (x, y, z); +} + +/* { dg-final { scan-assembler-times "vfma.f64\td\[0-9\]," 1 } } */ +/* { dg-final { scan-assembler-times "vfma.f32\ts\[0-9\]" 1 } } */ -- 2.43.0
[PATCH v2 1/3] Aarch64: Use BUILTIN_VHSDF_HSDF for vector and scalar sqrt builtins
This patch changes the `sqrt` builtin definition from `BUILTIN_VHSDF_DF` to `BUILTIN_VHSDF_HSDF` in `aarch64-simd-builtins.def`, ensuring the builtin covers half, single, and double precision variants. The redundant `VAR1 (UNOP, sqrt, 2, FP, hf)` lines are removed, as they are no longer needed now that `BUILTIN_VHSDF_HSDF` handles those cases. gcc/ChangeLog: * config/aarch64/aarch64-simd-builtins.def: Change BUILTIN_VHSDF_DF to BUILTIN_VHSDF_HSDF Signed-off-by: Ayan Shafqat Signed-off-by: Andrew Pinski --- gcc/config/aarch64/aarch64-simd-builtins.def | 5 + 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 6cc45b18a72..685bf0dc408 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -57,7 +57,7 @@ VAR1 (BINOPP, pmull, 0, DEFAULT, v8qi) VAR1 (BINOPP, pmull_hi, 0, DEFAULT, v16qi) BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0, FP) - BUILTIN_VHSDF_DF (UNOP, sqrt, 2, FP) + BUILTIN_VHSDF_HSDF (UNOP, sqrt, 2, FP) BUILTIN_VDQ_I (BINOP, addp, 0, DEFAULT) BUILTIN_VDQ_I (BINOPU, addp, 0, DEFAULT) BUILTIN_VDQ_BHSI (UNOP, clrsb, 2, DEFAULT) @@ -848,9 +848,6 @@ BUILTIN_VHSDF_HSDF (BINOP_USS, facgt, 0, FP) BUILTIN_VHSDF_HSDF (BINOP_USS, facge, 0, FP) - /* Implemented by sqrt2. */ - VAR1 (UNOP, sqrt, 2, FP, hf) - /* Implemented by hf2. */ VAR1 (UNOP, floatdi, 2, FP, hf) VAR1 (UNOP, floatsi, 2, FP, hf) -- 2.43.0
[PATCH v2 2/3] Aarch64: Add __sqrt and __sqrtf intrinsics to arm_acle.h
This patch introduces two new inline functions, __sqrt and __sqrtf, in arm_acle.h for Aarch64 targets. These functions wrap the new builtins __builtin_aarch64_sqrtdf and __builtin_aarch64_sqrtsf, respectively, providing direct access to hardware instructions without relying on the standard math library or optimization levels. gcc/ChangeLog: * config/aarch64/arm_acle.h (__sqrt, __sqrtf): New function. Signed-off-by: Ayan Shafqat --- gcc/config/aarch64/arm_acle.h | 14 ++ 1 file changed, 14 insertions(+) diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h index 7976c117daf..d972a4e7e7e 100644 --- a/gcc/config/aarch64/arm_acle.h +++ b/gcc/config/aarch64/arm_acle.h @@ -118,6 +118,20 @@ __revl (unsigned long __value) return __rev (__value); } +__extension__ extern __inline double +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__sqrt(double __x) +{ +return __builtin_aarch64_sqrtdf (__x); +} + +__extension__ extern __inline float +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__sqrtf(float __x) +{ +return __builtin_aarch64_sqrtsf (__x); +} + #pragma GCC push_options #pragma GCC target ("+nothing+jscvt") __extension__ extern __inline int32_t -- 2.43.0
[PATCH v2 3/3] Aarch64: Add tests for __sqrt and __sqrtf intrinsic
This patch introduces acle_sqrt.c in the AArch64 testsuite, verifying that the new __sqrt and __sqrtf intrinsics emit the expected fsqrt instructions for double and float arguments. Coverage for new intrinsics ensures that __sqrt and __sqrtf are correctly expanded to hardware instructions and do not fall back to library calls, regardless of optimization levels. gcc/testsuite/ChangeLog: * gcc.target/aarch64/acle/acle_sqrt.c: New test. Signed-off-by: Ayan Shafqat --- .../gcc.target/aarch64/acle/acle_sqrt.c | 17 + 1 file changed, 17 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/acle_sqrt.c diff --git a/gcc/testsuite/gcc.target/aarch64/acle/acle_sqrt.c b/gcc/testsuite/gcc.target/aarch64/acle/acle_sqrt.c new file mode 100644 index 000..1e3ed9eaa6d --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/acle/acle_sqrt.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +#include "arm_acle.h" + +double test_acle_sqrt (double x) +{ + return __sqrt (x); +} + +float test_acle_sqrtf (float x) +{ + return __sqrtf (x); +} + +/* { dg-final { scan-assembler-times "fsqrt\td\[0-9\]" 1 } } */ +/* { dg-final { scan-assembler-times "fsqrt\ts\[0-9\]" 1 } } */ -- 2.43.0
[PATCH 2/3] Aarch64: Add __sqrt and __sqrtf intrinsics to arm_acle.h
This patch introduces two new inline functions, __sqrt and __sqrtf, in arm_acle.h for Aarch64 targets. These functions wrap the new builtins __builtin_aarch64_sqrtdf and __builtin_aarch64_sqrtsf, respectively, providing direct access to hardware instructions without relying on the standard math library or optimization levels. gcc/ChangeLog: * config/aarch64/arm_acle.h (__sqrt, __sqrtf): New function. Signed-off-by: Ayan Shafqat --- gcc/config/aarch64/arm_acle.h | 14 ++ 1 file changed, 14 insertions(+) diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h index 7976c117daf..d972a4e7e7e 100644 --- a/gcc/config/aarch64/arm_acle.h +++ b/gcc/config/aarch64/arm_acle.h @@ -118,6 +118,20 @@ __revl (unsigned long __value) return __rev (__value); } +__extension__ extern __inline double +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__sqrt(double __x) +{ +return __builtin_aarch64_sqrtdf (__x); +} + +__extension__ extern __inline float +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__sqrtf(float __x) +{ +return __builtin_aarch64_sqrtsf (__x); +} + #pragma GCC push_options #pragma GCC target ("+nothing+jscvt") __extension__ extern __inline int32_t -- 2.43.0
[PATCH 2/2] arm: Add ACLE sqrt intrinsic using NEON vsqrt builtins
Add inline implementations of the ACLE __sqrt() and __sqrtf() functions in arm_acle.h. These functions, defined when __ARM_FP is available[1], forward the square-root operation to the corresponding NEON builtins: * __sqrt() calls __builtin_neon_vsqrtdf for double precision. * __sqrtf() calls __builtin_neon_vsqrtsf for single precision. Additionally, a new testsuite file (acle_sqrt.c) is introduced to verify the generated assembly contains the proper vsqrt instructions (vsqrt.f64 and vsqrt.f32) when compiling ACLE sqrt intrinsics. modified: gcc/config/arm/arm_acle.h * Add inline definitions for __sqrt() and __sqrtf(). new file: gcc/testsuite/gcc.target/arm/acle/acle_sqrt.c * Create tests to compile and check for vsqrt.f64 and vsqrt.f32 instructions. [1] https://developer.arm.com/documentation/101028/0012/3--C-language-extensions Signed-off-by: Ayan Shafqat --- gcc/ChangeLog | 16 gcc/config/arm/arm_acle.h | 18 ++ gcc/testsuite/ChangeLog | 6 ++ gcc/testsuite/gcc.target/arm/acle/acle_sqrt.c | 17 + 4 files changed, 57 insertions(+) create mode 100644 gcc/testsuite/gcc.target/arm/acle/acle_sqrt.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 704146d97aa..25b1905fa77 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,19 @@ +2025-03-13 Ayan Shafqat + + * config/arm/arm_acle.h (__sqrt, __sqrtf): + Add inline definitions for __sqrt() and __sqrtf() (guarded by __ARM_FP) + that forward to the NEON square-root builtins (__builtin_neon_vsqrtdf + and __builtin_neon_vsqrtsf), enabling ACLE sqrt intrinsics. + + * config/arm/arm-builtins.cc (df_UP): + Define df_UP to map to E_DFmode and add CODE_FOR_neon_vsqrtsf and + CODE_FOR_neon_vsqrtdf to support the new vsqrt builtins. + + * config/arm/arm_vfp_builtins.def (VAR1, VAR3): + Replace the single-mode vsqrt entry (VAR1) with a VAR3 entry that supports + hf, sf, and df modes. + + 2025-03-12 Jeff Law Revert: diff --git a/gcc/config/arm/arm_acle.h b/gcc/config/arm/arm_acle.h index c6c03fdce27..c5f8d35c7d5 100644 --- a/gcc/config/arm/arm_acle.h +++ b/gcc/config/arm/arm_acle.h @@ -829,6 +829,24 @@ __crc32cd (uint32_t __a, uint64_t __b) #endif /* __ARM_FEATURE_CRC32 */ #pragma GCC pop_options +#ifdef __ARM_FP +__extension__ extern __inline double +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__sqrt(double __x) +{ + return __builtin_neon_vsqrtdf (__x); +} +#endif + +#ifdef __ARM_FP +__extension__ extern __inline float +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__sqrtf(float __x) +{ + return __builtin_neon_vsqrtsf (__x); +} +#endif + #ifdef __cplusplus } #endif diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 95a405651c6..a03b78f9fba 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,9 @@ +2025-03-13 Ayan Shafqat + + * gcc.target/arm/acle/acle_sqrt.c: New test to verify that the ACLE + __sqrt() and __sqrtf() intrinsics are correctly lowered to the expected + vsqrt.f64 and vsqrt.f32 instructions. + 2025-03-11 Jakub Jelinek PR c/117178 diff --git a/gcc/testsuite/gcc.target/arm/acle/acle_sqrt.c b/gcc/testsuite/gcc.target/arm/acle/acle_sqrt.c new file mode 100644 index 000..f95e3476c4d --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/acle/acle_sqrt.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +#include "arm_acle.h" + +double test_acle_sqrt (double x) +{ + return __sqrt (x); +} + +float test_acle_sqrtf (float x) +{ + return __sqrtf (x); +} + +/* { dg-final { scan-assembler-times "vsqrt.f64\td\[0-9\]" 1 } } */ +/* { dg-final { scan-assembler-times "vsqrt.f32\ts\[0-9\]" 1 } } */ -- 2.43.0
[PATCH v2 2/2] arm: Add ACLE sqrt intrinsic using NEON vsqrt builtins
Add inline implementations of the ACLE __sqrt() and __sqrtf() functions in arm_acle.h. These functions, defined when __ARM_FP is available[1], forward the square-root operation to the corresponding NEON builtins: * __sqrt() calls __builtin_neon_vsqrtdf for double precision. * __sqrtf() calls __builtin_neon_vsqrtsf for single precision. Additionally, a new testsuite file (acle_sqrt.c) is introduced to verify the generated assembly contains the proper vsqrt instructions (vsqrt.f64 and vsqrt.f32) when compiling ACLE sqrt intrinsics. [1] https://developer.arm.com/documentation/101028/0012/3--C-language-extensions gcc/ChangeLog: * config/arm/arm_acle.h (__sqrt, __sqrtf): New functions gcc/testsuite/ChangeLog: * gcc.target/arm/acle/acle_sqrt.c: New test. Signed-off-by: Ayan Shafqat --- gcc/config/arm/arm_acle.h | 18 ++ gcc/testsuite/gcc.target/arm/acle/acle_sqrt.c | 19 +++ 2 files changed, 37 insertions(+) create mode 100644 gcc/testsuite/gcc.target/arm/acle/acle_sqrt.c diff --git a/gcc/config/arm/arm_acle.h b/gcc/config/arm/arm_acle.h index c6c03fdce27..c00a5dbce69 100644 --- a/gcc/config/arm/arm_acle.h +++ b/gcc/config/arm/arm_acle.h @@ -829,6 +829,24 @@ __crc32cd (uint32_t __a, uint64_t __b) #endif /* __ARM_FEATURE_CRC32 */ #pragma GCC pop_options +#ifdef __ARM_FP +__extension__ static __inline double +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__sqrt (double __x) +{ + return __builtin_neon_vsqrtdf (__x); +} +#endif + +#ifdef __ARM_FP +__extension__ static __inline float +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__sqrtf (float __x) +{ + return __builtin_neon_vsqrtsf (__x); +} +#endif + #ifdef __cplusplus } #endif diff --git a/gcc/testsuite/gcc.target/arm/acle/acle_sqrt.c b/gcc/testsuite/gcc.target/arm/acle/acle_sqrt.c new file mode 100644 index 000..bf2ff1ffa8b --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/acle/acle_sqrt.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +#include "arm_acle.h" + +double +test_acle_sqrt (double x) +{ + return __sqrt (x); +} + +float +test_acle_sqrtf (float x) +{ + return __sqrtf (x); +} + +/* { dg-final { scan-assembler-times "vsqrt.f64\td\[0-9\]" 1 } } */ +/* { dg-final { scan-assembler-times "vsqrt.f32\ts\[0-9\]" 1 } } */ -- 2.43.0
[PATCH v2 1/2] arm: Add support for NEON vsqrt builtins (hf, sf, df)
Introduce support for a new set of NEON square-root intrinsics for half, single, and double precision. modified: gcc/config/arm/arm-builtins.cc 1. Define the df_UP macro to map to E_DFmode. 2. Add CODE_FOR_neon_vsqrtsf and CODE_FOR_neon_vsqrtdf constants that reference the underlying VFP sqrt RTL patterns (sqrtsf2 and sqrtdf2). modified: gcc/config/arm/arm_vfp_builtins.def 1. Replace the single-mode entry for vsqrt with a unified VAR3 entry that supports hf, sf, and df modes. These modifications enable the use of __builtin_neon_vsqrt{hf,sf,df} in user code and ensure the correct mode is selected for each precision variant. Signed-off-by: Ayan Shafqat Signed-off-by: Andrew Pinski gcc/ChangeLog: * config/arm/arm-builtins.cc (df_UP): New macro. * config/arm/arm_vfp_builtins.def (VAR1, VAR3): Change VAR1 to VAR3, for HF, SF, and DF modes. --- gcc/config/arm/arm-builtins.cc | 3 +++ gcc/config/arm/arm_vfp_builtins.def | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/gcc/config/arm/arm-builtins.cc b/gcc/config/arm/arm-builtins.cc index c56ab5db985..acc86c7e8a1 100644 --- a/gcc/config/arm/arm-builtins.cc +++ b/gcc/config/arm/arm-builtins.cc @@ -694,6 +694,7 @@ arm_set_sat_qualifiers[SIMD_MAX_BUILTIN_ARGS] #define hi_UPE_HImode #define void_UP E_VOIDmode #define sf_UP E_SFmode +#define df_UP E_DFmode #define UP(X) X##_UP typedef struct { @@ -710,6 +711,8 @@ constexpr insn_code CODE_FOR_neon_usdotv8qi = CODE_FOR_neon_usdotv2siv8qi; constexpr insn_code CODE_FOR_neon_sdotv16qi = CODE_FOR_neon_sdotv4siv16qi; constexpr insn_code CODE_FOR_neon_udotv16qi = CODE_FOR_neon_udotv4siv16qi; constexpr insn_code CODE_FOR_neon_usdotv16qi = CODE_FOR_neon_usdotv4siv16qi; +constexpr insn_code CODE_FOR_neon_vsqrtsf = CODE_FOR_sqrtsf2; +constexpr insn_code CODE_FOR_neon_vsqrtdf = CODE_FOR_sqrtdf2; #define CF(N,X) CODE_FOR_neon_##N##X diff --git a/gcc/config/arm/arm_vfp_builtins.def b/gcc/config/arm/arm_vfp_builtins.def index 1fbf71e728e..8cafd72b565 100644 --- a/gcc/config/arm/arm_vfp_builtins.def +++ b/gcc/config/arm/arm_vfp_builtins.def @@ -40,7 +40,7 @@ VAR1 (UNOP, vrndm, hf) VAR1 (UNOP, vrndn, hf) VAR1 (UNOP, vrndp, hf) VAR1 (UNOP, vrndx, hf) -VAR1 (UNOP, vsqrt, hf) +VAR3 (UNOP, vsqrt, hf, sf, df) VAR2 (BINOP, vcvths_n, hf, si) VAR2 (BINOP, vcvthu_n, hf, si) -- 2.43.0
Re: [PATCH 2/3] Aarch64: Add __sqrt and __sqrtf intrinsics to arm_acle.h
Hi Jakub: Thank you very much for the review feedback. I have addressed the feedback in v2 of the patch [1]. See additional replies below. [1] https://gcc.gnu.org/pipermail/gcc-patches/2025-March/677754.html On Thu, Mar 13, 2025 at 10:28:52PM +0100, Jakub Jelinek wrote: > On Thu, Mar 13, 2025 at 05:23:00PM -0400, Ayan Shafqat wrote: > > This patch introduces two new inline functions, __sqrt and __sqrtf, in > > +__sqrt(double __x) > Just formatting nits, there should be space in between the function name > and ( and only one space between double and __x. Thanks for catching this. > Also, it is unclear why it uses __extension__ (but admittedly it is used > elsewhere in the header. That is a question to the ARM maintainers. I have followed the convention in the rest of the file. If that needs changes, let me know. > > > +{ > > +return __builtin_aarch64_sqrtdf (__x); > > Just two space indentation rather than 4 spaces. Got it! Thanks for pointing this out. > > > +} > > + > > +__extension__ extern __inline float > > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > > +__sqrtf(float __x) > > See above > > > +{ > > +return __builtin_aarch64_sqrtsf (__x); > > Ditto > > Jakub > Thank you again for the review. Let me know if you spot anything else. Best regards, Ayan
Re: [PATCH v2 3/3] Aarch64: Add tests for __sqrt and __sqrtf intrinsic
Hello Jakub: Thank you very much for your feedback. See additional replies below. On Thu, Mar 13, 2025 at 10:31:44PM +0100, Jakub Jelinek wrote: > On Thu, Mar 13, 2025 at 05:25:26PM -0400, Ayan Shafqat wrote: > > gcc/testsuite/ChangeLog: > > > > * gcc.target/aarch64/acle/acle_sqrt.c: New test. > > > > Signed-off-by: Ayan Shafqat > > Tests should be in the same patch as the code they are testing, > not committed separately. Yes, thanks for pointing this out. I agree that tests should be in the same commit as of introduction of the new feature. I have merged the two changes into one in the new version. Please see the newer version of this patch: https://gcc.gnu.org/pipermail/gcc-patches/2025-March/677754.html > > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/acle/acle_sqrt.c > > @@ -0,0 +1,17 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2" } */ > > + > > +#include "arm_acle.h" > > + > > +double test_acle_sqrt (double x) > > The normal GNU formatting is > double > test_acle_sqrt (double x) > (i.e. function name at the start of line, so one can grep for it). > > > +{ > > + return __sqrt (x); > > +} > > + > > +float test_acle_sqrtf (float x) > > Ditto. > > Jakub > Thanks, I have addressed it in the v2 of this patch. Let me know if there are any other issues. Best, Ayan
[PATCH v2 2/2] Aarch64: Add __sqrt and __sqrtf intrinsics and corresponding tests
This patch introduces two new inline functions, __sqrt and __sqrtf, in arm_acle.h for Aarch64 targets. These functions wrap the new builtins __builtin_aarch64_sqrtdf and __builtin_aarch64_sqrtsf, respectively, providing direct access to hardware instructions without relying on the standard math library or optimization levels. This patch also introduces acle_sqrt.c in the AArch64 testsuite, verifying that the new __sqrt and __sqrtf intrinsics emit the expected fsqrt instructions for double and float arguments. Coverage for new intrinsics ensures that __sqrt and __sqrtf are correctly expanded to hardware instructions and do not fall back to library calls, regardless of optimization levels. gcc/ChangeLog: * config/aarch64/arm_acle.h (__sqrt, __sqrtf): New function. gcc/testsuite/ChangeLog: * gcc.target/aarch64/acle/acle_sqrt.c: New test. Signed-off-by: Ayan Shafqat --- gcc/config/aarch64/arm_acle.h | 14 ++ .../gcc.target/aarch64/acle/acle_sqrt.c | 19 +++ 2 files changed, 33 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/acle_sqrt.c diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h index 7976c117daf..2900e934239 100644 --- a/gcc/config/aarch64/arm_acle.h +++ b/gcc/config/aarch64/arm_acle.h @@ -118,6 +118,20 @@ __revl (unsigned long __value) return __rev (__value); } +__extension__ extern __inline double +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__sqrt (double __x) +{ + return __builtin_aarch64_sqrtdf (__x); +} + +__extension__ extern __inline float +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__sqrtf (float __x) +{ + return __builtin_aarch64_sqrtsf (__x); +} + #pragma GCC push_options #pragma GCC target ("+nothing+jscvt") __extension__ extern __inline int32_t diff --git a/gcc/testsuite/gcc.target/aarch64/acle/acle_sqrt.c b/gcc/testsuite/gcc.target/aarch64/acle/acle_sqrt.c new file mode 100644 index 000..482351fa7e6 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/acle/acle_sqrt.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +#include "arm_acle.h" + +double +test_acle_sqrt (double x) +{ + return __sqrt (x); +} + +float +test_acle_sqrtf (float x) +{ + return __sqrtf (x); +} + +/* { dg-final { scan-assembler-times "fsqrt\td\[0-9\]" 1 } } */ +/* { dg-final { scan-assembler-times "fsqrt\ts\[0-9\]" 1 } } */ -- 2.43.0
[PATCH v2 1/2] Aarch64: Use BUILTIN_VHSDF_HSDF for vector and scalar sqrt builtins
This patch changes the `sqrt` builtin definition from `BUILTIN_VHSDF_DF` to `BUILTIN_VHSDF_HSDF` in `aarch64-simd-builtins.def`, ensuring the builtin covers half, single, and double precision variants. The redundant `VAR1 (UNOP, sqrt, 2, FP, hf)` lines are removed, as they are no longer needed now that `BUILTIN_VHSDF_HSDF` handles those cases. gcc/ChangeLog: * config/aarch64/aarch64-simd-builtins.def: Change BUILTIN_VHSDF_DF to BUILTIN_VHSDF_HSDF Signed-off-by: Ayan Shafqat Signed-off-by: Andrew Pinski --- gcc/config/aarch64/aarch64-simd-builtins.def | 5 + 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 6cc45b18a72..685bf0dc408 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -57,7 +57,7 @@ VAR1 (BINOPP, pmull, 0, DEFAULT, v8qi) VAR1 (BINOPP, pmull_hi, 0, DEFAULT, v16qi) BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0, FP) - BUILTIN_VHSDF_DF (UNOP, sqrt, 2, FP) + BUILTIN_VHSDF_HSDF (UNOP, sqrt, 2, FP) BUILTIN_VDQ_I (BINOP, addp, 0, DEFAULT) BUILTIN_VDQ_I (BINOPU, addp, 0, DEFAULT) BUILTIN_VDQ_BHSI (UNOP, clrsb, 2, DEFAULT) @@ -848,9 +848,6 @@ BUILTIN_VHSDF_HSDF (BINOP_USS, facgt, 0, FP) BUILTIN_VHSDF_HSDF (BINOP_USS, facge, 0, FP) - /* Implemented by sqrt2. */ - VAR1 (UNOP, sqrt, 2, FP, hf) - /* Implemented by hf2. */ VAR1 (UNOP, floatdi, 2, FP, hf) VAR1 (UNOP, floatsi, 2, FP, hf) -- 2.43.0
[PATCH v3 2/2] arm: Add FMA and FMAF intrinsics with corresponding tests
This patch introduces inline definitions for the __fma and __fmaf functions in arm_acle.h for arm targets. These definitions rely on __builtin_fma and __builtin_fmaf to ensure proper inlining and to meet the ACLE requirements [1]. The patch has been tested locally using a crosstool-NG sysroot for arm-cortexa9_neon-linux-gnueabihf, confirming that the generated code uses the expected fused multiply-accumulate instructions: vfma.f32 for single precision vmfa.f64 for double precision [1] https://arm-software.github.io/acle/main/acle.html#fused-multiply-accumulate-fma gcc/ChangeLog: * config/arm/arm_acle.h (__fma, __fmaf): New functions. gcc/testsuite/ChangeLog: * gcc.target/arm/acle/acle_fma.c: New test. --- gcc/config/arm/arm_acle.h| 19 +++ gcc/testsuite/gcc.target/arm/acle/acle_fma.c | 19 +++ 2 files changed, 38 insertions(+) create mode 100644 gcc/testsuite/gcc.target/arm/acle/acle_fma.c diff --git a/gcc/config/arm/arm_acle.h b/gcc/config/arm/arm_acle.h index c6c03fdce27..02cb67d1516 100644 --- a/gcc/config/arm/arm_acle.h +++ b/gcc/config/arm/arm_acle.h @@ -829,6 +829,25 @@ __crc32cd (uint32_t __a, uint64_t __b) #endif /* __ARM_FEATURE_CRC32 */ #pragma GCC pop_options +#pragma GCC push_options +#pragma GCC target("fpu=neon-vfpv4") + +__extension__ extern __inline double +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__fma (double __x, double __y, double __z) +{ + return __builtin_fma (__x, __y, __z); +} + +__extension__ extern __inline float +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__fmaf (float __x, float __y, float __z) +{ + return __builtin_fmaf (__x, __y, __z); +} + +#pragma GCC pop_options + #ifdef __cplusplus } #endif diff --git a/gcc/testsuite/gcc.target/arm/acle/acle_fma.c b/gcc/testsuite/gcc.target/arm/acle/acle_fma.c new file mode 100644 index 000..cba4f48929d --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/acle/acle_fma.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=armv7-a -mfpu=neon-vfpv4 -mfloat-abi=hard" } */ + +#include "arm_acle.h" + +double +test_acle_fma (double x, double y, double z) +{ + return __fma (x, y, z); +} + +float +test_acle_fmaf (float x, float y, float z) +{ + return __fmaf (x, y, z); +} + +/* { dg-final { scan-assembler-times "vfma.f64\td\[0-9\]," 1 } } */ +/* { dg-final { scan-assembler-times "vfma.f32\ts\[0-9\]" 1 } } */ -- 2.43.0
Re: [PATCH 1/2] aarch64: Add FMA and FMAF intrinsics and tests
On Thu, Mar 13, 2025 at 08:31:24AM +, Kyrylo Tkachov wrote: > > I forgot during the review, but a patch needs a ChangeLog entry. > Could you provide one please to add to the commit log? > I have submitted the patch again in the mailing list: https://gcc.gnu.org/pipermail/gcc-patches/2025-March/677588.html Let me know if you need anything else. Thanks in advance, Ayan
[PATCH v3 1/2] Aarch64: Add FMA and FMAF intrinsic and corresponding tests
This patch introduces inline definitions for the __fma and __fmaf functions in arm_acle.h for Aarch64 targets. These definitions rely on __builtin_fma and __builtin_fmaf to ensure proper inlining and to meet the ACLE requirements [1]. The patch has been tested locally using a crosstool-NG sysroot for Aarch64, confirming that the generated code uses the expected fused multiply-accumulate instructions (fmadd). [1] https://arm-software.github.io/acle/main/acle.html#fused-multiply-accumulate-fma gcc/ChangeLog: * config/aarch64/arm_acle.h (__fma, __fmaf): New functions. gcc/testsuite/ChangeLog: * gcc.target/aarch64/acle/acle_fma.c: New test. --- gcc/config/aarch64/arm_acle.h | 14 ++ .../gcc.target/aarch64/acle/acle_fma.c| 19 +++ 2 files changed, 33 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/acle_fma.c diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h index 7976c117daf..d9e2401ea9f 100644 --- a/gcc/config/aarch64/arm_acle.h +++ b/gcc/config/aarch64/arm_acle.h @@ -129,6 +129,20 @@ __jcvt (double __a) #pragma GCC pop_options +__extension__ extern __inline double +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__fma (double __x, double __y, double __z) +{ + return __builtin_fma (__x, __y, __z); +} + +__extension__ extern __inline float +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +__fmaf (float __x, float __y, float __z) +{ + return __builtin_fmaf (__x, __y, __z); +} + #pragma GCC push_options #pragma GCC target ("+nothing+frintts") __extension__ extern __inline float diff --git a/gcc/testsuite/gcc.target/aarch64/acle/acle_fma.c b/gcc/testsuite/gcc.target/aarch64/acle/acle_fma.c new file mode 100644 index 000..d7986caba31 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/acle/acle_fma.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +#include "arm_acle.h" + +double +test_acle_fma (double x, double y, double z) +{ + return __fma (x, y, z); +} + +float +test_acle_fmaf (float x, float y, float z) +{ + return __fmaf (x, y, z); +} + +/* { dg-final { scan-assembler-times "fmadd\td\[0-9\]" 1 } } */ +/* { dg-final { scan-assembler-times "fmadd\ts\[0-9\]" 1 } } */ -- 2.43.0