https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/154558
Now that #152455 is done, we can make all the vector fma intrinsics that wrap __builtin_elementwise_fma to be constexpr Fixes #154555 >From ab819f4f6e556bee2fbf5e8fd00b2135748c4c1f Mon Sep 17 00:00:00 2001 From: Simon Pilgrim <llvm-...@redking.me.uk> Date: Wed, 20 Aug 2025 16:27:25 +0100 Subject: [PATCH] [Headers][X86] Allow FMA3/FMA4 vector intrinsics to be used in constexpr Now that #152455 is done, we can make all the vector fma intrinsics that wrap __builtin_elementwise_fma to be constexpr --- clang/lib/Headers/fma4intrin.h | 42 ++++++++++++++++---------- clang/lib/Headers/fmaintrin.h | 42 ++++++++++++++++---------- clang/test/CodeGen/X86/fma-builtins.c | 17 +++++++++++ clang/test/CodeGen/X86/fma4-builtins.c | 17 +++++++++++ 4 files changed, 86 insertions(+), 32 deletions(-) diff --git a/clang/lib/Headers/fma4intrin.h b/clang/lib/Headers/fma4intrin.h index 69977fb2ccee3..05261f6389274 100644 --- a/clang/lib/Headers/fma4intrin.h +++ b/clang/lib/Headers/fma4intrin.h @@ -20,14 +20,22 @@ #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(128))) #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(256))) -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr +#else +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 +#endif + +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_macc_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_macc_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B, @@ -46,14 +54,14 @@ _mm_macc_sd(__m128d __A, __m128d __B, __m128d __C) return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_msub_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_msub_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B, @@ -72,14 +80,14 @@ _mm_msub_sd(__m128d __A, __m128d __B, __m128d __C) return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B, @@ -98,14 +106,14 @@ _mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C) return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B, @@ -148,56 +156,56 @@ _mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C) return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B, (__v4df)__C); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B, -(__v4df)__C); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B, (__v4df)__C); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B, @@ -230,5 +238,7 @@ _mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C) #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 +#undef __DEFAULT_FN_ATTRS128_CONSTEXPR +#undef __DEFAULT_FN_ATTRS256_CONSTEXPR #endif /* __FMA4INTRIN_H */ diff --git a/clang/lib/Headers/fmaintrin.h b/clang/lib/Headers/fmaintrin.h index 24584a928da4d..d8ea489022b8f 100644 --- a/clang/lib/Headers/fmaintrin.h +++ b/clang/lib/Headers/fmaintrin.h @@ -18,6 +18,14 @@ #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128))) #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr +#else +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 +#endif + /// Computes a multiply-add of 128-bit vectors of [4 x float]. /// For each element, computes <c> (__A * __B) + __C </c>. /// @@ -32,7 +40,7 @@ /// \param __C /// A 128-bit vector of [4 x float] containing the addend. /// \returns A 128-bit vector of [4 x float] containing the result. -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B, @@ -53,7 +61,7 @@ _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C) /// \param __C /// A 128-bit vector of [2 x double] containing the addend. /// \returns A 128-bit [2 x double] vector containing the result. -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B, @@ -132,7 +140,7 @@ _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C) /// \param __C /// A 128-bit vector of [4 x float] containing the subtrahend. /// \returns A 128-bit vector of [4 x float] containing the result. -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B, @@ -153,7 +161,7 @@ _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C) /// \param __C /// A 128-bit vector of [2 x double] containing the addend. /// \returns A 128-bit vector of [2 x double] containing the result. -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B, @@ -232,7 +240,7 @@ _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C) /// \param __C /// A 128-bit vector of [4 x float] containing the addend. /// \returns A 128-bit [4 x float] vector containing the result. -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B, @@ -253,7 +261,7 @@ _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C) /// \param __C /// A 128-bit vector of [2 x double] containing the addend. /// \returns A 128-bit vector of [2 x double] containing the result. -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B, @@ -332,7 +340,7 @@ _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C) /// \param __C /// A 128-bit vector of [4 x float] containing the subtrahend. /// \returns A 128-bit vector of [4 x float] containing the result. -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B, @@ -353,7 +361,7 @@ _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C) /// \param __C /// A 128-bit vector of [2 x double] containing the subtrahend. /// \returns A 128-bit vector of [2 x double] containing the result. -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B, @@ -536,7 +544,7 @@ _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C) /// \param __C /// A 256-bit vector of [8 x float] containing the addend. /// \returns A 256-bit vector of [8 x float] containing the result. -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B, @@ -557,7 +565,7 @@ _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C) /// \param __C /// A 256-bit vector of [4 x double] containing the addend. /// \returns A 256-bit vector of [4 x double] containing the result. -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B, @@ -578,7 +586,7 @@ _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C) /// \param __C /// A 256-bit vector of [8 x float] containing the subtrahend. /// \returns A 256-bit vector of [8 x float] containing the result. -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B, @@ -599,7 +607,7 @@ _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C) /// \param __C /// A 256-bit vector of [4 x double] containing the subtrahend. /// \returns A 256-bit vector of [4 x double] containing the result. -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B, @@ -620,7 +628,7 @@ _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C) /// \param __C /// A 256-bit vector of [8 x float] containing the addend. /// \returns A 256-bit vector of [8 x float] containing the result. -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B, @@ -641,7 +649,7 @@ _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C) /// \param __C /// A 256-bit vector of [4 x double] containing the addend. /// \returns A 256-bit vector of [4 x double] containing the result. -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B, @@ -662,7 +670,7 @@ _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C) /// \param __C /// A 256-bit vector of [8 x float] containing the subtrahend. /// \returns A 256-bit vector of [8 x float] containing the result. -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B, @@ -683,7 +691,7 @@ _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C) /// \param __C /// A 256-bit vector of [4 x double] containing the subtrahend. /// \returns A 256-bit vector of [4 x double] containing the result. -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B, @@ -808,5 +816,7 @@ _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C) #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 +#undef __DEFAULT_FN_ATTRS128_CONSTEXPR +#undef __DEFAULT_FN_ATTRS256_CONSTEXPR #endif /* __FMAINTRIN_H */ diff --git a/clang/test/CodeGen/X86/fma-builtins.c b/clang/test/CodeGen/X86/fma-builtins.c index aa17dcc62fbc0..8e9822ec6ad2f 100644 --- a/clang/test/CodeGen/X86/fma-builtins.c +++ b/clang/test/CodeGen/X86/fma-builtins.c @@ -5,18 +5,21 @@ #include <immintrin.h> +#include "builtin_test_helpers.h" __m128 test_mm_fmadd_ps(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_fmadd_ps // CHECK: call {{.*}}<4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) return _mm_fmadd_ps(a, b, c); } +TEST_CONSTEXPR(match_m128(_mm_fmadd_ps((__m128){ 0.0f, 1.0f, -2.0f, -4.0f }, (__m128){ -0.0f, 4.0f, 2.0f, 1.0f }, (__m128){ -0.0f, -4.0f, 2.0f, 1.0f }), -0.0f, 0.0f, -2.0f, -3.0f)); __m128d test_mm_fmadd_pd(__m128d a, __m128d b, __m128d c) { // CHECK-LABEL: test_mm_fmadd_pd // CHECK: call {{.*}}<2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) return _mm_fmadd_pd(a, b, c); } +TEST_CONSTEXPR(match_m128d(_mm_fmadd_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0, 1.0 }, (__m128d){ -0.0, 1.0 }), -0.0, -3.0)); __m128 test_mm_fmadd_ss(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_fmadd_ss @@ -44,6 +47,7 @@ __m128 test_mm_fmsub_ps(__m128 a, __m128 b, __m128 c) { // CHECK: call {{.*}}<4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) return _mm_fmsub_ps(a, b, c); } +TEST_CONSTEXPR(match_m128(_mm_fmsub_ps((__m128){ 0.0f, 1.0f, -2.0f, -4.0f }, (__m128){ -0.0f, 4.0f, 2.0f, 1.0f }, (__m128){ -0.0f, -4.0f, 2.0f, 1.0f }), 0.0f, 8.0f, -6.0f, -5.0f)); __m128d test_mm_fmsub_pd(__m128d a, __m128d b, __m128d c) { // CHECK-LABEL: test_mm_fmsub_pd @@ -51,6 +55,7 @@ __m128d test_mm_fmsub_pd(__m128d a, __m128d b, __m128d c) { // CHECK: call {{.*}}<2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) return _mm_fmsub_pd(a, b, c); } +TEST_CONSTEXPR(match_m128d(_mm_fmsub_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0, 1.0 }, (__m128d){ -0.0, 1.0 }), 0.0, -5.0)); __m128 test_mm_fmsub_ss(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_fmsub_ss @@ -80,6 +85,7 @@ __m128 test_mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) { // CHECK: call {{.*}}<4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) return _mm_fnmadd_ps(a, b, c); } +TEST_CONSTEXPR(match_m128(_mm_fnmadd_ps((__m128){ 0.0f, 1.0f, -2.0f, -4.0f }, (__m128){ -0.0f, 4.0f, 2.0f, 1.0f }, (__m128){ -0.0f, -4.0f, 2.0f, 1.0f }), 0.0f, -8.0f, 6.0f, 5.0f)); __m128d test_mm_fnmadd_pd(__m128d a, __m128d b, __m128d c) { // CHECK-LABEL: test_mm_fnmadd_pd @@ -87,6 +93,7 @@ __m128d test_mm_fnmadd_pd(__m128d a, __m128d b, __m128d c) { // CHECK: call {{.*}}<2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) return _mm_fnmadd_pd(a, b, c); } +TEST_CONSTEXPR(match_m128d(_mm_fnmadd_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0, 1.0 }, (__m128d){ -0.0, 1.0 }), 0.0, 5.0)); __m128 test_mm_fnmadd_ss(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_fnmadd_ss @@ -117,6 +124,7 @@ __m128 test_mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) { // CHECK: call {{.*}}<4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) return _mm_fnmsub_ps(a, b, c); } +TEST_CONSTEXPR(match_m128(_mm_fnmsub_ps((__m128){ 0.0f, 1.0f, -2.0f, -4.0f }, (__m128){ -0.0f, 4.0f, 2.0f, 1.0f }, (__m128){ -0.0f, -4.0f, 2.0f, 1.0f }), 0.0f, 0.0f, 2.0f, 3.0f)); __m128d test_mm_fnmsub_pd(__m128d a, __m128d b, __m128d c) { // CHECK-LABEL: test_mm_fnmsub_pd @@ -125,6 +133,7 @@ __m128d test_mm_fnmsub_pd(__m128d a, __m128d b, __m128d c) { // CHECK: call {{.*}}<2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) return _mm_fnmsub_pd(a, b, c); } +TEST_CONSTEXPR(match_m128d(_mm_fnmsub_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0, 1.0 }, (__m128d){ -0.0, 1.0 }), 0.0, 3.0)); __m128 test_mm_fnmsub_ss(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_fnmsub_ss @@ -183,12 +192,14 @@ __m256 test_mm256_fmadd_ps(__m256 a, __m256 b, __m256 c) { // CHECK: call {{.*}}<8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) return _mm256_fmadd_ps(a, b, c); } +TEST_CONSTEXPR(match_m256(_mm256_fmadd_ps((__m256){ 0.0f, 1.0f, -2.0f, -4.0f, -8.0f, -16.0f, 32.0f, 64.0f }, (__m256){ -0.0f, 64.0f, 32.0f, 16.0f, -8.0f, -4.0f, -2.0f, -1.0f }, (__m256){ -0.0f, -4.0f, 2.0f, 1.0f, -8.0f, -16.0f, 32.0f, 64.0f }), -0.0f, 60.0f, -62.0f, -63.0f, +56.0f, +48.0f, -32.0f, 0.0f)); __m256d test_mm256_fmadd_pd(__m256d a, __m256d b, __m256d c) { // CHECK-LABEL: test_mm256_fmadd_pd // CHECK: call {{.*}}<4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) return _mm256_fmadd_pd(a, b, c); } +TEST_CONSTEXPR(match_m256d(_mm256_fmadd_pd((__m256d){ 0.0, 1.0, -2.0, -4.0 }, (__m256d){ -0.0, 4.0, 2.0, 1.0 }, (__m256d){ -0.0, -4.0, 2.0, 1.0 }), -0.0, 0.0, -2.0, -3.0)); __m256 test_mm256_fmsub_ps(__m256 a, __m256 b, __m256 c) { // CHECK-LABEL: test_mm256_fmsub_ps @@ -196,6 +207,7 @@ __m256 test_mm256_fmsub_ps(__m256 a, __m256 b, __m256 c) { // CHECK: call {{.*}}<8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) return _mm256_fmsub_ps(a, b, c); } +TEST_CONSTEXPR(match_m256(_mm256_fmsub_ps((__m256){ 0.0f, 1.0f, -2.0f, -4.0f, -8.0f, -16.0f, 32.0f, 64.0f }, (__m256){ -0.0f, 64.0f, 32.0f, 16.0f, -8.0f, -4.0f, -2.0f, -1.0f }, (__m256){ -0.0f, -4.0f, 2.0f, 1.0f, -8.0f, -16.0f, 32.0f, 64.0f }), 0.0f, 68.0f, -66.0f, -65.0f, 72.0f, 80.0f, -96.0f, -128.0f)); __m256d test_mm256_fmsub_pd(__m256d a, __m256d b, __m256d c) { // CHECK-LABEL: test_mm256_fmsub_pd @@ -203,6 +215,7 @@ __m256d test_mm256_fmsub_pd(__m256d a, __m256d b, __m256d c) { // CHECK: call {{.*}}<4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) return _mm256_fmsub_pd(a, b, c); } +TEST_CONSTEXPR(match_m256d(_mm256_fmsub_pd((__m256d){ 0.0, 1.0, -2.0, -4.0 }, (__m256d){ -0.0, 4.0, 2.0, 1.0 }, (__m256d){ -0.0, -4.0, 2.0, 1.0 }), 0.0, 8.0, -6.0, -5.0)); __m256 test_mm256_fnmadd_ps(__m256 a, __m256 b, __m256 c) { // CHECK-LABEL: test_mm256_fnmadd_ps @@ -210,6 +223,7 @@ __m256 test_mm256_fnmadd_ps(__m256 a, __m256 b, __m256 c) { // CHECK: call {{.*}}<8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) return _mm256_fnmadd_ps(a, b, c); } +TEST_CONSTEXPR(match_m256(_mm256_fnmadd_ps((__m256){ 0.0f, 1.0f, -2.0f, -4.0f, -8.0f, -16.0f, 32.0f, 64.0f }, (__m256){ -0.0f, 64.0f, 32.0f, 16.0f, -8.0f, -4.0f, -2.0f, -1.0f }, (__m256){ -0.0f, -4.0f, 2.0f, 1.0f, -8.0f, -16.0f, 32.0f, 64.0f }), 0.0f, -68.0f, 66.0f, 65.0f, -72.0f, -80.0f, 96.0f, 128.0f)); __m256d test_mm256_fnmadd_pd(__m256d a, __m256d b, __m256d c) { // CHECK-LABEL: test_mm256_fnmadd_pd @@ -217,6 +231,7 @@ __m256d test_mm256_fnmadd_pd(__m256d a, __m256d b, __m256d c) { // CHECK: call {{.*}}<4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) return _mm256_fnmadd_pd(a, b, c); } +TEST_CONSTEXPR(match_m256d(_mm256_fnmadd_pd((__m256d){ 0.0, 1.0, -2.0, -4.0 }, (__m256d){ -0.0, 4.0, 2.0, 1.0 }, (__m256d){ -0.0, -4.0, 2.0, 1.0 }), 0.0, -8.0, 6.0, 5.0)); __m256 test_mm256_fnmsub_ps(__m256 a, __m256 b, __m256 c) { // CHECK-LABEL: test_mm256_fnmsub_ps @@ -225,6 +240,7 @@ __m256 test_mm256_fnmsub_ps(__m256 a, __m256 b, __m256 c) { // CHECK: call {{.*}}<8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) return _mm256_fnmsub_ps(a, b, c); } +TEST_CONSTEXPR(match_m256(_mm256_fnmsub_ps((__m256){ 0.0f, 1.0f, -2.0f, -4.0f, -8.0f, -16.0f, 32.0f, 64.0f }, (__m256){ -0.0f, 64.0f, 32.0f, 16.0f, -8.0f, -4.0f, -2.0f, -1.0f }, (__m256){ -0.0f, -4.0f, 2.0f, 1.0f, -8.0f, -16.0f, 32.0f, 64.0f }), 0.0f, -60.0f, 62.0f, 63.0f, -56.0f, -48.0f, 32.0f, 0.0f)); __m256d test_mm256_fnmsub_pd(__m256d a, __m256d b, __m256d c) { // CHECK-LABEL: test_mm256_fnmsub_pd @@ -233,6 +249,7 @@ __m256d test_mm256_fnmsub_pd(__m256d a, __m256d b, __m256d c) { // CHECK: call {{.*}}<4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) return _mm256_fnmsub_pd(a, b, c); } +TEST_CONSTEXPR(match_m256d(_mm256_fnmsub_pd((__m256d){ 0.0, 1.0, -2.0, -4.0 }, (__m256d){ -0.0, 4.0, 2.0, 1.0 }, (__m256d){ -0.0, -4.0, 2.0, 1.0 }), 0.0, 0.0, 2.0, 3.0)); __m256 test_mm256_fmaddsub_ps(__m256 a, __m256 b, __m256 c) { // CHECK-LABEL: test_mm256_fmaddsub_ps diff --git a/clang/test/CodeGen/X86/fma4-builtins.c b/clang/test/CodeGen/X86/fma4-builtins.c index ccdba8fea87b5..dcfd48a220e11 100644 --- a/clang/test/CodeGen/X86/fma4-builtins.c +++ b/clang/test/CodeGen/X86/fma4-builtins.c @@ -5,18 +5,21 @@ #include <x86intrin.h> +#include "builtin_test_helpers.h" __m128 test_mm_macc_ps(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_macc_ps // CHECK: call {{.*}}<4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) return _mm_macc_ps(a, b, c); } +TEST_CONSTEXPR(match_m128(_mm_macc_ps((__m128){ 0.0f, 1.0f, -2.0f, -4.0f }, (__m128){ -0.0f, 4.0f, 2.0f, 1.0f }, (__m128){ -0.0f, -4.0f, 2.0f, 1.0f }), -0.0f, 0.0f, -2.0f, -3.0f)); __m128d test_mm_macc_pd(__m128d a, __m128d b, __m128d c) { // CHECK-LABEL: test_mm_macc_pd // CHECK: call {{.*}}<2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) return _mm_macc_pd(a, b, c); } +TEST_CONSTEXPR(match_m128d(_mm_macc_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0, 1.0 }, (__m128d){ -0.0, 1.0 }), -0.0, -3.0)); __m128 test_mm_macc_ss(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_macc_ss @@ -44,6 +47,7 @@ __m128 test_mm_msub_ps(__m128 a, __m128 b, __m128 c) { // CHECK: call {{.*}}<4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) return _mm_msub_ps(a, b, c); } +TEST_CONSTEXPR(match_m128(_mm_msub_ps((__m128){ 0.0f, 1.0f, -2.0f, -4.0f }, (__m128){ -0.0f, 4.0f, 2.0f, 1.0f }, (__m128){ -0.0f, -4.0f, 2.0f, 1.0f }), 0.0f, 8.0f, -6.0f, -5.0f)); __m128d test_mm_msub_pd(__m128d a, __m128d b, __m128d c) { // CHECK-LABEL: test_mm_msub_pd @@ -51,6 +55,7 @@ __m128d test_mm_msub_pd(__m128d a, __m128d b, __m128d c) { // CHECK: call {{.*}}<2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) return _mm_msub_pd(a, b, c); } +TEST_CONSTEXPR(match_m128d(_mm_msub_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0, 1.0 }, (__m128d){ -0.0, 1.0 }), 0.0, -5.0)); __m128 test_mm_msub_ss(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_msub_ss @@ -80,6 +85,7 @@ __m128 test_mm_nmacc_ps(__m128 a, __m128 b, __m128 c) { // CHECK: call {{.*}}<4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) return _mm_nmacc_ps(a, b, c); } +TEST_CONSTEXPR(match_m128(_mm_nmacc_ps((__m128){ 0.0f, 1.0f, -2.0f, -4.0f }, (__m128){ -0.0f, 4.0f, 2.0f, 1.0f }, (__m128){ -0.0f, -4.0f, 2.0f, 1.0f }), 0.0f, -8.0f, 6.0f, 5.0f)); __m128d test_mm_nmacc_pd(__m128d a, __m128d b, __m128d c) { // CHECK-LABEL: test_mm_nmacc_pd @@ -87,6 +93,7 @@ __m128d test_mm_nmacc_pd(__m128d a, __m128d b, __m128d c) { // CHECK: call {{.*}}<2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) return _mm_nmacc_pd(a, b, c); } +TEST_CONSTEXPR(match_m128d(_mm_nmacc_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0, 1.0 }, (__m128d){ -0.0, 1.0 }), 0.0, 5.0)); __m128 test_mm_nmacc_ss(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_nmacc_ss @@ -117,6 +124,7 @@ __m128 test_mm_nmsub_ps(__m128 a, __m128 b, __m128 c) { // CHECK: call {{.*}}<4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) return _mm_nmsub_ps(a, b, c); } +TEST_CONSTEXPR(match_m128(_mm_nmsub_ps((__m128){ 0.0f, 1.0f, -2.0f, -4.0f }, (__m128){ -0.0f, 4.0f, 2.0f, 1.0f }, (__m128){ -0.0f, -4.0f, 2.0f, 1.0f }), 0.0f, 0.0f, 2.0f, 3.0f)); __m128d test_mm_nmsub_pd(__m128d a, __m128d b, __m128d c) { // CHECK-LABEL: test_mm_nmsub_pd @@ -125,6 +133,7 @@ __m128d test_mm_nmsub_pd(__m128d a, __m128d b, __m128d c) { // CHECK: call {{.*}}<2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) return _mm_nmsub_pd(a, b, c); } +TEST_CONSTEXPR(match_m128d(_mm_nmsub_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0, 1.0 }, (__m128d){ -0.0, 1.0 }), 0.0, 3.0)); __m128 test_mm_nmsub_ss(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_nmsub_ss @@ -183,12 +192,14 @@ __m256 test_mm256_macc_ps(__m256 a, __m256 b, __m256 c) { // CHECK: call {{.*}}<8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) return _mm256_macc_ps(a, b, c); } +TEST_CONSTEXPR(match_m256(_mm256_macc_ps((__m256){ 0.0f, 1.0f, -2.0f, -4.0f, -8.0f, -16.0f, 32.0f, 64.0f }, (__m256){ -0.0f, 64.0f, 32.0f, 16.0f, -8.0f, -4.0f, -2.0f, -1.0f }, (__m256){ -0.0f, -4.0f, 2.0f, 1.0f, -8.0f, -16.0f, 32.0f, 64.0f }), -0.0f, 60.0f, -62.0f, -63.0f, +56.0f, +48.0f, -32.0f, 0.0f)); __m256d test_mm256_macc_pd(__m256d a, __m256d b, __m256d c) { // CHECK-LABEL: test_mm256_macc_pd // CHECK: call {{.*}}<4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) return _mm256_macc_pd(a, b, c); } +TEST_CONSTEXPR(match_m256d(_mm256_macc_pd((__m256d){ 0.0, 1.0, -2.0, -4.0 }, (__m256d){ -0.0, 4.0, 2.0, 1.0 }, (__m256d){ -0.0, -4.0, 2.0, 1.0 }), -0.0, 0.0, -2.0, -3.0)); __m256 test_mm256_msub_ps(__m256 a, __m256 b, __m256 c) { // CHECK-LABEL: test_mm256_msub_ps @@ -196,6 +207,7 @@ __m256 test_mm256_msub_ps(__m256 a, __m256 b, __m256 c) { // CHECK: call {{.*}}<8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) return _mm256_msub_ps(a, b, c); } +TEST_CONSTEXPR(match_m256(_mm256_msub_ps((__m256){ 0.0f, 1.0f, -2.0f, -4.0f, -8.0f, -16.0f, 32.0f, 64.0f }, (__m256){ -0.0f, 64.0f, 32.0f, 16.0f, -8.0f, -4.0f, -2.0f, -1.0f }, (__m256){ -0.0f, -4.0f, 2.0f, 1.0f, -8.0f, -16.0f, 32.0f, 64.0f }), 0.0f, 68.0f, -66.0f, -65.0f, 72.0f, 80.0f, -96.0f, -128.0f)); __m256d test_mm256_msub_pd(__m256d a, __m256d b, __m256d c) { // CHECK-LABEL: test_mm256_msub_pd @@ -203,6 +215,7 @@ __m256d test_mm256_msub_pd(__m256d a, __m256d b, __m256d c) { // CHECK: call {{.*}}<4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) return _mm256_msub_pd(a, b, c); } +TEST_CONSTEXPR(match_m256d(_mm256_msub_pd((__m256d){ 0.0, 1.0, -2.0, -4.0 }, (__m256d){ -0.0, 4.0, 2.0, 1.0 }, (__m256d){ -0.0, -4.0, 2.0, 1.0 }), 0.0, 8.0, -6.0, -5.0)); __m256 test_mm256_nmacc_ps(__m256 a, __m256 b, __m256 c) { // CHECK-LABEL: test_mm256_nmacc_ps @@ -210,6 +223,7 @@ __m256 test_mm256_nmacc_ps(__m256 a, __m256 b, __m256 c) { // CHECK: call {{.*}}<8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) return _mm256_nmacc_ps(a, b, c); } +TEST_CONSTEXPR(match_m256(_mm256_nmacc_ps((__m256){ 0.0f, 1.0f, -2.0f, -4.0f, -8.0f, -16.0f, 32.0f, 64.0f }, (__m256){ -0.0f, 64.0f, 32.0f, 16.0f, -8.0f, -4.0f, -2.0f, -1.0f }, (__m256){ -0.0f, -4.0f, 2.0f, 1.0f, -8.0f, -16.0f, 32.0f, 64.0f }), 0.0f, -68.0f, 66.0f, 65.0f, -72.0f, -80.0f, 96.0f, 128.0f)); __m256d test_mm256_nmacc_pd(__m256d a, __m256d b, __m256d c) { // CHECK-LABEL: test_mm256_nmacc_pd @@ -217,6 +231,7 @@ __m256d test_mm256_nmacc_pd(__m256d a, __m256d b, __m256d c) { // CHECK: call {{.*}}<4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) return _mm256_nmacc_pd(a, b, c); } +TEST_CONSTEXPR(match_m256d(_mm256_nmacc_pd((__m256d){ 0.0, 1.0, -2.0, -4.0 }, (__m256d){ -0.0, 4.0, 2.0, 1.0 }, (__m256d){ -0.0, -4.0, 2.0, 1.0 }), 0.0, -8.0, 6.0, 5.0)); __m256 test_mm256_nmsub_ps(__m256 a, __m256 b, __m256 c) { // CHECK-LABEL: test_mm256_nmsub_ps @@ -225,6 +240,7 @@ __m256 test_mm256_nmsub_ps(__m256 a, __m256 b, __m256 c) { // CHECK: call {{.*}}<8 x float> @llvm.fma.v8f32(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}) return _mm256_nmsub_ps(a, b, c); } +TEST_CONSTEXPR(match_m256(_mm256_nmsub_ps((__m256){ 0.0f, 1.0f, -2.0f, -4.0f, -8.0f, -16.0f, 32.0f, 64.0f }, (__m256){ -0.0f, 64.0f, 32.0f, 16.0f, -8.0f, -4.0f, -2.0f, -1.0f }, (__m256){ -0.0f, -4.0f, 2.0f, 1.0f, -8.0f, -16.0f, 32.0f, 64.0f }), 0.0f, -60.0f, 62.0f, 63.0f, -56.0f, -48.0f, 32.0f, 0.0f)); __m256d test_mm256_nmsub_pd(__m256d a, __m256d b, __m256d c) { // CHECK-LABEL: test_mm256_nmsub_pd @@ -233,6 +249,7 @@ __m256d test_mm256_nmsub_pd(__m256d a, __m256d b, __m256d c) { // CHECK: call {{.*}}<4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) return _mm256_nmsub_pd(a, b, c); } +TEST_CONSTEXPR(match_m256d(_mm256_nmsub_pd((__m256d){ 0.0, 1.0, -2.0, -4.0 }, (__m256d){ -0.0, 4.0, 2.0, 1.0 }, (__m256d){ -0.0, -4.0, 2.0, 1.0 }), 0.0, 0.0, 2.0, 3.0)); __m256 test_mm256_maddsub_ps(__m256 a, __m256 b, __m256 c) { // CHECK-LABEL: test_mm256_maddsub_ps _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits