Author: Simon Pilgrim Date: 2020-12-13T15:37:35Z New Revision: 4855a1004d4d87b6c21c510c1724e74a8d37d91a
URL: https://github.com/llvm/llvm-project/commit/4855a1004d4d87b6c21c510c1724e74a8d37d91a DIFF: https://github.com/llvm/llvm-project/commit/4855a1004d4d87b6c21c510c1724e74a8d37d91a.diff LOG: [X86] Convert fadd/fmul _mm_reduce_* intrinsics to emit llvm.reduction intrinsics (PR47506) Followup to D87604, having confirmed on PR47506 that we can use the llvm codegen expansion for fadd/fmul as well. Differential Revision: https://reviews.llvm.org/D92940 Added: Modified: clang/include/clang/Basic/BuiltinsX86.def clang/lib/CodeGen/CGBuiltin.cpp clang/lib/Headers/avx512fintrin.h clang/test/CodeGen/X86/avx512-reduceIntrin.c Removed: ################################################################################ diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def index 0f5594f1a4e6..16fb7dd7b0e6 100644 --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -1876,6 +1876,10 @@ TARGET_BUILTIN(__builtin_ia32_reduce_add_d512, "iV16i", "ncV:512:", "avx512f") TARGET_BUILTIN(__builtin_ia32_reduce_add_q512, "OiV8Oi", "ncV:512:", "avx512f") TARGET_BUILTIN(__builtin_ia32_reduce_and_d512, "iV16i", "ncV:512:", "avx512f") TARGET_BUILTIN(__builtin_ia32_reduce_and_q512, "OiV8Oi", "ncV:512:", "avx512f") +TARGET_BUILTIN(__builtin_ia32_reduce_fadd_pd512, "ddV8d", "ncV:512:", "avx512f") +TARGET_BUILTIN(__builtin_ia32_reduce_fadd_ps512, "ffV16f", "ncV:512:", "avx512f") +TARGET_BUILTIN(__builtin_ia32_reduce_fmul_pd512, "ddV8d", "ncV:512:", "avx512f") +TARGET_BUILTIN(__builtin_ia32_reduce_fmul_ps512, "ffV16f", "ncV:512:", "avx512f") TARGET_BUILTIN(__builtin_ia32_reduce_mul_d512, "iV16i", "ncV:512:", "avx512f") TARGET_BUILTIN(__builtin_ia32_reduce_mul_q512, "OiV8Oi", "ncV:512:", "avx512f") TARGET_BUILTIN(__builtin_ia32_reduce_or_d512, "iV16i", "ncV:512:", "avx512f") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 316a60c31fd4..74f6c9fee2c8 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -13631,6 +13631,18 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, CGM.getIntrinsic(Intrinsic::vector_reduce_and, Ops[0]->getType()); return Builder.CreateCall(F, {Ops[0]}); } + case X86::BI__builtin_ia32_reduce_fadd_pd512: + case X86::BI__builtin_ia32_reduce_fadd_ps512: { + Function *F = + CGM.getIntrinsic(Intrinsic::vector_reduce_fadd, Ops[1]->getType()); + return Builder.CreateCall(F, {Ops[0], Ops[1]}); + } + case X86::BI__builtin_ia32_reduce_fmul_pd512: + case X86::BI__builtin_ia32_reduce_fmul_ps512: { + Function *F = + CGM.getIntrinsic(Intrinsic::vector_reduce_fmul, Ops[1]->getType()); + return Builder.CreateCall(F, {Ops[0], Ops[1]}); + } case X86::BI__builtin_ia32_reduce_mul_d512: case X86::BI__builtin_ia32_reduce_mul_q512: { Function *F = diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h index 2df399d978e3..2ee4350b14d4 100644 --- a/clang/lib/Headers/avx512fintrin.h +++ b/clang/lib/Headers/avx512fintrin.h @@ -9345,37 +9345,25 @@ _mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) { return __builtin_ia32_reduce_or_q512(__W); } -#define _mm512_mask_reduce_operator(op) \ - __m256d __t1 = _mm512_extractf64x4_pd(__W, 0); \ - __m256d __t2 = _mm512_extractf64x4_pd(__W, 1); \ - __m256d __t3 = __t1 op __t2; \ - __m128d __t4 = _mm256_extractf128_pd(__t3, 0); \ - __m128d __t5 = _mm256_extractf128_pd(__t3, 1); \ - __m128d __t6 = __t4 op __t5; \ - __m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \ - __m128d __t8 = __t6 op __t7; \ - return __t8[0] - static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_add_pd(__m512d __W) { - _mm512_mask_reduce_operator(+); + return __builtin_ia32_reduce_fadd_pd512(0.0, __W); } static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_pd(__m512d __W) { - _mm512_mask_reduce_operator(*); + return __builtin_ia32_reduce_fmul_pd512(1.0, __W); } static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) { __W = _mm512_maskz_mov_pd(__M, __W); - _mm512_mask_reduce_operator(+); + return __builtin_ia32_reduce_fadd_pd512(0.0, __W); } static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) { __W = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __M, __W); - _mm512_mask_reduce_operator(*); + return __builtin_ia32_reduce_fmul_pd512(1.0, __W); } -#undef _mm512_mask_reduce_operator static __inline__ int __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi32(__m512i __W) { @@ -9421,41 +9409,27 @@ _mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) { return __builtin_ia32_reduce_or_d512((__v16si)__W); } -#define _mm512_mask_reduce_operator(op) \ - __m256 __t1 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 0); \ - __m256 __t2 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 1); \ - __m256 __t3 = __t1 op __t2; \ - __m128 __t4 = _mm256_extractf128_ps(__t3, 0); \ - __m128 __t5 = _mm256_extractf128_ps(__t3, 1); \ - __m128 __t6 = __t4 op __t5; \ - __m128 __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \ - __m128 __t8 = __t6 op __t7; \ - __m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \ - __m128 __t10 = __t8 op __t9; \ - return __t10[0] - static __inline__ float __DEFAULT_FN_ATTRS512 _mm512_reduce_add_ps(__m512 __W) { - _mm512_mask_reduce_operator(+); + return __builtin_ia32_reduce_fadd_ps512(0.0f, __W); } static __inline__ float __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_ps(__m512 __W) { - _mm512_mask_reduce_operator(*); + return __builtin_ia32_reduce_fmul_ps512(1.0f, __W); } static __inline__ float __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) { __W = _mm512_maskz_mov_ps(__M, __W); - _mm512_mask_reduce_operator(+); + return __builtin_ia32_reduce_fadd_ps512(0.0f, __W); } static __inline__ float __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) { __W = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __M, __W); - _mm512_mask_reduce_operator(*); + return __builtin_ia32_reduce_fmul_ps512(1.0f, __W); } -#undef _mm512_mask_reduce_operator static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_max_epi64(__m512i __V) { diff --git a/clang/test/CodeGen/X86/avx512-reduceIntrin.c b/clang/test/CodeGen/X86/avx512-reduceIntrin.c index accc3c64f2b9..d8a1130f3cef 100644 --- a/clang/test/CodeGen/X86/avx512-reduceIntrin.c +++ b/clang/test/CodeGen/X86/avx512-reduceIntrin.c @@ -115,67 +115,25 @@ int test_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W){ double test_mm512_reduce_add_pd(__m512d __W){ // CHECK-LABEL: @test_mm512_reduce_add_pd( -// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> -// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> -// CHECK: fadd <4 x double> %{{.*}}, %{{.*}} -// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 0, i32 1> -// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3> -// CHECK: fadd <2 x double> %{{.*}}, %{{.*}} -// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 0> -// CHECK: fadd <2 x double> %{{.*}}, %{{.*}} -// CHECK: extractelement <2 x double> %{{.*}}, i32 0 +// CHECK: call double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> %{{.*}}) return _mm512_reduce_add_pd(__W); } double test_mm512_reduce_mul_pd(__m512d __W){ // CHECK-LABEL: @test_mm512_reduce_mul_pd( -// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> -// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> -// CHECK: fmul <4 x double> %{{.*}}, %{{.*}} -// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 0, i32 1> -// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3> -// CHECK: fmul <2 x double> %{{.*}}, %{{.*}} -// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 0> -// CHECK: fmul <2 x double> %{{.*}}, %{{.*}} -// CHECK: extractelement <2 x double> %{{.*}}, i32 0 +// CHECK: call double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %{{.*}}) return _mm512_reduce_mul_pd(__W); } float test_mm512_reduce_add_ps(__m512 __W){ // CHECK-LABEL: @test_mm512_reduce_add_ps( -// CHECK: bitcast <16 x float> %{{.*}} to <8 x double> -// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> -// CHECK: bitcast <4 x double> %{{.*}} to <8 x float> -// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> -// CHECK: bitcast <4 x double> %{{.*}} to <8 x float> -// CHECK: fadd <8 x float> %{{.*}}, %{{.*}} -// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> -// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> -// CHECK: fadd <4 x float> %{{.*}}, %{{.*}} -// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1> -// CHECK: fadd <4 x float> %{{.*}}, %{{.*}} -// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2> -// CHECK: fadd <4 x float> %{{.*}}, %{{.*}} -// CHECK: extractelement <4 x float> %{{.*}}, i32 0 +// CHECK: call float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> %{{.*}}) return _mm512_reduce_add_ps(__W); } float test_mm512_reduce_mul_ps(__m512 __W){ // CHECK-LABEL: @test_mm512_reduce_mul_ps( -// CHECK: bitcast <16 x float> %{{.*}} to <8 x double> -// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> -// CHECK: bitcast <4 x double> %{{.*}} to <8 x float> -// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> -// CHECK: bitcast <4 x double> %{{.*}} to <8 x float> -// CHECK: fmul <8 x float> %{{.*}}, %{{.*}} -// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> -// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> -// CHECK: fmul <4 x float> %{{.*}}, %{{.*}} -// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1> -// CHECK: fmul <4 x float> %{{.*}}, %{{.*}} -// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2> -// CHECK: fmul <4 x float> %{{.*}}, %{{.*}} -// CHECK: extractelement <4 x float> %{{.*}}, i32 0 +// CHECK: call float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %{{.*}}) return _mm512_reduce_mul_ps(__W); } @@ -183,15 +141,7 @@ double test_mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W){ // CHECK-LABEL: @test_mm512_mask_reduce_add_pd( // CHECK: bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} -// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> -// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> -// CHECK: fadd <4 x double> %{{.*}}, %{{.*}} -// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 0, i32 1> -// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3> -// CHECK: fadd <2 x double> %{{.*}}, %{{.*}} -// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 0> -// CHECK: fadd <2 x double> %{{.*}}, %{{.*}} -// CHECK: extractelement <2 x double> %{{.*}}, i32 0 +// CHECK: call double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> %{{.*}}) return _mm512_mask_reduce_add_pd(__M, __W); } @@ -199,37 +149,15 @@ double test_mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W){ // CHECK-LABEL: @test_mm512_mask_reduce_mul_pd( // CHECK: bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} -// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> -// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> -// CHECK: fmul <4 x double> %{{.*}}, %{{.*}} -// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 0, i32 1> -// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3> -// CHECK: fmul <2 x double> %{{.*}}, %{{.*}} -// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 0> -// CHECK: fmul <2 x double> %{{.*}}, %{{.*}} -// CHECK: extractelement <2 x double> %{{.*}}, i32 0 +// CHECK: call double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %{{.*}}) return _mm512_mask_reduce_mul_pd(__M, __W); } float test_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W){ // CHECK-LABEL: @test_mm512_mask_reduce_add_ps( -// CHECK-NEXT: entry: // CHECK: bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %{{.*}}, <16 x float> {{.*}}, <16 x float> {{.*}} -// CHECK: bitcast <16 x float> %{{.*}} to <8 x double> -// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> -// CHECK: bitcast <4 x double> %{{.*}} to <8 x float> -// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> -// CHECK: bitcast <4 x double> %{{.*}} to <8 x float> -// CHECK: fadd <8 x float> %{{.*}}, %{{.*}} -// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> -// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> -// CHECK: fadd <4 x float> %{{.*}}, %{{.*}} -// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1> -// CHECK: fadd <4 x float> %{{.*}}, %{{.*}} -// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2> -// CHECK: fadd <4 x float> %{{.*}}, %{{.*}} -// CHECK: extractelement <4 x float> %{{.*}}, i32 0 +// CHECK: call float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> %{{.*}}) return _mm512_mask_reduce_add_ps(__M, __W); } @@ -237,19 +165,6 @@ float test_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W){ // CHECK-LABEL: @test_mm512_mask_reduce_mul_ps( // CHECK: bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %{{.*}}, <16 x float> {{.*}}, <16 x float> %{{.*}} -// CHECK: bitcast <16 x float> %{{.*}} to <8 x double> -// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> -// CHECK: bitcast <4 x double> %{{.*}} to <8 x float> -// CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> -// CHECK: bitcast <4 x double> %{{.*}} to <8 x float> -// CHECK: fmul <8 x float> %{{.*}}, %{{.*}} -// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> -// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> -// CHECK: fmul <4 x float> %{{.*}}, %{{.*}} -// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1> -// CHECK: fmul <4 x float> %{{.*}}, %{{.*}} -// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2> -// CHECK: fmul <4 x float> %{{.*}}, %{{.*}} -// CHECK: extractelement <4 x float> %{{.*}}, i32 0 +// CHECK: call float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %{{.*}}) return _mm512_mask_reduce_mul_ps(__M, __W); } _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits