[PATCH] D92940: [X86] Convert fadd/fmul _mm_reduce_* intrinsics to emit llvm.reduction intrinsics (PR47506)

Simon Pilgrim via Phabricator via cfe-commits Sun, 13 Dec 2020 07:40:07 -0800

This revision was landed with ongoing or failed builds.
This revision was automatically updated to reflect the committed changes.
Closed by commit rG4855a1004d4d: [X86] Convert fadd/fmul _mm_reduce_* 
intrinsics to emit llvm.reduction… (authored by RKSimon).


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D92940/new/

https://reviews.llvm.org/D92940

Files:
  clang/include/clang/Basic/BuiltinsX86.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/Headers/avx512fintrin.h
  clang/test/CodeGen/X86/avx512-reduceIntrin.c

Index: clang/test/CodeGen/X86/avx512-reduceIntrin.c
===================================================================
--- clang/test/CodeGen/X86/avx512-reduceIntrin.c
+++ clang/test/CodeGen/X86/avx512-reduceIntrin.c
@@ -115,67 +115,25 @@
 
 double test_mm512_reduce_add_pd(__m512d __W){
 // CHECK-LABEL: @test_mm512_reduce_add_pd(
-// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:    fadd <4 x double> %{{.*}}, %{{.*}}
-// CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 0, i32 1>
-// CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
-// CHECK:    fadd <2 x double> %{{.*}}, %{{.*}}
-// CHECK:    shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 0>
-// CHECK:    fadd <2 x double> %{{.*}}, %{{.*}}
-// CHECK:    extractelement <2 x double> %{{.*}}, i32 0
+// CHECK:    call double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> %{{.*}})
   return _mm512_reduce_add_pd(__W); 
 }
 
 double test_mm512_reduce_mul_pd(__m512d __W){
 // CHECK-LABEL: @test_mm512_reduce_mul_pd(
-// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:    fmul <4 x double> %{{.*}}, %{{.*}}
-// CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 0, i32 1>
-// CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
-// CHECK:    fmul <2 x double> %{{.*}}, %{{.*}}
-// CHECK:    shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 0>
-// CHECK:    fmul <2 x double> %{{.*}}, %{{.*}}
-// CHECK:    extractelement <2 x double> %{{.*}}, i32 0
+// CHECK:    call double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %{{.*}})
   return _mm512_reduce_mul_pd(__W); 
 }
 
 float test_mm512_reduce_add_ps(__m512 __W){
 // CHECK-LABEL: @test_mm512_reduce_add_ps(
-// CHECK:    bitcast <16 x float> %{{.*}} to <8 x double>
-// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:    bitcast <4 x double> %{{.*}} to <8 x float>
-// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:    bitcast <4 x double> %{{.*}} to <8 x float>
-// CHECK:    fadd <8 x float> %{{.*}}, %{{.*}}
-// CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:    fadd <4 x float> %{{.*}}, %{{.*}}
-// CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-// CHECK:    fadd <4 x float> %{{.*}}, %{{.*}}
-// CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-// CHECK:    fadd <4 x float> %{{.*}}, %{{.*}}
-// CHECK:    extractelement <4 x float> %{{.*}}, i32 0
+// CHECK:    call float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> %{{.*}})
   return _mm512_reduce_add_ps(__W); 
 }
 
 float test_mm512_reduce_mul_ps(__m512 __W){
 // CHECK-LABEL: @test_mm512_reduce_mul_ps(
-// CHECK:    bitcast <16 x float> %{{.*}} to <8 x double>
-// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:    bitcast <4 x double> %{{.*}} to <8 x float>
-// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:    bitcast <4 x double> %{{.*}} to <8 x float>
-// CHECK:    fmul <8 x float> %{{.*}}, %{{.*}}
-// CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:    fmul <4 x float> %{{.*}}, %{{.*}}
-// CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-// CHECK:    fmul <4 x float> %{{.*}}, %{{.*}}
-// CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-// CHECK:    fmul <4 x float> %{{.*}}, %{{.*}}
-// CHECK:    extractelement <4 x float> %{{.*}}, i32 0
+// CHECK:    call float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %{{.*}})
   return _mm512_reduce_mul_ps(__W); 
 }
 
@@ -183,15 +141,7 @@
 // CHECK-LABEL: @test_mm512_mask_reduce_add_pd(
 // CHECK:    bitcast i8 %{{.*}} to <8 x i1>
 // CHECK:    select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:    fadd <4 x double> %{{.*}}, %{{.*}}
-// CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 0, i32 1>
-// CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
-// CHECK:    fadd <2 x double> %{{.*}}, %{{.*}}
-// CHECK:    shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 0>
-// CHECK:    fadd <2 x double> %{{.*}}, %{{.*}}
-// CHECK:    extractelement <2 x double> %{{.*}}, i32 0
+// CHECK:    call double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> %{{.*}})
   return _mm512_mask_reduce_add_pd(__M, __W); 
 }
 
@@ -199,37 +149,15 @@
 // CHECK-LABEL: @test_mm512_mask_reduce_mul_pd(
 // CHECK:    bitcast i8 %{{.*}} to <8 x i1>
 // CHECK:    select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:    fmul <4 x double> %{{.*}}, %{{.*}}
-// CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 0, i32 1>
-// CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
-// CHECK:    fmul <2 x double> %{{.*}}, %{{.*}}
-// CHECK:    shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 0>
-// CHECK:    fmul <2 x double> %{{.*}}, %{{.*}}
-// CHECK:    extractelement <2 x double> %{{.*}}, i32 0
+// CHECK:    call double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %{{.*}})
   return _mm512_mask_reduce_mul_pd(__M, __W); 
 }
 
 float test_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W){
 // CHECK-LABEL: @test_mm512_mask_reduce_add_ps(
-// CHECK-NEXT:  entry:
 // CHECK:    bitcast i16 %{{.*}} to <16 x i1>
 // CHECK:    select <16 x i1> %{{.*}}, <16 x float> {{.*}}, <16 x float> {{.*}}
-// CHECK:    bitcast <16 x float> %{{.*}} to <8 x double>
-// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:    bitcast <4 x double> %{{.*}} to <8 x float>
-// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:    bitcast <4 x double> %{{.*}} to <8 x float>
-// CHECK:    fadd <8 x float> %{{.*}}, %{{.*}}
-// CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:    fadd <4 x float> %{{.*}}, %{{.*}}
-// CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-// CHECK:    fadd <4 x float> %{{.*}}, %{{.*}}
-// CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-// CHECK:    fadd <4 x float> %{{.*}}, %{{.*}}
-// CHECK:    extractelement <4 x float> %{{.*}}, i32 0
+// CHECK:    call float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> %{{.*}})
   return _mm512_mask_reduce_add_ps(__M, __W); 
 }
 
@@ -237,19 +165,6 @@
 // CHECK-LABEL: @test_mm512_mask_reduce_mul_ps(
 // CHECK:    bitcast i16 %{{.*}} to <16 x i1>
 // CHECK:    select <16 x i1> %{{.*}}, <16 x float> {{.*}}, <16 x float> %{{.*}}
-// CHECK:    bitcast <16 x float> %{{.*}} to <8 x double>
-// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:    bitcast <4 x double> %{{.*}} to <8 x float>
-// CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:    bitcast <4 x double> %{{.*}} to <8 x float>
-// CHECK:    fmul <8 x float> %{{.*}}, %{{.*}}
-// CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK:    fmul <4 x float> %{{.*}}, %{{.*}}
-// CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-// CHECK:    fmul <4 x float> %{{.*}}, %{{.*}}
-// CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-// CHECK:    fmul <4 x float> %{{.*}}, %{{.*}}
-// CHECK:    extractelement <4 x float> %{{.*}}, i32 0
+// CHECK:    call float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %{{.*}})
   return _mm512_mask_reduce_mul_ps(__M, __W); 
 }
Index: clang/lib/Headers/avx512fintrin.h
===================================================================
--- clang/lib/Headers/avx512fintrin.h
+++ clang/lib/Headers/avx512fintrin.h
@@ -9345,37 +9345,25 @@
   return __builtin_ia32_reduce_or_q512(__W);
 }
 
-#define _mm512_mask_reduce_operator(op) \
-  __m256d __t1 = _mm512_extractf64x4_pd(__W, 0); \
-  __m256d __t2 = _mm512_extractf64x4_pd(__W, 1); \
-  __m256d __t3 = __t1 op __t2; \
-  __m128d __t4 = _mm256_extractf128_pd(__t3, 0); \
-  __m128d __t5 = _mm256_extractf128_pd(__t3, 1); \
-  __m128d __t6 = __t4 op __t5; \
-  __m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \
-  __m128d __t8 = __t6 op __t7; \
-  return __t8[0]
-
 static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_add_pd(__m512d __W) {
-  _mm512_mask_reduce_operator(+);
+  return __builtin_ia32_reduce_fadd_pd512(0.0, __W);
 }
 
 static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_pd(__m512d __W) {
-  _mm512_mask_reduce_operator(*);
+  return __builtin_ia32_reduce_fmul_pd512(1.0, __W);
 }
 
 static __inline__ double __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) {
   __W = _mm512_maskz_mov_pd(__M, __W);
-  _mm512_mask_reduce_operator(+);
+  return __builtin_ia32_reduce_fadd_pd512(0.0, __W);
 }
 
 static __inline__ double __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {
   __W = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __M, __W);
-  _mm512_mask_reduce_operator(*);
+  return __builtin_ia32_reduce_fmul_pd512(1.0, __W);
 }
-#undef _mm512_mask_reduce_operator
 
 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_reduce_add_epi32(__m512i __W) {
@@ -9421,41 +9409,27 @@
   return __builtin_ia32_reduce_or_d512((__v16si)__W);
 }
 
-#define _mm512_mask_reduce_operator(op) \
-  __m256 __t1 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 0); \
-  __m256 __t2 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 1); \
-  __m256 __t3 = __t1 op __t2; \
-  __m128 __t4 = _mm256_extractf128_ps(__t3, 0); \
-  __m128 __t5 = _mm256_extractf128_ps(__t3, 1); \
-  __m128 __t6 = __t4 op __t5; \
-  __m128 __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \
-  __m128 __t8 = __t6 op __t7; \
-  __m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \
-  __m128 __t10 = __t8 op __t9; \
-  return __t10[0]
-
 static __inline__ float __DEFAULT_FN_ATTRS512
 _mm512_reduce_add_ps(__m512 __W) {
-  _mm512_mask_reduce_operator(+);
+  return __builtin_ia32_reduce_fadd_ps512(0.0f, __W);
 }
 
 static __inline__ float __DEFAULT_FN_ATTRS512
 _mm512_reduce_mul_ps(__m512 __W) {
-  _mm512_mask_reduce_operator(*);
+  return __builtin_ia32_reduce_fmul_ps512(1.0f, __W);
 }
 
 static __inline__ float __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) {
   __W = _mm512_maskz_mov_ps(__M, __W);
-  _mm512_mask_reduce_operator(+);
+  return __builtin_ia32_reduce_fadd_ps512(0.0f, __W);
 }
 
 static __inline__ float __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
   __W = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __M, __W);
-  _mm512_mask_reduce_operator(*);
+  return __builtin_ia32_reduce_fmul_ps512(1.0f, __W);
 }
-#undef _mm512_mask_reduce_operator
 
 static __inline__ long long __DEFAULT_FN_ATTRS512
 _mm512_reduce_max_epi64(__m512i __V) {
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -13631,6 +13631,18 @@
         CGM.getIntrinsic(Intrinsic::vector_reduce_and, Ops[0]->getType());
     return Builder.CreateCall(F, {Ops[0]});
   }
+  case X86::BI__builtin_ia32_reduce_fadd_pd512:
+  case X86::BI__builtin_ia32_reduce_fadd_ps512: {
+    Function *F =
+        CGM.getIntrinsic(Intrinsic::vector_reduce_fadd, Ops[1]->getType());
+    return Builder.CreateCall(F, {Ops[0], Ops[1]});
+  }
+  case X86::BI__builtin_ia32_reduce_fmul_pd512:
+  case X86::BI__builtin_ia32_reduce_fmul_ps512: {
+    Function *F =
+        CGM.getIntrinsic(Intrinsic::vector_reduce_fmul, Ops[1]->getType());
+    return Builder.CreateCall(F, {Ops[0], Ops[1]});
+  }
   case X86::BI__builtin_ia32_reduce_mul_d512:
   case X86::BI__builtin_ia32_reduce_mul_q512: {
     Function *F =
Index: clang/include/clang/Basic/BuiltinsX86.def
===================================================================
--- clang/include/clang/Basic/BuiltinsX86.def
+++ clang/include/clang/Basic/BuiltinsX86.def
@@ -1876,6 +1876,10 @@
 TARGET_BUILTIN(__builtin_ia32_reduce_add_q512, "OiV8Oi", "ncV:512:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_reduce_and_d512, "iV16i", "ncV:512:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_reduce_and_q512, "OiV8Oi", "ncV:512:", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_reduce_fadd_pd512, "ddV8d", "ncV:512:", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_reduce_fadd_ps512, "ffV16f", "ncV:512:", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_reduce_fmul_pd512, "ddV8d", "ncV:512:", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_reduce_fmul_ps512, "ffV16f", "ncV:512:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_reduce_mul_d512, "iV16i", "ncV:512:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_reduce_mul_q512, "OiV8Oi", "ncV:512:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_reduce_or_d512, "iV16i", "ncV:512:", "avx512f")

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D92940: [X86] Convert fadd/fmul _mm_reduce_* intrinsics to emit llvm.reduction intrinsics (PR47506)

Reply via email to