[PATCH] D93179: [X86] Convert fmin/fmax _mm_reduce_* intrinsics to emit llvm.reduction intrinsics (PR47506)

Pengfei Wang via Phabricator via cfe-commits Wed, 10 Feb 2021 01:22:55 -0800

pengfei updated this revision to Diff 322616.
pengfei added a comment.

Minor fixes in tests.



Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D93179/new/

https://reviews.llvm.org/D93179

Files:
  clang/include/clang/Basic/BuiltinsX86.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/Headers/avx512fintrin.h
  clang/test/CodeGen/X86/avx512-reduceMinMaxIntrin.c

Index: clang/test/CodeGen/X86/avx512-reduceMinMaxIntrin.c
===================================================================
--- clang/test/CodeGen/X86/avx512-reduceMinMaxIntrin.c
+++ clang/test/CodeGen/X86/avx512-reduceMinMaxIntrin.c
@@ -14,18 +14,14 @@
   return _mm512_reduce_max_epu64(__W);
 }
 
-double test_mm512_reduce_max_pd(__m512d __W){
-  // CHECK-LABEL: @test_mm512_reduce_max_pd(
-  // CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK:    call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
-  // CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> poison, <2 x i32> <i32 0, i32 1>
-  // CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> poison, <2 x i32> <i32 2, i32 3>
-  // CHECK:    call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
-  // CHECK:    shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 0>
-  // CHECK:    call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
-  // CHECK:    extractelement <2 x double> %{{.*}}, i32 0
-  return _mm512_reduce_max_pd(__W); 
+double test_mm512_reduce_max_pd(__m512d __W, double ExtraAddOp){
+// CHECK-LABEL: @test_mm512_reduce_max_pd(
+// CHECK-NOT: nnan
+// CHECK-NOT: nsz
+// CHECK:    call nnan nsz double @llvm.vector.reduce.fmax.v8f64(<8 x double> %{{.*}})
+// CHECK-NOT: nnan
+// CHECK-NOT: nsz
+  return _mm512_reduce_max_pd(__W) + ExtraAddOp;
 }
 
 long long test_mm512_reduce_min_epi64(__m512i __W){
@@ -40,18 +36,14 @@
   return _mm512_reduce_min_epu64(__W);
 }
 
-double test_mm512_reduce_min_pd(__m512d __W){
-  // CHECK-LABEL: @test_mm512_reduce_min_pd(
-  // CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK:    call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
-  // CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> poison, <2 x i32> <i32 0, i32 1>
-  // CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> poison, <2 x i32> <i32 2, i32 3>
-  // CHECK:    call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
-  // CHECK:    shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 0>
-  // CHECK:    call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
-  // CHECK:    extractelement <2 x double> %{{.*}}, i32 0
-  return _mm512_reduce_min_pd(__W); 
+double test_mm512_reduce_min_pd(__m512d __W, double ExtraMulOp){
+// CHECK-LABEL: @test_mm512_reduce_min_pd(
+// CHECK-NOT: nnan
+// CHECK-NOT: nsz
+// CHECK:    call nnan nsz double @llvm.vector.reduce.fmin.v8f64(<8 x double> %{{.*}})
+// CHECK-NOT: nnan
+// CHECK-NOT: nsz
+  return _mm512_reduce_min_pd(__W) * ExtraMulOp;
 }
 
 long long test_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __W){
@@ -59,7 +51,7 @@
 // CHECK:    bitcast i8 %{{.*}} to <8 x i1>
 // CHECK:    select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
 // CHECK:    call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> %{{.*}})
-  return _mm512_mask_reduce_max_epi64(__M, __W); 
+  return _mm512_mask_reduce_max_epi64(__M, __W);
 }
 
 unsigned long test_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __W){
@@ -67,23 +59,15 @@
 // CHECK:    bitcast i8 %{{.*}} to <8 x i1>
 // CHECK:    select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
 // CHECK:    call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> %{{.*}})
-  return _mm512_mask_reduce_max_epu64(__M, __W); 
+  return _mm512_mask_reduce_max_epu64(__M, __W);
 }
 
 double test_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __W){
-  // CHECK-LABEL: @test_mm512_mask_reduce_max_pd(
-  // CHECK:    bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK:    select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  // CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK:    call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
-  // CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> poison, <2 x i32> <i32 0, i32 1>
-  // CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> poison, <2 x i32> <i32 2, i32 3>
-  // CHECK:    call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
-  // CHECK:    shufflevector <2 x double> %{{.*}}, <2 x double>  %{{.*}}, <2 x i32> <i32 1, i32 0>
-  // CHECK:    call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
-  // CHECK:    extractelement <2 x double> %{{.*}}, i32 0
-  return _mm512_mask_reduce_max_pd(__M, __W); 
+// CHECK-LABEL: @test_mm512_mask_reduce_max_pd(
+// CHECK:    bitcast i8 %{{.*}} to <8 x i1>
+// CHECK:    select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+// CHECK:    call nnan nsz double @llvm.vector.reduce.fmax.v8f64(<8 x double> %{{.*}})
+  return _mm512_mask_reduce_max_pd(__M, __W);
 }
 
 long long test_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __W){
@@ -91,7 +75,7 @@
 // CHECK:    bitcast i8 %{{.*}} to <8 x i1>
 // CHECK:    select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
 // CHECK:    call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> %{{.*}})
-  return _mm512_mask_reduce_min_epi64(__M, __W); 
+  return _mm512_mask_reduce_min_epi64(__M, __W);
 }
 
 unsigned long long test_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __W){
@@ -99,23 +83,15 @@
 // CHECK:    bitcast i8 %{{.*}} to <8 x i1>
 // CHECK:    select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
 // CHECK:    call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> %{{.*}})
-  return _mm512_mask_reduce_min_epu64(__M, __W); 
+  return _mm512_mask_reduce_min_epu64(__M, __W);
 }
 
 double test_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __W){
-  // CHECK-LABEL: @test_mm512_mask_reduce_min_pd(
-  // CHECK:    bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK:    select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  // CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK:    call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
-  // CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> poison, <2 x i32> <i32 0, i32 1>
-  // CHECK:    shufflevector <4 x double> %{{.*}}, <4 x double> poison, <2 x i32> <i32 2, i32 3>
-  // CHECK:    call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
-  // CHECK:    shufflevector <2 x double> %{{.*}}, <2 x double>  %{{.*}}, <2 x i32> <i32 1, i32 0>
-  // CHECK:    call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
-  // CHECK:    extractelement <2 x double> %{{.*}}, i32 0
-  return _mm512_mask_reduce_min_pd(__M, __W); 
+// CHECK-LABEL: @test_mm512_mask_reduce_min_pd(
+// CHECK:    bitcast i8 %{{.*}} to <8 x i1>
+// CHECK:    select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+// CHECK:    call nnan nsz double @llvm.vector.reduce.fmin.v8f64(<8 x double> %{{.*}})
+  return _mm512_mask_reduce_min_pd(__M, __W);
 }
 
 int test_mm512_reduce_max_epi32(__m512i __W){
@@ -131,19 +107,9 @@
 }
 
 float test_mm512_reduce_max_ps(__m512 __W){
-  // CHECK-LABEL: define{{.*}} float @test_mm512_reduce_max_ps(
-  // CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK:    call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
-  // CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK:    call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
-  // CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-  // CHECK:    call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
-  // CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-  // CHECK:    call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
-  // CHECK:    extractelement <4 x float> %{{.*}}, i32 0
-  return _mm512_reduce_max_ps(__W); 
+// CHECK-LABEL: @test_mm512_reduce_max_ps(
+// CHECK:    call nnan nsz float @llvm.vector.reduce.fmax.v16f32(<16 x float> %{{.*}})
+  return _mm512_reduce_max_ps(__W);
 }
 
 int test_mm512_reduce_min_epi32(__m512i __W){
@@ -159,19 +125,9 @@
 }
 
 float test_mm512_reduce_min_ps(__m512 __W){
-  // CHECK-LABEL: define{{.*}} float @test_mm512_reduce_min_ps(
-  // CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK:    call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
-  // CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK:    call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
-  // CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-  // CHECK:    call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
-  // CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-  // CHECK:    call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
-  // CHECK:    extractelement <4 x float> %{{.*}}, i32 0
-  return _mm512_reduce_min_ps(__W); 
+// CHECK-LABEL: @test_mm512_reduce_min_ps(
+// CHECK:    call nnan nsz float @llvm.vector.reduce.fmin.v16f32(<16 x float> %{{.*}})
+  return _mm512_reduce_min_ps(__W);
 }
 
 int test_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __W){
@@ -179,7 +135,7 @@
 // CHECK:    bitcast i16 %{{.*}} to <16 x i1>
 // CHECK:    select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
 // CHECK:    call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> %{{.*}})
-  return _mm512_mask_reduce_max_epi32(__M, __W); 
+  return _mm512_mask_reduce_max_epi32(__M, __W);
 }
 
 unsigned int test_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __W){
@@ -187,25 +143,15 @@
 // CHECK:    bitcast i16 %{{.*}} to <16 x i1>
 // CHECK:    select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
 // CHECK:    call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %{{.*}})
-  return _mm512_mask_reduce_max_epu32(__M, __W); 
+  return _mm512_mask_reduce_max_epu32(__M, __W);
 }
 
 float test_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __W){
-  // CHECK-LABEL: define{{.*}} float @test_mm512_mask_reduce_max_ps(
-  // CHECK:    bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK:    select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  // CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK:    call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
-  // CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK:    call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
-  // CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-  // CHECK:    call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
-  // CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-  // CHECK:    call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
-  // CHECK:    extractelement <4 x float> %{{.*}}, i32 0
-  return _mm512_mask_reduce_max_ps(__M, __W); 
+// CHECK-LABEL: @test_mm512_mask_reduce_max_ps(
+// CHECK:    bitcast i16 %{{.*}} to <16 x i1>
+// CHECK:    select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+// CHECK:    call nnan nsz float @llvm.vector.reduce.fmax.v16f32(<16 x float> %{{.*}})
+  return _mm512_mask_reduce_max_ps(__M, __W);
 }
 
 int test_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __W){
@@ -213,7 +159,7 @@
 // CHECK:    bitcast i16 %{{.*}} to <16 x i1>
 // CHECK:    select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
 // CHECK:    call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> %{{.*}})
-  return _mm512_mask_reduce_min_epi32(__M, __W); 
+  return _mm512_mask_reduce_min_epi32(__M, __W);
 }
 
 unsigned int test_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __W){
@@ -221,24 +167,14 @@
 // CHECK:    bitcast i16 %{{.*}} to <16 x i1>
 // CHECK:    select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
 // CHECK:    call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> %{{.*}})
-  return _mm512_mask_reduce_min_epu32(__M, __W); 
+  return _mm512_mask_reduce_min_epu32(__M, __W);
 }
 
 float test_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __W){
-  // CHECK-LABEL: define{{.*}} float @test_mm512_mask_reduce_min_ps(
-  // CHECK:    bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK:    select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  // CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK:    shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK:    call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
-  // CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK:    shufflevector <8 x float> %{{.*}}, <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK:    call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
-  // CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-  // CHECK:    call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
-  // CHECK:    shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-  // CHECK:    call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
-  // CHECK:    extractelement <4 x float> %{{.*}}, i32 0
-  return _mm512_mask_reduce_min_ps(__M, __W); 
+// CHECK-LABEL: @test_mm512_mask_reduce_min_ps(
+// CHECK:    bitcast i16 %{{.*}} to <16 x i1>
+// CHECK:    select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+// CHECK:    call nnan nsz float @llvm.vector.reduce.fmin.v16f32(<16 x float> %{{.*}})
+  return _mm512_mask_reduce_min_ps(__M, __W);
 }
 
Index: clang/lib/Headers/avx512fintrin.h
===================================================================
--- clang/lib/Headers/avx512fintrin.h
+++ clang/lib/Headers/avx512fintrin.h
@@ -9300,8 +9300,10 @@
  * computations. In vector-reduction arithmetic, the evaluation order is
  * independent of the order of the input elements of V.
 
- * For floating point types, we always assume the elements are reassociable even
- * if -fast-math is off.
+ * For floating point intrinsics, we have implicit assumptions:
+ * 1. The elements are reassociable when using fadd/fmul intrinsics;
+ * 2. There's no nan and signed zero in the elements when using fmin/max
+ intrinsics;
 
  * Used bisection method. At each step, we partition the vector with previous
  * step in half, and the operation is performed on its two halves.
@@ -9524,75 +9526,49 @@
   return __builtin_ia32_reduce_umin_d512((__v16si)__V);
 }
 
-#define _mm512_mask_reduce_operator(op) \
-  __m256d __t1 = _mm512_extractf64x4_pd(__V, 0); \
-  __m256d __t2 = _mm512_extractf64x4_pd(__V, 1); \
-  __m256d __t3 = _mm256_##op(__t1, __t2); \
-  __m128d __t4 = _mm256_extractf128_pd(__t3, 0); \
-  __m128d __t5 = _mm256_extractf128_pd(__t3, 1); \
-  __m128d __t6 = _mm_##op(__t4, __t5); \
-  __m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \
-  __m128d __t8 = _mm_##op(__t6, __t7); \
-  return __t8[0]
-
 static __inline__ double __DEFAULT_FN_ATTRS512
 _mm512_reduce_max_pd(__m512d __V) {
-  _mm512_mask_reduce_operator(max_pd);
+  return __builtin_ia32_reduce_fmax_pd512(__V);
 }
 
 static __inline__ double __DEFAULT_FN_ATTRS512
 _mm512_reduce_min_pd(__m512d __V) {
-  _mm512_mask_reduce_operator(min_pd);
+  return __builtin_ia32_reduce_fmin_pd512(__V);
 }
 
 static __inline__ double __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) {
   __V = _mm512_mask_mov_pd(_mm512_set1_pd(-__builtin_inf()), __M, __V);
-  _mm512_mask_reduce_operator(max_pd);
+  return __builtin_ia32_reduce_fmax_pd512(__V);
 }
 
 static __inline__ double __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) {
   __V = _mm512_mask_mov_pd(_mm512_set1_pd(__builtin_inf()), __M, __V);
-  _mm512_mask_reduce_operator(min_pd);
-}
-#undef _mm512_mask_reduce_operator
-
-#define _mm512_mask_reduce_operator(op) \
-  __m256 __t1 = (__m256)_mm512_extractf64x4_pd((__m512d)__V, 0); \
-  __m256 __t2 = (__m256)_mm512_extractf64x4_pd((__m512d)__V, 1); \
-  __m256 __t3 = _mm256_##op(__t1, __t2); \
-  __m128 __t4 = _mm256_extractf128_ps(__t3, 0); \
-  __m128 __t5 = _mm256_extractf128_ps(__t3, 1); \
-  __m128 __t6 = _mm_##op(__t4, __t5); \
-  __m128 __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \
-  __m128 __t8 = _mm_##op(__t6, __t7); \
-  __m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \
-  __m128 __t10 = _mm_##op(__t8, __t9); \
-  return __t10[0]
+  return __builtin_ia32_reduce_fmin_pd512(__V);
+}
 
 static __inline__ float __DEFAULT_FN_ATTRS512
 _mm512_reduce_max_ps(__m512 __V) {
-  _mm512_mask_reduce_operator(max_ps);
+  return __builtin_ia32_reduce_fmax_ps512(__V);
 }
 
 static __inline__ float __DEFAULT_FN_ATTRS512
 _mm512_reduce_min_ps(__m512 __V) {
-  _mm512_mask_reduce_operator(min_ps);
+  return __builtin_ia32_reduce_fmin_ps512(__V);
 }
 
 static __inline__ float __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) {
   __V = _mm512_mask_mov_ps(_mm512_set1_ps(-__builtin_inff()), __M, __V);
-  _mm512_mask_reduce_operator(max_ps);
+  return __builtin_ia32_reduce_fmax_ps512(__V);
 }
 
 static __inline__ float __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) {
   __V = _mm512_mask_mov_ps(_mm512_set1_ps(__builtin_inff()), __M, __V);
-  _mm512_mask_reduce_operator(min_ps);
+  return __builtin_ia32_reduce_fmin_ps512(__V);
 }
-#undef _mm512_mask_reduce_operator
 
 /// Moves the least significant 32 bits of a vector of [16 x i32] to a
 ///    32-bit signed integer value.
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -13826,16 +13826,34 @@
   case X86::BI__builtin_ia32_reduce_fadd_ps512: {
     Function *F =
         CGM.getIntrinsic(Intrinsic::vector_reduce_fadd, Ops[1]->getType());
-    Builder.getFastMathFlags().setAllowReassoc(true);
+    Builder.getFastMathFlags().setAllowReassoc();
     return Builder.CreateCall(F, {Ops[0], Ops[1]});
   }
   case X86::BI__builtin_ia32_reduce_fmul_pd512:
   case X86::BI__builtin_ia32_reduce_fmul_ps512: {
     Function *F =
         CGM.getIntrinsic(Intrinsic::vector_reduce_fmul, Ops[1]->getType());
-    Builder.getFastMathFlags().setAllowReassoc(true);
+    Builder.getFastMathFlags().setAllowReassoc();
     return Builder.CreateCall(F, {Ops[0], Ops[1]});
   }
+  case X86::BI__builtin_ia32_reduce_fmax_pd512:
+  case X86::BI__builtin_ia32_reduce_fmax_ps512: {
+    Function *F =
+        CGM.getIntrinsic(Intrinsic::vector_reduce_fmax, Ops[0]->getType());
+    FastMathFlags &FMF = Builder.getFastMathFlags();
+    FMF.setNoNaNs();
+    FMF.setNoSignedZeros();
+    return Builder.CreateCall(F, {Ops[0]});
+  }
+  case X86::BI__builtin_ia32_reduce_fmin_pd512:
+  case X86::BI__builtin_ia32_reduce_fmin_ps512: {
+    Function *F =
+        CGM.getIntrinsic(Intrinsic::vector_reduce_fmin, Ops[0]->getType());
+    FastMathFlags &FMF = Builder.getFastMathFlags();
+    FMF.setNoNaNs();
+    FMF.setNoSignedZeros();
+    return Builder.CreateCall(F, {Ops[0]});
+  }
   case X86::BI__builtin_ia32_reduce_mul_d512:
   case X86::BI__builtin_ia32_reduce_mul_q512: {
     Function *F =
Index: clang/include/clang/Basic/BuiltinsX86.def
===================================================================
--- clang/include/clang/Basic/BuiltinsX86.def
+++ clang/include/clang/Basic/BuiltinsX86.def
@@ -1878,6 +1878,10 @@
 TARGET_BUILTIN(__builtin_ia32_reduce_and_q512, "OiV8Oi", "ncV:512:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_reduce_fadd_pd512, "ddV8d", "ncV:512:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_reduce_fadd_ps512, "ffV16f", "ncV:512:", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_reduce_fmax_pd512, "dV8d", "ncV:512:", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_reduce_fmax_ps512, "fV16f", "ncV:512:", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_reduce_fmin_pd512, "dV8d", "ncV:512:", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_reduce_fmin_ps512, "fV16f", "ncV:512:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_reduce_fmul_pd512, "ddV8d", "ncV:512:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_reduce_fmul_ps512, "ffV16f", "ncV:512:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_reduce_mul_d512, "iV16i", "ncV:512:", "avx512f")

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D93179: [X86] Convert fmin/fmax _mm_reduce_* intrinsics to emit llvm.reduction intrinsics (PR47506)

Reply via email to