VEC_COND_EXPR optimizations

Marc Glisse Thu, 30 Jul 2020 00:49:24 -0700

When vector comparisons were forced to use vec_cond_expr, we lost a numberof optimizations (my fault for not adding enough testcases to preventthat). This patch tries to unwrap vec_cond_expr a bit so someoptimizations can still happen.

I wasn't planning to add all those transformations together, but addingone caused a regression, whose fix introduced a second regression, etc.

Using a simple fold_binary internally looks like an ok compromise to me.It remains cheap enough (not recursive, and vector instructions are notthat frequent), while still allowing more than const_binop (X|0 or X&X forinstance). The transformations are quite conservative with :s and foldingonly if everything simplifies, we may want to relax this later. And ofcourse we are going to miss things like a?b:c + a?c:b -> b+c.

In terms of number of operations, some transformations turning 2VEC_COND_EXPR into VEC_COND_EXPR + BIT_IOR_EXPR + BIT_NOT_EXPR might notlook like a gain... I expect the bit_not disappears in most cases, andVEC_COND_EXPR looks more costly than a simpler BIT_IOR_EXPR.

I am a bit confused that with avx512 we get types like "vector(4)<signed-boolean:2>" with :2 and not :1 (is it a hack so true is 1 and not-1?), but that doesn't matter for this patch.


Regtest+bootstrap on x86_64-pc-linux-gnu

2020-07-30  Marc Glisse  <marc.gli...@inria.fr>

        PR tree-optimization/95906
        PR target/70314
        * match.pd ((c ? a : b) op d, (c ? a : b) op (c ? d : e),
        (v ? w : 0) ? a : b, c1 ? c2 ? a : b : b): New transformations.

        * gcc.dg/tree-ssa/andnot-2.c: New file.
        * gcc.dg/tree-ssa/pr95906.c: Likewise.
        * gcc.target/i386/pr70314.c: Likewise.

--
Marc Glisse

diff --git a/gcc/match.pd b/gcc/match.pd
index c6ae7a7db7a..af52d56162b 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -3451,6 +3451,77 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
    (if (cst1 && cst2)
     (vec_cond @0 { cst1; } { cst2; })))))
 
+/* Sink binary operation to branches, but only if we can fold it.  */
+#if GIMPLE
+(for op (tcc_comparison plus minus mult bit_and bit_ior bit_xor
+	 rdiv trunc_div ceil_div floor_div round_div
+	 trunc_mod ceil_mod floor_mod round_mod min max)
+/* (c ? a : b) op d  -->  c ? (a op d) : (b op d) */
+ (simplify
+  (op (vec_cond:s @0 @1 @2) @3)
+  (with
+   {
+     tree rhs1, rhs2 = NULL;
+     rhs1 = fold_binary (op, type, @1, @3);
+     if (rhs1 && is_gimple_val (rhs1))
+       rhs2 = fold_binary (op, type, @2, @3);
+   }
+   (if (rhs2 && is_gimple_val (rhs2))
+    (vec_cond @0 { rhs1; } { rhs2; }))))
+ (simplify
+  (op @3 (vec_cond:s @0 @1 @2))
+  (with
+   {
+     tree rhs1, rhs2 = NULL;
+     rhs1 = fold_binary (op, type, @3, @1);
+     if (rhs1 && is_gimple_val (rhs1))
+       rhs2 = fold_binary (op, type, @3, @2);
+   }
+   (if (rhs2 && is_gimple_val (rhs2))
+    (vec_cond @0 { rhs1; } { rhs2; }))))
+
+/* (c ? a : b) op (c ? d : e)  -->  c ? (a op d) : (b op e) */
+ (simplify
+  (op (vec_cond:s @0 @1 @2) (vec_cond:s @0 @3 @4))
+  (with
+   {
+     tree rhs1, rhs2 = NULL;
+     rhs1 = fold_binary (op, type, @1, @3);
+     if (rhs1 && is_gimple_val (rhs1))
+       rhs2 = fold_binary (op, type, @2, @4);
+   }
+   (if (rhs2 && is_gimple_val (rhs2))
+    (vec_cond @0 { rhs1; } { rhs2; })))))
+#endif
+
+/* (v ? w : 0) ? a : b is just (v & w) ? a : b  */
+(simplify
+ (vec_cond (vec_cond:s @0 @3 integer_zerop) @1 @2)
+ (vec_cond (bit_and @0 @3) @1 @2))
+(simplify
+ (vec_cond (vec_cond:s @0 integer_all_onesp @3) @1 @2)
+ (vec_cond (bit_ior @0 @3) @1 @2))
+(simplify
+ (vec_cond (vec_cond:s @0 integer_zerop @3) @1 @2)
+ (vec_cond (bit_ior @0 (bit_not @3)) @2 @1))
+(simplify
+ (vec_cond (vec_cond:s @0 @3 integer_all_onesp) @1 @2)
+ (vec_cond (bit_and @0 (bit_not @3)) @2 @1))
+
+/* c1 ? c2 ? a : b : b  -->  (c1 & c2) ? a : b  */
+(simplify
+ (vec_cond @0 (vec_cond:s @1 @2 @3) @3)
+ (vec_cond (bit_and @0 @1) @2 @3))
+(simplify
+ (vec_cond @0 @2 (vec_cond:s @1 @2 @3))
+ (vec_cond (bit_ior @0 @1) @2 @3))
+(simplify
+ (vec_cond @0 (vec_cond:s @1 @2 @3) @2)
+ (vec_cond (bit_ior (bit_not @0) @1) @2 @3))
+(simplify
+ (vec_cond @0 @3 (vec_cond:s @1 @2 @3))
+ (vec_cond (bit_and (bit_not @0) @1) @2 @3))
+
 /* Simplification moved from fold_cond_expr_with_comparison.  It may also
    be extended.  */
 /* This pattern implements two kinds simplification:
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/andnot-2.c b/gcc/testsuite/gcc.dg/tree-ssa/andnot-2.c
new file mode 100644
index 00000000000..e0955ce3ffd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/andnot-2.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-forwprop3-raw -w -Wno-psabi" } */
+
+typedef long vec __attribute__((vector_size(16)));
+vec f(vec x){
+  vec y = x < 10;
+  return y & (y == 0);
+}
+
+/* { dg-final { scan-tree-dump-not "_expr" "forwprop3" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr95906.c b/gcc/testsuite/gcc.dg/tree-ssa/pr95906.c
new file mode 100644
index 00000000000..3d820a58e93
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr95906.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-forwprop3-raw -w -Wno-psabi" } */
+
+// FIXME: this should further optimize to a MAX_EXPR
+typedef signed char v16i8 __attribute__((vector_size(16)));
+v16i8 f(v16i8 a, v16i8 b)
+{
+    v16i8 cmp = (a > b);
+    return (cmp & a) | (~cmp & b);
+}
+
+/* { dg-final { scan-tree-dump-not "bit_(and|ior)_expr" "forwprop3" } } */
+/* { dg-final { scan-tree-dump-times "vec_cond_expr" 1 "forwprop3" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr70314.c b/gcc/testsuite/gcc.target/i386/pr70314.c
new file mode 100644
index 00000000000..aad8dd9b57e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr70314.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-march=skylake-avx512 -O2" } */
+/* { dg-final { scan-assembler-times "cmp" 2 } } */
+/* { dg-final { scan-assembler-not "and" } } */
+
+typedef long vec __attribute__((vector_size(16)));
+vec f(vec x, vec y){
+  return (x < 5) & (y < 8);
+}
+
+/* On x86_64, currently
+	vpcmpq	$2, .LC1(%rip), %xmm1, %k1
+	vpcmpq	$2, .LC0(%rip), %xmm0, %k0{%k1}
+	vpmovm2q	%k0, %xmm0
+*/

VEC_COND_EXPR optimizations

Reply via email to