Hi,
As commented in patch, this one simplifies (cond (cmp x c1) (op x c2) c3) into
(op (minmax x c1) c2) if:
1) OP is PLUS or MINUS.
2) CMP is LT, LE, GT or GE.
3) C3 == (C1 op C2), and the experation isn't undefined behavior.
This pattern also handles special cases like:
1) Comparison's operand x is a unsigned to signed type conversion
and c1 is integer zero. In this case,
(signed type)x < 0 <=> x > MAX_VAL(signed type)
(signed type)x >= 0 <=> x <= MAX_VAL(signed type)
2) Const c1 may not equal to (C3 op' C2). In this case we also
check equality for (c1+1) and (c1-1) by adjusting comparison
code.
Also note: Though signed type is handled by this pattern, it cannot be
simplified at the moment because C standard requires additional type promotion.
In order to match&simplify signed type cases, the IR needs to be cleaned up by
other optimizers, i.e, VRP.
For given loop:
int foo1 (unsigned short a[], unsigned int x)
{
unsigned int i;
for (i = 0; i < 1000; i++)
{
x = a[i];
a[i] = (unsigned short)(x <= 32768 ? x + 32768 : 0);
}
return x;
}
Generated assembly can be improved from:
.L4:
ldr q5, [x3, x1]
add w2, w2, 1
cmp w0, w2
ushll v1.4s, v5.4h, 0
ushll2 v0.4s, v5.8h, 0
add v4.4s, v1.4s, v2.4s
add v3.4s, v0.4s, v2.4s
cmhs v1.4s, v2.4s, v1.4s
cmhs v0.4s, v2.4s, v0.4s
and v1.16b, v4.16b, v1.16b
and v0.16b, v3.16b, v0.16b
xtn v3.4h, v1.4s
xtn2 v3.8h, v0.4s
str q3, [x3, x1]
add x1, x1, 16
bhi .L4
To:
.L4:
ldr q1, [x3, x1]
add w2, w2, 1
cmp w0, w2
umin v0.8h, v1.8h, v2.8h
add v0.8h, v0.8h, v2.8h
str q0, [x3, x1]
add x1, x1, 16
bhi .L4
Bootstrap and test on x86_64 and AArch64 for whole patch set. Any comments?
Thanks,
bin
2016-10-21 Bin Cheng <bin.ch...@arm.com>
* match.pd ((cond (cmp x c1) (op x c2) c3) -> (op (minmax x c1) c2)):
New pattern.
gcc/testsuite/ChangeLog
2016-10-21 Bin Cheng <bin.ch...@arm.com>
* gcc.dg/fold-bopcond-1.c: New test.
* gcc.dg/fold-bopcond-2.c: New test.
diff --git a/gcc/match.pd b/gcc/match.pd
index 071db2c..704421d 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -1981,6 +1981,109 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
(max @1 @2)
(if (code == MIN_EXPR)
(min @1 @2))))))
+
+/* (cond (cmp x c1) (op x c2) c3) -> (op (minmax x c1) c2) if:
+
+ 1) OP is PLUS or MINUS.
+ 2) CMP is LT, LE, GT or GE.
+ 3) C3 == (C1 op C2), and the experation isn't undefined behavior.
+
+ This pattern also handles special cases like:
+
+ 1) Comparison's operand x is a unsigned to signed type conversion
+ and c1 is integer zero. In this case,
+ (signed type)x < 0 <=> x > MAX_VAL(signed type)
+ (signed type)x >= 0 <=> x <= MAX_VAL(signed type)
+ 2) Const c1 may not equal to (C3 op' C2). In this case we also
+ check equality for (c1+1) and (c1-1) by adjusting comparison
+ code.
+
+ TODO: Though signed type is handled by this pattern, it cannot be
+ simplified at the moment because C standard requires additional
+ promoting conversion. In order to match&simplify it here, the IR
+ needs to be cleaned up by other optimizers, i.e, VRP. */
+(for op (plus minus)
+ (for cmp (lt le gt ge)
+ (simplify
+ (cond (cmp@0 (convert?@00 @10) INTEGER_CST@01)
+ (op @10 INTEGER_CST@11)
+ INTEGER_CST@2)
+ (with { tree from_type = TREE_TYPE (@10), to_type = TREE_TYPE (@00); }
+ (if (@00 == @10
+ || (TYPE_UNSIGNED (from_type)
+ && !TYPE_UNSIGNED (to_type)
+ && TYPE_PRECISION (from_type) == TYPE_PRECISION (to_type)
+ && integer_zerop (@01)
+ && (TREE_CODE (@0) == LT_EXPR || TREE_CODE (@0) == GE_EXPR)))
+ (with
+ {
+ bool overflow = false;
+ enum tree_code code, cmp_code = TREE_CODE (@0);
+ tree real_c1, c1 = @01, c2 = @11, c3 = @2;
+ tree op_type = TREE_TYPE (@10);
+ signop sgn = TYPE_SIGN (op_type);
+ widest_int wmin = widest_int::from (wi::min_value (op_type), sgn);
+ widest_int wmax = widest_int::from (wi::max_value (op_type), sgn);
+
+ /* Handle special cases, given x of unsigned type:
+ ((signed type)x < 0) <=> (x > MAX_VAL(signed type))
+ ((signed type)x >= 0) <=> (x <= MAX_VAL(signed type)) */
+ if (@00 != @10)
+ {
+ if (cmp_code == LT_EXPR)
+ cmp_code = GT_EXPR;
+ if (cmp_code == GE_EXPR)
+ cmp_code = LE_EXPR;
+ c1 = wide_int_to_tree (op_type, wi::max_value (to_type));
+ }
+ /* To simplify this pattern, we require c3 = (c1 op c2). Here we
+ compute (c3 op' c2) and check it equals to c1 in which op' is
+ the inverted operator of op. Also make sure overflow does not
+ happen if it is undefined. */
+ if (op == PLUS_EXPR)
+ real_c1 = wide_int_to_tree (op_type,
+ wi::sub (c3, c2, sgn, &overflow));
+ else
+ real_c1 = wide_int_to_tree (op_type,
+ wi::add (c3, c2, sgn, &overflow));
+
+ code = cmp_code;
+ if (!overflow || !TYPE_OVERFLOW_UNDEFINED (op_type))
+ {
+ /* Check if c1 equals to real_c1. Boundary condition is handled
+ by adjusting comparison operation if necessary. */
+ if (wi::to_widest (c1) == (wi::to_widest (real_c1) - 1))
+ {
+ /* X <= Y - 1 equals to X < Y. */
+ if (cmp_code == LE_EXPR)
+ code = LT_EXPR;
+ /* X > Y - 1 equals to X >= Y. */
+ if (cmp_code == GT_EXPR)
+ code = GE_EXPR;
+ }
+ if (wi::to_widest (c1) == (wi::to_widest (real_c1) + 1))
+ {
+ /* X < Y + 1 equals to X <= Y. */
+ if (cmp_code == LT_EXPR)
+ code = LE_EXPR;
+ /* X >= Y + 1 equals to X > Y. */
+ if (cmp_code == GE_EXPR)
+ code = GT_EXPR;
+ }
+ if (code != cmp_code
+ || wi::to_widest (c1) == wi::to_widest (real_c1))
+ {
+ if (cmp_code == LT_EXPR || cmp_code == LE_EXPR)
+ code = MIN_EXPR;
+ if (cmp_code == GT_EXPR || cmp_code == GE_EXPR)
+ code = MAX_EXPR;
+ }
+ }
+ }
+ (if (code == MAX_EXPR)
+ (op (max @10 { real_c1; }) { c2; })
+ (if (code == MIN_EXPR)
+ (op (min @10 { real_c1; }) { c2; })))))))))
#endif
(for cnd (cond vec_cond)
diff --git a/gcc/testsuite/gcc.dg/fold-bopcond-1.c
b/gcc/testsuite/gcc.dg/fold-bopcond-1.c
new file mode 100644
index 0000000..7324c16
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/fold-bopcond-1.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fdump-tree-ifcvt" } */
+
+int foo1 (unsigned short a[], unsigned int x)
+{
+ unsigned int i;
+ for (i = 0; i < 1000; i++)
+ {
+ x = a[i];
+ a[i] = (unsigned short)(x <= 32768 ? x + 32768 : 0);
+ }
+ return x;
+}
+
+int foo2 (unsigned short a[], unsigned int x)
+{
+ unsigned int i;
+ for (i = 0; i < 1000; i++)
+ {
+ x = a[i];
+ a[i] = (unsigned short)(x < 32768 ? x + 32768 : 0);
+ }
+ return x;
+}
+
+int foo3 (unsigned short a[], unsigned int x)
+{
+ unsigned int i;
+ for (i = 0; i < 1000; i++)
+ {
+ x = a[i];
+ a[i] = (unsigned short)(x < 1000 ? x - 1000 : 0);
+ }
+ return x;
+}
+
+int foo4 (unsigned short a[], unsigned int x)
+{
+ unsigned int i;
+ for (i = 0; i < 1000; i++)
+ {
+ x = a[i];
+ a[i] = (unsigned short)(x <= 2 ? x + 999 : 1001);
+ }
+ return x;
+}
+
+/* { dg-final { scan-tree-dump-times "MIN_EXPR " 4 "ifcvt" } } */
diff --git a/gcc/testsuite/gcc.dg/fold-bopcond-2.c
b/gcc/testsuite/gcc.dg/fold-bopcond-2.c
new file mode 100644
index 0000000..7a47449
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/fold-bopcond-2.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fdump-tree-ifcvt" } */
+
+int foo1 (unsigned short a[], unsigned int x)
+{
+ unsigned int i;
+ for (i = 0; i < 1000; i++)
+ {
+ x = a[i];
+ a[i] = (unsigned short)(x >= 32768 ? x - 32768 : 0);
+ }
+ return x;
+}
+
+int foo2 (unsigned short a[], unsigned int x)
+{
+ unsigned int i;
+ for (i = 0; i < 1000; i++)
+ {
+ x = a[i];
+ a[i] = (unsigned short)(x > 32768 ? x - 32768 : 0);
+ }
+ return x;
+}
+
+int foo3 (unsigned short a[], unsigned int x)
+{
+ unsigned int i;
+ for (i = 0; i < 1000; i++)
+ {
+ x = a[i];
+ a[i] = (unsigned short)(x > 1000 ? x - 1000 : 0);
+ }
+ return x;
+}
+
+int foo4 (unsigned short a[], unsigned int x)
+{
+ unsigned int i;
+ for (i = 0; i < 1000; i++)
+ {
+ x = a[i];
+ a[i] = (unsigned short)(x >= 2 ? x - 32768 : 32770);
+ }
+ return x;
+}
+
+/* { dg-final { scan-tree-dump-times "MAX_EXPR " 4 "ifcvt" } } */