A(1+-B) match.pd optimization

Jakub Jelinek Fri, 29 Nov 2019 16:28:48 -0800

Hi!

As discussed in the PR, we can't optimize e.g.
  int a = t - 1;
  int b = a * v;
  return b + v;
into return t * v; for signed non-wrapv arithmetics.  This can be done
by the match.pd (A * B) +- A -> (B +- 1) * A or
A +- (A * B) -> (1 +- B) * A canonicalizations.  Being a lazy guy,
I wrote attached brute force proglet to look for all the cases (for signed
char) where there is no UB in the original and the transformation would
introduce UB.  For the simple cases with just A and B, rather than A, B and
C, the problematic cases are for signed char only:
A*B+A -> (B+1)*A A==-1 && B==127
A*B+A -> (B+1)*A A==0 && B==127
A*B-A -> (B-1)*A A==0 && B==-128
A-A*B -> (1-B)*A A==-1 && B==-127
A-A*B -> (1-B)*A A==0 && B==-128
A-A*B -> (1-B)*A A==0 && B==-127
The current patterns already use VRP (tree_expr_nonzero_p and
expr_not_equal_to) to do the transformation only if A is known not to be 0
or -1.  But as the above problematic cases show, for A*B-A the -1
case is actually not problematic (transformation doesn't introduce UB;
if A is -1, -1*B+1 has UB only for minimum and minimum+1, and the
replacement (B-1)*-1 has also UB for those two cases only) and even when we
know nothing about A value range, if we know something about B value range,
we could still optimize.  So, for the
  int a = t - 1;
  int b = a * v;
  return b + v;
case, the a = t - 1 has value range that doesn't include maximum and
so we can conclude it is ok to transform it into return ((t - 1) + 1) * v
and thus t * v.


Unfortunately, the patch "broke" a few loop_versioning_*.f90 tests (CCing
author), where there are small differences between the lversion pass,
e.g. in loop_versioning_1.f90 (f1):
-  # RANGE ~[0, 0]
-  _4 = iftmp.11_6 * S.4_19;
-  _11 = _4 - iftmp.11_6;
+  # RANGE [0, 9223372036854775806] NONZERO 9223372036854775807
+  _4 = S.4_19 + -1;
+  _11 = _4 * iftmp.11_6;
and the lversion pass then emits just one message instead of two, but in the
end assembly is identical.  In loop_versioning_6.f90 (though, with -m32
only), the code before lversion pass actually looks better in f1:
-  # i_35 = PHI <1(8), i_28(9)>
-  _9 = iftmp.33_15 * i_35;
-  _10 = _9 * 2;
-  _21 = _10 - iftmp.33_15;
-  (*x.0_23)[_21] = 1.0e+2;
-  _11 = i_35 * 2;
-  _12 = _11 + 1;
-  _13 = _12 * iftmp.33_15;
-  _22 = _13 - iftmp.33_15;
-  (*x.0_23)[_22] = 1.01e+2;
-  i_28 = i_35 + 1;
-  if (iftmp.36_25 < i_28)
+  # i_31 = PHI <1(8), i_26(9)>
+  _10 = iftmp.33_13 * i_31;
+  _11 = _10 * 2;
+  _19 = _11 - iftmp.33_13;
+  (*x.0_21)[_19] = 1.0e+2;
+  (*x.0_21)[_11] = 1.01e+2;
+  i_26 = i_31 + 1;
+  if (iftmp.36_23 < i_26)
where due to the new canonicalizations we managed to avoid some
multiplications.  One index was iftmp*i*2-iftmp and the other
was iftmp*(i*2+1)-iftmp and with the patch we managed to simplify
the latter into iftmp*i*2 and use for that the temporary used for
the first expression.  f1 is actually in assembly smaller because of this.
The lp64 vs. ! lp64 is just a wild guess, guess testing on further targets
will show what is the target property that matters.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2019-11-29  Jakub Jelinek  <ja...@redhat.com>

        PR tree-optimization/92712
        * match.pd ((A * B) +- A -> (B +- 1) * A,
        A +- (A * B) -> (1 +- B) * A): Allow optimizing signed integers
        even when we don't know anything about range of A, but do know
        something about range of B and the simplification won't introduce
        new UB.

        * gcc.dg/tree-ssa/pr92712-1.c: New test.
        * gcc.dg/tree-ssa/pr92712-2.c: New test.
        * gcc.dg/tree-ssa/pr92712-3.c: New test.
        * gfortran.dg/loop_versioning_1.f90: Adjust expected number of
        likely to be innermost dimension messages.
        * gfortran.dg/loop_versioning_10.f90: Likewise.
        * gfortran.dg/loop_versioning_6.f90: Likewise.

--- gcc/match.pd.jj     2019-11-05 14:59:22.546873967 +0100
+++ gcc/match.pd        2019-11-29 18:17:27.472002727 +0100
@@ -2480,18 +2480,42 @@ (define_operator_list COND_TERNARY
     (plusminus @0 (mult:c@3 @0 @2))
     (if ((!ANY_INTEGRAL_TYPE_P (type)
          || TYPE_OVERFLOW_WRAPS (type)
+         /* For @0 + @0*@2 this transformation would introduce UB
+            (where there was none before) for @0 in [-1,0] and @2 max.
+            For @0 - @0*@2 this transformation would introduce UB
+            for @0 0 and @2 in [min,min+1] or @0 -1 and @2 min+1.  */
          || (INTEGRAL_TYPE_P (type)
-             && tree_expr_nonzero_p (@0)
-             && expr_not_equal_to (@0, wi::minus_one (TYPE_PRECISION (type)))))
+             && ((tree_expr_nonzero_p (@0)
+                  && expr_not_equal_to (@0,
+                               wi::minus_one (TYPE_PRECISION (type))))
+                 || (plusminus == PLUS_EXPR
+                     ? expr_not_equal_to (@2,
+                           wi::max_value (TYPE_PRECISION (type), SIGNED))
+                     /* Let's ignore the @0 -1 and @2 min case.  */
+                     : (expr_not_equal_to (@2,
+                           wi::min_value (TYPE_PRECISION (type), SIGNED))
+                        && expr_not_equal_to (@2,
+                               wi::min_value (TYPE_PRECISION (type), SIGNED)
+                               + 1))))))
         && single_use (@3))
      (mult (plusminus { build_one_cst (type); } @2) @0)))
    (simplify
     (plusminus (mult:c@3 @0 @2) @0)
     (if ((!ANY_INTEGRAL_TYPE_P (type)
          || TYPE_OVERFLOW_WRAPS (type)
+         /* For @0*@2 + @0 this transformation would introduce UB
+            (where there was none before) for @0 in [-1,0] and @2 max.
+            For @0*@2 - @0 this transformation would introduce UB
+            for @0 0 and @2 min.  */
          || (INTEGRAL_TYPE_P (type)
-             && tree_expr_nonzero_p (@0)
-             && expr_not_equal_to (@0, wi::minus_one (TYPE_PRECISION (type)))))
+             && ((tree_expr_nonzero_p (@0)
+                  && (plusminus == MINUS_EXPR
+                      || expr_not_equal_to (@0,
+                               wi::minus_one (TYPE_PRECISION (type)))))
+                 || expr_not_equal_to (@2,
+                       (plusminus == PLUS_EXPR
+                        ? wi::max_value (TYPE_PRECISION (type), SIGNED)
+                        : wi::min_value (TYPE_PRECISION (type), SIGNED))))))
         && single_use (@3))
      (mult (plusminus @2 { build_one_cst (type); }) @0))))))
 
--- gcc/testsuite/gcc.dg/tree-ssa/pr92712-1.c.jj        2019-11-29 
19:47:42.900389708 +0100
+++ gcc/testsuite/gcc.dg/tree-ssa/pr92712-1.c   2019-11-29 19:50:26.704925013 
+0100
@@ -0,0 +1,21 @@
+/* PR tree-optimization/92712 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump " = \[tv]_\[0-9]*\\\(D\\\) \\* 
\[tv]_\[0-9]*\\\(D\\\);" "optimized" } } */
+
+static int
+foo (int t, int v)
+{
+  int i, x = 0;
+  for (int i = 0; i < t; ++i)
+    x += v;
+  return x;
+}
+
+int
+bar (int t, int v)
+{
+  if (t < 0)
+    __builtin_unreachable ();
+  return foo (t, v);
+}
--- gcc/testsuite/gcc.dg/tree-ssa/pr92712-2.c.jj        2019-11-29 
19:51:47.169714383 +0100
+++ gcc/testsuite/gcc.dg/tree-ssa/pr92712-2.c   2019-11-29 20:12:21.792153466 
+0100
@@ -0,0 +1,66 @@
+/* PR tree-optimization/92712 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-ipa-icf -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-times " = \[tv]_\[0-9]*\\\(D\\\) \\* 
\[tv]_\[0-9]*\\\(D\\\);" 7 "optimized" } } */
+
+int
+f1 (int t, int v)
+{
+  int a = t - 1;
+  int b = a * v;
+  return b + v;
+}
+
+int
+f2 (int t, int v)
+{
+  int a = t - 1;
+  int b = a * v;
+  return v + b;
+}
+
+int
+f3 (int t, int v)
+{
+  int a = t + 1;
+  int b = a * v;
+  return b - v;
+}
+
+int
+f4 (int t, int v)
+{
+  int a = 1 - t;
+  int b = a * v;
+  return v - b;
+}
+
+int
+f5 (int t, int v)
+{
+  if (v == 0 || v == -1)
+    __builtin_unreachable ();
+  int a = t - 1U;
+  int b = a * v;
+  return b + v;
+}
+
+int
+f6 (int t, int v)
+{
+  if (v == 0 || v == -1)
+    __builtin_unreachable ();
+  int a = t - 1U;
+  int b = a * v;
+  return v + b;
+}
+
+int
+f7 (int t, int v)
+{
+  if (v == 0)
+    __builtin_unreachable ();
+  int a = t + 1U;
+  int b = a * v;
+  return b - v;
+}
--- gcc/testsuite/gcc.dg/tree-ssa/pr92712-3.c.jj        2019-11-29 
20:07:52.313198937 +0100
+++ gcc/testsuite/gcc.dg/tree-ssa/pr92712-3.c   2019-11-29 20:08:16.890829972 
+0100
@@ -0,0 +1,36 @@
+/* PR tree-optimization/92712 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-ipa-icf -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-not " = \[tv]_\[0-9]*\\\(D\\\) \\* 
\[tv]_\[0-9]*\\\(D\\\);" "optimized" } } */
+
+int
+f1 (int t, int v)
+{
+  int a = t - 1U;
+  int b = a * v;
+  return b + v;
+}
+
+int
+f2 (int t, int v)
+{
+  int a = t - 1U;
+  int b = a * v;
+  return v + b;
+}
+
+int
+f3 (int t, int v)
+{
+  int a = t + 1U;
+  int b = a * v;
+  return b - v;
+}
+
+int
+f4 (int t, int v)
+{
+  int a = 1U - t;
+  int b = a * v;
+  return v - b;
+}
--- gcc/testsuite/gfortran.dg/loop_versioning_1.f90.jj  2019-01-19 
18:27:58.396602572 +0100
+++ gcc/testsuite/gfortran.dg/loop_versioning_1.f90     2019-11-30 
00:49:42.250277985 +0100
@@ -23,6 +23,6 @@ subroutine f3(x, limit, step)
   end do
 end subroutine f3
 
-! { dg-final { scan-tree-dump-times {likely to be the innermost dimension} 2 
"lversion" } }
+! { dg-final { scan-tree-dump-times {likely to be the innermost dimension} 1 
"lversion" } }
 ! { dg-final { scan-tree-dump-times {want to version containing loop} 3 
"lversion" } }
 ! { dg-final { scan-tree-dump-times {versioned this loop} 3 "lversion" } }
--- gcc/testsuite/gfortran.dg/loop_versioning_10.f90.jj 2019-01-19 
18:27:58.396602572 +0100
+++ gcc/testsuite/gfortran.dg/loop_versioning_10.f90    2019-11-30 
00:49:53.159113815 +0100
@@ -26,6 +26,6 @@ subroutine f4(x, i)
   end do
 end subroutine f4
 
-! { dg-final { scan-tree-dump-times {likely to be the innermost dimension} 6 
"lversion" } }
+! { dg-final { scan-tree-dump-times {likely to be the innermost dimension} 4 
"lversion" } }
 ! { dg-final { scan-tree-dump-times {want to version} 4 "lversion" } }
 ! { dg-final { scan-tree-dump-times {versioned} 4 "lversion" } }
--- gcc/testsuite/gfortran.dg/loop_versioning_6.f90.jj  2018-12-17 
22:13:44.485132746 +0100
+++ gcc/testsuite/gfortran.dg/loop_versioning_6.f90     2019-11-30 
00:58:42.291150793 +0100
@@ -89,5 +89,7 @@ subroutine f9(x, limit, step)
   end do
 end subroutine f9
 
-! { dg-final { scan-tree-dump-times {want to version containing loop} 9 
"lversion" } }
-! { dg-final { scan-tree-dump-times {versioned this loop} 9 "lversion" } }
+! { dg-final { scan-tree-dump-times {want to version containing loop} 9 
"lversion" { target lp64 } } }
+! { dg-final { scan-tree-dump-times {versioned this loop} 9 "lversion" { 
target lp64 } } }
+! { dg-final { scan-tree-dump-times {want to version containing loop} 8 
"lversion" { target { ! lp64 } } } }
+! { dg-final { scan-tree-dump-times {versioned this loop} 8 "lversion" { 
target { ! lp64 } } } }

        Jakub

int
main ()
{
  for (int a = -128; a <= 127; a++)
    for (int b = -128; b <= 127; b++)
      {
        int ovf1, ovf2;
        int t;
        t = a * b;
        ovf1 = t != (signed char) t;
        t = (signed char) t + a;
        ovf1 |= t != (signed char) t;
        t = b + 1;
        ovf2 = t != (signed char) t;
        t = (signed char) t * a;
        ovf2 |= t != (signed char) t;
        if (!ovf1 && ovf2)
          __builtin_printf ("A*B+A -> (B+1)*A A==%d && B==%d\n", a, b);
      }
  for (int a = -128; a <= 127; a++)
    for (int b = -128; b <= 127; b++)
      {
        int ovf1, ovf2;
        int t;
        t = a * b;
        ovf1 = t != (signed char) t;
        t = (signed char) t - a;
        ovf1 |= t != (signed char) t;
        t = b - 1;
        ovf2 = t != (signed char) t;
        t = (signed char) t * a;
        ovf2 |= t != (signed char) t;
        if (!ovf1 && ovf2)
          __builtin_printf ("A*B-A -> (B-1)*A A==%d && B==%d\n", a, b);
      }
  for (int a = -128; a <= 127; a++)
    for (int b = -128; b <= 127; b++)
      {
        int ovf1, ovf2;
        int t;
        t = a * b;
        ovf1 = t != (signed char) t;
        t = a - (signed char) t;
        ovf1 |= t != (signed char) t;
        t = 1 - b;
        ovf2 = t != (signed char) t;
        t = (signed char) t * a;
        ovf2 |= t != (signed char) t;
        if (!ovf1 && ovf2)
          __builtin_printf ("A-A*B -> (1-B)*A A==%d && B==%d\n", a, b);
      }
  for (int a = -128; a <= 127; a++)
    for (int b = -128; b <= 127; b++)
    for (int c = -128; c <= 127; c++)
      {
        int ovf1, ovf2;
        int t, t2;
        t = a * b;
        ovf1 = t != (signed char) t;
        t2 = a * c;
        ovf1 |= t2 != (signed char) t2;
        t = (signed char) t + (signed char) t2;
        ovf1 |= t != (signed char) t;
        t = b + c;
        ovf2 = t != (signed char) t;
        t = (signed char) t * a;
        ovf2 |= t != (signed char) t;
        if (!ovf1 && ovf2)
          __builtin_printf ("A*B+A*C -> (B+C)*A A==%d && B==%d && C==%d\n", a, 
b, c);
      }
  for (int a = -128; a <= 127; a++)
    for (int b = -128; b <= 127; b++)
    for (int c = -128; c <= 127; c++)
      {
        int ovf1, ovf2;
        int t, t2;
        t = a * b;
        ovf1 = t != (signed char) t;
        t2 = a * c;
        ovf1 |= t2 != (signed char) t2;
        t = (signed char) t - (signed char) t2;
        ovf1 |= t != (signed char) t;
        t = b - c;
        ovf2 = t != (signed char) t;
        t = (signed char) t * a;
        ovf2 |= t != (signed char) t;
        if (!ovf1 && ovf2)
          __builtin_printf ("A*B-A*C -> (B-C)*A A==%d && B==%d && C==%d\n", a, 
b, c);
      }
  return 0;
}

[PATCH] Improve A*B+-A -> A*(B+-1) and A+-A*B -> A*(1+-B) match.pd optimization

Reply via email to

[PATCH] Improve AB+-A -> A(B+-1) and A+-AB -> A(1+-B) match.pd optimization