Hi! As discussed in the PR, we can't optimize e.g. int a = t - 1; int b = a * v; return b + v; into return t * v; for signed non-wrapv arithmetics. This can be done by the match.pd (A * B) +- A -> (B +- 1) * A or A +- (A * B) -> (1 +- B) * A canonicalizations. Being a lazy guy, I wrote attached brute force proglet to look for all the cases (for signed char) where there is no UB in the original and the transformation would introduce UB. For the simple cases with just A and B, rather than A, B and C, the problematic cases are for signed char only: A*B+A -> (B+1)*A A==-1 && B==127 A*B+A -> (B+1)*A A==0 && B==127 A*B-A -> (B-1)*A A==0 && B==-128 A-A*B -> (1-B)*A A==-1 && B==-127 A-A*B -> (1-B)*A A==0 && B==-128 A-A*B -> (1-B)*A A==0 && B==-127 The current patterns already use VRP (tree_expr_nonzero_p and expr_not_equal_to) to do the transformation only if A is known not to be 0 or -1. But as the above problematic cases show, for A*B-A the -1 case is actually not problematic (transformation doesn't introduce UB; if A is -1, -1*B+1 has UB only for minimum and minimum+1, and the replacement (B-1)*-1 has also UB for those two cases only) and even when we know nothing about A value range, if we know something about B value range, we could still optimize. So, for the int a = t - 1; int b = a * v; return b + v; case, the a = t - 1 has value range that doesn't include maximum and so we can conclude it is ok to transform it into return ((t - 1) + 1) * v and thus t * v.
Unfortunately, the patch "broke" a few loop_versioning_*.f90 tests (CCing author), where there are small differences between the lversion pass, e.g. in loop_versioning_1.f90 (f1): - # RANGE ~[0, 0] - _4 = iftmp.11_6 * S.4_19; - _11 = _4 - iftmp.11_6; + # RANGE [0, 9223372036854775806] NONZERO 9223372036854775807 + _4 = S.4_19 + -1; + _11 = _4 * iftmp.11_6; and the lversion pass then emits just one message instead of two, but in the end assembly is identical. In loop_versioning_6.f90 (though, with -m32 only), the code before lversion pass actually looks better in f1: - # i_35 = PHI <1(8), i_28(9)> - _9 = iftmp.33_15 * i_35; - _10 = _9 * 2; - _21 = _10 - iftmp.33_15; - (*x.0_23)[_21] = 1.0e+2; - _11 = i_35 * 2; - _12 = _11 + 1; - _13 = _12 * iftmp.33_15; - _22 = _13 - iftmp.33_15; - (*x.0_23)[_22] = 1.01e+2; - i_28 = i_35 + 1; - if (iftmp.36_25 < i_28) + # i_31 = PHI <1(8), i_26(9)> + _10 = iftmp.33_13 * i_31; + _11 = _10 * 2; + _19 = _11 - iftmp.33_13; + (*x.0_21)[_19] = 1.0e+2; + (*x.0_21)[_11] = 1.01e+2; + i_26 = i_31 + 1; + if (iftmp.36_23 < i_26) where due to the new canonicalizations we managed to avoid some multiplications. One index was iftmp*i*2-iftmp and the other was iftmp*(i*2+1)-iftmp and with the patch we managed to simplify the latter into iftmp*i*2 and use for that the temporary used for the first expression. f1 is actually in assembly smaller because of this. The lp64 vs. ! lp64 is just a wild guess, guess testing on further targets will show what is the target property that matters. Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2019-11-29 Jakub Jelinek <ja...@redhat.com> PR tree-optimization/92712 * match.pd ((A * B) +- A -> (B +- 1) * A, A +- (A * B) -> (1 +- B) * A): Allow optimizing signed integers even when we don't know anything about range of A, but do know something about range of B and the simplification won't introduce new UB. * gcc.dg/tree-ssa/pr92712-1.c: New test. * gcc.dg/tree-ssa/pr92712-2.c: New test. * gcc.dg/tree-ssa/pr92712-3.c: New test. * gfortran.dg/loop_versioning_1.f90: Adjust expected number of likely to be innermost dimension messages. * gfortran.dg/loop_versioning_10.f90: Likewise. * gfortran.dg/loop_versioning_6.f90: Likewise. --- gcc/match.pd.jj 2019-11-05 14:59:22.546873967 +0100 +++ gcc/match.pd 2019-11-29 18:17:27.472002727 +0100 @@ -2480,18 +2480,42 @@ (define_operator_list COND_TERNARY (plusminus @0 (mult:c@3 @0 @2)) (if ((!ANY_INTEGRAL_TYPE_P (type) || TYPE_OVERFLOW_WRAPS (type) + /* For @0 + @0*@2 this transformation would introduce UB + (where there was none before) for @0 in [-1,0] and @2 max. + For @0 - @0*@2 this transformation would introduce UB + for @0 0 and @2 in [min,min+1] or @0 -1 and @2 min+1. */ || (INTEGRAL_TYPE_P (type) - && tree_expr_nonzero_p (@0) - && expr_not_equal_to (@0, wi::minus_one (TYPE_PRECISION (type))))) + && ((tree_expr_nonzero_p (@0) + && expr_not_equal_to (@0, + wi::minus_one (TYPE_PRECISION (type)))) + || (plusminus == PLUS_EXPR + ? expr_not_equal_to (@2, + wi::max_value (TYPE_PRECISION (type), SIGNED)) + /* Let's ignore the @0 -1 and @2 min case. */ + : (expr_not_equal_to (@2, + wi::min_value (TYPE_PRECISION (type), SIGNED)) + && expr_not_equal_to (@2, + wi::min_value (TYPE_PRECISION (type), SIGNED) + + 1)))))) && single_use (@3)) (mult (plusminus { build_one_cst (type); } @2) @0))) (simplify (plusminus (mult:c@3 @0 @2) @0) (if ((!ANY_INTEGRAL_TYPE_P (type) || TYPE_OVERFLOW_WRAPS (type) + /* For @0*@2 + @0 this transformation would introduce UB + (where there was none before) for @0 in [-1,0] and @2 max. + For @0*@2 - @0 this transformation would introduce UB + for @0 0 and @2 min. */ || (INTEGRAL_TYPE_P (type) - && tree_expr_nonzero_p (@0) - && expr_not_equal_to (@0, wi::minus_one (TYPE_PRECISION (type))))) + && ((tree_expr_nonzero_p (@0) + && (plusminus == MINUS_EXPR + || expr_not_equal_to (@0, + wi::minus_one (TYPE_PRECISION (type))))) + || expr_not_equal_to (@2, + (plusminus == PLUS_EXPR + ? wi::max_value (TYPE_PRECISION (type), SIGNED) + : wi::min_value (TYPE_PRECISION (type), SIGNED)))))) && single_use (@3)) (mult (plusminus @2 { build_one_cst (type); }) @0)))))) --- gcc/testsuite/gcc.dg/tree-ssa/pr92712-1.c.jj 2019-11-29 19:47:42.900389708 +0100 +++ gcc/testsuite/gcc.dg/tree-ssa/pr92712-1.c 2019-11-29 19:50:26.704925013 +0100 @@ -0,0 +1,21 @@ +/* PR tree-optimization/92712 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ +/* { dg-final { scan-tree-dump " = \[tv]_\[0-9]*\\\(D\\\) \\* \[tv]_\[0-9]*\\\(D\\\);" "optimized" } } */ + +static int +foo (int t, int v) +{ + int i, x = 0; + for (int i = 0; i < t; ++i) + x += v; + return x; +} + +int +bar (int t, int v) +{ + if (t < 0) + __builtin_unreachable (); + return foo (t, v); +} --- gcc/testsuite/gcc.dg/tree-ssa/pr92712-2.c.jj 2019-11-29 19:51:47.169714383 +0100 +++ gcc/testsuite/gcc.dg/tree-ssa/pr92712-2.c 2019-11-29 20:12:21.792153466 +0100 @@ -0,0 +1,66 @@ +/* PR tree-optimization/92712 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -fno-ipa-icf -fdump-tree-optimized" } */ +/* { dg-final { scan-tree-dump-times " = \[tv]_\[0-9]*\\\(D\\\) \\* \[tv]_\[0-9]*\\\(D\\\);" 7 "optimized" } } */ + +int +f1 (int t, int v) +{ + int a = t - 1; + int b = a * v; + return b + v; +} + +int +f2 (int t, int v) +{ + int a = t - 1; + int b = a * v; + return v + b; +} + +int +f3 (int t, int v) +{ + int a = t + 1; + int b = a * v; + return b - v; +} + +int +f4 (int t, int v) +{ + int a = 1 - t; + int b = a * v; + return v - b; +} + +int +f5 (int t, int v) +{ + if (v == 0 || v == -1) + __builtin_unreachable (); + int a = t - 1U; + int b = a * v; + return b + v; +} + +int +f6 (int t, int v) +{ + if (v == 0 || v == -1) + __builtin_unreachable (); + int a = t - 1U; + int b = a * v; + return v + b; +} + +int +f7 (int t, int v) +{ + if (v == 0) + __builtin_unreachable (); + int a = t + 1U; + int b = a * v; + return b - v; +} --- gcc/testsuite/gcc.dg/tree-ssa/pr92712-3.c.jj 2019-11-29 20:07:52.313198937 +0100 +++ gcc/testsuite/gcc.dg/tree-ssa/pr92712-3.c 2019-11-29 20:08:16.890829972 +0100 @@ -0,0 +1,36 @@ +/* PR tree-optimization/92712 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -fno-ipa-icf -fdump-tree-optimized" } */ +/* { dg-final { scan-tree-dump-not " = \[tv]_\[0-9]*\\\(D\\\) \\* \[tv]_\[0-9]*\\\(D\\\);" "optimized" } } */ + +int +f1 (int t, int v) +{ + int a = t - 1U; + int b = a * v; + return b + v; +} + +int +f2 (int t, int v) +{ + int a = t - 1U; + int b = a * v; + return v + b; +} + +int +f3 (int t, int v) +{ + int a = t + 1U; + int b = a * v; + return b - v; +} + +int +f4 (int t, int v) +{ + int a = 1U - t; + int b = a * v; + return v - b; +} --- gcc/testsuite/gfortran.dg/loop_versioning_1.f90.jj 2019-01-19 18:27:58.396602572 +0100 +++ gcc/testsuite/gfortran.dg/loop_versioning_1.f90 2019-11-30 00:49:42.250277985 +0100 @@ -23,6 +23,6 @@ subroutine f3(x, limit, step) end do end subroutine f3 -! { dg-final { scan-tree-dump-times {likely to be the innermost dimension} 2 "lversion" } } +! { dg-final { scan-tree-dump-times {likely to be the innermost dimension} 1 "lversion" } } ! { dg-final { scan-tree-dump-times {want to version containing loop} 3 "lversion" } } ! { dg-final { scan-tree-dump-times {versioned this loop} 3 "lversion" } } --- gcc/testsuite/gfortran.dg/loop_versioning_10.f90.jj 2019-01-19 18:27:58.396602572 +0100 +++ gcc/testsuite/gfortran.dg/loop_versioning_10.f90 2019-11-30 00:49:53.159113815 +0100 @@ -26,6 +26,6 @@ subroutine f4(x, i) end do end subroutine f4 -! { dg-final { scan-tree-dump-times {likely to be the innermost dimension} 6 "lversion" } } +! { dg-final { scan-tree-dump-times {likely to be the innermost dimension} 4 "lversion" } } ! { dg-final { scan-tree-dump-times {want to version} 4 "lversion" } } ! { dg-final { scan-tree-dump-times {versioned} 4 "lversion" } } --- gcc/testsuite/gfortran.dg/loop_versioning_6.f90.jj 2018-12-17 22:13:44.485132746 +0100 +++ gcc/testsuite/gfortran.dg/loop_versioning_6.f90 2019-11-30 00:58:42.291150793 +0100 @@ -89,5 +89,7 @@ subroutine f9(x, limit, step) end do end subroutine f9 -! { dg-final { scan-tree-dump-times {want to version containing loop} 9 "lversion" } } -! { dg-final { scan-tree-dump-times {versioned this loop} 9 "lversion" } } +! { dg-final { scan-tree-dump-times {want to version containing loop} 9 "lversion" { target lp64 } } } +! { dg-final { scan-tree-dump-times {versioned this loop} 9 "lversion" { target lp64 } } } +! { dg-final { scan-tree-dump-times {want to version containing loop} 8 "lversion" { target { ! lp64 } } } } +! { dg-final { scan-tree-dump-times {versioned this loop} 8 "lversion" { target { ! lp64 } } } } Jakub
int main () { for (int a = -128; a <= 127; a++) for (int b = -128; b <= 127; b++) { int ovf1, ovf2; int t; t = a * b; ovf1 = t != (signed char) t; t = (signed char) t + a; ovf1 |= t != (signed char) t; t = b + 1; ovf2 = t != (signed char) t; t = (signed char) t * a; ovf2 |= t != (signed char) t; if (!ovf1 && ovf2) __builtin_printf ("A*B+A -> (B+1)*A A==%d && B==%d\n", a, b); } for (int a = -128; a <= 127; a++) for (int b = -128; b <= 127; b++) { int ovf1, ovf2; int t; t = a * b; ovf1 = t != (signed char) t; t = (signed char) t - a; ovf1 |= t != (signed char) t; t = b - 1; ovf2 = t != (signed char) t; t = (signed char) t * a; ovf2 |= t != (signed char) t; if (!ovf1 && ovf2) __builtin_printf ("A*B-A -> (B-1)*A A==%d && B==%d\n", a, b); } for (int a = -128; a <= 127; a++) for (int b = -128; b <= 127; b++) { int ovf1, ovf2; int t; t = a * b; ovf1 = t != (signed char) t; t = a - (signed char) t; ovf1 |= t != (signed char) t; t = 1 - b; ovf2 = t != (signed char) t; t = (signed char) t * a; ovf2 |= t != (signed char) t; if (!ovf1 && ovf2) __builtin_printf ("A-A*B -> (1-B)*A A==%d && B==%d\n", a, b); } for (int a = -128; a <= 127; a++) for (int b = -128; b <= 127; b++) for (int c = -128; c <= 127; c++) { int ovf1, ovf2; int t, t2; t = a * b; ovf1 = t != (signed char) t; t2 = a * c; ovf1 |= t2 != (signed char) t2; t = (signed char) t + (signed char) t2; ovf1 |= t != (signed char) t; t = b + c; ovf2 = t != (signed char) t; t = (signed char) t * a; ovf2 |= t != (signed char) t; if (!ovf1 && ovf2) __builtin_printf ("A*B+A*C -> (B+C)*A A==%d && B==%d && C==%d\n", a, b, c); } for (int a = -128; a <= 127; a++) for (int b = -128; b <= 127; b++) for (int c = -128; c <= 127; c++) { int ovf1, ovf2; int t, t2; t = a * b; ovf1 = t != (signed char) t; t2 = a * c; ovf1 |= t2 != (signed char) t2; t = (signed char) t - (signed char) t2; ovf1 |= t != (signed char) t; t = b - c; ovf2 = t != (signed char) t; t = (signed char) t * a; ovf2 |= t != (signed char) t; if (!ovf1 && ovf2) __builtin_printf ("A*B-A*C -> (B-C)*A A==%d && B==%d && C==%d\n", a, b, c); } return 0; }