On Wed, Dec 11, 2013 at 7:54 AM, Jeff Law <l...@redhat.com> wrote: > > So for this source, compiled for x86_64 with -O3: > > typedef unsigned long int uint64_t; > typedef long int int64_t; > int summation_helper_1(int64_t* products, uint64_t count) > { > int s = 0; > uint64_t i; > for(i=0; i<count; i++) > { > int64_t val = (products[i]>0) ? 1 : -1; > products[i] *= val; > if(products[i] != i) > val = -val; > products[i] = val; > s += val; > } > return s; > } > > > int summation_helper_2(int64_t* products, uint64_t count) > { > int s = 0; > uint64_t i; > for(i=0; i<count; i++) > { > int val = (products[i]>0) ? 1 : -1; > products[i] *= val; > if(products[i] != i) > val = -val; > products[i] = val; > s += val; > } > return s; > } > > > The loops we generate are pretty bad and have regressed relative to older > versions of GCC. > > For the first loop, we have the following .optimized output for the loop: > > <bb 4>: > # s_28 = PHI <s_20(5), 0(3)> > # i_27 = PHI <i_21(5), 0(3)> > _11 = MEM[base: products_9(D), index: i_27, step: 8, offset: 0B]; > val_4 = _11 > 0 ? 1 : -1; > prephitmp_38 = _11 > 0 ? -1 : 1; > prephitmp_39 = _11 > 0 ? 4294967295 : 1; > prephitmp_41 = _11 > 0 ? 1 : 4294967295; > _12 = val_4 * _11; > _14 = (long unsigned int) _12; > val_3 = _14 != i_27 ? prephitmp_38 : val_4; > prephitmp_44 = _14 != i_27 ? prephitmp_39 : prephitmp_41; > MEM[base: products_9(D), index: i_27, step: 8, offset: 0B] = val_3; > s.1_18 = (unsigned int) s_28; > _19 = prephitmp_44 + s.1_18; > s_20 = (int) _19; > i_21 = i_27 + 1; > if (i_21 != count_7(D)) > goto <bb 5>; > else > goto <bb 6>; > > <bb 5>: > goto <bb 4>; > > > Note the series of COND_EXPRs. A couple are just conditional negation which > can be implemented with a straight-line code sequence. Using that > straight-line sequence results in: > > <bb 4>: > # s_31 = PHI <s_20(5), 0(3)> > # i_32 = PHI <i_21(5), 0(3)> > _11 = MEM[base: products_9(D), index: i_32, step: 8, offset: 0B]; > val_4 = _11 > 0 ? 1 : -1; > _12 = val_4 * _11; > _14 = (long unsigned int) _12; > _24 = _14 != i_32; > _25 = (int64_t) _24; > _29 = -_25; > _28 = _29 ^ val_4; > _27 = _28 + _25; > MEM[base: products_9(D), index: i_32, step: 8, offset: 0B] = _27; > _17 = (unsigned int) _27; > s.1_18 = (unsigned int) s_31; > _19 = _17 + s.1_18; > s_20 = (int) _19; > i_21 = i_32 + 1; > if (i_21 != count_7(D)) > goto <bb 5>; > else > goto <bb 6>; > > <bb 5>: > goto <bb 4>; > > Which *appears* worse. However, that code can much more easily be handled > by the RTL optimizers. When we look at what the trunk generates at the > assembly level we have: > > .L3: > movq (%rdi,%rcx,8), %rdx > testq %rdx, %rdx > setg %r8b > movzbl %r8b, %r10d > movzbl %r8b, %r8d > leaq -1(%r10,%r10), %r10 > leal -1(%r8,%r8), %r8d > movq %r10, %r11 > imulq %rdx, %r11 > testq %rdx, %rdx > setle %dl > movzbl %dl, %r9d > movzbl %dl, %edx > leaq -1(%r9,%r9), %r9 > leal -1(%rdx,%rdx), %edx > cmpq %rcx, %r11 > cmove %r10, %r9 > cmove %r8d, %edx > movq %r9, (%rdi,%rcx,8) > addq $1, %rcx > addl %edx, %eax > cmpq %rsi, %rcx > jne .L3 > (Ick) > > With the conditional negation patch that turns into: > > L3: > movq (%rdi,%rcx,8), %r8 > xorl %edx, %edx > testq %r8, %r8 > setg %dl > leaq -1(%rdx,%rdx), %rdx > imulq %rdx, %r8 > cmpq %rcx, %r8 > setne %r8b > movzbl %r8b, %r8d > movq %r8, %r9 > negq %r9 > xorq %r9, %rdx > addq %r8, %rdx > movq %rdx, (%rdi,%rcx,8) > addq $1, %rcx > addl %edx, %eax > cmpq %rsi, %rcx > jne .L3 > > No branches within the loop, no conditional moves either. In all it's 5 > instructions shorter. > > > The second loop shows similar effects, though they're not as dramatic. > > Before: > <bb 4>: > # s_27 = PHI <s_19(5), 0(3)> > # i_26 = PHI <i_20(5), 0(3)> > _11 = MEM[base: products_9(D), index: i_26, step: 8, offset: 0B]; > val_4 = _11 > 0 ? 1 : -1; > prephitmp_32 = _11 > 0 ? 1 : -1; > prephitmp_33 = _11 > 0 ? -1 : 1; > prephitmp_34 = _11 > 0 ? -1 : 1; > _13 = _11 * prephitmp_32; > _15 = (long unsigned int) _13; > val_3 = _15 != i_26 ? prephitmp_33 : val_4; > prephitmp_36 = _15 != i_26 ? prephitmp_34 : prephitmp_32; > MEM[base: products_9(D), index: i_26, step: 8, offset: 0B] = prephitmp_36; > s_19 = val_3 + s_27; > i_20 = i_26 + 1; > if (i_20 != count_7(D)) > goto <bb 5>; > else > goto <bb 6>; > > <bb 5>: > goto <bb 4>; > > > Which results in the following assembly: > > .L8: > movq (%rdi,%r8,8), %rdx > testq %rdx, %rdx > movq %rdx, %r11 > setg %cl > movzbl %cl, %r10d > movzbl %cl, %ecx > leaq -1(%rcx,%rcx), %rcx > leal -1(%r10,%r10), %r10d > imulq %rcx, %r11 > testq %rdx, %rdx > setle %dl > movzbl %dl, %r9d > movzbl %dl, %edx > leaq -1(%r9,%r9), %r9 > leal -1(%rdx,%rdx), %edx > cmpq %r8, %r11 > cmovne %r9, %rcx > cmove %r10d, %edx > movq %rcx, (%rdi,%r8,8) > addq $1, %r8 > addl %edx, %eax > cmpq %rsi, %r8 > jne .L8 > > > > With the conditional negation patch: > > <bb 4>: > # s_31 = PHI <s_20(5), 0(3)> > # i_32 = PHI <i_21(5), 0(3)> > _11 = MEM[base: products_9(D), index: i_32, step: 8, offset: 0B]; > val_4 = _11 > 0 ? 1 : -1; > _12 = val_4 * _11; > _14 = (long unsigned int) _12; > _24 = _14 != i_32; > _25 = (int64_t) _24; > _29 = -_25; > _28 = _29 ^ val_4; > _27 = _28 + _25; > MEM[base: products_9(D), index: i_32, step: 8, offset: 0B] = _27; > _17 = (unsigned int) _27; > s.1_18 = (unsigned int) s_31; > _19 = _17 + s.1_18; > s_20 = (int) _19; > i_21 = i_32 + 1; > if (i_21 != count_7(D)) > goto <bb 5>; > else > goto <bb 6>; > > <bb 5>: > goto <bb 4>; > > Which again looks worse than the original, but optimizes well into: > > .L8: > movq (%rdi,%r8,8), %r9 > testq %r9, %r9 > setg %cl > movzbl %cl, %edx > leaq -1(%rdx,%rdx), %rdx > imulq %r9, %rdx > xorl %r9d, %r9d > cmpq %r8, %rdx > movzbl %cl, %edx > setne %r9b > leal -1(%rdx,%rdx), %edx > movl %r9d, %r10d > negl %r10d > xorl %r10d, %edx > addl %r9d, %edx > movslq %edx, %rcx > addl %edx, %eax > movq %rcx, (%rdi,%r8,8) > addq $1, %r8 > cmpq %rsi, %r8 > jne .L8 > > > Bootstrapped and regression tested on x86_64-unknown-linux-gnu. OK for the > trunk?
First of all phiopt runs unconditionally for -On with n > 0 but the conversion is clearly not suitable for non-speed optimizations. Thus I'd guard it with at least !optimize_size && optimize >= 2. As you are targeting a worse transformation done by if-conversion you may want to add || (((flag_tree_loop_vectorize || cfun->has_force_vect_loops) && flag_tree_loop_if_convert != 0) || flag_tree_loop_if_convert == 1 || flag_tree_loop_if_convert_stores == 1) (ugh). More comments below. > > PR tree-optimization/45685 > * tree-ssa-phiopt.c (neg_replacement): New function. > (tree_ssa_phiopt_worker): Call it. > > PR tree-optimization/45685 > * gcc.dg/tree-ssa/pr45685.c: New test. > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr45685.c > b/gcc/testsuite/gcc.dg/tree-ssa/pr45685.c > new file mode 100644 > index 0000000..0628943 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr45685.c > @@ -0,0 +1,41 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -fdump-tree-phiopt1-details" } */ > + > +typedef unsigned long int uint64_t; > +typedef long int int64_t; > +int summation_helper_1(int64_t* products, uint64_t count) > +{ > + int s = 0; > + uint64_t i; > + for(i=0; i<count; i++) > + { > + int64_t val = (products[i]>0) ? 1 : -1; > + products[i] *= val; > + if(products[i] != i) > + val = -val; > + products[i] = val; > + s += val; > + } > + return s; > +} > + > + > +int summation_helper_2(int64_t* products, uint64_t count) > +{ > + int s = 0; > + uint64_t i; > + for(i=0; i<count; i++) > + { > + int val = (products[i]>0) ? 1 : -1; > + products[i] *= val; > + if(products[i] != i) > + val = -val; > + products[i] = val; > + s += val; > + } > + return s; > +} > + > +/* { dg-final { scan-tree-dump-times "converted to straightline code" 2 > "phiopt1" } } */ > +/* { dg-final { cleanup-tree-dump "phiopt1" } } */ > + > diff --git a/gcc/tree-ssa-phiopt.c b/gcc/tree-ssa-phiopt.c > index 11e565f..2522255 100644 > --- a/gcc/tree-ssa-phiopt.c > +++ b/gcc/tree-ssa-phiopt.c > @@ -69,6 +69,8 @@ static bool minmax_replacement (basic_block, basic_block, > edge, edge, gimple, tree, tree); > static bool abs_replacement (basic_block, basic_block, > edge, edge, gimple, tree, tree); > +static bool neg_replacement (basic_block, basic_block, > + edge, edge, gimple, tree, tree); > static bool cond_store_replacement (basic_block, basic_block, edge, edge, > struct pointer_set_t *); > static bool cond_if_else_store_replacement (basic_block, basic_block, > basic_block); > @@ -489,6 +491,8 @@ tree_ssa_phiopt_worker (bool do_store_elim, bool > do_hoist_loads) > cfgchanged = true; > else if (abs_replacement (bb, bb1, e1, e2, phi, arg0, arg1)) > cfgchanged = true; > + else if (neg_replacement (bb, bb1, e1, e2, phi, arg0, arg1)) > + cfgchanged = true; > else if (minmax_replacement (bb, bb1, e1, e2, phi, arg0, arg1)) > cfgchanged = true; > } > @@ -1285,6 +1289,143 @@ abs_replacement (basic_block cond_bb, basic_block > middle_bb, > return true; > } > > +/* The function neg_replacement replaces conditional negation with > + equivalent straight line code. Returns TRUE if replacement is done, > + otherwise returns FALSE. > + > + COND_BB branches around negation occuring in MIDDLE_BB. > + > + E0 and E1 are edges out of COND_BB. E0 reaches MIDDLE_BB and > + E1 reaches the other successor which should contain PHI with > + arguments ARG0 and ARG1. > + > + Assuming negation is to occur when the condition is true, > + then the non-branching sequence is: > + > + result = (rhs ^ -cond) + cond > + > + Inverting the condition or its result gives us negation > + when the original condition is false. */ > + > +static bool > +neg_replacement (basic_block cond_bb, basic_block middle_bb, > + edge e0 ATTRIBUTE_UNUSED, edge e1, > + gimple phi, tree arg0, tree arg1) > +{ > + gimple new_stmt, cond; > + gimple_stmt_iterator gsi; > + gimple assign; > + edge true_edge, false_edge; > + tree rhs, lhs; > + enum tree_code cond_code; > + bool invert = false; > + > + /* This transformation performs logical operations on the > + incoming arguments. So force them to be integral types. */ > + if (!INTEGRAL_TYPE_P (TREE_TYPE (arg0))) > + return false; > + /* OTHER_BLOCK must have only one executable statement which must have > the > + form arg0 = -arg1 or arg1 = -arg0. */ > + > + assign = last_and_only_stmt (middle_bb); > + /* If we did not find the proper negation assignment, then we can not > + optimize. */ > + if (assign == NULL) > + return false; > + > + /* If we got here, then we have found the only executable statement > + in OTHER_BLOCK. If it is anything other than arg0 = -arg1 or > + arg1 = -arg0, then we can not optimize. */ > + if (gimple_code (assign) != GIMPLE_ASSIGN) > + return false; > + > + lhs = gimple_assign_lhs (assign); > + > + if (gimple_assign_rhs_code (assign) != NEGATE_EXPR) > + return false; > + > + rhs = gimple_assign_rhs1 (assign); > + > + /* The assignment has to be arg0 = -arg1 or arg1 = -arg0. */ > + if (!(lhs == arg0 && rhs == arg1) > + && !(lhs == arg1 && rhs == arg0)) > + return false; > + > + /* The basic sequence assumes we negate when the condition is true. > + If we need the opposite, then we will either need to invert the > + condition or its result. */ > + extract_true_false_edges_from_block (cond_bb, &true_edge, &false_edge); > + invert = false_edge->dest == middle_bb; > + > + /* Unlike abs_replacement, we can handle arbitrary conditionals here. */ > + cond = last_stmt (cond_bb); > + cond_code = gimple_cond_code (cond); > + > + /* If inversion is needed, first try to invert the test since > + that's cheapest. */ > + if (invert) > + { > + enum tree_code new_code > + = invert_tree_comparison (cond_code, > + HONOR_NANS (TYPE_MODE (TREE_TYPE (rhs)))); That looks wrong - you want to look at HONOR_NANS on the mode of one of the comparison operands, not of the actual value you want to negate (it's integer and thus never has NaNs). The rest of the patch looks ok to me. Thanks, Richard. > + > + /* If invert_tree_comparison was successful, then use its return > + value as the new code and note that inversion is no longer > + needed. */ > + if (new_code != ERROR_MARK) > + { > + cond_code = new_code; > + invert = false; > + } > + } > + > + tree cond_val = make_ssa_name (boolean_type_node, NULL); > + new_stmt = gimple_build_assign_with_ops (cond_code, cond_val, > + gimple_cond_lhs (cond), > + gimple_cond_rhs (cond)); > + gsi = gsi_last_bb (cond_bb); > + gsi_insert_before (&gsi, new_stmt, GSI_NEW_STMT); > + > + /* If we still need inversion, then invert the result of the > + condition. */ > + if (invert) > + { > + tree tmp = make_ssa_name (boolean_type_node, NULL); > + new_stmt = gimple_build_assign_with_ops (BIT_XOR_EXPR, tmp, > + cond_val, boolean_true_node); > + gsi_insert_after (&gsi, new_stmt, GSI_NEW_STMT); > + cond_val = tmp; > + } > + > + /* Get the condition in the right type so that we can perform > + logical and arithmetic operations on it. */ > + tree cond_val_converted = make_ssa_name (TREE_TYPE (rhs), NULL); > + new_stmt = gimple_build_assign_with_ops (NOP_EXPR, cond_val_converted, > + cond_val, NULL_TREE); > + gsi_insert_after (&gsi, new_stmt, GSI_NEW_STMT); > + > + tree neg_cond_val_converted = make_ssa_name (TREE_TYPE (rhs), NULL); > + new_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, > neg_cond_val_converted, > + cond_val_converted, NULL_TREE); > + gsi_insert_after (&gsi, new_stmt, GSI_NEW_STMT); > + > + tree tmp = make_ssa_name (TREE_TYPE (rhs), NULL); > + new_stmt = gimple_build_assign_with_ops (BIT_XOR_EXPR, tmp, > + rhs, neg_cond_val_converted); > + gsi_insert_after (&gsi, new_stmt, GSI_NEW_STMT); > + > + tree new_lhs = make_ssa_name (TREE_TYPE (rhs), NULL); > + new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, new_lhs, > + tmp, cond_val_converted); > + gsi_insert_after (&gsi, new_stmt, GSI_NEW_STMT); > + > + replace_phi_edge_with_variable (cond_bb, e1, phi, new_lhs); > + > + /* Note that we optimized this PHI. */ > + return true; > +} > + > /* Auxiliary functions to determine the set of memory accesses which > can't trap because they are preceded by accesses to the same memory > portion. We do that for MEM_REFs, so we only need to track >