Hi Jakub,
> -----Original Message----- > From: Jakub Jelinek <ja...@redhat.com> > Sent: Tuesday, March 27, 2018 2:40 PM > To: Richard Biener <rguent...@suse.de> > Cc: gcc-patches@gcc.gnu.org; Kumar, Venkataramanan > <venkataramanan.ku...@amd.com> > Subject: [PATCH] Improve TRUTH_{AND,OR}IF_EXPR expansion (PR rtl- > optimization/78200) > > Hi! > > The following patch attempts to improve expansion, if we have code like: > <bb 16> [local count: 102513059]: > if_conversion_var_52 = MEM[base: st1_22, offset: 0B]; > if (if_conversion_var_52 < 0) > goto <bb 17>; [41.00%] > else > goto <bb 18>; [59.00%] > > ... > > <bb 18> [local count: 60482706]: > _81 = _11 == 2; > _82 = if_conversion_var_52 > 0; > _79 = _81 & _82; > if (_79 != 0) > goto <bb 19>; [29.26%] > else > goto <bb 20>; [70.74%] > > Here, the single pred of the bb performed a similar comparison to what one > of the & (or |) operands does, and as & (or |) is not ordered, we can choose > which operand we'll expand first. If we expand if_conversion_var_52 > 0 > first, there is a chance that we can reuse flags from the previous comparison. > The patch does it only if there are no (non-virtual) phis on the current bb, > all > stmts before the current condition are TERable, so there is nothing that > would clobber the flags in between. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk if it > helps for SPEC? Venkat, do you think you could benchmark it in the setup > where you've measured the slowdown to see if it helps? I see the patch > changes the loop: The patch causes regression benchmark when I measured on my Ryzen box (>4%) . GCC trunk: -O3 -maxv2 -mprefer-avx128 -O3 -march=znver1 429.mcf 9120 238 38.3 * 9120 227 40.2 * GCC patch: 429.mcf 9120 251 36.3 * 9120 236 38.6 * > .p2align 4,,10 > .p2align 3 > .L11: > + jle .L10 > cmpl $2, %eax > - jne .L10 > - testq %r8, %r8 > - jg .L12 > + je .L12 > .p2align 4,,10 > .p2align 3 > .L10: > addq %r9, %rsi > cmpq %rsi, %rdx > jbe .L35 > .L13: > movl 24(%rsi), %eax > testl %eax, %eax > jle .L10 > movq (%rsi), %r8 > testq %r8, %r8 > jns .L11 > cmpl $1, %eax > jne .L10 > .L12: > addq $1, %r10 > movl $1, %r11d > movq st5(,%r10,8), %rax > movq %rsi, (%rax) > addq %r9, %rsi > movq %r8, 8(%rax) > movq st5(,%rdi,8), %rax > movq %r8, 16(%rax) > cmpq %rsi, %rdx > ja .L13 > which I assume shall be an improvement, since we can save one extra > comparison. This issue was reported against GCC 7.x. Now looking at the assembly generated, it seems GCC trunk is already emitting the correct order needed for MCF. GCC trunk GCC patch .L41: | .L41: --------------------------------------------| jle .L40 cmpl $2, %edi | cmpl $2, %edi jne .L40 | je .L42 testq %rdx, %rdx | -------------------------------------------- jg .L42 Now the patch avoids extra compare. But it is again emitting compares in an order which is bad for "mcf". GCC trunk cmpl $2, %edi jne .L40 <== is almost always true. testq %rdx, %rdx Now "jle .L40" <== is almost always false cmpl $2, %edi je .L42 > > 2018-03-27 Jakub Jelinek <ja...@redhat.com> > Richard Biener <rgue...@suse.de> > > PR rtl-optimization/78200 > * cfgexpand.c (gimple_bb_info_for_bb): New variable. > (expand_bb_seq, expand_phi_nodes): New functions. > (expand_gimple_cond): Use heuristics if it is desirable to > swap TRUTH_{AND,OR}IF_EXPR operands. > (expand_gimple_basic_block): Remember GIMPLE il for bbs > being expanded or already expanded. > (pass_expand::execute): Initialize and then free the > gimple_bb_info_for_bb vector. > > --- gcc/cfgexpand.c.jj 2018-02-09 06:44:36.413805556 +0100 > +++ gcc/cfgexpand.c 2018-03-26 13:35:57.536509844 +0200 > @@ -2357,6 +2357,34 @@ label_rtx_for_bb (basic_block bb ATTRIBU > return l; > } > > +/* Maps blocks to their GIMPLE IL. */ > +static vec<gimple_bb_info, va_heap, vl_embed> *gimple_bb_info_for_bb; > + > +/* Like bb_seq, except that during expansion returns the GIMPLE seq > + even for the blocks that have been already expanded or are being > + currently expanded. */ > + > +static gimple_seq > +expand_bb_seq (basic_block bb) > +{ > + if ((bb->flags & BB_RTL) > + && (unsigned) bb->index < vec_safe_length (gimple_bb_info_for_bb)) > + return (*gimple_bb_info_for_bb)[bb->index].seq; > + return bb_seq (bb); > +} > + > +/* Like phi_nodes, except that during expansion returns the GIMPLE PHIs > + even for the blocks that have been already expanded or are being > + currently expanded. */ > + > +static gimple_seq > +expand_phi_nodes (basic_block bb) > +{ > + if ((bb->flags & BB_RTL) > + && (unsigned) bb->index < vec_safe_length (gimple_bb_info_for_bb)) > + return (*gimple_bb_info_for_bb)[bb->index].phi_nodes; > + return phi_nodes (bb); > +} > > /* A subroutine of expand_gimple_cond. Given E, a fallthrough edge > of a basic block where we just expanded the conditional at the end, @@ - > 2475,6 +2503,65 @@ expand_gimple_cond (basic_block bb, gcon > op0 = gimple_assign_rhs1 (second); > op1 = gimple_assign_rhs2 (second); > } > + > + /* We'll expand RTL for op0 first, see if we'd better expand RTL > + for op1 first. Do that if the previous bb ends with > + if (x op cst), op1's def_stmt rhs is x op2 cst and there are > + no non-virtual PHIs nor non-TERed stmts in BB before STMT. > */ > + while (TREE_CODE (op1) == SSA_NAME > + && (code == TRUTH_ANDIF_EXPR || code == > TRUTH_ORIF_EXPR) > + && single_pred_p (bb)) > + { > + gimple *def1 = SSA_NAME_DEF_STMT (op1); > + if (!is_gimple_assign (def1) > + || (TREE_CODE_CLASS (gimple_assign_rhs_code (def1)) > + != tcc_comparison)) > + break; > + > + basic_block pred = single_pred (bb); > + gimple_seq pred_seq = expand_bb_seq (pred); > + gimple_stmt_iterator i = gsi_last (pred_seq); > + if (!gsi_end_p (i) && is_gimple_debug (gsi_stmt (i))) > + gsi_prev_nondebug (&i); > + > + gimple *last = gsi_stmt (i); > + if (!last > + || gimple_code (last) != GIMPLE_COND > + || gimple_assign_rhs1 (def1) != gimple_cond_lhs (last) > + || !operand_equal_p (gimple_assign_rhs2 (def1), > + gimple_cond_rhs (last), 0)) > + break; > + > + gimple_seq phi_seq = expand_phi_nodes (bb); > + i = gsi_start (phi_seq); > + if (!gsi_end_p (i) > + && !virtual_operand_p (gimple_phi_result (gsi_stmt (i)))) > + break; > + > + /* Check if all stmts before STMT are TERed. */ > + gimple_seq cur_seq = expand_bb_seq (bb); > + i = gsi_start_nondebug (cur_seq); > + int cnt = 100; > + while (!gsi_end_p (i) && gsi_stmt (i) != stmt) > + { > + gimple *g = gsi_stmt (i); > + if (!is_gimple_assign (g)) > + break; > + if (--cnt == 0) > + break; > + tree lhs = gimple_assign_lhs (g); > + if (TREE_CODE (lhs) != SSA_NAME > + || !get_gimple_for_ssa_name (lhs)) > + break; > + gsi_next_nondebug (&i); > + } > + > + if (gsi_stmt (i) != stmt) > + break; > + > + std::swap (op0, op1); > + break; > + } > } > } > } > @@ -5501,6 +5588,10 @@ expand_gimple_basic_block (basic_block b > if (optimize) > reorder_operands (bb); > stmts = bb_seq (bb); > + if ((unsigned) bb->index >= vec_safe_length (gimple_bb_info_for_bb)) > + vec_safe_grow_cleared (gimple_bb_info_for_bb, bb->index + 1); > + (*gimple_bb_info_for_bb)[bb->index].seq = stmts; > + (*gimple_bb_info_for_bb)[bb->index].phi_nodes = phi_nodes (bb); > bb->il.gimple.seq = NULL; > bb->il.gimple.phi_nodes = NULL; > rtl_profile_for_bb (bb); > @@ -6419,6 +6510,8 @@ pass_expand::execute (function *fun) > >= PARAM_VALUE (PARAM_MAX_DEBUG_MARKER_COUNT)) > cfun->debug_nonbind_markers = false; > > + vec_safe_grow_cleared (gimple_bb_info_for_bb, n_basic_blocks_for_fn > + (cfun)); > + > lab_rtx_for_bb = new hash_map<basic_block, rtx_code_label *>; > FOR_BB_BETWEEN (bb, init_block->next_bb, EXIT_BLOCK_PTR_FOR_FN > (fun), > next_bb) > @@ -6433,6 +6526,8 @@ pass_expand::execute (function *fun) > deep_ter_debug_map = NULL; > } > > + vec_free (gimple_bb_info_for_bb); > + > /* Free stuff we no longer need after GIMPLE optimizations. */ > free_dominance_info (CDI_DOMINATORS); > free_dominance_info (CDI_POST_DOMINATORS); > > Jakub Regards, Venkat.