Hi Jakub,

> -----Original Message-----
> From: Jakub Jelinek <ja...@redhat.com>
> Sent: Tuesday, March 27, 2018 2:40 PM
> To: Richard Biener <rguent...@suse.de>
> Cc: gcc-patches@gcc.gnu.org; Kumar, Venkataramanan
> <venkataramanan.ku...@amd.com>
> Subject: [PATCH] Improve TRUTH_{AND,OR}IF_EXPR expansion (PR rtl-
> optimization/78200)
> 
> Hi!
> 
> The following patch attempts to improve expansion, if we have code like:
>   <bb 16> [local count: 102513059]:
>   if_conversion_var_52 = MEM[base: st1_22, offset: 0B];
>   if (if_conversion_var_52 < 0)
>     goto <bb 17>; [41.00%]
>   else
>     goto <bb 18>; [59.00%]
> 
> ...
> 
>   <bb 18> [local count: 60482706]:
>   _81 = _11 == 2;
>   _82 = if_conversion_var_52 > 0;
>   _79 = _81 & _82;
>   if (_79 != 0)
>     goto <bb 19>; [29.26%]
>   else
>     goto <bb 20>; [70.74%]
> 
> Here, the single pred of the bb performed a similar comparison to what one
> of the & (or |) operands does, and as & (or |) is not ordered, we can choose
> which operand we'll expand first.  If we expand if_conversion_var_52 > 0
> first, there is a chance that we can reuse flags from the previous comparison.
> The patch does it only if there are no (non-virtual) phis on the current bb, 
> all
> stmts before the current condition are TERable, so there is nothing that
> would clobber the flags in between.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk if it
> helps for SPEC?  Venkat, do you think you could benchmark it in the setup
> where you've measured the slowdown to see if it helps?  I see the patch
> changes the loop:

The patch causes regression  benchmark when I measured on my Ryzen box (>4%) . 

GCC trunk:      -O3 -maxv2 -mprefer-avx128      -O3 -march=znver1
429.mcf          9120        238       38.3 *    9120        227       40.2 *

GCC patch:
429.mcf          9120        251       36.3 *    9120        236       38.6 *

>       .p2align 4,,10
>       .p2align 3
>  .L11:
> +     jle     .L10
>       cmpl    $2, %eax
> -     jne     .L10
> -     testq   %r8, %r8
> -     jg      .L12
> +     je      .L12
>       .p2align 4,,10
>       .p2align 3
>  .L10:
>       addq    %r9, %rsi
>       cmpq    %rsi, %rdx
>       jbe     .L35
>  .L13:
>       movl    24(%rsi), %eax
>       testl   %eax, %eax
>       jle     .L10
>       movq    (%rsi), %r8
>       testq   %r8, %r8
>       jns     .L11
>       cmpl    $1, %eax
>       jne     .L10
>  .L12:
>       addq    $1, %r10
>       movl    $1, %r11d
>       movq    st5(,%r10,8), %rax
>       movq    %rsi, (%rax)
>       addq    %r9, %rsi
>       movq    %r8, 8(%rax)
>       movq    st5(,%rdi,8), %rax
>       movq    %r8, 16(%rax)
>       cmpq    %rsi, %rdx
>       ja      .L13
> which I assume shall be an improvement, since we can save one extra
> comparison.

This issue was reported against GCC 7.x. Now looking at the assembly generated, 
it seems GCC trunk is already emitting the correct order needed for MCF.


GCC trunk                               GCC patch
.L41:                                       |  .L41:
  --------------------------------------------|          jle     .L40   
          cmpl    $2, %edi                    |          cmpl    $2, %edi
          jne     .L40                        |          je      .L42
          testq   %rdx, %rdx                  |  
--------------------------------------------
          jg      .L42


Now the patch avoids extra compare. But it is again emitting  compares in an 
order which is bad for "mcf".

GCC trunk 
cmpl    $2, %edi                    
jne     .L40 <== is almost always true.
testq   %rdx, %rdx                  

Now
 "jle     .L40" <== is almost always false   
  cmpl    $2, %edi 
 je      .L42
 
> 
> 2018-03-27  Jakub Jelinek  <ja...@redhat.com>
>           Richard Biener  <rgue...@suse.de>
> 
>       PR rtl-optimization/78200
>       * cfgexpand.c (gimple_bb_info_for_bb): New variable.
>       (expand_bb_seq, expand_phi_nodes): New functions.
>       (expand_gimple_cond): Use heuristics if it is desirable to
>       swap TRUTH_{AND,OR}IF_EXPR operands.
>       (expand_gimple_basic_block): Remember GIMPLE il for bbs
>       being expanded or already expanded.
>       (pass_expand::execute): Initialize and then free the
>       gimple_bb_info_for_bb vector.
> 
> --- gcc/cfgexpand.c.jj        2018-02-09 06:44:36.413805556 +0100
> +++ gcc/cfgexpand.c   2018-03-26 13:35:57.536509844 +0200
> @@ -2357,6 +2357,34 @@ label_rtx_for_bb (basic_block bb ATTRIBU
>    return l;
>  }
> 
> +/* Maps blocks to their GIMPLE IL.  */
> +static vec<gimple_bb_info, va_heap, vl_embed> *gimple_bb_info_for_bb;
> +
> +/* Like bb_seq, except that during expansion returns the GIMPLE seq
> +   even for the blocks that have been already expanded or are being
> +   currently expanded.  */
> +
> +static gimple_seq
> +expand_bb_seq (basic_block bb)
> +{
> +  if ((bb->flags & BB_RTL)
> +      && (unsigned) bb->index < vec_safe_length (gimple_bb_info_for_bb))
> +    return (*gimple_bb_info_for_bb)[bb->index].seq;
> +  return bb_seq (bb);
> +}
> +
> +/* Like phi_nodes, except that during expansion returns the GIMPLE PHIs
> +   even for the blocks that have been already expanded or are being
> +   currently expanded.  */
> +
> +static gimple_seq
> +expand_phi_nodes (basic_block bb)
> +{
> +  if ((bb->flags & BB_RTL)
> +      && (unsigned) bb->index < vec_safe_length (gimple_bb_info_for_bb))
> +    return (*gimple_bb_info_for_bb)[bb->index].phi_nodes;
> +  return phi_nodes (bb);
> +}
> 
>  /* A subroutine of expand_gimple_cond.  Given E, a fallthrough edge
>     of a basic block where we just expanded the conditional at the end, @@ -
> 2475,6 +2503,65 @@ expand_gimple_cond (basic_block bb, gcon
>                 op0 = gimple_assign_rhs1 (second);
>                 op1 = gimple_assign_rhs2 (second);
>               }
> +
> +           /* We'll expand RTL for op0 first, see if we'd better expand RTL
> +              for op1 first.  Do that if the previous bb ends with
> +              if (x op cst), op1's def_stmt rhs is x op2 cst and there are
> +              no non-virtual PHIs nor non-TERed stmts in BB before STMT.
> */
> +           while (TREE_CODE (op1) == SSA_NAME
> +                  && (code == TRUTH_ANDIF_EXPR || code ==
> TRUTH_ORIF_EXPR)
> +                  && single_pred_p (bb))
> +             {
> +               gimple *def1 = SSA_NAME_DEF_STMT (op1);
> +               if (!is_gimple_assign (def1)
> +                   || (TREE_CODE_CLASS (gimple_assign_rhs_code (def1))
> +                       != tcc_comparison))
> +                 break;
> +
> +               basic_block pred = single_pred (bb);
> +               gimple_seq pred_seq = expand_bb_seq (pred);
> +               gimple_stmt_iterator i = gsi_last (pred_seq);
> +               if (!gsi_end_p (i) && is_gimple_debug (gsi_stmt (i)))
> +                 gsi_prev_nondebug (&i);
> +
> +               gimple *last = gsi_stmt (i);
> +               if (!last
> +                   || gimple_code (last) != GIMPLE_COND
> +                   || gimple_assign_rhs1 (def1) != gimple_cond_lhs (last)
> +                   || !operand_equal_p (gimple_assign_rhs2 (def1),
> +                                        gimple_cond_rhs (last), 0))
> +                 break;
> +
> +               gimple_seq phi_seq = expand_phi_nodes (bb);
> +               i = gsi_start (phi_seq);
> +               if (!gsi_end_p (i)
> +                   && !virtual_operand_p (gimple_phi_result (gsi_stmt (i))))
> +                 break;
> +
> +               /* Check if all stmts before STMT are TERed.  */
> +               gimple_seq cur_seq = expand_bb_seq (bb);
> +               i = gsi_start_nondebug (cur_seq);
> +               int cnt = 100;
> +               while (!gsi_end_p (i) && gsi_stmt (i) != stmt)
> +                 {
> +                   gimple *g = gsi_stmt (i);
> +                   if (!is_gimple_assign (g))
> +                     break;
> +                   if (--cnt == 0)
> +                     break;
> +                   tree lhs = gimple_assign_lhs (g);
> +                   if (TREE_CODE (lhs) != SSA_NAME
> +                       || !get_gimple_for_ssa_name (lhs))
> +                     break;
> +                   gsi_next_nondebug (&i);
> +                 }
> +
> +               if (gsi_stmt (i) != stmt)
> +                 break;
> +
> +               std::swap (op0, op1);
> +               break;
> +             }
>           }
>       }
>      }
> @@ -5501,6 +5588,10 @@ expand_gimple_basic_block (basic_block b
>    if (optimize)
>      reorder_operands (bb);
>    stmts = bb_seq (bb);
> +  if ((unsigned) bb->index >= vec_safe_length (gimple_bb_info_for_bb))
> +    vec_safe_grow_cleared (gimple_bb_info_for_bb, bb->index + 1);
> + (*gimple_bb_info_for_bb)[bb->index].seq = stmts;
> + (*gimple_bb_info_for_bb)[bb->index].phi_nodes = phi_nodes (bb);
>    bb->il.gimple.seq = NULL;
>    bb->il.gimple.phi_nodes = NULL;
>    rtl_profile_for_bb (bb);
> @@ -6419,6 +6510,8 @@ pass_expand::execute (function *fun)
>        >= PARAM_VALUE (PARAM_MAX_DEBUG_MARKER_COUNT))
>      cfun->debug_nonbind_markers = false;
> 
> +  vec_safe_grow_cleared (gimple_bb_info_for_bb, n_basic_blocks_for_fn
> + (cfun));
> +
>    lab_rtx_for_bb = new hash_map<basic_block, rtx_code_label *>;
>    FOR_BB_BETWEEN (bb, init_block->next_bb, EXIT_BLOCK_PTR_FOR_FN
> (fun),
>                 next_bb)
> @@ -6433,6 +6526,8 @@ pass_expand::execute (function *fun)
>        deep_ter_debug_map = NULL;
>      }
> 
> +  vec_free (gimple_bb_info_for_bb);
> +
>    /* Free stuff we no longer need after GIMPLE optimizations.  */
>    free_dominance_info (CDI_DOMINATORS);
>    free_dominance_info (CDI_POST_DOMINATORS);
> 
>       Jakub

Regards,
Venkat.

Reply via email to