[PATCH] Improve TRUTH_{AND,OR}IF_EXPR expansion (PR rtl-optimization/78200)

Jakub Jelinek Tue, 27 Mar 2018 02:11:36 -0700

Hi!

The following patch attempts to improve expansion, if we have code like:
  <bb 16> [local count: 102513059]:
  if_conversion_var_52 = MEM[base: st1_22, offset: 0B];
  if (if_conversion_var_52 < 0)
    goto <bb 17>; [41.00%]
  else
    goto <bb 18>; [59.00%]


...

  <bb 18> [local count: 60482706]:
  _81 = _11 == 2;
  _82 = if_conversion_var_52 > 0;
  _79 = _81 & _82;
  if (_79 != 0)
    goto <bb 19>; [29.26%]
  else
    goto <bb 20>; [70.74%]

Here, the single pred of the bb performed a similar comparison to what
one of the & (or |) operands does, and as & (or |) is not ordered, we can
choose which operand we'll expand first.  If we expand if_conversion_var_52 > 0
first, there is a chance that we can reuse flags from the previous
comparison.  The patch does it only if there are no (non-virtual) phis on the
current bb, all stmts before the current condition are TERable, so there is
nothing that would clobber the flags in between.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk if it
helps for SPEC?  Venkat, do you think you could benchmark it in the setup
where you've measured the slowdown to see if it helps?  I see the patch
changes the loop:
        .p2align 4,,10
        .p2align 3
 .L11:
+       jle     .L10
        cmpl    $2, %eax
-       jne     .L10
-       testq   %r8, %r8
-       jg      .L12
+       je      .L12
        .p2align 4,,10
        .p2align 3
 .L10:
        addq    %r9, %rsi
        cmpq    %rsi, %rdx
        jbe     .L35
 .L13:
        movl    24(%rsi), %eax
        testl   %eax, %eax
        jle     .L10
        movq    (%rsi), %r8
        testq   %r8, %r8
        jns     .L11
        cmpl    $1, %eax
        jne     .L10
 .L12:
        addq    $1, %r10
        movl    $1, %r11d
        movq    st5(,%r10,8), %rax
        movq    %rsi, (%rax)
        addq    %r9, %rsi
        movq    %r8, 8(%rax)
        movq    st5(,%rdi,8), %rax
        movq    %r8, 16(%rax)
        cmpq    %rsi, %rdx
        ja      .L13
which I assume shall be an improvement, since we can save one extra
comparison.

2018-03-27  Jakub Jelinek  <ja...@redhat.com>
            Richard Biener  <rgue...@suse.de>

        PR rtl-optimization/78200
        * cfgexpand.c (gimple_bb_info_for_bb): New variable.
        (expand_bb_seq, expand_phi_nodes): New functions.
        (expand_gimple_cond): Use heuristics if it is desirable to
        swap TRUTH_{AND,OR}IF_EXPR operands.
        (expand_gimple_basic_block): Remember GIMPLE il for bbs
        being expanded or already expanded.
        (pass_expand::execute): Initialize and then free the
        gimple_bb_info_for_bb vector.

--- gcc/cfgexpand.c.jj  2018-02-09 06:44:36.413805556 +0100
+++ gcc/cfgexpand.c     2018-03-26 13:35:57.536509844 +0200
@@ -2357,6 +2357,34 @@ label_rtx_for_bb (basic_block bb ATTRIBU
   return l;
 }
 
+/* Maps blocks to their GIMPLE IL.  */
+static vec<gimple_bb_info, va_heap, vl_embed> *gimple_bb_info_for_bb;
+
+/* Like bb_seq, except that during expansion returns the GIMPLE seq
+   even for the blocks that have been already expanded or are being
+   currently expanded.  */
+
+static gimple_seq
+expand_bb_seq (basic_block bb)
+{
+  if ((bb->flags & BB_RTL)
+      && (unsigned) bb->index < vec_safe_length (gimple_bb_info_for_bb))
+    return (*gimple_bb_info_for_bb)[bb->index].seq;
+  return bb_seq (bb);
+}
+
+/* Like phi_nodes, except that during expansion returns the GIMPLE PHIs
+   even for the blocks that have been already expanded or are being
+   currently expanded.  */
+
+static gimple_seq
+expand_phi_nodes (basic_block bb)
+{
+  if ((bb->flags & BB_RTL)
+      && (unsigned) bb->index < vec_safe_length (gimple_bb_info_for_bb))
+    return (*gimple_bb_info_for_bb)[bb->index].phi_nodes;
+  return phi_nodes (bb);
+}
 
 /* A subroutine of expand_gimple_cond.  Given E, a fallthrough edge
    of a basic block where we just expanded the conditional at the end,
@@ -2475,6 +2503,65 @@ expand_gimple_cond (basic_block bb, gcon
                  op0 = gimple_assign_rhs1 (second);
                  op1 = gimple_assign_rhs2 (second);
                }
+
+             /* We'll expand RTL for op0 first, see if we'd better expand RTL
+                for op1 first.  Do that if the previous bb ends with
+                if (x op cst), op1's def_stmt rhs is x op2 cst and there are
+                no non-virtual PHIs nor non-TERed stmts in BB before STMT.  */
+             while (TREE_CODE (op1) == SSA_NAME
+                    && (code == TRUTH_ANDIF_EXPR || code == TRUTH_ORIF_EXPR)
+                    && single_pred_p (bb))
+               {
+                 gimple *def1 = SSA_NAME_DEF_STMT (op1);
+                 if (!is_gimple_assign (def1)
+                     || (TREE_CODE_CLASS (gimple_assign_rhs_code (def1))
+                         != tcc_comparison))
+                   break;
+
+                 basic_block pred = single_pred (bb);
+                 gimple_seq pred_seq = expand_bb_seq (pred);
+                 gimple_stmt_iterator i = gsi_last (pred_seq);
+                 if (!gsi_end_p (i) && is_gimple_debug (gsi_stmt (i)))
+                   gsi_prev_nondebug (&i);
+
+                 gimple *last = gsi_stmt (i);
+                 if (!last
+                     || gimple_code (last) != GIMPLE_COND
+                     || gimple_assign_rhs1 (def1) != gimple_cond_lhs (last)
+                     || !operand_equal_p (gimple_assign_rhs2 (def1),
+                                          gimple_cond_rhs (last), 0))
+                   break;
+
+                 gimple_seq phi_seq = expand_phi_nodes (bb);
+                 i = gsi_start (phi_seq);
+                 if (!gsi_end_p (i)
+                     && !virtual_operand_p (gimple_phi_result (gsi_stmt (i))))
+                   break;
+
+                 /* Check if all stmts before STMT are TERed.  */
+                 gimple_seq cur_seq = expand_bb_seq (bb);
+                 i = gsi_start_nondebug (cur_seq);
+                 int cnt = 100;
+                 while (!gsi_end_p (i) && gsi_stmt (i) != stmt)
+                   {
+                     gimple *g = gsi_stmt (i);
+                     if (!is_gimple_assign (g))
+                       break;
+                     if (--cnt == 0)
+                       break;
+                     tree lhs = gimple_assign_lhs (g);
+                     if (TREE_CODE (lhs) != SSA_NAME
+                         || !get_gimple_for_ssa_name (lhs))
+                       break;
+                     gsi_next_nondebug (&i);
+                   }
+
+                 if (gsi_stmt (i) != stmt)
+                   break;
+
+                 std::swap (op0, op1);
+                 break;
+               }
            }
        }
     }
@@ -5501,6 +5588,10 @@ expand_gimple_basic_block (basic_block b
   if (optimize)
     reorder_operands (bb);
   stmts = bb_seq (bb);
+  if ((unsigned) bb->index >= vec_safe_length (gimple_bb_info_for_bb))
+    vec_safe_grow_cleared (gimple_bb_info_for_bb, bb->index + 1);
+  (*gimple_bb_info_for_bb)[bb->index].seq = stmts;
+  (*gimple_bb_info_for_bb)[bb->index].phi_nodes = phi_nodes (bb);
   bb->il.gimple.seq = NULL;
   bb->il.gimple.phi_nodes = NULL;
   rtl_profile_for_bb (bb);
@@ -6419,6 +6510,8 @@ pass_expand::execute (function *fun)
       >= PARAM_VALUE (PARAM_MAX_DEBUG_MARKER_COUNT))
     cfun->debug_nonbind_markers = false;
 
+  vec_safe_grow_cleared (gimple_bb_info_for_bb, n_basic_blocks_for_fn (cfun));
+
   lab_rtx_for_bb = new hash_map<basic_block, rtx_code_label *>;
   FOR_BB_BETWEEN (bb, init_block->next_bb, EXIT_BLOCK_PTR_FOR_FN (fun),
                  next_bb)
@@ -6433,6 +6526,8 @@ pass_expand::execute (function *fun)
       deep_ter_debug_map = NULL;
     }
 
+  vec_free (gimple_bb_info_for_bb);
+
   /* Free stuff we no longer need after GIMPLE optimizations.  */
   free_dominance_info (CDI_DOMINATORS);
   free_dominance_info (CDI_POST_DOMINATORS);

        Jakub

[PATCH] Improve TRUTH_{AND,OR}IF_EXPR expansion (PR rtl-optimization/78200)

Reply via email to