On Mon, 2 Sep 2019, Richard Sandiford wrote:

> Richard Biener <rguent...@suse.de> writes:
> > This disables postreload GCSE the same way we disable GCSE/cprop.
> > On the PR36262 testcase this removes
> >
> >  load CSE after reload              : 129.00 ( 72%)   0.08 (  5%) 130.50 ( 
> > 72%)       6 kB (  0%)
> >
> > With a smaller testcase both PRE and postreload GCSE still run
> > and GCSE shows itself roughly 2x the cost of postreload GCSE there
> > (still wondering why we have two implementations of the same thing?!)
> >
> > I've seem postreload CSE pop up a lot on larger testcases while
> > PRE turns itself off.
> >
> > So, does this look reasonable?
> >
> > Thanks,
> > Richard.
> >
> > 2019-09-02  Richard Biener  <rguent...@suse.de>
> >
> >     PR rtl-optimization/36262
> >     * postreload-gcse.c: Include intl.h and gcse.h.
> >     (gcse_after_reload_main): Skip pass if gcse_or_cprop_is_too_expensive
> >     says so.
> 
> LGTM.  At first I thought:
> 
>   unsigned int memory_request = (n_basic_blocks_for_fn (cfun)
>                                * SBITMAP_SET_SIZE (max_reg_num ())
>                                * sizeof (SBITMAP_ELT_TYPE));
> 
> might be a bad approximation after reload, but it looks like the
> sbitmap sizes are O(ninsns), and so it's probably pretty good after all.

Yeah.  Looking closer postreload-GCSE dosn't actually solve a dataflow
problem with "transp" but compile-time is dominated by the
quadraticness of the alias checks compute_transp performs.  So it's
throwing a stupidly expensive thing at something that could have been
done on the very specific paths needed ... (get_bb_avail_insn
looking if it can look at a single predecessor [recursively]).

Bah.

Well, turns out one can easily gate off compute_transp and preserve
the "cheap" parts of postreload-CSE.  Not sure how important that is,
but hey - it seems to work (maybe also sth for -O[1g]).

Bootstrapped and tested on x86_64-unknown-linux-gnu.

OK?  I actually tested with do_transp forced to false btw.

Thanks,
Richard.

2019-09-02  Richard Biener  <rguent...@suse.de>

        PR rtl-optimization/36262
        * postreload-gcse.c: Include intl.h and gcse.h.
        (insert_expr_in_table): Insert at the head of cur_expr->avail_occr
        to avoid linear list walk.
        (record_last_mem_set_info): Gate off if not computing transparentness.
        (get_bb_avail_insn): If transparentness isn't computed give up
        early.
        (gcse_after_reload_main): Skip compute_transp and extended PRE
        if gcse_or_cprop_is_too_expensive says so.

Index: gcc/postreload-gcse.c
===================================================================
--- gcc/postreload-gcse.c       (revision 275294)
+++ gcc/postreload-gcse.c       (working copy)
@@ -38,7 +38,9 @@ along with GCC; see the file COPYING3.
 #include "params.h"
 #include "tree-pass.h"
 #include "dbgcnt.h"
+#include "intl.h"
 #include "gcse-common.h"
+#include "gcse.h"
 
 /* The following code implements gcse after reload, the purpose of this
    pass is to cleanup redundant loads generated by reload and other
@@ -364,7 +366,7 @@ insert_expr_in_table (rtx x, rtx_insn *i
   int do_not_record_p;
   hashval_t hash;
   struct expr *cur_expr, **slot;
-  struct occr *avail_occr, *last_occr = NULL;
+  struct occr *avail_occr;
 
   hash = hash_expr (x, &do_not_record_p);
 
@@ -405,38 +407,22 @@ insert_expr_in_table (rtx x, rtx_insn *i
       cur_expr = *slot;
     }
 
-  /* Search for another occurrence in the same basic block.  */
+  /* Search for another occurrence in the same basic block.  We insert
+     insns blockwise from start to end, so keep appending to the
+     start of the list so we have to check only a single element.  */
   avail_occr = cur_expr->avail_occr;
-  while (avail_occr
-        && BLOCK_FOR_INSN (avail_occr->insn) != BLOCK_FOR_INSN (insn))
-    {
-      /* If an occurrence isn't found, save a pointer to the end of
-        the list.  */
-      last_occr = avail_occr;
-      avail_occr = avail_occr->next;
-    }
-
-  if (avail_occr)
-    /* Found another instance of the expression in the same basic block.
-       Prefer this occurrence to the currently recorded one.  We want
-       the last one in the block and the block is scanned from start
-       to end.  */
+  if (avail_occr
+      && BLOCK_FOR_INSN (avail_occr->insn) == BLOCK_FOR_INSN (insn))
     avail_occr->insn = insn;
   else
     {
       /* First occurrence of this expression in this basic block.  */
       avail_occr = (struct occr *) obstack_alloc (&occr_obstack,
                                                  sizeof (struct occr));
-
-      /* First occurrence of this expression in any block?  */
-      if (cur_expr->avail_occr == NULL)
-        cur_expr->avail_occr = avail_occr;
-      else
-        last_occr->next = avail_occr;
-
       avail_occr->insn = insn;
-      avail_occr->next = NULL;
+      avail_occr->next = cur_expr->avail_occr;
       avail_occr->deleted_p = 0;
+      cur_expr->avail_occr = avail_occr;
     }
 }
 
@@ -710,6 +696,9 @@ record_last_reg_set_info_regno (rtx_insn
 static void
 record_last_mem_set_info (rtx_insn *insn)
 {
+  if (!transp)
+    return;
+
   struct modifies_mem *list_entry;
 
   list_entry = (struct modifies_mem *) obstack_alloc (&modifies_mem_obstack,
@@ -995,7 +984,8 @@ get_bb_avail_insn (basic_block bb, struc
   /* If we could not find an occurrence in BB, see if BB
      has a single predecessor with an occurrence that is
      transparent through BB.  */
-  if (single_pred_p (bb)
+  if (transp
+      && single_pred_p (bb)
       && bitmap_bit_p (transp[bb->index], bitmap_index)
       && (occr = get_bb_avail_insn (single_pred (bb), orig_occr, 
bitmap_index)))
     {
@@ -1371,6 +1361,10 @@ delete_redundant_insns (void)
 static void
 gcse_after_reload_main (rtx f ATTRIBUTE_UNUSED)
 {
+  /* Disable computing transparentness if it is too expensive.  */
+  bool do_transp
+    = !gcse_or_cprop_is_too_expensive (_("using simple load CSE after register 
"
+                                        "allocation"));
 
   memset (&stats, 0, sizeof (stats));
 
@@ -1392,15 +1386,21 @@ gcse_after_reload_main (rtx f ATTRIBUTE_
         increase the number of redundant loads found.  So compute transparency
         information for each memory expression in the hash table.  */
       df_analyze ();
-      /* This cannot be part of the normal allocation routine because
-        we have to know the number of elements in the hash table.  */
-      transp = sbitmap_vector_alloc (last_basic_block_for_fn (cfun),
-                                    expr_table->elements ());
-      bitmap_vector_ones (transp, last_basic_block_for_fn (cfun));
-      expr_table->traverse <FILE *, compute_expr_transp> (dump_file);
+      if (do_transp)
+       {
+         /* This cannot be part of the normal allocation routine because
+            we have to know the number of elements in the hash table.  */
+         transp = sbitmap_vector_alloc (last_basic_block_for_fn (cfun),
+                                        expr_table->elements ());
+         bitmap_vector_ones (transp, last_basic_block_for_fn (cfun));
+         expr_table->traverse <FILE *, compute_expr_transp> (dump_file);
+       }
+      else
+       transp = NULL;
       eliminate_partially_redundant_loads ();
       delete_redundant_insns ();
-      sbitmap_vector_free (transp);
+      if (do_transp)
+       sbitmap_vector_free (transp);
 
       if (dump_file)
        {

Reply via email to