Richard Biener <rguent...@suse.de> writes:
> On Mon, 2 Sep 2019, Richard Sandiford wrote:
>
>> Richard Biener <rguent...@suse.de> writes:
>> > This disables postreload GCSE the same way we disable GCSE/cprop.
>> > On the PR36262 testcase this removes
>> >
>> >  load CSE after reload              : 129.00 ( 72%)   0.08 (  5%) 130.50 ( 
>> > 72%)       6 kB (  0%)
>> >
>> > With a smaller testcase both PRE and postreload GCSE still run
>> > and GCSE shows itself roughly 2x the cost of postreload GCSE there
>> > (still wondering why we have two implementations of the same thing?!)
>> >
>> > I've seem postreload CSE pop up a lot on larger testcases while
>> > PRE turns itself off.
>> >
>> > So, does this look reasonable?
>> >
>> > Thanks,
>> > Richard.
>> >
>> > 2019-09-02  Richard Biener  <rguent...@suse.de>
>> >
>> >    PR rtl-optimization/36262
>> >    * postreload-gcse.c: Include intl.h and gcse.h.
>> >    (gcse_after_reload_main): Skip pass if gcse_or_cprop_is_too_expensive
>> >    says so.
>> 
>> LGTM.  At first I thought:
>> 
>>   unsigned int memory_request = (n_basic_blocks_for_fn (cfun)
>>                               * SBITMAP_SET_SIZE (max_reg_num ())
>>                               * sizeof (SBITMAP_ELT_TYPE));
>> 
>> might be a bad approximation after reload, but it looks like the
>> sbitmap sizes are O(ninsns), and so it's probably pretty good after all.
>
> Yeah.  Looking closer postreload-GCSE dosn't actually solve a dataflow
> problem with "transp" but compile-time is dominated by the
> quadraticness of the alias checks compute_transp performs.  So it's
> throwing a stupidly expensive thing at something that could have been
> done on the very specific paths needed ... (get_bb_avail_insn
> looking if it can look at a single predecessor [recursively]).
>
> Bah.
>
> Well, turns out one can easily gate off compute_transp and preserve
> the "cheap" parts of postreload-CSE.  Not sure how important that is,
> but hey - it seems to work (maybe also sth for -O[1g]).
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu.
>
> OK?  I actually tested with do_transp forced to false btw.
>
> Thanks,
> Richard.
>
> 2019-09-02  Richard Biener  <rguent...@suse.de>
>
>       PR rtl-optimization/36262
>       * postreload-gcse.c: Include intl.h and gcse.h.
>       (insert_expr_in_table): Insert at the head of cur_expr->avail_occr
>       to avoid linear list walk.
>       (record_last_mem_set_info): Gate off if not computing transparentness.
>       (get_bb_avail_insn): If transparentness isn't computed give up
>       early.
>       (gcse_after_reload_main): Skip compute_transp and extended PRE
>       if gcse_or_cprop_is_too_expensive says so.

OK, thanks.

Richard

>
> Index: gcc/postreload-gcse.c
> ===================================================================
> --- gcc/postreload-gcse.c     (revision 275294)
> +++ gcc/postreload-gcse.c     (working copy)
> @@ -38,7 +38,9 @@ along with GCC; see the file COPYING3.
>  #include "params.h"
>  #include "tree-pass.h"
>  #include "dbgcnt.h"
> +#include "intl.h"
>  #include "gcse-common.h"
> +#include "gcse.h"
>  
>  /* The following code implements gcse after reload, the purpose of this
>     pass is to cleanup redundant loads generated by reload and other
> @@ -364,7 +366,7 @@ insert_expr_in_table (rtx x, rtx_insn *i
>    int do_not_record_p;
>    hashval_t hash;
>    struct expr *cur_expr, **slot;
> -  struct occr *avail_occr, *last_occr = NULL;
> +  struct occr *avail_occr;
>  
>    hash = hash_expr (x, &do_not_record_p);
>  
> @@ -405,38 +407,22 @@ insert_expr_in_table (rtx x, rtx_insn *i
>        cur_expr = *slot;
>      }
>  
> -  /* Search for another occurrence in the same basic block.  */
> +  /* Search for another occurrence in the same basic block.  We insert
> +     insns blockwise from start to end, so keep appending to the
> +     start of the list so we have to check only a single element.  */
>    avail_occr = cur_expr->avail_occr;
> -  while (avail_occr
> -      && BLOCK_FOR_INSN (avail_occr->insn) != BLOCK_FOR_INSN (insn))
> -    {
> -      /* If an occurrence isn't found, save a pointer to the end of
> -      the list.  */
> -      last_occr = avail_occr;
> -      avail_occr = avail_occr->next;
> -    }
> -
> -  if (avail_occr)
> -    /* Found another instance of the expression in the same basic block.
> -       Prefer this occurrence to the currently recorded one.  We want
> -       the last one in the block and the block is scanned from start
> -       to end.  */
> +  if (avail_occr
> +      && BLOCK_FOR_INSN (avail_occr->insn) == BLOCK_FOR_INSN (insn))
>      avail_occr->insn = insn;
>    else
>      {
>        /* First occurrence of this expression in this basic block.  */
>        avail_occr = (struct occr *) obstack_alloc (&occr_obstack,
>                                                 sizeof (struct occr));
> -
> -      /* First occurrence of this expression in any block?  */
> -      if (cur_expr->avail_occr == NULL)
> -        cur_expr->avail_occr = avail_occr;
> -      else
> -        last_occr->next = avail_occr;
> -
>        avail_occr->insn = insn;
> -      avail_occr->next = NULL;
> +      avail_occr->next = cur_expr->avail_occr;
>        avail_occr->deleted_p = 0;
> +      cur_expr->avail_occr = avail_occr;
>      }
>  }
>  
> @@ -710,6 +696,9 @@ record_last_reg_set_info_regno (rtx_insn
>  static void
>  record_last_mem_set_info (rtx_insn *insn)
>  {
> +  if (!transp)
> +    return;
> +
>    struct modifies_mem *list_entry;
>  
>    list_entry = (struct modifies_mem *) obstack_alloc (&modifies_mem_obstack,
> @@ -995,7 +984,8 @@ get_bb_avail_insn (basic_block bb, struc
>    /* If we could not find an occurrence in BB, see if BB
>       has a single predecessor with an occurrence that is
>       transparent through BB.  */
> -  if (single_pred_p (bb)
> +  if (transp
> +      && single_pred_p (bb)
>        && bitmap_bit_p (transp[bb->index], bitmap_index)
>        && (occr = get_bb_avail_insn (single_pred (bb), orig_occr, 
> bitmap_index)))
>      {
> @@ -1371,6 +1361,10 @@ delete_redundant_insns (void)
>  static void
>  gcse_after_reload_main (rtx f ATTRIBUTE_UNUSED)
>  {
> +  /* Disable computing transparentness if it is too expensive.  */
> +  bool do_transp
> +    = !gcse_or_cprop_is_too_expensive (_("using simple load CSE after 
> register "
> +                                      "allocation"));
>  
>    memset (&stats, 0, sizeof (stats));
>  
> @@ -1392,15 +1386,21 @@ gcse_after_reload_main (rtx f ATTRIBUTE_
>        increase the number of redundant loads found.  So compute transparency
>        information for each memory expression in the hash table.  */
>        df_analyze ();
> -      /* This cannot be part of the normal allocation routine because
> -      we have to know the number of elements in the hash table.  */
> -      transp = sbitmap_vector_alloc (last_basic_block_for_fn (cfun),
> -                                  expr_table->elements ());
> -      bitmap_vector_ones (transp, last_basic_block_for_fn (cfun));
> -      expr_table->traverse <FILE *, compute_expr_transp> (dump_file);
> +      if (do_transp)
> +     {
> +       /* This cannot be part of the normal allocation routine because
> +          we have to know the number of elements in the hash table.  */
> +       transp = sbitmap_vector_alloc (last_basic_block_for_fn (cfun),
> +                                      expr_table->elements ());
> +       bitmap_vector_ones (transp, last_basic_block_for_fn (cfun));
> +       expr_table->traverse <FILE *, compute_expr_transp> (dump_file);
> +     }
> +      else
> +     transp = NULL;
>        eliminate_partially_redundant_loads ();
>        delete_redundant_insns ();
> -      sbitmap_vector_free (transp);
> +      if (do_transp)
> +     sbitmap_vector_free (transp);
>  
>        if (dump_file)
>       {

Reply via email to