Hi,

On Tue, 29 May 2012, Richard Guenther wrote:

> > [... patch about making conflict handling for stack slot sharing 
> > faster...]
> 
> The volatile handling is very odd - the objects_must_conflict_p code 
> basically says that two volatile vars may always share stack slots?!  
> What's the reasoning for this, or handling volatiles special in any way?  
> I'd rather remove this fishy code (and if at all, ensure that volatile 
> automatics are never coalesced with any other stack slot).

After some pondering and tries I propose something even more aggressive: 
disable the restriction on stack slot sharing alltogether.  The current 
code tries to avoid sharing slots for decls that don't conflict in their 
alias sets.  The thought behind that is that such accesses could be 
reordered by any transformation (on the basis of non-conflicting alias 
sets), which is a problem if those accesses actually go to the same 
memory.

Now, since this code was invented some TLC went into the RTL 
disambiguators.  In particular write_dependence_1 (and hence anti_ and 
output_dependece) don't use TBAA anymore (see below for a caveat).  
true_dependence_1 still does, and it's correct to do so.

As TBAA isn't used in the dependence testers that could create problems 
(namely swapping a write-write or a read-write pair) we also don't need to 
restrict the stack sharing based on TBAA.  We still will happily reorder a 
write-read pair (i.e. read-after-write), but that's okay because for this 
to have a bad effect the initial code must have been invalid already (in 
our mem model a read must be done with a matching type like the last write 
to that memory area).

I verified this with some tweaks on the scheduler on x86(-64).  I disabled 
the stack slot sharing restrictions, then I forced a first scheduling pass 
with -O2, and I implemented a shuffle mode that reorders everything it 
can:

Index: common/config/i386/i386-common.c
===================================================================
--- common/config/i386/i386-common.c    (revision 187959)
+++ common/config/i386/i386-common.c    (working copy)
@@ -618,7 +618,8 @@ static const struct default_options ix86
     { OPT_LEVELS_2_PLUS, OPT_free, NULL, 1 },
     /* Turn off -fschedule-insns by default.  It tends to make the
        problem with not enough registers even worse.  */
-    { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 },
+    /*{ OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 },*/
+    { OPT_LEVELS_2_PLUS, OPT_fschedule_insns, NULL, 1 },

 #ifdef SUBTARGET_OPTIMIZATION_OPTIONS
     SUBTARGET_OPTIMIZATION_OPTIONS,
Index: haifa-sched.c
===================================================================
--- haifa-sched.c       (revision 187959)
+++ haifa-sched.c       (working copy)
@@ -2436,6 +2436,9 @@ rank_for_schedule (const void *x, const
   /* Make sure that priority of TMP and TMP2 are initialized.  */
   gcc_assert (INSN_PRIORITY_KNOWN (tmp) && INSN_PRIORITY_KNOWN (tmp2));

+  if (!reload_completed)
+    return INSN_LUID (tmp2) - INSN_LUID (tmp);
+
   if (sched_pressure != SCHED_PRESSURE_NONE)
     {
       int diff;

The only regression with this change on a regstrap on x86_64-linux with 
and without -m32 is gcc.target/i386/pr49095.c, which already fails just 
with -fschedule-insns and no patches (unexpected input to the peephole 
patterns disable the optimization that is checked in this testcase via asm 
string matching).

The caveat: there's a pre-existing problem in the RTL disambiguators where 
nonoverlapping_component_refs_p still uses a sort of TBAA (component_ref 
paths), not based on alias sets.  This problem wasn't worked around with 
the stack sharing restrictions anyway, so it's untouched by this proposal.

What I propose is hence the below patch.

(Btw, what about adding this shuffle schedule mode as general testing 
mean, perhaps under an uck-me flag?)

Regstrapped this patch (all languages+Ada) on x86_64-linux, with and 
without the above scheduler hacks, no regressions (without the scheduler 
hacks).


Ciao,
Michael.
--------------------------
        PR middle-end/38474
        * cfgexpand.c (add_alias_set_conflicts): Remove.
        (expand_used_vars): Don't call it.
        (fini_vars_expansion): Clear nonconflicting_type.
        * function.c (n_temp_slots_in_use): New static data.
        (make_slot_available, assign_stack_temp_for_type): Update it.
        (init_temp_slots): Zero it.
        (remove_unused_temp_slot_addresses): Use it for quicker removal.
        (remove_unused_temp_slot_addresses_1): Use htab_clear_slot.

Index: cfgexpand.c
===================================================================
--- cfgexpand.c.orig    2012-05-29 16:24:46.000000000 +0200
+++ cfgexpand.c 2012-06-06 14:30:33.000000000 +0200
@@ -353,47 +353,6 @@ aggregate_contains_union_type (tree type
   return false;
 }
 
-/* A subroutine of expand_used_vars.  If two variables X and Y have alias
-   sets that do not conflict, then do add a conflict for these variables
-   in the interference graph.  We also need to make sure to add conflicts
-   for union containing structures.  Else RTL alias analysis comes along
-   and due to type based aliasing rules decides that for two overlapping
-   union temporaries { short s; int i; } accesses to the same mem through
-   different types may not alias and happily reorders stores across
-   life-time boundaries of the temporaries (See PR25654).  */
-
-static void
-add_alias_set_conflicts (void)
-{
-  size_t i, j, n = stack_vars_num;
-
-  for (i = 0; i < n; ++i)
-    {
-      tree type_i = TREE_TYPE (stack_vars[i].decl);
-      bool aggr_i = AGGREGATE_TYPE_P (type_i);
-      bool contains_union;
-
-      contains_union = aggregate_contains_union_type (type_i);
-      for (j = 0; j < i; ++j)
-       {
-         tree type_j = TREE_TYPE (stack_vars[j].decl);
-         bool aggr_j = AGGREGATE_TYPE_P (type_j);
-         if (aggr_i != aggr_j
-             /* Either the objects conflict by means of type based
-                aliasing rules, or we need to add a conflict.  */
-             || !objects_must_conflict_p (type_i, type_j)
-             /* In case the types do not conflict ensure that access
-                to elements will conflict.  In case of unions we have
-                to be careful as type based aliasing rules may say
-                access to the same memory does not conflict.  So play
-                safe and add a conflict in this case when
-                 -fstrict-aliasing is used.  */
-              || (contains_union && flag_strict_aliasing))
-           add_stack_var_conflict (i, j);
-       }
-    }
-}
-
 /* Callback for walk_stmt_ops.  If OP is a decl touched by add_stack_var
    enter its partition number into bitmap DATA.  */
 
@@ -1626,10 +1585,6 @@ expand_used_vars (void)
   if (stack_vars_num > 0)
     {
       add_scope_conflicts ();
-      /* Due to the way alias sets work, no variables with non-conflicting
-        alias sets may be assigned the same address.  Add conflicts to
-        reflect this.  */
-      add_alias_set_conflicts ();
 
       /* If stack protection is enabled, we don't share space between
         vulnerable data and non-vulnerable data.  */
Index: function.c
===================================================================
--- function.c.orig     2012-05-29 16:42:31.000000000 +0200
+++ function.c  2012-05-29 16:45:33.000000000 +0200
@@ -572,6 +572,7 @@ struct GTY(()) temp_slot {
 /* A table of addresses that represent a stack slot.  The table is a mapping
    from address RTXen to a temp slot.  */
 static GTY((param_is(struct temp_slot_address_entry))) htab_t 
temp_slot_address_table;
+static size_t n_temp_slots_in_use;
 
 /* Entry for the above hash table.  */
 struct GTY(()) temp_slot_address_entry {
@@ -648,6 +649,7 @@ make_slot_available (struct temp_slot *t
   insert_slot_to_list (temp, &avail_temp_slots);
   temp->in_use = 0;
   temp->level = -1;
+  n_temp_slots_in_use--;
 }
 
 /* Compute the hash value for an address -> temp slot mapping.
@@ -700,7 +702,7 @@ remove_unused_temp_slot_addresses_1 (voi
   const struct temp_slot_address_entry *t;
   t = (const struct temp_slot_address_entry *) *slot;
   if (! t->temp_slot->in_use)
-    *slot = NULL;
+    htab_clear_slot (temp_slot_address_table, slot);
   return 1;
 }
 
@@ -708,9 +710,13 @@ remove_unused_temp_slot_addresses_1 (voi
 static void
 remove_unused_temp_slot_addresses (void)
 {
-  htab_traverse (temp_slot_address_table,
-                remove_unused_temp_slot_addresses_1,
-                NULL);
+  /* Use quicker clearing if there aren't any active temp slots.  */
+  if (n_temp_slots_in_use)
+    htab_traverse (temp_slot_address_table,
+                  remove_unused_temp_slot_addresses_1,
+                  NULL);
+  else
+    htab_empty (temp_slot_address_table);
 }
 
 /* Find the temp slot corresponding to the object at address X.  */
@@ -902,6 +908,7 @@ assign_stack_temp_for_type (enum machine
   p->in_use = 1;
   p->type = type;
   p->level = temp_slot_level;
+  n_temp_slots_in_use++;
 
   pp = temp_slots_at_level (p->level);
   insert_slot_to_list (p, pp);
@@ -1213,6 +1220,7 @@ init_temp_slots (void)
   avail_temp_slots = 0;
   used_temp_slots = 0;
   temp_slot_level = 0;
+  n_temp_slots_in_use = 0;
 
   /* Set up the table to map addresses to temp slots.  */
   if (! temp_slot_address_table)

Reply via email to