This finally computes a valid partition ordering (or if not possible
merge partitions again) in loop distribution.  This is the only
place where data dependences are relevant, so we delay computing them
(and also do not compute all N^2 dependences).

Bootstrapped and tested on x86_64-unknown-linux-gnu with BOOT_CFLAGS
-O2 -ftree-loop-distribution, BOOT_CFLAGS -O3 still running.

Richard.

2013-10-24  Richard Biener  <rguent...@suse.de>

        PR tree-optimization/58626
        * tree-loop-distribution.c (enum rdg_dep_type): Remove
        anti_dd, output_dd and input_dd.
        (struct rdg_edge): Remove level and relation members.
        (RDGE_LEVEL, RDGE_RELATION): Remove.
        (dot_rdg_1): Adjust.
        (create_rdg_edge_for_ddr): Remove.
        (create_rdg_edges_for_scalar): Adjust.
        (create_edge_for_control_dependence): Likewise.
        (create_rdg_edges): Split into ...
        (create_rdg_flow_edges): ... this
        (create_rdg_cd_edges): ... and this.
        (free_rdg): Adjust.
        (build_rdg): Likewise, do not compute data dependences or
        add edges for them.
        (pg_add_dependence_edges): New function.
        (pgcmp): Likewise.
        (distribute_loop): First apply all non-dependence based
        partition mergings.  Then compute dependences between partitions
        and merge and order partitions according to them.

        * gcc.dg/torture/pr58626.c: New testcase.

Index: gcc/tree-loop-distribution.c
===================================================================
*** gcc/tree-loop-distribution.c.orig   2013-10-24 12:44:13.000000000 +0200
--- gcc/tree-loop-distribution.c        2013-10-24 16:01:29.163862370 +0200
*************** enum rdg_dep_type
*** 96,110 ****
    /* Read After Write (RAW).  */
    flow_dd = 'f',
  
-   /* Write After Read (WAR).  */
-   anti_dd = 'a',
- 
-   /* Write After Write (WAW).  */
-   output_dd = 'o',
- 
-   /* Read After Read (RAR).  */
-   input_dd = 'i',
- 
    /* Control dependence (execute conditional on).  */
    control_dd = 'c'
  };
--- 96,101 ----
*************** typedef struct rdg_edge
*** 115,133 ****
  {
    /* Type of the dependence.  */
    enum rdg_dep_type type;
- 
-   /* Levels of the dependence: the depth of the loops that carry the
-      dependence.  */
-   unsigned level;
- 
-   /* Dependence relation between data dependences, NULL when one of
-      the vertices is a scalar.  */
-   ddr_p relation;
  } *rdg_edge_p;
  
  #define RDGE_TYPE(E)        ((struct rdg_edge *) ((E)->data))->type
- #define RDGE_LEVEL(E)       ((struct rdg_edge *) ((E)->data))->level
- #define RDGE_RELATION(E)    ((struct rdg_edge *) ((E)->data))->relation
  
  /* Dump vertex I in RDG to FILE.  */
  
--- 106,114 ----
*************** dot_rdg_1 (FILE *file, struct graph *rdg
*** 215,237 ****
         for (e = v->succ; e; e = e->succ_next)
           switch (RDGE_TYPE (e))
             {
-            case input_dd:
-              fprintf (file, "%d -> %d [label=input] \n", i, e->dest);
-              break;
- 
-            case output_dd:
-              fprintf (file, "%d -> %d [label=output] \n", i, e->dest);
-              break;
- 
             case flow_dd:
               /* These are the most common dependences: don't print these. */
               fprintf (file, "%d -> %d \n", i, e->dest);
               break;
  
-            case anti_dd:
-              fprintf (file, "%d -> %d [label=anti] \n", i, e->dest);
-              break;
- 
           case control_dd:
               fprintf (file, "%d -> %d [label=control] \n", i, e->dest);
               break;
--- 196,206 ----
*************** rdg_vertex_for_stmt (struct graph *rdg A
*** 273,324 ****
    return index;
  }
  
- /* Creates an edge in RDG for each distance vector from DDR.  The
-    order that we keep track of in the RDG is the order in which
-    statements have to be executed.  */
- 
- static void
- create_rdg_edge_for_ddr (struct graph *rdg, ddr_p ddr)
- {
-   struct graph_edge *e;
-   int va, vb;
-   data_reference_p dra = DDR_A (ddr);
-   data_reference_p drb = DDR_B (ddr);
-   unsigned level = ddr_dependence_level (ddr);
- 
-   /* For non scalar dependences, when the dependence is REVERSED,
-      statement B has to be executed before statement A.  */
-   if (level > 0
-       && !DDR_REVERSED_P (ddr))
-     {
-       data_reference_p tmp = dra;
-       dra = drb;
-       drb = tmp;
-     }
- 
-   va = rdg_vertex_for_stmt (rdg, DR_STMT (dra));
-   vb = rdg_vertex_for_stmt (rdg, DR_STMT (drb));
- 
-   if (va < 0 || vb < 0)
-     return;
- 
-   e = add_edge (rdg, va, vb);
-   e->data = XNEW (struct rdg_edge);
- 
-   RDGE_LEVEL (e) = level;
-   RDGE_RELATION (e) = ddr;
- 
-   /* Determines the type of the data dependence.  */
-   if (DR_IS_READ (dra) && DR_IS_READ (drb))
-     RDGE_TYPE (e) = input_dd;
-   else if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
-     RDGE_TYPE (e) = output_dd;
-   else if (DR_IS_WRITE (dra) && DR_IS_READ (drb))
-     RDGE_TYPE (e) = flow_dd;
-   else if (DR_IS_READ (dra) && DR_IS_WRITE (drb))
-     RDGE_TYPE (e) = anti_dd;
- }
- 
  /* Creates dependence edges in RDG for all the uses of DEF.  IDEF is
     the index of DEF in RDG.  */
  
--- 242,247 ----
*************** create_rdg_edges_for_scalar (struct grap
*** 339,345 ****
        e = add_edge (rdg, idef, use);
        e->data = XNEW (struct rdg_edge);
        RDGE_TYPE (e) = flow_dd;
-       RDGE_RELATION (e) = NULL;
      }
  }
  
--- 262,267 ----
*************** create_edge_for_control_dependence (stru
*** 366,372 ****
          e = add_edge (rdg, c, v);
          e->data = XNEW (struct rdg_edge);
          RDGE_TYPE (e) = control_dd;
-         RDGE_RELATION (e) = NULL;
        }
      }
  }
--- 288,293 ----
*************** create_edge_for_control_dependence (stru
*** 374,411 ****
  /* Creates the edges of the reduced dependence graph RDG.  */
  
  static void
! create_rdg_edges (struct graph *rdg, vec<ddr_p> ddrs, control_dependences *cd)
  {
    int i;
-   struct data_dependence_relation *ddr;
    def_operand_p def_p;
    ssa_op_iter iter;
  
-   FOR_EACH_VEC_ELT (ddrs, i, ddr)
-     if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE)
-       create_rdg_edge_for_ddr (rdg, ddr);
-     else
-       free_dependence_relation (ddr);
- 
    for (i = 0; i < rdg->n_vertices; i++)
      FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i),
                              iter, SSA_OP_DEF)
        create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i);
  
!   if (cd)
!     for (i = 0; i < rdg->n_vertices; i++)
!       {
!       gimple stmt = RDG_STMT (rdg, i);
!       if (gimple_code (stmt) == GIMPLE_PHI)
!         {
!           edge_iterator ei;
!           edge e;
!           FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->preds)
              create_edge_for_control_dependence (rdg, e->src, i, cd);
!         }
!       else
!         create_edge_for_control_dependence (rdg, gimple_bb (stmt), i, cd);
!       }
  }
  
  /* Build the vertices of the reduced dependence graph RDG.  Return false
--- 295,332 ----
  /* Creates the edges of the reduced dependence graph RDG.  */
  
  static void
! create_rdg_flow_edges (struct graph *rdg)
  {
    int i;
    def_operand_p def_p;
    ssa_op_iter iter;
  
    for (i = 0; i < rdg->n_vertices; i++)
      FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i),
                              iter, SSA_OP_DEF)
        create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i);
+ }
  
! /* Creates the edges of the reduced dependence graph RDG.  */
! 
! static void
! create_rdg_cd_edges (struct graph *rdg, control_dependences *cd)
! {
!   int i;
! 
!   for (i = 0; i < rdg->n_vertices; i++)
!     {
!       gimple stmt = RDG_STMT (rdg, i);
!       if (gimple_code (stmt) == GIMPLE_PHI)
!       {
!         edge_iterator ei;
!         edge e;
!         FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->preds)
              create_edge_for_control_dependence (rdg, e->src, i, cd);
!       }
!       else
!       create_edge_for_control_dependence (rdg, gimple_bb (stmt), i, cd);
!     }
  }
  
  /* Build the vertices of the reduced dependence graph RDG.  Return false
*************** free_rdg (struct graph *rdg)
*** 494,503 ****
        struct graph_edge *e;
  
        for (e = v->succ; e; e = e->succ_next)
!       {
!         free_dependence_relation (RDGE_RELATION (e));
!         free (e->data);
!       }
  
        if (v->data)
        {
--- 415,421 ----
        struct graph_edge *e;
  
        for (e = v->succ; e; e = e->succ_next)
!       free (e->data);
  
        if (v->data)
        {
*************** build_rdg (vec<loop_p> loop_nest, contro
*** 520,526 ****
    struct graph *rdg;
    vec<gimple> stmts;
    vec<data_reference_p> datarefs;
-   vec<ddr_p> dependence_relations;
  
    /* Create the RDG vertices from the stmts of the loop nest.  */
    stmts.create (10);
--- 438,443 ----
*************** build_rdg (vec<loop_p> loop_nest, contro
*** 536,554 ****
      }
    stmts.release ();
  
!   /* Create the RDG edges from the data dependences in the loop nest.  */
!   dependence_relations.create (100);
!   if (!compute_all_dependences (datarefs, &dependence_relations, loop_nest,
!                               false)
!       || !known_dependences_p (dependence_relations))
!     {
!       free_dependence_relations (dependence_relations);
!       datarefs.release ();
!       free_rdg (rdg);
!       return NULL;
!     }
!   create_rdg_edges (rdg, dependence_relations, cd);
!   dependence_relations.release ();
    datarefs.release ();
  
    return rdg;
--- 453,462 ----
      }
    stmts.release ();
  
!   create_rdg_flow_edges (rdg);
!   if (cd)
!     create_rdg_cd_edges (rdg, cd);
! 
    datarefs.release ();
  
    return rdg;
*************** partition_contains_all_rw (struct graph
*** 1405,1410 ****
--- 1313,1382 ----
    return false;
  }
  
+ /* Compute partition dependence created by the data references in DRS1
+    and DRS2 and modify and return DIR according to that.  */
+ 
+ static int
+ pg_add_dependence_edges (struct graph *rdg, vec<loop_p> loops, int dir,
+                        vec<data_reference_p> drs1,
+                        vec<data_reference_p> drs2)
+ {
+   data_reference_p dr1, dr2;
+ 
+   /* dependence direction - 0 is no dependence, -1 is back,
+      1 is forth, 2 is both (we can stop then, merging will occur).  */
+   for (int ii = 0; drs1.iterate (ii, &dr1); ++ii)
+     for (int jj = 0; drs2.iterate (jj, &dr2); ++jj)
+       {
+       int this_dir = 1;
+       ddr_p ddr;
+       /* Re-shuffle data-refs to be in dominator order.  */
+       if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
+           > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
+         {
+           data_reference_p tem = dr1;
+           dr1 = dr2;
+           dr2 = tem;
+           this_dir = -this_dir;
+         }
+       ddr = initialize_data_dependence_relation (dr1, dr2, loops);
+       compute_affine_dependence (ddr, loops[0]);
+       if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
+         this_dir = 2;
+       else if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE)
+         {
+           if (DDR_REVERSED_P (ddr))
+             {
+               data_reference_p tem = dr1;
+               dr1 = dr2;
+               dr2 = tem;
+               this_dir = -this_dir;
+             }
+           /* Known dependences can still be unordered througout the
+              iteration space, see gcc.dg/tree-ssa/ldist-16.c.  */
+           if (DDR_NUM_DIST_VECTS (ddr) == 0)
+             this_dir = 2;
+         }
+       else
+         this_dir = 0;
+       free_dependence_relation (ddr);
+       if (dir == 0)
+         dir = this_dir;
+       else if (dir != this_dir)
+         return 2;
+       }
+   return dir;
+ }
+ 
+ /* Compare postorder number of the partition graph vertices V1 and V2.  */
+ 
+ static int
+ pgcmp (const void *v1_, const void *v2_)
+ {
+   const vertex *v1 = (const vertex *)v1_;
+   const vertex *v2 = (const vertex *)v2_;
+   return v2->post - v1->post;
+ }
  
  /* Distributes the code from LOOP in such a way that producer
     statements are placed before consumer statements.  Tries to separate
*************** distribute_loop (struct loop *loop, vec<
*** 1421,1426 ****
--- 1393,1400 ----
    partition_t partition;
    bool any_builtin;
    int i, nbp;
+   graph *pg = NULL;
+   int num_sccs = 1;
  
    *nb_calls = 0;
    loop_nest.create (3);
*************** distribute_loop (struct loop *loop, vec<
*** 1455,1462 ****
        any_builtin |= partition_builtin_p (partition);
      }
  
!   /* If we did not detect any builtin but are not asked to apply
!      regular loop distribution simply bail out.  */
    if (!flag_tree_loop_distribution
        && !any_builtin)
      {
--- 1429,1436 ----
        any_builtin |= partition_builtin_p (partition);
      }
  
!   /* If we are only distributing patterns but did not detect any,
!      simply bail out.  */
    if (!flag_tree_loop_distribution
        && !any_builtin)
      {
*************** distribute_loop (struct loop *loop, vec<
*** 1464,1472 ****
        goto ldist_done;
      }
  
    /* Apply our simple cost model - fuse partitions with similar
       memory accesses.  */
-   partition_t into;
    for (i = 0; partitions.iterate (i, &into); ++i)
      {
        if (partition_builtin_p (into))
--- 1438,1493 ----
        goto ldist_done;
      }
  
+   /* If we are only distributing patterns fuse all partitions that
+      were not classified as builtins.  This also avoids chopping
+      a loop into pieces, separated by builtin calls.  That is, we
+      only want no or a single loop body remaining.  */
+   partition_t into;
+   if (!flag_tree_loop_distribution)
+     {
+       for (i = 0; partitions.iterate (i, &into); ++i)
+       if (!partition_builtin_p (into))
+         break;
+       for (++i; partitions.iterate (i, &partition); ++i)
+       if (!partition_builtin_p (partition))
+         {
+           if (dump_file && (dump_flags & TDF_DETAILS))
+             {
+               fprintf (dump_file, "fusing non-builtin partitions\n");
+               dump_bitmap (dump_file, into->stmts);
+               dump_bitmap (dump_file, partition->stmts);
+             }
+           partition_merge_into (into, partition);
+           partitions.unordered_remove (i);
+           partition_free (partition);
+           i--;
+         }
+     }
+ 
+   /* Due to limitations in the transform phase we have to fuse all
+      reduction partitions into the last partition so the existing
+      loop will contain all loop-closed PHI nodes.  */
+   for (i = 0; partitions.iterate (i, &into); ++i)
+     if (partition_reduction_p (into))
+       break;
+   for (i = i + 1; partitions.iterate (i, &partition); ++i)
+     if (partition_reduction_p (partition))
+       {
+       if (dump_file && (dump_flags & TDF_DETAILS))
+         {
+           fprintf (dump_file, "fusing partitions\n");
+           dump_bitmap (dump_file, into->stmts);
+           dump_bitmap (dump_file, partition->stmts);
+           fprintf (dump_file, "because they have reductions\n");
+         }
+       partition_merge_into (into, partition);
+       partitions.unordered_remove (i);
+       partition_free (partition);
+       i--;
+       }
+ 
    /* Apply our simple cost model - fuse partitions with similar
       memory accesses.  */
    for (i = 0; partitions.iterate (i, &into); ++i)
      {
        if (partition_builtin_p (into))
*************** distribute_loop (struct loop *loop, vec<
*** 1486,1546 ****
                           "memory accesses\n");
                }
              partition_merge_into (into, partition);
!             partitions.ordered_remove (j);
              partition_free (partition);
              j--;
            }
        }
      }
  
!   /* If we are only distributing patterns fuse all partitions that
!      were not properly classified as builtins.  */
!   if (!flag_tree_loop_distribution)
      {
!       partition_t into;
!       /* Only fuse adjacent non-builtin partitions, see PR53616.
!          ???  Use dependence information to improve partition ordering.  */
!       i = 0;
!       do
        {
!         for (; partitions.iterate (i, &into); ++i)
!           if (!partition_builtin_p (into))
              break;
!         for (++i; partitions.iterate (i, &partition); ++i)
!           if (!partition_builtin_p (partition))
              {
!               partition_merge_into (into, partition);
!               partitions.ordered_remove (i);
                partition_free (partition);
!               i--;
              }
-           else
-             break;
        }
-       while ((unsigned) i < partitions.length ());
-     }
  
!   /* Fuse all reduction partitions into the last.  */
!   if (partitions.length () > 1)
!     {
!       partition_t into = partitions.last ();
!       for (i = partitions.length () - 2; i >= 0; --i)
        {
!         partition_t what = partitions[i];
!         if (partition_reduction_p (what))
!           {
!             if (dump_file && (dump_flags & TDF_DETAILS))
!               {
!                 fprintf (dump_file, "fusing partitions\n");
!                 dump_bitmap (dump_file, into->stmts);
!                 dump_bitmap (dump_file, what->stmts);
!                 fprintf (dump_file, "because the latter has reductions\n");
!               }
!             partition_merge_into (into, what);
!             partitions.ordered_remove (i);
!             partition_free (what);
!           }
        }
      }
  
    nbp = partitions.length ();
--- 1507,1625 ----
                           "memory accesses\n");
                }
              partition_merge_into (into, partition);
!             partitions.unordered_remove (j);
              partition_free (partition);
              j--;
            }
        }
      }
  
!   /* Build the partition dependency graph.  */
!   if (partitions.length () > 1)
      {
!       pg = new_graph (partitions.length ());
!       struct pgdata {
!         partition_t partition;
!         vec<data_reference_p> writes;
!         vec<data_reference_p> reads;
!       };
! #define PGDATA(i) ((pgdata *)(pg->vertices[i].data))
!       for (i = 0; partitions.iterate (i, &partition); ++i)
!       {
!         vertex *v = &pg->vertices[i];
!         pgdata *data = new pgdata;
!         data_reference_p dr;
!         /* FIXME - leaks.  */
!         v->data = data;
!         bitmap_iterator bi;
!         unsigned j;
!         data->partition = partition;
!         data->reads = vNULL;
!         data->writes = vNULL;
!         EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, j, bi)
!           for (int k = 0; RDG_DATAREFS (rdg, j).iterate (k, &dr); ++k)
!             if (DR_IS_READ (dr))
!               data->reads.safe_push (dr);
!             else
!               data->writes.safe_push (dr);
!       }
!       partition_t partition1, partition2;
!       for (i = 0; partitions.iterate (i, &partition1); ++i)
!       for (int j = i + 1; partitions.iterate (j, &partition2); ++j)
!         {
!           /* dependence direction - 0 is no dependence, -1 is back,
!              1 is forth, 2 is both (we can stop then, merging will occur).  */
!           int dir = 0;
!           dir = pg_add_dependence_edges (rdg, loop_nest, dir,
!                                          PGDATA(i)->writes,
!                                          PGDATA(j)->reads);
!           if (dir != 2)
!             dir = pg_add_dependence_edges (rdg, loop_nest, dir,
!                                            PGDATA(i)->reads,
!                                            PGDATA(j)->writes);
!           if (dir != 2)
!             dir = pg_add_dependence_edges (rdg, loop_nest, dir,
!                                            PGDATA(i)->writes,
!                                            PGDATA(j)->writes);
!           if (dir == 1 || dir == 2)
!             add_edge (pg, i, j);
!           if (dir == -1 || dir == 2)
!             add_edge (pg, j, i);
!         }
! 
!       /* Add edges to the reduction partition (if any) to force it last.  */
!       unsigned j;
!       for (j = 0; partitions.iterate (j, &partition); ++j)
!       if (partition_reduction_p (partition))
!         break;
!       if (j < partitions.length ())
        {
!         for (unsigned i = 0; partitions.iterate (i, &partition); ++i)
!           if (i != j)
!             add_edge (pg, i, j);
!       }
! 
!       /* Compute partitions we cannot separate and fuse them.  */
!       num_sccs = graphds_scc (pg, NULL);
!       for (i = 0; i < num_sccs; ++i)
!       {
!         partition_t first;
!         int j;
!         for (j = 0; partitions.iterate (j, &first); ++j)
!           if (pg->vertices[j].component == i)
              break;
!         for (j = j + 1; partitions.iterate (j, &partition); ++j)
!           if (pg->vertices[j].component == i)
              {
!               if (dump_file && (dump_flags & TDF_DETAILS))
!                 {
!                   fprintf (dump_file, "fusing partitions\n");
!                   dump_bitmap (dump_file, first->stmts);
!                   dump_bitmap (dump_file, partition->stmts);
!                   fprintf (dump_file, "because they are in the same "
!                            "dependence SCC\n");
!                 }
!               partition_merge_into (first, partition);
!               partitions[j] = NULL;
                partition_free (partition);
!               PGDATA (j)->partition = NULL;
              }
        }
  
!       /* Now order the remaining nodes in postorder.  */
!       qsort (pg->vertices, pg->n_vertices, sizeof (vertex), pgcmp);
!       partitions.truncate (0);
!       for (i = 0; i < pg->n_vertices; ++i)
        {
!         pgdata *data = PGDATA (i);
!         if (data->partition)
!           partitions.safe_push (data->partition);
!         data->reads.release ();
!         data->writes.release ();
!         delete data;
        }
+       gcc_assert (partitions.length () == (unsigned)num_sccs);
+       free_graph (pg);
      }
  
    nbp = partitions.length ();
Index: gcc/testsuite/gcc.dg/torture/pr58626.c
===================================================================
*** /dev/null   1970-01-01 00:00:00.000000000 +0000
--- gcc/testsuite/gcc.dg/torture/pr58626.c      2013-10-24 12:48:45.987057105 
+0200
***************
*** 0 ****
--- 1,20 ----
+ /* { dg-do run } */
+ 
+ extern void abort (void);
+ 
+ int a[8][6] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 };
+ int b;
+ 
+ int main(void)
+ {
+   for (b = 0; b <= 1; b++) {
+       a[1][3] = 0;
+       int c;
+       for (c = 0; c <= 1; c++) {
+         a[c + 1][b] = a[c + 2][b];
+       }
+   }
+   if (a[1][1] != 1)
+     abort ();
+   return 0;
+ }

Reply via email to