Hello world,

this is a draft patch which interchanges the indices for FORALL and
DO CONCURRENT loops for cases like PR 82471, where code like

  DO CONCURRENT( K=1:N, J=1:M, I=1:L)
     C(I,J,K) = A(I,J,K) + B(I,J,K)
  END DO

led to very poor code because of stride issues.  Currently,
Graphite is not able to do this.

Without the patch, the code above is translated as


        i.7 = 1;
        count.10 = 512;
        while (1)
          {
            if (ANNOTATE_EXPR <count.10 <= 0, ivdep>) goto L.4;
            j.6 = 1;
            count.9 = 512;
            while (1)
              {
                if (ANNOTATE_EXPR <count.9 <= 0, ivdep>) goto L.3;
                k.5 = 1;
                count.8 = 512;
                while (1)
                  {
                    if (ANNOTATE_EXPR <count.8 <= 0, ivdep>) goto L.2;
(*(real(kind=4)[0:] * restrict) c.data)[((c.offset + (integer(kind=8)) k.5 * c.dim[2].stride) + (integer(kind=8)) j.6 * c.dim[1].stride) + (integer(kind=8)) i.7] = (*(real(kind=4)[0:] * restrict) a.data)[((a.offset + (integer(kind=8)) k.5 * a.dim[2].stride) + (integer(kind=8)) j.6 * a.dim[1].stride) + (integer(kind=8)) i.7] + (*(real(kind=4)[0:] * restrict) b.data)[((b.offset + (integer(kind=8)) k.5 * b.dim[2].stride) + (integer(kind=8)) j.6 * b.dim[1].stride) + (integer(kind=8)) i.7];
                    L.1:;
                    k.5 = k.5 + 1;
                    count.8 = count.8 + -1;
                  }
                L.2:;
                j.6 = j.6 + 1;
                count.9 = count.9 + -1;
              }
            L.3:;
            i.7 = i.7 + 1;
            count.10 = count.10 + -1;
          }
        L.4:;

so the innermost loop has the biggest stride. With the patch
(and front-end optimization turned on), this is turned into

        k.7 = 1;
        count.10 = 512;
        while (1)
          {
            if (ANNOTATE_EXPR <count.10 <= 0, ivdep>) goto L.4;
            j.6 = 1;
            count.9 = 512;
            while (1)
              {
                if (ANNOTATE_EXPR <count.9 <= 0, ivdep>) goto L.3;
                i.5 = 1;
                count.8 = 512;
                while (1)
                  {
                    if (ANNOTATE_EXPR <count.8 <= 0, ivdep>) goto L.2;
(*(real(kind=4)[0:] * restrict) c.data)[((c.offset + (integer(kind=8)) k.7 * c.dim[2].stride) + (integer(kind=8)) j.6 * c.dim[1].stride) + (integer(kind=8)) i.5] = (*(real(kind=4)[0:] * restrict) a.data)[((a.offset + (integer(kind=8)) k.7 * a.dim[2].stride) + (integer(kind=8)) j.6 * a.dim[1].stride) + (integer(kind=8)) i.5] + (*(real(kind=4)[0:] * restrict) b.data)[((b.offset + (integer(kind=8)) k.7 * b.dim[2].stride) + (integer(kind=8)) j.6 * b.dim[1].stride) + (integer(kind=8)) i.5];
                    L.1:;
                    i.5 = i.5 + 1;
                    count.8 = count.8 + -1;
                  }
                L.2:;
                j.6 = j.6 + 1;
                count.9 = count.9 + -1;
              }
            L.3:;
            k.7 = k.7 + 1;
            count.10 = count.10 + -1;
          }
        L.4:;

so the innermost loop is the one that gets traversed with unity stride
(the way it should have been done).

Although the algorithm here is quite simple, it resolves the issues
raised in the PR so far, and it definitely worth fixing.

If we do this kind of thing, it might also be possible to
interchange normal DO loops in a similar way (which Graphite also
cannot do at the moment, at least not if the bounds are variables).

So, comments? Suggestions? Other ideas? Any ideas how to write
a test case? Any volunteers to re-implement Graphite in the
Fortran front end (didn't think so) or to make Graphite catch
this sort of pattern (which it currently doesn't) instead?

Regards

        Thomas

2017-10-27  Thomas Koenig  <tkoe...@gcc.gnu.org>

        * frontend-passes.c (index_interchange): New funciton,
        prototype.
        (optimize_namespace): Call index_interchange.
        (ind_type): New function.
        (has_var): New function.
        (index_cost): New function.
        (loop_comp): New function.
Index: frontend-passes.c
===================================================================
--- frontend-passes.c	(Revision 253872)
+++ frontend-passes.c	(Arbeitskopie)
@@ -55,6 +55,7 @@ static gfc_expr* check_conjg_transpose_variable (g
 						 bool *);
 static bool has_dimen_vector_ref (gfc_expr *);
 static int matmul_temp_args (gfc_code **, int *,void *data);
+static int index_interchange (gfc_code **, int*, void *);
 
 #ifdef CHECKING_P
 static void check_locus (gfc_namespace *);
@@ -1385,6 +1386,9 @@ optimize_namespace (gfc_namespace *ns)
 		       NULL);
     }
 
+  gfc_code_walker (&ns->code, index_interchange, dummy_expr_callback,
+		   NULL);
+
   /* BLOCKs are handled in the expression walker below.  */
   for (ns = ns->contained; ns; ns = ns->sibling)
     {
@@ -4225,6 +4229,157 @@ inline_matmul_assign (gfc_code **c, int *walk_subt
   return 0;
 }
 
+
+/* Code for index interchange for loops which are grouped together in DO
+   CONCURRENT or FORALL statements.  This is currently only applied if the
+   iterations are grouped together in a single statement.
+
+   For this transformation, tt is assumed that memory access in strides is
+   expensive, and that loops which access later indices (which access memory
+   in bigger strides) shoud be moved to the first loops.
+
+   For this, a loop over all the statements is executed, counting the times
+   that the loop iteration values are acessed in each index.  The loop
+   indices are then sorted to minimize access to later indces from inner
+   loops.  */
+
+/* Type for holding index information.  */
+
+typepedef struct {
+  gfc_symbol *sym;
+  gfc_forall_iterator *fa;
+  int num;
+  int n[GFC_MAX_DIMENSIONS];
+} ind_type;
+
+/* Callback function to determine if an expression is the 
+   corresponding variable.  */
+
+static int
+has_var (gfc_expr **e, int *walk_subtrees ATTRIBUTE_UNUSED, void *data)
+{
+  gfc_expr *expr = *e;
+  gfc_symbol *sym;
+
+  if (expr->expr_type != EXPR_VARIABLE)
+    return 0;
+
+  sym = (gfc_symbol *) data;
+  return sym == expr->symtree->n.sym;
+}
+
+/* Callback function to calculate the cost of a certain index.  */
+
+static int
+index_cost (gfc_expr **e, int *walk_subtrees ATTRIBUTE_UNUSED,
+	    void *data)
+{
+  ind_type *ind;
+  gfc_expr *expr;
+  gfc_array_ref *ar;
+  gfc_ref *ref;
+  int i,j;
+
+  expr = *e;
+  if (expr->expr_type != EXPR_VARIABLE)
+    return 0;
+
+  ar = NULL;
+  for (ref = expr->ref; ref; ref = ref->next)
+    {
+      if (ref->type == REF_ARRAY)
+	{
+	  ar = &ref->u.ar;
+	  break;
+	}
+    }
+  if (ar == NULL || ar->type != AR_ELEMENT)
+    return 0;
+
+  ind = (ind_type *) data;
+  for (i = 0; i < ar->dimen; i++)
+    {
+      for (j=0; ind[j].sym != NULL; j++)
+	{
+	  if (gfc_expr_walker (&ar->start[i], has_var, (void *) (ind[j].sym)))
+	      ind[j].n[i]++;
+	}
+    }
+  return 0;
+}
+
+/* Callback function for qsort, to sort the loop indices. */
+
+static int
+loop_comp (const void *e1, const void *e2)
+{
+  const ind_type *i1 = (const ind_type *) e1;
+  const ind_type *i2 = (const ind_type *) e2;
+  int i;
+
+  for (i=GFC_MAX_DIMENSIONS-1; i >= 0; i--)
+    {
+      if (i1->n[i] != i2->n[i])
+	return i1->n[i] - i2->n[i];
+    }
+  /* All other things being equal, let's not change the ordering.  */
+  return i2->num - i1->num;
+}
+
+/* Main function to do the index interchange.  */
+
+static int
+index_interchange (gfc_code **c, int *walk_subtrees ATTRIBUTE_UNUSED,
+		  void *data ATTRIBUTE_UNUSED)
+{
+  gfc_code *co;
+  co = *c;
+  int n_iter;
+  gfc_forall_iterator *fa;
+  ind_type *ind;
+  int i, j;
+  
+  if (co->op != EXEC_FORALL && co->op != EXEC_DO_CONCURRENT)
+    return 0;
+
+  n_iter = 0;
+  for (fa = co->ext.forall_iterator; fa; fa = fa->next)
+    n_iter ++;
+
+  /* Nothing to reorder. */
+  if (n_iter < 2)
+    return 0;
+
+  ind = XALLOCAVEC (ind_type, n_iter + 1);
+
+  i = 0;
+  for (fa = co->ext.forall_iterator; fa; fa = fa->next)
+    {
+      ind[i].sym = fa->var->symtree->n.sym;
+      ind[i].fa = fa;
+      for (j=0; j<GFC_MAX_DIMENSIONS; j++)
+	ind[i].n[j] = 0;
+      ind[i].num = i;
+      i++;
+    }
+  ind[n_iter].sym = NULL;
+  ind[n_iter].fa = NULL;
+
+  gfc_code_walker (c, gfc_dummy_code_callback, index_cost, (void *) ind);
+  qsort ((void *) ind, n_iter, sizeof (ind_type), loop_comp);
+
+  /* Do the actual index interchange.  */
+  co->ext.forall_iterator = fa = ind[0].fa;
+  for (i=1; i<n_iter; i++)
+    {
+      fa->next = ind[i].fa;
+      fa = fa->next;
+    }
+  fa->next = NULL;
+
+  return 0;
+}
+
 #define WALK_SUBEXPR(NODE) \
   do							\
     {							\

Reply via email to