On Fri, Oct 23, 2015 at 5:20 PM, Alan Lawrence <alan.lawre...@arm.com> wrote:
> vect_analyze_slp_instance currently only creates an slp_instance if _all_ 
> stores
> in a group fitted the same pattern. This patch splits non-matching groups up
> on vector boundaries, allowing only part of the group to be SLP'd, or multiple
> subgroups to be SLP'd differently.
>
> The algorithm could be made more efficient: we have more info available in
> the matches vector, and a single match in a vector full of non-matches, means
> we will be unable to SLP). But the double-recursion case has at most log(N)
> depth and the single-recursion case is at worst N^2 in *the group width*, 
> which
> is generally small.
>
> This could possibly be extended to hybrid SLP, but I believe that would also
> require splitting the load groups, as at present removing the bb_vinfo check
> ends up with data being stored to the wrong locations e.g. in slp-11a.c. 
> Hence,
> leaving that extension to a future patch.
>
> Bootstrapped + check-{gcc,g++,fortran} on aarch64, x86_64 and arm (v7-a neon).
>
> Thanks, Alan
>
> gcc/ChangeLog:
>
>         * tree-vect-slp.c (vect_split_slp_group): New.
>         (vect_analyze_slp_instance): Recurse on subgroup(s) if
>         vect_build_slp_tree fails during basic block SLP.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.dg/vect/bb-slp-7.c: Make that SLP group even more 
> non-isomorphic.
>         * gcc.dg/vect/bb-slp-subgroups-1.c: New.
>         * gcc.dg/vect/bb-slp-subgroups-2.c: New.
> ---
>  gcc/testsuite/gcc.dg/vect/bb-slp-7.c           |  8 ++--
>  gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-1.c | 44 ++++++++++++++++++
>  gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-2.c | 42 +++++++++++++++++
>  gcc/tree-vect-slp.c                            | 63 
> +++++++++++++++++++++++++-
>  4 files changed, 152 insertions(+), 5 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-1.c
>  create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-2.c
>
> diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-7.c 
> b/gcc/testsuite/gcc.dg/vect/bb-slp-7.c
> index ab54a48..b012d78 100644
> --- a/gcc/testsuite/gcc.dg/vect/bb-slp-7.c
> +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-7.c
> @@ -16,12 +16,12 @@ main1 (unsigned int x, unsigned int y)
>    unsigned int *pout = &out[0];
>    unsigned int a0, a1, a2, a3;
>
> -  /* Non isomorphic.  */
> +  /* Non isomorphic, even 64-bit subgroups.  */
>    a0 = *pin++ + 23;
> -  a1 = *pin++ + 142;
> +  a1 = *pin++ * 142;
>    a2 = *pin++ + 2;
>    a3 = *pin++ * 31;
> -
> +
>    *pout++ = a0 * x;
>    *pout++ = a1 * y;
>    *pout++ = a2 * x;
> @@ -47,4 +47,4 @@ int main (void)
>  }
>
>  /* { dg-final { scan-tree-dump-times "basic block vectorized" 0 "slp2" } } */
> -
> +
> diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-1.c 
> b/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-1.c
> new file mode 100644
> index 0000000..39c23c3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-1.c
> @@ -0,0 +1,44 @@
> +/* { dg-require-effective-target vect_int } */
> +/* PR tree-optimization/67682.  */
> +
> +#include "tree-vect.h"
> +
> +int __attribute__((__aligned__(8))) a[8];
> +int __attribute__((__aligned__(8))) b[4];
> +
> +__attribute__ ((noinline)) void
> +test ()
> +{
> +    a[0] = b[0];
> +    a[1] = b[1];
> +    a[2] = b[2];
> +    a[3] = b[3];
> +    a[4] = 0;
> +    a[5] = 0;
> +    a[6] = 0;
> +    a[7] = 0;
> +}
> +
> +int
> +main (int argc, char **argv)
> +{
> +  check_vect ();
> +
> +  for (int i = 0; i < 8; i++)
> +    a[i] = 1;
> +  for (int i = 0; i < 4; i++)
> +    b[i] = i + 4;
> +  __asm__ volatile ("" : : : "memory");
> +  test (a, b);
> +  __asm__ volatile ("" : : : "memory");
> +  for (int i = 0; i < 4; i++)
> +    if (a[i] != i+4)
> +      abort ();
> +  for (int i = 4; i < 8; i++)
> +    if (a[i] != 0)
> +      abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "Basic block will be vectorized using 
> SLP" 1 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp2" } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-2.c 
> b/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-2.c
> new file mode 100644
> index 0000000..06099fd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-2.c
> @@ -0,0 +1,42 @@
> +/* { dg-require-effective-target vect_int } */
> +/* PR tree-optimization/67682.  */
> +
> +#include "tree-vect.h"
> +
> +int __attribute__((__aligned__(8))) a[8];
> +int __attribute__((__aligned__(8))) b[8];
> +
> +__attribute__ ((noinline)) void
> +test ()
> +{
> +    a[0] = b[0];
> +    a[1] = b[1] + 1;
> +    a[2] = b[2] * 2;
> +    a[3] = b[3] + 3;
> +
> +    a[4] = b[4] + 4;
> +    a[5] = b[5] + 4;
> +    a[6] = b[6] + 4;
> +    a[7] = b[7] + 4;
> +}
> +
> +int
> +main (int argc, char **argv)
> +{
> +  check_vect ();
> +
> +  for (int i = 0; i < 8; i++)
> +    b[i] = i + 1;
> +  __asm__ volatile ("" : : : "memory");
> +  test (a, b);
> +  __asm__ volatile ("" : : : "memory");
> +  if ((a[0] != 1) || (a[1] != 3) || (a[2] != 6) || (a[3] != 7))
> +    abort ();
> +  for (int i = 4; i < 8; i++)
> +    if (a[i] != i + 5)
> +      abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "Basic block will be vectorized using 
> SLP" 1 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp2" } } */
> diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
> index 7a2d623..fb8d11e 100644
> --- a/gcc/tree-vect-slp.c
> +++ b/gcc/tree-vect-slp.c
> @@ -1627,6 +1627,33 @@ vect_analyze_slp_cost (slp_instance instance, void 
> *data)
>    body_cost_vec.release ();
>  }
>
> +/* Splits a group of statements currently beginning at FIRST_STMT into two
> +   groups, one (still beginning at FIRST_STMT) containing the first I stmts,
> +   the second containing the remainder.  Return the first stmt in the second
> +   group.  */
> +
> +static gimple *
> +vect_split_slp_group (gimple *first_stmt, unsigned int i)
> +{
> +  gcc_assert (GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt)) == 
> first_stmt);
> +  gcc_assert (i > 0);
> +  int group2_size = GROUP_SIZE (vinfo_for_stmt (first_stmt)) - i;
> +  gcc_assert (group2_size > 0);
> +  GROUP_SIZE (vinfo_for_stmt (first_stmt)) = i;
> +
> +  gimple *stmt = first_stmt;
> +  while (i-- > 1)
> +    stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
> +  /* STMT is now the last element of the first group.  */
> +  gimple *first = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
> +  GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)) = 0;
> +
> +  GROUP_SIZE (vinfo_for_stmt (first)) = group2_size;
> +  for (stmt = first; stmt; stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)))
> +    GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = first;

apart from the fact that you'll post a new version you need to adjust GROUP_GAP.
You also seem to somewhat "confuse" "first I stmts" and "a group of
size I", those
are not the same when the group has haps.  I'd say "a group of size i" makes the
most sense here thus I suggest to adjust the function comment accordingly.

GROUP_GAP is the gap from the previous group element.  The goup first
elements GROUP_GAP is the gap at the _end_ of the whole group.  Thus
for the 2nd group the first stmts GROUP_GAP needs to be GROUP_GAP
of the 1st stmt of the original group.  For the new 1st group GROUP_GAP
of the 1st stmt should be the old GROUP_GAP of the 1st element of the
new 2nd group.  And of course the counting of 'i' needs to properly factor
in GROUP_GAP as well.

This is probably what broke the loop-vectorization side but I'm quite sure
I added some intermediate-gap BB vectorization support this year as well.

Like

   ... = a[0];
   ... = a[2];
   ... = a[2];
   ... = a[3];

just no gaps at the boundaries of the group (but after being split they might
be on a boundary).

Richard.

> +  return first;
> +}
> +
>  /* Analyze an SLP instance starting from a group of grouped stores.  Call
>     vect_build_slp_tree to build a tree of packed stmts if possible.
>     Return FALSE if it's impossible to SLP any stmt in the loop.  */
> @@ -1642,7 +1669,7 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, 
> bb_vec_info bb_vinfo,
>    tree vectype, scalar_type = NULL_TREE;
>    gimple *next;
>    unsigned int vectorization_factor = 0;
> -  int i;
> +  unsigned int i;
>    unsigned int max_nunits = 0;
>    vec<slp_tree> loads;
>    struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
> @@ -1836,6 +1863,40 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, 
> bb_vec_info bb_vinfo,
>    vect_free_slp_tree (node);
>    loads.release ();
>
> +  /* For basic block vectorization, try to break the group up into multiples
> +     of the vectorization factor.  */
> +  if (bb_vinfo && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
> +    {
> +      /* We consider breaking the group only on VF boundaries from the 
> existing
> +        start.  */
> +      for (i = 0; i < group_size; i++)
> +       if (!matches[i]) break;
> +
> +      if (i < vectorization_factor && vectorization_factor < group_size)
> +       {
> +         /* First vector is a mix of (non-/)matches, or first element was
> +            impossible for another reason.  Retry without the first vector.  
> */
> +         stmt = vect_split_slp_group (stmt, vectorization_factor);
> +         return vect_analyze_slp_instance (loop_vinfo, bb_vinfo,
> +                                           stmt, max_tree_size);
> +       }
> +      else if (i >= vectorization_factor && i < group_size)
> +       {
> +         /* Split into two groups at the first vector boundary before i.  */
> +         gcc_assert ((vectorization_factor & (vectorization_factor - 1)) == 
> 0);
> +         i &= ~(vectorization_factor - 1);
> +         gimple *group2start = vect_split_slp_group (stmt, i);
> +         return vect_analyze_slp_instance (loop_vinfo, bb_vinfo,
> +                                           stmt, max_tree_size)
> +                | vect_analyze_slp_instance (loop_vinfo, bb_vinfo,
> +                                             group2start, max_tree_size);
> +        }
> +    }
> +  else
> +    {
> +      /* TODO: handle reduction case?  */
> +    }
> +
>    return false;
>  }
>
> --
> 1.9.1
>

Reply via email to