On Wed, Apr 7, 2021 at 11:10 AM Richard Sandiford
<richard.sandif...@arm.com> wrote:
>
> Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> > On Tue, Apr 6, 2021 at 12:03 PM Richard Sandiford via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> >>
> >> As noted in the PR, we were no longer using ST3 for the testcase and
> >> instead stored each lane individually.  This is because we'd split
> >> the store group during SLP and couldn't recover when SLP failed.
> >>
> >> However, we seem to get better code with ST3 and ST4 even if
> >> SLP would have succeeded, such as for vect-complex-5.c.
> >> I think the best thing for GCC 11 would therefore be to skip
> >> the split entirely if we could use IFN_STORE_LANES.  A downside
> >> of this is that SLP can handle smaller iteration counts than
> >> IFN_STORE_LANES can, but we don't have the infrastructure to
> >> choose reliably based on that.
> >>
> >> Tested on aarch64-linux-gnu (with and without SVE), arm-linux-gnueabihf,
> >> armeb-eabi and x86_64-linux-gnu.  OK to install?
> >
> > One of the cases where splitting helps is if you have say V2DFmode
> > and a group size of 4 but if you break up the group into sizes of 2
> > then you get two V2DFmode group size 2 SLP subgraphs.  So I wonder
> > if, since you look for a vector type, want to only disable splitting
> > in case the resulting vector type has the same number of lanes
> > as the group size?  (and if not, instead limit where we consider splitting)
>
> Hmm, yeah.  If we can apply SLP to full vectors within a loop iteration
> then I agree it's still better to split.  I think the test should favour
> ST3/ST4 more though: split only if one of the new group sizes is a
> multiple of the vector size.  If the vector type has more lanes than the
> group size then we should still use ST3/ST4.
>
> How does this version look?  Tested as before.

Looks good.

Thanks,
Richard.

> Thanks,
> Richard
>
>
> gcc/
>         PR tree-optimization/99873
>         * tree-vect-slp.c (vect_slp_prefer_store_lanes_p): New function.
>         (vect_build_slp_instance): Don't split store groups that could
>         use IFN_STORE_LANES.
>
> gcc/testsuite/
>         * gcc.dg/vect/slp-21.c: Only expect 2 of the loops to use SLP
>         if IFN_STORE_LANES is available.
>         * vect/vect-complex-5.c: Expect no loops to use SLP if
>         IFN_STORE_LANES is available.
>         * gcc.target/aarch64/pr99873_1.c: New test.
>         * gcc.target/aarch64/pr99873_2.c: Likewise.
>         * gcc.target/aarch64/pr99873_3.c: Likewise.
>         * gcc.target/aarch64/sve/pr99873_1.c: Likewise.
>         * gcc.target/aarch64/sve/pr99873_2.c: Likewise.
>         * gcc.target/aarch64/sve/pr99873_3.c: Likewise.
> ---
>  gcc/testsuite/gcc.dg/vect/slp-21.c            |  4 +--
>  gcc/testsuite/gcc.dg/vect/vect-complex-5.c    |  3 +-
>  gcc/testsuite/gcc.target/aarch64/pr99873_1.c  | 17 ++++++++++
>  gcc/testsuite/gcc.target/aarch64/pr99873_2.c  | 20 ++++++++++++
>  gcc/testsuite/gcc.target/aarch64/pr99873_3.c  | 20 ++++++++++++
>  .../gcc.target/aarch64/sve/pr99873_1.c        | 15 +++++++++
>  .../gcc.target/aarch64/sve/pr99873_2.c        | 18 +++++++++++
>  .../gcc.target/aarch64/sve/pr99873_3.c        | 18 +++++++++++
>  gcc/tree-vect-slp.c                           | 31 ++++++++++++++++++-
>  9 files changed, 142 insertions(+), 4 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/pr99873_1.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/pr99873_2.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/pr99873_3.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/pr99873_1.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/pr99873_2.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/pr99873_3.c
>
> diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
> index ceec7f5c410..eadbd521966 100644
> --- a/gcc/tree-vect-slp.c
> +++ b/gcc/tree-vect-slp.c
> @@ -2458,6 +2458,34 @@ vect_match_slp_patterns (slp_instance instance, 
> vec_info *vinfo,
>    return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, visited);
>  }
>
> +/* STMT_INFO is a store group of size GROUP_SIZE that we are considering
> +   splitting into two, with the first split group having size NEW_GROUP_SIZE.
> +   Return true if we could use IFN_STORE_LANES instead and if that appears
> +   to be the better approach.  */
> +
> +static bool
> +vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
> +                              unsigned int group_size,
> +                              unsigned int new_group_size)
> +{
> +  tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
> +  tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
> +  if (!vectype)
> +    return false;
> +  /* Allow the split if one of the two new groups would operate on full
> +     vectors *within* rather than across one scalar loop iteration.
> +     This is purely a heuristic, but it should work well for group
> +     sizes of 3 and 4, where the possible splits are:
> +
> +       3->2+1:  OK if the vector has exactly two elements
> +       4->2+2:  Likewise
> +       4->3+1:  Less clear-cut.  */
> +  if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS 
> (vectype))
> +      || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
> +    return false;
> +  return vect_store_lanes_supported (vectype, group_size, false);
> +}
> +
>  /* Analyze an SLP instance starting from a group of grouped stores.  Call
>     vect_build_slp_tree to build a tree of packed stmts if possible.
>     Return FALSE if it's impossible to SLP any stmt in the loop.  */
> @@ -2693,7 +2721,8 @@ vect_build_slp_instance (vec_info *vinfo,
>
>        /* For loop vectorization split into arbitrary pieces of size > 1.  */
>        if (is_a <loop_vec_info> (vinfo)
> -         && (i > 1 && i < group_size))
> +         && (i > 1 && i < group_size)
> +         && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
>         {
>           unsigned group1_size = i;
>
> diff --git a/gcc/testsuite/gcc.dg/vect/slp-21.c 
> b/gcc/testsuite/gcc.dg/vect/slp-21.c
> index bf8f434dd50..85393975b45 100644
> --- a/gcc/testsuite/gcc.dg/vect/slp-21.c
> +++ b/gcc/testsuite/gcc.dg/vect/slp-21.c
> @@ -210,7 +210,7 @@ int main (void)
>
>     Not all vect_perm targets support that, and it's a bit too specific to 
> have
>     its own effective-target selector, so we just test targets directly.  */
> -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" 
> { target { aarch64*-*-* arm*-*-* powerpc64*-*-* } } } } */
> -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" 
> { target { vect_strided4 && { ! { aarch64*-*-* arm*-*-* powerpc64*-*-* } } } 
> } } } */
> +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" 
> { target powerpc64*-*-* } } } */
> +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" 
> { target { vect_strided4 && { ! powerpc64*-*-* } } } } } */
>  /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect"  
> { target { ! { vect_strided4 } } } } } */
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-complex-5.c 
> b/gcc/testsuite/gcc.dg/vect/vect-complex-5.c
> index 81fdb67ce81..addcf60438c 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-complex-5.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-complex-5.c
> @@ -40,4 +40,5 @@ main (void)
>    return 0;
>  }
>
> -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" 
> { xfail { ! vect_hw_misalign } } } } */
> +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" 
> { target vect_load_lanes } } } */
> +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" 
> { target { ! vect_load_lanes } xfail { ! vect_hw_misalign } } } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr99873_1.c 
> b/gcc/testsuite/gcc.target/aarch64/pr99873_1.c
> new file mode 100644
> index 00000000000..bc4d81e3ae5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr99873_1.c
> @@ -0,0 +1,17 @@
> +/* { dg-options "-O3" } */
> +
> +#pragma GCC target "+nosve"
> +
> +void
> +f (int *restrict x, int *restrict y, int *restrict z, int n)
> +{
> +  for (int i = 0; i < n; i += 3)
> +    {
> +      x[i] = y[i] + z[i];
> +      x[i + 1] = y[i + 1] - z[i + 1];
> +      x[i + 2] = y[i + 2] | z[i + 2];
> +    }
> +}
> +
> +/* { dg-final { scan-assembler-times {\tld3\t} 2 } } */
> +/* { dg-final { scan-assembler-times {\tst3\t} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr99873_2.c 
> b/gcc/testsuite/gcc.target/aarch64/pr99873_2.c
> new file mode 100644
> index 00000000000..b73fbdc0a18
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr99873_2.c
> @@ -0,0 +1,20 @@
> +/* { dg-options "-O3" } */
> +
> +#include <stdint.h>
> +
> +#pragma GCC target "+nosve"
> +
> +void __attribute ((noipa))
> +foo (uint64_t *__restrict x, uint64_t *__restrict y, int n)
> +{
> +  for (int i = 0; i < n; i += 4)
> +    {
> +      x[i] += y[i];
> +      x[i + 1] += y[i + 1];
> +      x[i + 2] |= y[i + 2];
> +      x[i + 3] |= y[i + 3];
> +    }
> +}
> +
> +/* { dg-final { scan-assembler-not {\tld4\t} } } */
> +/* { dg-final { scan-assembler-not {\tst4\t} } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr99873_3.c 
> b/gcc/testsuite/gcc.target/aarch64/pr99873_3.c
> new file mode 100644
> index 00000000000..ccbab6d74be
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr99873_3.c
> @@ -0,0 +1,20 @@
> +/* { dg-options "-O3" } */
> +
> +#include <stdint.h>
> +
> +#pragma GCC target "+nosve"
> +
> +void __attribute ((noipa))
> +foo (uint32_t *__restrict x, uint32_t *__restrict y, int n)
> +{
> +  for (int i = 0; i < n; i += 4)
> +    {
> +      x[i] += y[i];
> +      x[i + 1] += y[i + 1];
> +      x[i + 2] |= y[i + 2];
> +      x[i + 3] |= y[i + 3];
> +    }
> +}
> +
> +/* { dg-final { scan-assembler-times {\tld4\t} 2 } } */
> +/* { dg-final { scan-assembler-times {\tst4\t} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr99873_1.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/pr99873_1.c
> new file mode 100644
> index 00000000000..f4b95da2afa
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr99873_1.c
> @@ -0,0 +1,15 @@
> +/* { dg-options "-O3" } */
> +
> +void
> +f (int *restrict x, int *restrict y, int *restrict z, int n)
> +{
> +  for (int i = 0; i < n; i += 3)
> +    {
> +      x[i] = y[i] + z[i];
> +      x[i + 1] = y[i + 1] - z[i + 1];
> +      x[i + 2] = y[i + 2] | z[i + 2];
> +    }
> +}
> +
> +/* { dg-final { scan-assembler-times {\tld3w\t} 2 } } */
> +/* { dg-final { scan-assembler-times {\tst3w\t} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr99873_2.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/pr99873_2.c
> new file mode 100644
> index 00000000000..03dc4ef930d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr99873_2.c
> @@ -0,0 +1,18 @@
> +/* { dg-options "-O3" } */
> +
> +#include <stdint.h>
> +
> +void __attribute ((noipa))
> +foo (uint64_t *__restrict x, uint64_t *__restrict y, int n)
> +{
> +  for (int i = 0; i < n; i += 4)
> +    {
> +      x[i] += y[i];
> +      x[i + 1] += y[i + 1];
> +      x[i + 2] |= y[i + 2];
> +      x[i + 3] |= y[i + 3];
> +    }
> +}
> +
> +/* { dg-final { scan-assembler-times {\tld4d\t} 2 } } */
> +/* { dg-final { scan-assembler-times {\tst4d\t} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr99873_3.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/pr99873_3.c
> new file mode 100644
> index 00000000000..87a0141e508
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr99873_3.c
> @@ -0,0 +1,18 @@
> +/* { dg-options "-O3" } */
> +
> +#include <stdint.h>
> +
> +void __attribute ((noipa))
> +foo (uint32_t *__restrict x, uint32_t *__restrict y, int n)
> +{
> +  for (int i = 0; i < n; i += 4)
> +    {
> +      x[i] += y[i];
> +      x[i + 1] += y[i + 1];
> +      x[i + 2] |= y[i + 2];
> +      x[i + 3] |= y[i + 3];
> +    }
> +}
> +
> +/* { dg-final { scan-assembler-times {\tld4w\t} 2 } } */
> +/* { dg-final { scan-assembler-times {\tst4w\t} 1 } } */

Reply via email to