> -----Original Message-----
> From: Richard Biener <rguent...@suse.de>
> Sent: Wednesday, July 10, 2024 10:04 AM
> To: gcc-patches@gcc.gnu.org
> Subject: [PATCH 2/3] Support group-size of three in SLP load permutation 
> lowering
> 
> The following adds support for group-size three in SLP load permutation
> lowering to match the non-SLP capabilities.  This is done by using
> the non-interleaving fallback code which then creates at VF == 4 from
> { { a0, b0, c0 }, { a1, b1, c1 }, { a2, b2, c2 }, { a3, b3, c3 } }
> the intermediate vectors { c0, c0, c1, c1 } and { c2, c2, c3, c3 }
> to produce { c0, c1, c2, c3 }.
> 

Just curious, is this only for the 3rd component then? I'm assuming the firs 
two get handled by
{a0, b0, a1, b1} {a2, b2, a3, b3} still?

Regards,
Tamar

> This turns out to be more effective than the scheme implemented
> for non-SLP for SSE and only slightly worse for AVX512 and a bit
> more worse for AVX2.  It seems to me that this would extend to
> other non-power-of-two group-sizes though (but the patch does not).
> Optimal schemes are likely difficult to lay out in VF agnostic form.
> 
> I'll note that while the lowering assumes even/odd extract is
> generally available for all vector element sizes (which is probably
> a good assumption), it doesn't in any way constrain the other
> permutes it generates based on target availability.  Again difficult
> to do in a VF agnostic way (but at least currently the vector type
> is fixed).
> 
> I'll also note that the SLP store side merges lanes in a way
> producing three-vector permutes for store group-size of three, so
> the testcase uses a store group-size of four.
> 
>       * tree-vect-slp.cc (vect_lower_load_permutations): Support
>       group-size of three.
> 
>       * gcc.dg/vect/slp-52.c: New testcase.
> ---
>  gcc/testsuite/gcc.dg/vect/slp-52.c | 14 ++++++++++++
>  gcc/tree-vect-slp.cc               | 35 +++++++++++++++++-------------
>  2 files changed, 34 insertions(+), 15 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/slp-52.c
> 
> diff --git a/gcc/testsuite/gcc.dg/vect/slp-52.c 
> b/gcc/testsuite/gcc.dg/vect/slp-52.c
> new file mode 100644
> index 00000000000..ba49f0046e2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/slp-52.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +
> +void foo (int * __restrict x, int *y)
> +{
> +  for (int i = 0; i < 1024; ++i)
> +    {
> +      x[4*i+0] = y[3*i+0];
> +      x[4*i+1] = y[3*i+1] * 2;
> +      x[4*i+2] = y[3*i+2] + 3;
> +      x[4*i+3] = y[3*i+2] * 2 - 5;
> +    }
> +}
> +
> +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { target 
> {
> vect_int && vect_int_mult } } } } */
> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> index 0f830c1ad9c..2dc6d365303 100644
> --- a/gcc/tree-vect-slp.cc
> +++ b/gcc/tree-vect-slp.cc
> @@ -3710,7 +3710,8 @@ vect_build_slp_instance (vec_info *vinfo,
>                with the least number of lanes to one and then repeat until
>                we end up with two inputs.  That scheme makes sure we end
>                up with permutes satisfying the restriction of requiring at
> -              most two vector inputs to produce a single vector output.  */
> +              most two vector inputs to produce a single vector output
> +              when the number of lanes is even.  */
>             while (SLP_TREE_CHILDREN (perm).length () > 2)
>               {
>                 /* When we have three equal sized groups left the pairwise
> @@ -4050,11 +4051,10 @@ vect_lower_load_permutations (loop_vec_info
> loop_vinfo,
>      = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (loads[0])[0]);
> 
>    /* Only a power-of-two number of lanes matches interleaving with N levels.
> -     The non-SLP path also supports DR_GROUP_SIZE == 3.
>       ???  An even number of lanes could be reduced to 1<<ceil_log2(N)-1 lanes
>       at each step.  */
>    unsigned group_lanes = DR_GROUP_SIZE (first);
> -  if (exact_log2 (group_lanes) == -1)
> +  if (exact_log2 (group_lanes) == -1 && group_lanes != 3)
>      return;
> 
>    for (slp_tree load : loads)
> @@ -4071,7 +4071,7 @@ vect_lower_load_permutations (loop_vec_info
> loop_vinfo,
>        with a non-1:1 load permutation around instead of canonicalizing
>        those into a load and a permute node.  Removing this early
>        check would do such canonicalization.  */
> -      if (SLP_TREE_LANES (load) >= group_lanes / 2)
> +      if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
>       continue;
> 
>        /* First build (and possibly re-use) a load node for the
> @@ -4107,7 +4107,7 @@ vect_lower_load_permutations (loop_vec_info
> loop_vinfo,
>        while (1)
>       {
>         unsigned group_lanes = SLP_TREE_LANES (l0);
> -       if (SLP_TREE_LANES (load) >= group_lanes / 2)
> +       if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
>           break;
> 
>         /* Try to lower by reducing the group to half its size using an
> @@ -4117,19 +4117,24 @@ vect_lower_load_permutations (loop_vec_info
> loop_vinfo,
>            Thus { e, e, o, o, e, e, o, o } woud be an even/odd decomposition
>            with N == 2.  */
>         /* ???  Only an even number of lanes can be handed this way, but the
> -          fallback below could work for any number.  */
> -       gcc_assert ((group_lanes & 1) == 0);
> -       unsigned even = (1 << ceil_log2 (group_lanes)) - 1;
> -       unsigned odd = even;
> -       for (auto l : final_perm)
> +          fallback below could work for any number.  We have to make sure
> +          to round up in that case.  */
> +       gcc_assert ((group_lanes & 1) == 0 || group_lanes == 3);
> +       unsigned even = 0, odd = 0;
> +       if ((group_lanes & 1) == 0)
>           {
> -           even &= ~l.second;
> -           odd &= l.second;
> +           even = (1 << ceil_log2 (group_lanes)) - 1;
> +           odd = even;
> +           for (auto l : final_perm)
> +             {
> +               even &= ~l.second;
> +               odd &= l.second;
> +             }
>           }
> 
>         /* Now build an even or odd extraction from the unpermuted load.  */
>         lane_permutation_t perm;
> -       perm.create (group_lanes / 2);
> +       perm.create ((group_lanes + 1) / 2);
>         unsigned level;
>         if (even
>             && ((level = 1 << ctz_hwi (even)), true)
> @@ -4164,7 +4169,7 @@ vect_lower_load_permutations (loop_vec_info
> loop_vinfo,
>             bitmap_iterator bi;
>             EXECUTE_IF_SET_IN_BITMAP (l, 0, i, bi)
>                 perm.quick_push (std::make_pair (0, i));
> -           while (perm.length () < group_lanes / 2)
> +           while (perm.length () < (group_lanes + 1) / 2)
>               perm.quick_push (perm.last ());
>           }
> 
> @@ -4200,7 +4205,7 @@ vect_lower_load_permutations (loop_vec_info
> loop_vinfo,
>            have a "local" CSE map here.  */
>         SLP_TREE_SCALAR_STMTS (p) = perm_stmts;
> 
> -       /* We now have a node for group_lanes / 2 lanes.  */
> +       /* We now have a node for (group_lanes + 1) / 2 lanes.  */
>         l0 = p;
>       }
> 
> --
> 2.35.3

Reply via email to