On Fri, 2 Sep 2022, Richard Sandiford wrote:

> vect_optimize_slp_pass always treats the starting layout as valid,
> to avoid having to "optimise" when every possible choice is invalid.
> But it gives the starting layout a high cost if it seems like the
> target might reject it, in the hope that this will encourage other
> (valid) layouts.
> 
> The testcase for PR106787 showed that this was flawed, since it was
> triggering even in cases where the number of input lanes is different
> from the number of output lanes.  Picking such a high cost could also
> make costs for loop-invariant nodes overwhelm the costs for inner-loop
> nodes.
> 
> This patch makes the costing less aggressive by (a) restricting
> it to N-to-N permutations and (b) assigning the maximum cost of
> a permute.
> 
> Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?

OK.

Thanks,
Richard.

> Richard
> 
> 
> gcc/
>       * tree-vect-slp.cc (vect_optimize_slp_pass::internal_node_cost):
>       Reduce the fallback cost to 1.  Only use it if the number of
>       input lanes is equal to the number of output lanes.
> 
> gcc/testsuite/
>       * gcc.dg/vect/bb-slp-layout-20.c: New test.
> ---
>  gcc/testsuite/gcc.dg/vect/bb-slp-layout-20.c | 33 ++++++++++++++++
>  gcc/tree-vect-slp.cc                         | 40 +++++++++++++++-----
>  2 files changed, 63 insertions(+), 10 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-layout-20.c
> 
> diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-20.c 
> b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-20.c
> new file mode 100644
> index 00000000000..ed7816b3f7b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-20.c
> @@ -0,0 +1,33 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-fno-tree-loop-vectorize" } */
> +
> +extern int a[][4], b[][4], c[][4], d[4], e[4];
> +void f()
> +{
> +  int t0 = a[0][3];
> +  int t1 = a[1][3];
> +  int t2 = a[2][3];
> +  int t3 = a[3][3];
> +  int a0 = 0, a1 = 0, a2 = 0, a3 = 0, b0 = 0, b1 = 0, b2 = 0, b3 = 0;
> +  for (int i = 0; i < 400; i += 4)
> +    {
> +      a0 += b[i][3] * t0;
> +      a1 += b[i][2] * t1;
> +      a2 += b[i][1] * t2;
> +      a3 += b[i][0] * t3;
> +      b0 += c[i][3] * t0;
> +      b1 += c[i][2] * t1;
> +      b2 += c[i][1] * t2;
> +      b3 += c[i][0] * t3;
> +    }
> +  d[0] = a0;
> +  d[1] = a1;
> +  d[2] = a2;
> +  d[3] = a3;
> +  e[0] = b0;
> +  e[1] = b1;
> +  e[2] = b2;
> +  e[3] = b3;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = 
> VEC_PERM_EXPR" 3 "slp1" { target { vect_int_mult && vect_perm } } } } */
> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> index 59ec66a6f96..b10f69da133 100644
> --- a/gcc/tree-vect-slp.cc
> +++ b/gcc/tree-vect-slp.cc
> @@ -4436,18 +4436,19 @@ change_vec_perm_layout (slp_tree node, 
> lane_permutation_t &perm,
>  
>     IN_LAYOUT_I has no meaning for other types of node.
>  
> -   Keeping the node as-is is always valid.  If the target doesn't appear to
> -   support the node as-is then layout 0 has a high and arbitrary cost instead
> -   of being invalid.  On the one hand, this ensures that every node has at
> -   least one valid layout, avoiding what would otherwise be an awkward
> -   special case.  On the other, it still encourages the pass to change
> -   an invalid pre-existing layout choice into a valid one.  */
> +   Keeping the node as-is is always valid.  If the target doesn't appear
> +   to support the node as-is, but might realistically support other layouts,
> +   then layout 0 instead has the cost of a worst-case permutation.  On the
> +   one hand, this ensures that every node has at least one valid layout,
> +   avoiding what would otherwise be an awkward special case.  On the other,
> +   it still encourages the pass to change an invalid pre-existing layout
> +   choice into a valid one.  */
>  
>  int
>  vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
>                                           unsigned int out_layout_i)
>  {
> -  const int fallback_cost = 100;
> +  const int fallback_cost = 1;
>  
>    if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
>      {
> @@ -4457,8 +4458,9 @@ vect_optimize_slp_pass::internal_node_cost (slp_tree 
> node, int in_layout_i,
>        /* Check that the child nodes support the chosen layout.  Checking
>        the first child is enough, since any second child would have the
>        same shape.  */
> +      auto first_child = SLP_TREE_CHILDREN (node)[0];
>        if (in_layout_i > 0
> -       && !is_compatible_layout (SLP_TREE_CHILDREN (node)[0], in_layout_i))
> +       && !is_compatible_layout (first_child, in_layout_i))
>       return -1;
>  
>        change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
> @@ -4469,7 +4471,15 @@ vect_optimize_slp_pass::internal_node_cost (slp_tree 
> node, int in_layout_i,
>        if (count < 0)
>       {
>         if (in_layout_i == 0 && out_layout_i == 0)
> -         return fallback_cost;
> +         {
> +           /* Use the fallback cost if the node could in principle support
> +              some nonzero layout for both the inputs and the outputs.
> +              Otherwise assume that the node will be rejected later
> +              and rebuilt from scalars.  */
> +           if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
> +             return fallback_cost;
> +           return 0;
> +         }
>         return -1;
>       }
>  
> @@ -4498,8 +4508,18 @@ vect_optimize_slp_pass::internal_node_cost (slp_tree 
> node, int in_layout_i,
>        if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
>                                          nullptr, vf, true, false, &n_perms))
>       {
> +       auto rep = SLP_TREE_REPRESENTATIVE (node);
>         if (out_layout_i == 0)
> -         return fallback_cost;
> +         {
> +           /* Use the fallback cost if the load is an N-to-N permutation.
> +              Otherwise assume that the node will be rejected later
> +              and rebuilt from scalars.  */
> +           if (STMT_VINFO_GROUPED_ACCESS (rep)
> +               && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
> +                   == SLP_TREE_LANES (node)))
> +             return fallback_cost;
> +           return 0;
> +         }
>         return -1;
>       }
>  
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)

Reply via email to