This series should fix the target-independent parts of PR116583.
(We also need some target-specific patches, to be posted separately.)

The explanations are in the individual commit messages, but I've
attached a -b diff below in case my attempt to split the patch up
has just obfuscated things instead.

Tested on aarch64-linux-gnu (with and without SVE enabled by default)
and x86_64-linux-gnu.  Also tested by running the vect testsuite
with vect-force-slp=1.

Richard Sandiford (4):
  vect: Variable lane indices in vectorizable_slp_permutation_1
  vect: Restructure repeating_p case for SLP permutations
  vect: Support more VLA SLP permutations
  vect: Add more dump messages for VLA SLP permutation

 gcc/testsuite/gcc.dg/vect/slp-13-big-array.c |   2 +-
 gcc/testsuite/gcc.dg/vect/slp-13.c           |   2 +-
 gcc/tree-vect-slp.cc                         | 190 +++++++++++++------
 3 files changed, 134 insertions(+), 60 deletions(-)

-- 
2.25.1


diff --git a/gcc/testsuite/gcc.dg/vect/slp-13-big-array.c 
b/gcc/testsuite/gcc.dg/vect/slp-13-big-array.c
index ca70856c1dd..e45f8aab133 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-13-big-array.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-13-big-array.c
@@ -137,4 +137,4 @@ int main (void)
 /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target { 
{ vect_interleave && vect_extract_even_odd } && { ! vect_pack_trunc } } } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { 
target { ! vect_pack_trunc } } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { target { 
{ vect_interleave && vect_extract_even_odd } && vect_pack_trunc } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { 
target vect_pack_trunc xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { 
target vect_pack_trunc } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-13.c 
b/gcc/testsuite/gcc.dg/vect/slp-13.c
index b7f947e6dbe..d6346aef978 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-13.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-13.c
@@ -131,4 +131,4 @@ int main (void)
 /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target { 
{ vect_interleave && vect_extract_even_odd } && { ! vect_pack_trunc } } } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { 
target { ! vect_pack_trunc } } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { target { 
{ vect_interleave && vect_extract_even_odd } && vect_pack_trunc } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { 
target vect_pack_trunc xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { 
target vect_pack_trunc } } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 482b9d50496..56fb55cb628 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -10194,6 +10194,13 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, 
gimple_stmt_iterator *gsi,
   unsigned i;
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
+  /* True if we're permuting a single input of 2N vectors down
+     to N vectors.  This case doesn't generalize beyond 2 since
+     VEC_PERM_EXPR only takes 2 inputs.  */
+  bool pack_p = false;
+  /* If we're permuting inputs of N vectors each into X*N outputs,
+     this is the value of X, otherwise it is 1.  */
+  unsigned int unpack_factor = 1;
   tree op_vectype = NULL_TREE;
   FOR_EACH_VEC_ELT (children, i, child)
     if (SLP_TREE_VECTYPE (child))
@@ -10215,7 +10222,20 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, 
gimple_stmt_iterator *gsi,
                             "Unsupported vector types in lane permutation\n");
          return -1;
        }
-      if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
+      auto op_nunits = TYPE_VECTOR_SUBPARTS (op_vectype);
+      unsigned int this_unpack_factor;
+      /* Check whether the input has twice as many lanes per vector.  */
+      if (children.length () == 1
+         && known_eq (SLP_TREE_LANES (child) * nunits,
+                      SLP_TREE_LANES (node) * op_nunits * 2))
+       pack_p = true;
+      /* Check whether the output has N times as many lanes per vector.  */
+      else if (constant_multiple_p (SLP_TREE_LANES (node) * op_nunits,
+                                   SLP_TREE_LANES (child) * nunits,
+                                   &this_unpack_factor)
+              && (i == 0 || unpack_factor == this_unpack_factor))
+       unpack_factor = this_unpack_factor;
+      else
        repeating_p = false;
     }
 
@@ -10243,29 +10263,47 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, 
gimple_stmt_iterator *gsi,
       return 1;
     }
 
-  /* REPEATING_P is true if every output vector is guaranteed to use the
-     same permute vector.  We can handle that case for both variable-length
-     and constant-length vectors, but we only handle other cases for
-     constant-length vectors.
+  /* Set REPEATING_P to true if the permutations are cylical wrt UNPACK_FACTOR
+     and if we can generate the vectors in a vector-length agnostic way.
+     This requires UNPACK_STEP == NUNITS / UNPACK_FACTOR to be known at
+     compile time.
+
+     The significance of UNPACK_STEP is that, when PACK_P is false,
+     output vector I operates on a window of UNPACK_STEP elements from each
+     input, starting at lane UNPACK_STEP * (I % UNPACK_FACTOR).  For example,
+     when UNPACK_FACTOR is 2, the first output vector operates on lanes
+     [0, NUNITS / 2 - 1] of each input vector and the second output vector
+     operates on lanes [NUNITS / 2, NUNITS - 1] of each input vector.
+
+     When REPEATING_P is true, NOUTPUTS holds the total number of outputs
+     that we actually need to generate.  */
+  uint64_t noutputs = 0;
+  poly_uint64 unpack_step = 0;
+  loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo);
+  if (!linfo
+      || !multiple_p (nunits, unpack_factor, &unpack_step)
+      || !constant_multiple_p (LOOP_VINFO_VECT_FACTOR (linfo)
+                              * SLP_TREE_LANES (node), nunits, &noutputs))
+    repeating_p = false;
+
+  /* We can handle the conditions described for REPEATING_P above for
+     both variable- and constant-length vectors.  The fallback requires
+     us to generate every element of every permute vector explicitly,
+     which is only possible for constant-length permute vectors.
 
      Set:
 
      - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
-       mask vector that we want to build.
+       mask vectors that we want to build.
 
      - NCOPIES to the number of copies of PERM that we need in order
-       to build the necessary permute mask vectors.
-
-     - NOUTPUTS_PER_MASK to the number of output vectors we want to create
-       for each permute mask vector.  This is only relevant when GSI is
-       nonnull.  */
+       to build the necessary permute mask vectors.  */
   uint64_t npatterns;
   unsigned nelts_per_pattern;
   uint64_t ncopies;
-  unsigned noutputs_per_mask;
   if (repeating_p)
     {
-      /* We need a single permute mask vector that has the form:
+      /* We need permute mask vectors that have the form:
 
           { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
 
@@ -10274,7 +10312,6 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, 
gimple_stmt_iterator *gsi,
         that we use for permutes requires 3n elements.  */
       npatterns = SLP_TREE_LANES (node);
       nelts_per_pattern = ncopies = 3;
-      noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
     }
   else
     {
@@ -10282,24 +10319,40 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, 
gimple_stmt_iterator *gsi,
         instead of relying on the pattern described above.  */
       if (!nunits.is_constant (&npatterns)
          || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
+       {
+         if (dump_p)
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "unsupported permutation %p on variable-length"
+                            " vectors\n", (void *) node);
          return -1;
+       }
       nelts_per_pattern = ncopies = 1;
-      if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
-       if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
+      if (linfo && !LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
+       {
+         if (dump_p)
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "unsupported permutation %p for variable VF\n",
+                            (void *) node);
          return -1;
-      noutputs_per_mask = 1;
        }
-  unsigned olanes = ncopies * SLP_TREE_LANES (node);
+      pack_p = false;
+      unpack_factor = 1;
+    }
+  unsigned olanes = unpack_factor * ncopies * SLP_TREE_LANES (node);
   gcc_assert (repeating_p || multiple_p (olanes, nunits));
 
   /* Compute the { { SLP operand, vector index}, lane } permutation sequence
      from the { SLP operand, scalar lane } permutation as recorded in the
      SLP node as intermediate step.  This part should already work
      with SLP children with arbitrary number of lanes.  */
-  auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
-  auto_vec<unsigned> active_lane;
+  auto_vec<std::pair<std::pair<unsigned, unsigned>, poly_uint64>> vperm;
+  auto_vec<poly_uint64> active_lane;
   vperm.create (olanes);
   active_lane.safe_grow_cleared (children.length (), true);
+  for (unsigned int ui = 0; ui < unpack_factor; ++ui)
+    {
+      for (unsigned j = 0; j < children.length (); ++j)
+       active_lane[j] = ui * unpack_step;
       for (unsigned i = 0; i < ncopies; ++i)
        {
          for (unsigned pi = 0; pi < perm.length (); ++pi)
@@ -10307,13 +10360,16 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, 
gimple_stmt_iterator *gsi,
              std::pair<unsigned, unsigned> p = perm[pi];
              tree vtype = SLP_TREE_VECTYPE (children[p.first]);
              if (repeating_p)
-           vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
+               vperm.quick_push ({{p.first, 0},
+                                  p.second + active_lane[p.first]});
              else
                {
                  /* We checked above that the vectors are constant-length.  */
-             unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
-             unsigned vi = (active_lane[p.first] + p.second) / vnunits;
-             unsigned vl = (active_lane[p.first] + p.second) % vnunits;
+                 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype)
+                   .to_constant ();
+                 unsigned lane = active_lane[p.first].to_constant ();
+                 unsigned vi = (lane + p.second) / vnunits;
+                 unsigned vl = (lane + p.second) % vnunits;
                  vperm.quick_push ({{p.first, vi}, vl});
                }
            }
@@ -10321,6 +10377,7 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, 
gimple_stmt_iterator *gsi,
          for (unsigned j = 0; j < children.length (); ++j)
            active_lane[j] += SLP_TREE_LANES (children[j]);
        }
+    }
 
   if (dump_p)
     {
@@ -10339,9 +10396,10 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, 
gimple_stmt_iterator *gsi,
                  ? multiple_p (i, npatterns)
                  : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
            dump_printf (MSG_NOTE, ",");
-         dump_printf (MSG_NOTE, " vops%u[%u][%u]",
-                      vperm[i].first.first, vperm[i].first.second,
-                      vperm[i].second);
+         dump_printf (MSG_NOTE, " vops%u[%u][",
+                      vperm[i].first.first, vperm[i].first.second);
+         dump_dec (MSG_NOTE, vperm[i].second);
+         dump_printf (MSG_NOTE, "]");
        }
       dump_printf (MSG_NOTE, "\n");
     }
@@ -10362,16 +10420,37 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, 
gimple_stmt_iterator *gsi,
   mask.quick_grow (count);
   vec_perm_indices indices;
   unsigned nperms = 0;
-  for (unsigned i = 0; i < vperm.length (); ++i)
-    {
-      mask_element = vperm[i].second;
-      if (first_vec.first == -1U
-         || first_vec == vperm[i].first)
-       first_vec = vperm[i].first;
+  /* When REPEATING_P is true, we only have UNPACK_FACTOR unique permute
+     vectors to check during analysis, but we need to generate NOUTPUTS
+     vectors during transformation.  */
+  unsigned total_nelts = olanes;
+  if (repeating_p && gsi)
+    total_nelts = (total_nelts / unpack_factor) * noutputs;
+  for (unsigned i = 0; i < total_nelts; ++i)
+    {
+      /* VI is the input vector index when generating code for REPEATING_P.  */
+      unsigned vi = i / olanes * (pack_p ? 2 : 1);
+      unsigned ei = i % olanes;
+      mask_element = vperm[ei].second;
+      if (pack_p)
+       {
+         /* In this case, we have N outputs and the single child provides 2N
+            inputs.  Output X permutes inputs 2X and 2X+1.
+
+            The mask indices are taken directly from the SLP permutation node.
+            Index X selects from the first vector if (X / NUNITS) % 2 == 0;
+            X selects from the second vector otherwise.  These conditions
+            are only known at compile time for constant-length vectors.  */
+         first_vec = std::make_pair (0, 0);
+         second_vec = std::make_pair (0, 1);
+       }
+      else if (first_vec.first == -1U
+              || first_vec == vperm[ei].first)
+       first_vec = vperm[ei].first;
       else if (second_vec.first == -1U
-              || second_vec == vperm[i].first)
+              || second_vec == vperm[ei].first)
        {
-         second_vec = vperm[i].first;
+         second_vec = vperm[ei].first;
          mask_element += nunits;
        }
       else
@@ -10435,18 +10514,13 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, 
gimple_stmt_iterator *gsi,
              if (!identity_p)
                mask_vec = vect_gen_perm_mask_checked (vectype, indices);
 
-             for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
-               {
              tree first_def
-                   = vect_get_slp_vect_def (first_node,
-                                            first_vec.second + vi);
+               = vect_get_slp_vect_def (first_node, first_vec.second + vi);
              tree second_def
-                   = vect_get_slp_vect_def (second_node,
-                                            second_vec.second + vi);
+               = vect_get_slp_vect_def (second_node, second_vec.second + vi);
              vect_add_slp_permutation (vinfo, gsi, node, first_def,
                                        second_def, mask_vec, mask[0]);
            }
-           }
 
          index = 0;
          first_vec = std::make_pair (-1U, -1U);

Reply via email to