The following does a simple legitimising attempt on the SLP graph
permutations before trying to optimize them.  For the case we have
a single non-zero layout we can force that to all partitions if
it is compatible.  This way we end up with the most canonical
(and possibly no-op) load permutations and permutes.

I have refrained from trying to use internal_node_cost to actually
check if the result is legitimate (it would need at least the
change to anticipate redundant load permute eliding).  This relies
on start_choosing_layouts chosing layout zero for everything we
cannot handle (like non-bijective permutes).

What's still missing is to try to process disconnected parts of the
SLP graph separately.  We should possibly try to handle those
separately through all of the SLP optimize process for simplicity.

This also includes a fix for a dumping ICE when we permute the
root node of a reduction chain that was associated.

Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.

For

int
frd (int *p, int *lastone)
{
  int sum = 0;
  for (; p <= lastone; p += 16)
    sum += p[0] + p[1] + p[2] + p[3] + p[4] + p[5] + p[6] + p[7]
        + p[8] + p[9] + p[10] + p[11] + p[12] + p[13] + p[14] + p[15];
  return sum;
}

we now can use variable-length vectors on AARCH64 as well, but I think
the main loop body shows that in the above cases we want to re-roll,
aka support a fractional VF:

.L3:
        ld1w    z25.s, p7/z, [x0, #1, mul vl]
        ld1w    z24.s, p5/z, [x0, #2, mul vl]
        ld1w    z23.s, p4/z, [x0, #3, mul vl]
        add     z28.s, p7/m, z28.s, z25.s
        add     z29.s, p5/m, z29.s, z24.s
        whilelo p7.s, x2, x3
        whilelo p5.s, x2, x4
        add     z30.s, p4/m, z30.s, z23.s
        whilelo p4.s, x2, x5
        ld1w    z26.s, p6/z, [x0]
        incb    x0, all, mul #4
        add     z27.s, p6/m, z27.s, z26.s
        whilelo p6.s, x2, x1
        incb    x2
        b.any   .L3

while that's using VLA vectors we seem to limit the active elements
to the minimal vector size here?  For RVV the degenerated way of
handling this is load-lanes (up to 10).  The fixed-length aarch64
code chosen with -march=armv8.3-a+sve2 was before

.L3:
        ldp     q2, q1, [x0]
        add     x2, x2, 1
        ldp     q0, q26, [x0, 32]
        add     x0, x0, 64
        add     v27.4s, v2.4s, v27.4s
        add     v28.4s, v1.4s, v28.4s
        add     v29.4s, v0.4s, v29.4s
        add     v30.4s, v26.4s, v30.4s
        cmp     x1, x2
        bhi     .L3

where I don't quite see the vector loads.  So this patch might
expose a cost model issue.  With neoverse-v2 tuning I get a
fixed-size main loop and a VLA vector epilogue though.

I'll wait for the RISC-V CI and then plan to push this.

Richard.

        PR tree-optimization/120687
        * tree-vect-slp.cc (vect_optimize_slp_pass::is_compatible_layout):
        New overload for checking a whole partition.
        (vect_optimize_slp_pass::legitimize): New function trying
        a single layout for all partitions for now.
        (vect_optimize_slp_pass::run): Try legitimizing to a single
        layout before propagating.
        (vect_slp_analyze_operations): For dumping deal with
        SLP_TREE_SCALAR_STMTS being empty or element zero being NULL.
---
 gcc/tree-vect-slp.cc | 93 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 90 insertions(+), 3 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index e02b3379bb4..c1983adeef5 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -6297,6 +6297,7 @@ private:
 
   /* Layout selection.  */
   bool is_compatible_layout (slp_tree, unsigned int);
+  bool is_compatible_layout (const slpg_partition_info &, unsigned int);
   int change_layout_cost (slp_tree, unsigned int, unsigned int);
   slpg_partition_layout_costs &partition_layout_costs (unsigned int,
                                                       unsigned int);
@@ -6304,6 +6305,7 @@ private:
                               int, unsigned int);
   int internal_node_cost (slp_tree, int, unsigned int);
   void start_choosing_layouts ();
+  bool legitimize ();
 
   /* Cost propagation.  */
   slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
@@ -6710,6 +6712,29 @@ vect_optimize_slp_pass::is_compatible_layout (slp_tree 
node,
   return true;
 }
 
+/* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
+   that NODE would operate on for each NODE in PARTITION.
+   This test is independent of NODE's actual operations.  */
+
+bool
+vect_optimize_slp_pass::is_compatible_layout (const slpg_partition_info
+                                               &partition,
+                                             unsigned int layout_i)
+{
+  for (unsigned int order_i = partition.node_begin;
+       order_i < partition.node_end; ++order_i)
+    {
+      unsigned int node_i = m_partitioned_nodes[order_i];
+      auto &vertex = m_vertices[node_i];
+
+      /* The layout is incompatible if it is individually incompatible
+        with any node in the partition.  */
+      if (!is_compatible_layout (vertex.node, layout_i))
+       return false;
+    }
+  return true;
+}
+
 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
    to layout TO_LAYOUT_I for a node like NODE.  Return -1 if either of the
    layouts is incompatible with NODE or if the change is not possible for
@@ -8029,6 +8054,62 @@ vect_optimize_slp_pass::decide_masked_load_lanes ()
     }
 }
 
+/* Perform legitimizing attempts.  This is intended to improve the
+   situation when layout 0 is not valid which is a situation the cost
+   based propagation does not handle well.
+   Return true if further layout optimization is possible, false if
+   the layout configuration should be considered final.  */
+
+bool
+vect_optimize_slp_pass::legitimize ()
+{
+  /* Perform a very simple legitimizing attempt by attempting to choose
+     a single layout for all partitions that will make all permutations
+     a noop.  That should also be the optimal layout choice in case
+     layout zero is legitimate.
+     ???  Disconnected components of the SLP graph could have distinct
+     single layouts.  */
+  int single_layout_i = -1;
+  unsigned deferred_up_to = -1U;
+  for (unsigned partition_i = 0; partition_i < m_partitions.length ();
+       ++partition_i)
+    {
+      auto &partition = m_partitions[partition_i];
+      if (single_layout_i == -1)
+       {
+         single_layout_i = partition.layout;
+         deferred_up_to = partition_i;
+       }
+      else if (partition.layout == single_layout_i || partition.layout == -1)
+       ;
+      else
+       single_layout_i = 0;
+      if (single_layout_i == 0)
+       return true;
+
+      if (single_layout_i != -1
+         && !is_compatible_layout (partition, single_layout_i))
+       return true;
+    }
+
+  if (single_layout_i <= 0)
+    return true;
+
+  for (unsigned partition_i = 0; partition_i < deferred_up_to; ++partition_i)
+    if (!is_compatible_layout (m_partitions[partition_i],
+                              single_layout_i))
+      return true;
+
+  for (unsigned partition_i = 0; partition_i < m_partitions.length ();
+       ++partition_i)
+    {
+      auto &partition = m_partitions[partition_i];
+      partition.layout = single_layout_i;
+    }
+
+  return false;
+}
+
 /* Main entry point for the SLP graph optimization pass.  */
 
 void
@@ -8039,8 +8120,11 @@ vect_optimize_slp_pass::run ()
   start_choosing_layouts ();
   if (m_perms.length () > 1)
     {
-      forward_pass ();
-      backward_pass ();
+      if (legitimize ())
+       {
+         forward_pass ();
+         backward_pass ();
+       }
       if (dump_enabled_p ())
        dump ();
       materialize ();
@@ -9031,8 +9115,11 @@ vect_slp_analyze_operations (vec_info *vinfo)
          stmt_vec_info stmt_info;
          if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
            stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
-         else
+         else if (!SLP_TREE_SCALAR_STMTS (node).is_empty ()
+                  && SLP_TREE_SCALAR_STMTS (node)[0])
            stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
+         else
+           stmt_info = SLP_TREE_REPRESENTATIVE (node);
          if (is_a <loop_vec_info> (vinfo))
            {
              if (dump_enabled_p ())
-- 
2.51.0

Reply via email to