[PATCH 3/4] Avoid splitting store dataref groups during SLP discovery

Richard Biener Tue, 21 May 2024 05:45:53 -0700

The following avoids splitting store dataref groups during SLP
discovery but instead forces (eventually single-lane) consecutive
lane SLP discovery for all lanes of the group, creating VEC_PERM
SLP nodes merging them so the store will always cover the whole group.


With this for example

int x[1024], y[1024], z[1024], w[1024];
void foo (void)
{
  for (int i = 0; i < 256; i++)
    {
      x[4*i+0] = y[2*i+0];
      x[4*i+1] = y[2*i+1];
      x[4*i+2] = z[i];
      x[4*i+3] = w[i];
    }
}

which was previously using hybrid SLP can now be fully SLPed and
SSE code generated looks better (but of course you never know,
I didn't actually benchmark).  We of course need a VF of four here.

.L2:
        movdqa  z(%rax), %xmm0
        movdqa  w(%rax), %xmm4
        movdqa  y(%rax,%rax), %xmm2
        movdqa  y+16(%rax,%rax), %xmm1
        movdqa  %xmm0, %xmm3
        punpckhdq       %xmm4, %xmm0
        punpckldq       %xmm4, %xmm3
        movdqa  %xmm2, %xmm4
        shufps  $238, %xmm3, %xmm2
        movaps  %xmm2, x+16(,%rax,4)
        movdqa  %xmm1, %xmm2
        shufps  $68, %xmm3, %xmm4
        shufps  $68, %xmm0, %xmm2
        movaps  %xmm4, x(,%rax,4)
        shufps  $238, %xmm0, %xmm1
        movaps  %xmm2, x+32(,%rax,4)
        movaps  %xmm1, x+48(,%rax,4)
        addq    $16, %rax
        cmpq    $1024, %rax
        jne     .L2

The extra permute nodes merging distinct branches of the SLP
tree might be unexpected for some code, esp. since
SLP_TREE_REPRESENTATIVE cannot be meaningfully set and we
cannot populate SLP_TREE_SCALAR_STMTS or SLP_TREE_SCALAR_OPS
consistently as we can have a mix of both.

The patch keeps the sub-trees form consecutive lanes but that's
in principle not necessary if we for example have an even/odd
split which now would result in N single-lane sub-trees.  That's
left for future improvements.

The interesting part is how VLA vector ISAs handle merging of
two vectors that's not trivial even/odd merging.  The strathegy
of how to build the permute tree might need adjustments for that
(in the end splitting each branch to single lanes and then doing
even/odd merging would be the brute-force fallback).  Not sure
how much we can or should rely on the SLP optimize pass to handle
this.

        * tree-vect-slp.cc (vect_build_slp_instance): Do not split
        store dataref groups on loop SLP discovery failure but create
        a single SLP instance for the stores but branch to SLP sub-trees
        and merge with a series of VEC_PERM nodes.
---
 gcc/tree-vect-slp.cc | 240 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 214 insertions(+), 26 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 43f2c153bf0..873748b0a72 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -3468,12 +3468,7 @@ vect_build_slp_instance (vec_info *vinfo,
          return true;
        }
     }
-  else
-    {
-      /* Failed to SLP.  */
-      /* Free the allocated memory.  */
-      scalar_stmts.release ();
-    }
+  /* Failed to SLP.  */
 
   stmt_vec_info stmt_info = stmt_info_;
   /* Try to break the group up into pieces.  */
@@ -3491,6 +3486,9 @@ vect_build_slp_instance (vec_info *vinfo,
       if (is_a <bb_vec_info> (vinfo)
          && (i > 1 && i < group_size))
        {
+         /* Free the allocated memory.  */
+         scalar_stmts.release ();
+
          tree scalar_type
            = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
          tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
@@ -3535,38 +3533,228 @@ vect_build_slp_instance (vec_info *vinfo,
            }
        }
 
-      /* For loop vectorization split into arbitrary pieces of size > 1.  */
-      if (is_a <loop_vec_info> (vinfo)
-         && (i > 1 && i < group_size)
-         && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
+      /* For loop vectorization split the RHS into arbitrary pieces of
+        size >= 1.  */
+      else if (is_a <loop_vec_info> (vinfo)
+              && (i > 0 && i < group_size)
+              && !vect_slp_prefer_store_lanes_p (vinfo,
+                                                 stmt_info, group_size, i))
        {
-         unsigned group1_size = i;
-
          if (dump_enabled_p ())
            dump_printf_loc (MSG_NOTE, vect_location,
                             "Splitting SLP group at stmt %u\n", i);
 
-         stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
-                                                          group1_size);
-         /* Loop vectorization cannot handle gaps in stores, make sure
-            the split group appears as strided.  */
-         STMT_VINFO_STRIDED_P (rest) = 1;
-         DR_GROUP_GAP (rest) = 0;
-         STMT_VINFO_STRIDED_P (stmt_info) = 1;
-         DR_GROUP_GAP (stmt_info) = 0;
+         /* Analyze the stored values and pinch them together with
+            a permute node so we can preserve the whole store group.  */
+         auto_vec<slp_tree> rhs_nodes;
+
+         /* Calculate the unrolling factor based on the smallest type.  */
+         poly_uint64 unrolling_factor = 1;
+
+         unsigned int start = 0, end = i;
+         while (start < group_size)
+           {
+             gcc_assert (end - start >= 1);
+             vec<stmt_vec_info> substmts;
+             substmts.create (end - start);
+             for (unsigned j = start; j < end; ++j)
+               substmts.quick_push (scalar_stmts[j]);
+             max_nunits = 1;
+             node = vect_build_slp_tree (vinfo, substmts, end - start,
+                                         &max_nunits,
+                                         matches, limit, &tree_size, bst_map);
+             if (node)
+               {
+                 /* ???  Possibly not safe, but not sure how to check
+                    and fail SLP build?  */
+                 unrolling_factor
+                   = force_common_multiple (unrolling_factor,
+                                            calculate_unrolling_factor
+                                              (max_nunits, end - start));
+                 rhs_nodes.safe_push (node);
+                 start = end;
+                 end = group_size;
+               }
+             else
+               {
+                 substmts.release ();
+                 if (end - start == 1)
+                   {
+                     /* Single-lane discovery failed.  Free ressources.  */
+                     for (auto node : rhs_nodes)
+                       vect_free_slp_tree (node);
+                     scalar_stmts.release ();
+                     if (dump_enabled_p ())
+                       dump_printf_loc (MSG_NOTE, vect_location,
+                                        "SLP discovery failed\n");
+                     return false;
+                   }
+
+                 /* ???  It really happens that we soft-fail SLP
+                    build at a mismatch but the matching part hard-fails
+                    later.  As we know we arrived here with a group
+                    larger than one try a group of size one!  */
+                 if (!matches[0])
+                   end = start + 1;
+                 else
+                   for (unsigned j = start; j < end; j++)
+                     if (!matches[j - start])
+                       {
+                         end = j;
+                         break;
+                       }
+               }
+           }
+
+         /* Now we assume we can build the root SLP node from all
+            stores.  */
+         node = vect_create_new_slp_node (scalar_stmts, 1);
+         SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
+         /* And a permute merging all RHS SLP trees.  */
+         slp_tree perm = vect_create_new_slp_node (rhs_nodes.length (),
+                                                   VEC_PERM_EXPR);
+         SLP_TREE_CHILDREN (node).quick_push (perm);
+         SLP_TREE_LANE_PERMUTATION (perm).create (group_size);
+         SLP_TREE_VECTYPE (perm) = SLP_TREE_VECTYPE (node);
+         SLP_TREE_LANES (perm) = group_size;
+         /* ???  We should set this NULL but that's not expected.  */
+         SLP_TREE_REPRESENTATIVE (perm)
+           = SLP_TREE_REPRESENTATIVE (SLP_TREE_CHILDREN (rhs_nodes[0])[0]);
+         for (unsigned j = 0; j < rhs_nodes.length (); ++j)
+           {
+             SLP_TREE_CHILDREN (perm)
+               .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
+             for (unsigned k = 0;
+                  k < SLP_TREE_SCALAR_STMTS (rhs_nodes[j]).length (); ++k)
+               {
+                 /* ???  We should populate SLP_TREE_SCALAR_STMTS
+                    or SLP_TREE_SCALAR_OPS but then we might have
+                    a mix of both in our children.  */
+                 SLP_TREE_LANE_PERMUTATION (perm)
+                   .quick_push (std::make_pair (j, k));
+               }
+           }
+
+         /* Now we have a single permute node but we cannot code-generate
+            the case with more than two inputs.
+            Perform pairwise reduction, reducing the two inputs
+            with the least number of lanes to one and then repeat until
+            we end up with two inputs.  That scheme makes sure we end
+            up with permutes satisfying the restriction of requiring
+            at most two vector inputs to produce a single vector output.  */
+         while (SLP_TREE_CHILDREN (perm).length () > 2)
+           {
+             /* Pick the two nodes with the least number of lanes,
+                prefer the earliest candidate and maintain ai < bi.  */
+             int ai = -1;
+             int bi = -1;
+             for (unsigned ci = 0;
+                  ci < SLP_TREE_CHILDREN (perm).length (); ++ci)
+               {
+                 if (ai == -1)
+                   ai = ci;
+                 else if (bi == -1)
+                   bi = ci;
+                 else if ((SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
+                           < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai]))
+                          || (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
+                              < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi])))
+                   {
+                     if (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai])
+                         <= SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi]))
+                       bi = ci;
+                     else
+                       {
+                         ai = bi;
+                         bi = ci;
+                       }
+                   }
+               }
+
+             /* Produce a merge of nodes ai and bi.  */
+             slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
+             slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
+             unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b);
+             slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
+             SLP_TREE_LANES (permab) = n;
+             SLP_TREE_LANE_PERMUTATION (permab).create (n);
+             SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm); /* ??? */
+             /* ???  We should set this NULL but that's not expected.  */
+             SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
+             SLP_TREE_CHILDREN (permab).quick_push (a);
+             for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
+               SLP_TREE_LANE_PERMUTATION (permab)
+                 .quick_push (std::make_pair (0, k));
+             SLP_TREE_CHILDREN (permab).quick_push (b);
+             for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
+               SLP_TREE_LANE_PERMUTATION (permab)
+                 .quick_push (std::make_pair (1, k));
+
+             /* Put the merged node into 'perm', in place of a  */
+             SLP_TREE_CHILDREN (perm)[ai] = permab;
+             /* Adjust the references to b in the permutation
+                of perm and to the later children which we'll
+                remove.  */
+             for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
+               {
+                 std::pair<unsigned, unsigned> &p
+                     = SLP_TREE_LANE_PERMUTATION (perm)[k];
+                 if (p.first == (unsigned) bi)
+                   {
+                     p.first = ai;
+                     p.second += SLP_TREE_LANES (a);
+                   }
+                 else if (p.first > (unsigned) bi)
+                   p.first--;
+               }
+             SLP_TREE_CHILDREN (perm).ordered_remove (bi);
+           }
+
+         /* Create a new SLP instance.  */
+         slp_instance new_instance = XNEW (class _slp_instance);
+         SLP_INSTANCE_TREE (new_instance) = node;
+         SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
+         SLP_INSTANCE_LOADS (new_instance) = vNULL;
+         SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
+         SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
+         SLP_INSTANCE_KIND (new_instance) = kind;
+         new_instance->reduc_phis = NULL;
+         new_instance->cost_vec = vNULL;
+         new_instance->subgraph_entries = vNULL;
+
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "SLP size %u vs. limit %u.\n",
+                            tree_size, max_tree_size);
 
-         bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
-                                               kind, max_tree_size, limit);
-         if (i + 1 < group_size)
-           res |= vect_analyze_slp_instance (vinfo, bst_map,
-                                             rest, kind, max_tree_size, limit);
+         vinfo->slp_instances.safe_push (new_instance);
+
+         /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
+            the number of scalar stmts in the root in a few places.
+            Verify that assumption holds.  */
+         gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
+                       .length () == group_size);
 
-         return res;
+         if (dump_enabled_p ())
+           {
+             dump_printf_loc (MSG_NOTE, vect_location,
+                              "Final SLP tree for instance %p:\n",
+                              (void *) new_instance);
+             vect_print_slp_graph (MSG_NOTE, vect_location,
+                                   SLP_INSTANCE_TREE (new_instance));
+           }
+         return true;
        }
+      else
+       /* Free the allocated memory.  */
+       scalar_stmts.release ();
 
       /* Even though the first vector did not all match, we might be able to 
SLP
         (some) of the remainder.  FORNOW ignore this possibility.  */
     }
+  else
+    /* Free the allocated memory.  */
+    scalar_stmts.release ();
 
   /* Failed to SLP.  */
   if (dump_enabled_p ())
-- 
2.35.3

[PATCH 3/4] Avoid splitting store dataref groups during SLP discovery

Reply via email to