The following tries to avoid eating into the SLP discovery limit
when we can do cheaper checks first.  Together with the previous
patch this allows to use two-lane SLP discovery for mult_su3_an
in 433.milc.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

        PR tree-optimization/117874
        * tree-vect-slp.cc (vect_build_slp_tree_2): Perform early
        reassoc checks before eating into discovery limit.
---
 gcc/tree-vect-slp.cc | 39 +++++++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 1799d5a619b..425135a9ee0 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -2292,6 +2292,9 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
                }
            }
          /* 2. try to build children nodes, associating as necessary.  */
+         /* 2a. prepare and perform early checks to avoid eating into
+            discovery limit unnecessarily.  */
+         vect_def_type *dts = XALLOCAVEC (vect_def_type, chain_len);
          for (unsigned n = 0; n < chain_len; ++n)
            {
              vect_def_type dt = chains[0][n].dt;
@@ -2319,6 +2322,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
                    matches[0] = false;
                  goto out;
                }
+             dts[n] = dt;
              if (dt == vect_constant_def
                  || dt == vect_external_def)
                {
@@ -2333,16 +2337,6 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
                      matches[0] = false;
                      goto out;
                    }
-                 vec<tree> ops;
-                 ops.create (group_size);
-                 for (lane = 0; lane < group_size; ++lane)
-                   if (stmts[lane])
-                     ops.quick_push (chains[lane][n].op);
-                   else
-                     ops.quick_push (NULL_TREE);
-                 slp_tree child = vect_create_new_slp_node (ops);
-                 SLP_TREE_DEF_TYPE (child) = dt;
-                 children.safe_push (child);
                }
              else if (dt != vect_internal_def)
                {
@@ -2354,6 +2348,26 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
                  hard_fail = false;
                  goto out;
                }
+           }
+         /* 2b. do the actual build.  */
+         for (unsigned n = 0; n < chain_len; ++n)
+           {
+             vect_def_type dt = dts[n];
+             unsigned lane;
+             if (dt == vect_constant_def
+                 || dt == vect_external_def)
+               {
+                 vec<tree> ops;
+                 ops.create (group_size);
+                 for (lane = 0; lane < group_size; ++lane)
+                   if (stmts[lane])
+                     ops.quick_push (chains[lane][n].op);
+                   else
+                     ops.quick_push (NULL_TREE);
+                 slp_tree child = vect_create_new_slp_node (ops);
+                 SLP_TREE_DEF_TYPE (child) = dt;
+                 children.safe_push (child);
+               }
              else
                {
                  vec<stmt_vec_info> op_stmts;
@@ -2396,6 +2410,11 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
                                term = true;
                                break;
                              }
+                           if (dump_enabled_p ())
+                             dump_printf_loc (MSG_NOTE, vect_location,
+                                              "swapping operand %d and %d "
+                                              "of lane %d\n",
+                                              n, n + perms[lane] + 1, lane);
                            std::swap (chains[lane][n],
                                       chains[lane][n + perms[lane] + 1]);
                            perms[lane]++;
-- 
2.43.0

Reply via email to