The following tries to avoid eating into the SLP discovery limit when we can do cheaper checks first. Together with the previous patch this allows to use two-lane SLP discovery for mult_su3_an in 433.milc.
Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed. PR tree-optimization/117874 * tree-vect-slp.cc (vect_build_slp_tree_2): Perform early reassoc checks before eating into discovery limit. --- gcc/tree-vect-slp.cc | 39 +++++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 1799d5a619b..425135a9ee0 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -2292,6 +2292,9 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, } } /* 2. try to build children nodes, associating as necessary. */ + /* 2a. prepare and perform early checks to avoid eating into + discovery limit unnecessarily. */ + vect_def_type *dts = XALLOCAVEC (vect_def_type, chain_len); for (unsigned n = 0; n < chain_len; ++n) { vect_def_type dt = chains[0][n].dt; @@ -2319,6 +2322,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, matches[0] = false; goto out; } + dts[n] = dt; if (dt == vect_constant_def || dt == vect_external_def) { @@ -2333,16 +2337,6 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, matches[0] = false; goto out; } - vec<tree> ops; - ops.create (group_size); - for (lane = 0; lane < group_size; ++lane) - if (stmts[lane]) - ops.quick_push (chains[lane][n].op); - else - ops.quick_push (NULL_TREE); - slp_tree child = vect_create_new_slp_node (ops); - SLP_TREE_DEF_TYPE (child) = dt; - children.safe_push (child); } else if (dt != vect_internal_def) { @@ -2354,6 +2348,26 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, hard_fail = false; goto out; } + } + /* 2b. do the actual build. */ + for (unsigned n = 0; n < chain_len; ++n) + { + vect_def_type dt = dts[n]; + unsigned lane; + if (dt == vect_constant_def + || dt == vect_external_def) + { + vec<tree> ops; + ops.create (group_size); + for (lane = 0; lane < group_size; ++lane) + if (stmts[lane]) + ops.quick_push (chains[lane][n].op); + else + ops.quick_push (NULL_TREE); + slp_tree child = vect_create_new_slp_node (ops); + SLP_TREE_DEF_TYPE (child) = dt; + children.safe_push (child); + } else { vec<stmt_vec_info> op_stmts; @@ -2396,6 +2410,11 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, term = true; break; } + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "swapping operand %d and %d " + "of lane %d\n", + n, n + perms[lane] + 1, lane); std::swap (chains[lane][n], chains[lane][n + perms[lane] + 1]); perms[lane]++; -- 2.43.0