This series should fix the target-independent parts of PR116583. (We also need some target-specific patches, to be posted separately.)
The explanations are in the individual commit messages, but I've attached a -b diff below in case my attempt to split the patch up has just obfuscated things instead. Tested on aarch64-linux-gnu (with and without SVE enabled by default) and x86_64-linux-gnu. Also tested by running the vect testsuite with vect-force-slp=1. Richard Sandiford (4): vect: Variable lane indices in vectorizable_slp_permutation_1 vect: Restructure repeating_p case for SLP permutations vect: Support more VLA SLP permutations vect: Add more dump messages for VLA SLP permutation gcc/testsuite/gcc.dg/vect/slp-13-big-array.c | 2 +- gcc/testsuite/gcc.dg/vect/slp-13.c | 2 +- gcc/tree-vect-slp.cc | 190 +++++++++++++------ 3 files changed, 134 insertions(+), 60 deletions(-) -- 2.25.1 diff --git a/gcc/testsuite/gcc.dg/vect/slp-13-big-array.c b/gcc/testsuite/gcc.dg/vect/slp-13-big-array.c index ca70856c1dd..e45f8aab133 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-13-big-array.c +++ b/gcc/testsuite/gcc.dg/vect/slp-13-big-array.c @@ -137,4 +137,4 @@ int main (void) /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target { { vect_interleave && vect_extract_even_odd } && { ! vect_pack_trunc } } } } } */ /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { ! vect_pack_trunc } } } } */ /* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { target { { vect_interleave && vect_extract_even_odd } && vect_pack_trunc } } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { target vect_pack_trunc xfail vect_variable_length } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { target vect_pack_trunc } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-13.c b/gcc/testsuite/gcc.dg/vect/slp-13.c index b7f947e6dbe..d6346aef978 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-13.c +++ b/gcc/testsuite/gcc.dg/vect/slp-13.c @@ -131,4 +131,4 @@ int main (void) /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target { { vect_interleave && vect_extract_even_odd } && { ! vect_pack_trunc } } } } } */ /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { ! vect_pack_trunc } } } } */ /* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { target { { vect_interleave && vect_extract_even_odd } && vect_pack_trunc } } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { target vect_pack_trunc xfail vect_variable_length } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { target vect_pack_trunc } } } */ diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 482b9d50496..56fb55cb628 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -10194,6 +10194,13 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi, unsigned i; poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node)); + /* True if we're permuting a single input of 2N vectors down + to N vectors. This case doesn't generalize beyond 2 since + VEC_PERM_EXPR only takes 2 inputs. */ + bool pack_p = false; + /* If we're permuting inputs of N vectors each into X*N outputs, + this is the value of X, otherwise it is 1. */ + unsigned int unpack_factor = 1; tree op_vectype = NULL_TREE; FOR_EACH_VEC_ELT (children, i, child) if (SLP_TREE_VECTYPE (child)) @@ -10215,7 +10222,20 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi, "Unsupported vector types in lane permutation\n"); return -1; } - if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node)) + auto op_nunits = TYPE_VECTOR_SUBPARTS (op_vectype); + unsigned int this_unpack_factor; + /* Check whether the input has twice as many lanes per vector. */ + if (children.length () == 1 + && known_eq (SLP_TREE_LANES (child) * nunits, + SLP_TREE_LANES (node) * op_nunits * 2)) + pack_p = true; + /* Check whether the output has N times as many lanes per vector. */ + else if (constant_multiple_p (SLP_TREE_LANES (node) * op_nunits, + SLP_TREE_LANES (child) * nunits, + &this_unpack_factor) + && (i == 0 || unpack_factor == this_unpack_factor)) + unpack_factor = this_unpack_factor; + else repeating_p = false; } @@ -10243,29 +10263,47 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi, return 1; } - /* REPEATING_P is true if every output vector is guaranteed to use the - same permute vector. We can handle that case for both variable-length - and constant-length vectors, but we only handle other cases for - constant-length vectors. + /* Set REPEATING_P to true if the permutations are cylical wrt UNPACK_FACTOR + and if we can generate the vectors in a vector-length agnostic way. + This requires UNPACK_STEP == NUNITS / UNPACK_FACTOR to be known at + compile time. + + The significance of UNPACK_STEP is that, when PACK_P is false, + output vector I operates on a window of UNPACK_STEP elements from each + input, starting at lane UNPACK_STEP * (I % UNPACK_FACTOR). For example, + when UNPACK_FACTOR is 2, the first output vector operates on lanes + [0, NUNITS / 2 - 1] of each input vector and the second output vector + operates on lanes [NUNITS / 2, NUNITS - 1] of each input vector. + + When REPEATING_P is true, NOUTPUTS holds the total number of outputs + that we actually need to generate. */ + uint64_t noutputs = 0; + poly_uint64 unpack_step = 0; + loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo); + if (!linfo + || !multiple_p (nunits, unpack_factor, &unpack_step) + || !constant_multiple_p (LOOP_VINFO_VECT_FACTOR (linfo) + * SLP_TREE_LANES (node), nunits, &noutputs)) + repeating_p = false; + + /* We can handle the conditions described for REPEATING_P above for + both variable- and constant-length vectors. The fallback requires + us to generate every element of every permute vector explicitly, + which is only possible for constant-length permute vectors. Set: - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute - mask vector that we want to build. + mask vectors that we want to build. - NCOPIES to the number of copies of PERM that we need in order - to build the necessary permute mask vectors. - - - NOUTPUTS_PER_MASK to the number of output vectors we want to create - for each permute mask vector. This is only relevant when GSI is - nonnull. */ + to build the necessary permute mask vectors. */ uint64_t npatterns; unsigned nelts_per_pattern; uint64_t ncopies; - unsigned noutputs_per_mask; if (repeating_p) { - /* We need a single permute mask vector that has the form: + /* We need permute mask vectors that have the form: { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... } @@ -10274,7 +10312,6 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi, that we use for permutes requires 3n elements. */ npatterns = SLP_TREE_LANES (node); nelts_per_pattern = ncopies = 3; - noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node); } else { @@ -10282,24 +10319,40 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi, instead of relying on the pattern described above. */ if (!nunits.is_constant (&npatterns) || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ()) + { + if (dump_p) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "unsupported permutation %p on variable-length" + " vectors\n", (void *) node); return -1; + } nelts_per_pattern = ncopies = 1; - if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo)) - if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies)) + if (linfo && !LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies)) + { + if (dump_p) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "unsupported permutation %p for variable VF\n", + (void *) node); return -1; - noutputs_per_mask = 1; } - unsigned olanes = ncopies * SLP_TREE_LANES (node); + pack_p = false; + unpack_factor = 1; + } + unsigned olanes = unpack_factor * ncopies * SLP_TREE_LANES (node); gcc_assert (repeating_p || multiple_p (olanes, nunits)); /* Compute the { { SLP operand, vector index}, lane } permutation sequence from the { SLP operand, scalar lane } permutation as recorded in the SLP node as intermediate step. This part should already work with SLP children with arbitrary number of lanes. */ - auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm; - auto_vec<unsigned> active_lane; + auto_vec<std::pair<std::pair<unsigned, unsigned>, poly_uint64>> vperm; + auto_vec<poly_uint64> active_lane; vperm.create (olanes); active_lane.safe_grow_cleared (children.length (), true); + for (unsigned int ui = 0; ui < unpack_factor; ++ui) + { + for (unsigned j = 0; j < children.length (); ++j) + active_lane[j] = ui * unpack_step; for (unsigned i = 0; i < ncopies; ++i) { for (unsigned pi = 0; pi < perm.length (); ++pi) @@ -10307,13 +10360,16 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi, std::pair<unsigned, unsigned> p = perm[pi]; tree vtype = SLP_TREE_VECTYPE (children[p.first]); if (repeating_p) - vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]}); + vperm.quick_push ({{p.first, 0}, + p.second + active_lane[p.first]}); else { /* We checked above that the vectors are constant-length. */ - unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant (); - unsigned vi = (active_lane[p.first] + p.second) / vnunits; - unsigned vl = (active_lane[p.first] + p.second) % vnunits; + unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype) + .to_constant (); + unsigned lane = active_lane[p.first].to_constant (); + unsigned vi = (lane + p.second) / vnunits; + unsigned vl = (lane + p.second) % vnunits; vperm.quick_push ({{p.first, vi}, vl}); } } @@ -10321,6 +10377,7 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi, for (unsigned j = 0; j < children.length (); ++j) active_lane[j] += SLP_TREE_LANES (children[j]); } + } if (dump_p) { @@ -10339,9 +10396,10 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi, ? multiple_p (i, npatterns) : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))) dump_printf (MSG_NOTE, ","); - dump_printf (MSG_NOTE, " vops%u[%u][%u]", - vperm[i].first.first, vperm[i].first.second, - vperm[i].second); + dump_printf (MSG_NOTE, " vops%u[%u][", + vperm[i].first.first, vperm[i].first.second); + dump_dec (MSG_NOTE, vperm[i].second); + dump_printf (MSG_NOTE, "]"); } dump_printf (MSG_NOTE, "\n"); } @@ -10362,16 +10420,37 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi, mask.quick_grow (count); vec_perm_indices indices; unsigned nperms = 0; - for (unsigned i = 0; i < vperm.length (); ++i) - { - mask_element = vperm[i].second; - if (first_vec.first == -1U - || first_vec == vperm[i].first) - first_vec = vperm[i].first; + /* When REPEATING_P is true, we only have UNPACK_FACTOR unique permute + vectors to check during analysis, but we need to generate NOUTPUTS + vectors during transformation. */ + unsigned total_nelts = olanes; + if (repeating_p && gsi) + total_nelts = (total_nelts / unpack_factor) * noutputs; + for (unsigned i = 0; i < total_nelts; ++i) + { + /* VI is the input vector index when generating code for REPEATING_P. */ + unsigned vi = i / olanes * (pack_p ? 2 : 1); + unsigned ei = i % olanes; + mask_element = vperm[ei].second; + if (pack_p) + { + /* In this case, we have N outputs and the single child provides 2N + inputs. Output X permutes inputs 2X and 2X+1. + + The mask indices are taken directly from the SLP permutation node. + Index X selects from the first vector if (X / NUNITS) % 2 == 0; + X selects from the second vector otherwise. These conditions + are only known at compile time for constant-length vectors. */ + first_vec = std::make_pair (0, 0); + second_vec = std::make_pair (0, 1); + } + else if (first_vec.first == -1U + || first_vec == vperm[ei].first) + first_vec = vperm[ei].first; else if (second_vec.first == -1U - || second_vec == vperm[i].first) + || second_vec == vperm[ei].first) { - second_vec = vperm[i].first; + second_vec = vperm[ei].first; mask_element += nunits; } else @@ -10435,18 +10514,13 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi, if (!identity_p) mask_vec = vect_gen_perm_mask_checked (vectype, indices); - for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi) - { tree first_def - = vect_get_slp_vect_def (first_node, - first_vec.second + vi); + = vect_get_slp_vect_def (first_node, first_vec.second + vi); tree second_def - = vect_get_slp_vect_def (second_node, - second_vec.second + vi); + = vect_get_slp_vect_def (second_node, second_vec.second + vi); vect_add_slp_permutation (vinfo, gsi, node, first_def, second_def, mask_vec, mask[0]); } - } index = 0; first_vec = std::make_pair (-1U, -1U);