For the following test case (compiled with load/store lanes disabled locally):
void f (uint32_t *restrict x, uint8_t *restrict y, int n) { for (int i = 0; i < n; ++i) { x[i * 2] = x[i * 2] + y[i * 2]; x[i * 2 + 1] = x[i * 2 + 1] + y[i * 2]; } } we have a redundant no-op permute on the x[] load node: node 0x4472350 (max_nunits=8, refcnt=2) stmt 0 _5 = *_4; stmt 1 _13 = *_12; load permutation { 0 1 } Then, when costing it, we pick a cost of 1, even though we need 4 copies of the x[] load to match a single y[] load: ==> examining statement: _5 = *_4; Vectorizing an unaligned access. vect_model_load_cost: unaligned supported by hardware. vect_model_load_cost: inside_cost = 1, prologue_cost = 0 . The problem is that the code only considers the permutation for the first scalar iteration, rather than for all VF iterations. This patch tries to fix that by using similar logic to vect_transform_slp_perm_load. Tested on aarch64-linux-gnu and x86_64-linux-gnu. OK to install? Richard [-b version also included] gcc/ * tree-vect-stmts.c (vect_model_load_cost): Use similar logic to vect_transform_slp_perm_load when counting the number of loads in a permuted SLP operation. --- gcc/tree-vect-stmts.c | 67 ++++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 26 deletions(-) diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 3575f25241f..6eacd641e6b 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -1099,38 +1099,53 @@ vect_model_load_cost (vec_info *vinfo, stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); /* Record the cost for the permutation. */ unsigned n_perms; - unsigned assumed_nunits - = vect_nunits_for_cost (STMT_VINFO_VECTYPE (first_stmt_info)); vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL, vf, true, &n_perms); inside_cost += record_stmt_cost (cost_vec, n_perms, vec_perm, first_stmt_info, 0, vect_body); - /* And adjust the number of loads performed. This handles - redundancies as well as loads that are later dead. */ - auto_sbitmap perm (DR_GROUP_SIZE (first_stmt_info)); - bitmap_clear (perm); - for (unsigned i = 0; - i < SLP_TREE_LOAD_PERMUTATION (slp_node).length (); ++i) - bitmap_set_bit (perm, SLP_TREE_LOAD_PERMUTATION (slp_node)[i]); - ncopies = 0; - bool load_seen = false; - for (unsigned i = 0; i < DR_GROUP_SIZE (first_stmt_info); ++i) - { - if (i % assumed_nunits == 0) + + /* And if this is not a simple "load N vectors and then permute each + vector internally" operation, adjust the number of loads performed. + This handles redundancies as well as loads that are later dead. */ + unsigned int nscalars = SLP_TREE_SCALAR_STMTS (slp_node).length (); + unsigned int dr_group_size = DR_GROUP_SIZE (first_stmt_info); + tree vectype = STMT_VINFO_VECTYPE (first_stmt_info); + poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); + if (nscalars != dr_group_size + || !multiple_p (nunits, nscalars)) + { + /* The constant VF and nunits are enforced by + vect_transform_slp_perm_load for this case. */ + unsigned int const_nunits = nunits.to_constant (); + unsigned int in_nlanes = dr_group_size * vf.to_constant (); + unsigned int out_nlanes = nscalars * vf.to_constant (); + auto_sbitmap perm (in_nlanes); + bitmap_clear (perm); + for (unsigned i = 0; i < out_nlanes; ++i) + { + unsigned int iter_num = i / nscalars; + unsigned int stmt_num = i % nscalars; + unsigned int in_lane + = (iter_num * dr_group_size + + SLP_TREE_LOAD_PERMUTATION (slp_node)[stmt_num]); + bitmap_set_bit (perm, in_lane); + } + ncopies = 0; + bool load_seen = false; + for (unsigned i = 0; i < in_nlanes; ++i) { - if (load_seen) - ncopies++; - load_seen = false; + if (i % const_nunits == 0) + { + if (load_seen) + ncopies++; + load_seen = false; + } + if (bitmap_bit_p (perm, i)) + load_seen = true; } - if (bitmap_bit_p (perm, i)) - load_seen = true; - } - if (load_seen) - ncopies++; - gcc_assert (ncopies - <= (DR_GROUP_SIZE (first_stmt_info) - - DR_GROUP_GAP (first_stmt_info) - + assumed_nunits - 1) / assumed_nunits); + if (load_seen) + ncopies++; + } } /* Grouped loads read all elements in the group at once,
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 3575f25241f..6eacd641e6b 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -1099,24 +1099,42 @@ vect_model_load_cost (vec_info *vinfo, stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); /* Record the cost for the permutation. */ unsigned n_perms; - unsigned assumed_nunits - = vect_nunits_for_cost (STMT_VINFO_VECTYPE (first_stmt_info)); vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL, vf, true, &n_perms); inside_cost += record_stmt_cost (cost_vec, n_perms, vec_perm, first_stmt_info, 0, vect_body); - /* And adjust the number of loads performed. This handles - redundancies as well as loads that are later dead. */ - auto_sbitmap perm (DR_GROUP_SIZE (first_stmt_info)); + + /* And if this is not a simple "load N vectors and then permute each + vector internally" operation, adjust the number of loads performed. + This handles redundancies as well as loads that are later dead. */ + unsigned int nscalars = SLP_TREE_SCALAR_STMTS (slp_node).length (); + unsigned int dr_group_size = DR_GROUP_SIZE (first_stmt_info); + tree vectype = STMT_VINFO_VECTYPE (first_stmt_info); + poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); + if (nscalars != dr_group_size + || !multiple_p (nunits, nscalars)) + { + /* The constant VF and nunits are enforced by + vect_transform_slp_perm_load for this case. */ + unsigned int const_nunits = nunits.to_constant (); + unsigned int in_nlanes = dr_group_size * vf.to_constant (); + unsigned int out_nlanes = nscalars * vf.to_constant (); + auto_sbitmap perm (in_nlanes); bitmap_clear (perm); - for (unsigned i = 0; - i < SLP_TREE_LOAD_PERMUTATION (slp_node).length (); ++i) - bitmap_set_bit (perm, SLP_TREE_LOAD_PERMUTATION (slp_node)[i]); + for (unsigned i = 0; i < out_nlanes; ++i) + { + unsigned int iter_num = i / nscalars; + unsigned int stmt_num = i % nscalars; + unsigned int in_lane + = (iter_num * dr_group_size + + SLP_TREE_LOAD_PERMUTATION (slp_node)[stmt_num]); + bitmap_set_bit (perm, in_lane); + } ncopies = 0; bool load_seen = false; - for (unsigned i = 0; i < DR_GROUP_SIZE (first_stmt_info); ++i) + for (unsigned i = 0; i < in_nlanes; ++i) { - if (i % assumed_nunits == 0) + if (i % const_nunits == 0) { if (load_seen) ncopies++; @@ -1127,10 +1145,7 @@ vect_model_load_cost (vec_info *vinfo, } if (load_seen) ncopies++; - gcc_assert (ncopies - <= (DR_GROUP_SIZE (first_stmt_info) - - DR_GROUP_GAP (first_stmt_info) - + assumed_nunits - 1) / assumed_nunits); + } } /* Grouped loads read all elements in the group at once,