Hi Hongtao, on 2023/6/14 16:17, Hongtao Liu wrote: > On Tue, Jun 13, 2023 at 10:07 AM Kewen Lin via Gcc-patches > <gcc-patches@gcc.gnu.org> wrote: >> >> This patch adjusts the cost handling on >> VMAT_CONTIGUOUS_PERMUTE in function vectorizable_load. We >> don't call function vect_model_load_cost for it any more. >> >> As the affected test case gcc.target/i386/pr70021.c shows, >> the previous costing can under-cost the total generated >> vector loads as for VMAT_CONTIGUOUS_PERMUTE function >> vect_model_load_cost doesn't consider the group size which >> is considered as vec_num during the transformation. > The original PR is for the correctness issue, and I'm not sure how > much of a performance impact the patch would be, but the change looks > reasonable, so the test change looks ok to me. > I'll track performance impact on SPEC2017 to see if there's any > regression caused by the patch(Guess probably not).
Thanks for the feedback and further tracking! Hope this (and this whole series) doesn't impact SPEC2017 performance on x86. :) BR, Kewen >> >> This patch makes the count of vector load in costing become >> consistent with what we generates during the transformation. >> To be more specific, for the given test case, for memory >> access b[i_20], it costed for 2 vector loads before, >> with this patch it costs 8 instead, it matches the final >> count of generated vector loads basing from b. This costing >> change makes cost model analysis feel it's not profitable >> to vectorize the first loop, so this patch adjusts the test >> case without vect cost model any more. >> >> But note that this test case also exposes something we can >> improve further is that although the number of vector >> permutation what we costed and generated are consistent, >> but DCE can further optimize some unused permutation out, >> it would be good if we can predict that and generate only >> those necessary permutations. >> >> gcc/ChangeLog: >> >> * tree-vect-stmts.cc (vect_model_load_cost): Assert this function >> only >> handle memory_access_type VMAT_CONTIGUOUS, remove some >> VMAT_CONTIGUOUS_PERMUTE related handlings. >> (vectorizable_load): Adjust the cost handling on >> VMAT_CONTIGUOUS_PERMUTE >> without calling vect_model_load_cost. >> >> gcc/testsuite/ChangeLog: >> >> * gcc.target/i386/pr70021.c: Adjust with -fno-vect-cost-model. >> --- >> gcc/testsuite/gcc.target/i386/pr70021.c | 2 +- >> gcc/tree-vect-stmts.cc | 88 ++++++++++++++----------- >> 2 files changed, 51 insertions(+), 39 deletions(-) >> >> diff --git a/gcc/testsuite/gcc.target/i386/pr70021.c >> b/gcc/testsuite/gcc.target/i386/pr70021.c >> index 6562c0f2bd0..d509583601e 100644 >> --- a/gcc/testsuite/gcc.target/i386/pr70021.c >> +++ b/gcc/testsuite/gcc.target/i386/pr70021.c >> @@ -1,7 +1,7 @@ >> /* PR target/70021 */ >> /* { dg-do run } */ >> /* { dg-require-effective-target avx2 } */ >> -/* { dg-options "-O2 -ftree-vectorize -mavx2 -fdump-tree-vect-details >> -mtune=skylake" } */ >> +/* { dg-options "-O2 -ftree-vectorize -mavx2 -fdump-tree-vect-details >> -mtune=skylake -fno-vect-cost-model" } */ >> >> #include "avx2-check.h" >> >> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc >> index 7f8d9db5363..e7a97dbe05d 100644 >> --- a/gcc/tree-vect-stmts.cc >> +++ b/gcc/tree-vect-stmts.cc >> @@ -1134,8 +1134,7 @@ vect_model_load_cost (vec_info *vinfo, >> slp_tree slp_node, >> stmt_vector_for_cost *cost_vec) >> { >> - gcc_assert (memory_access_type == VMAT_CONTIGUOUS >> - || memory_access_type == VMAT_CONTIGUOUS_PERMUTE); >> + gcc_assert (memory_access_type == VMAT_CONTIGUOUS); >> >> unsigned int inside_cost = 0, prologue_cost = 0; >> bool grouped_access_p = STMT_VINFO_GROUPED_ACCESS (stmt_info); >> @@ -1174,26 +1173,6 @@ vect_model_load_cost (vec_info *vinfo, >> once per group anyhow. */ >> bool first_stmt_p = (first_stmt_info == stmt_info); >> >> - /* We assume that the cost of a single load-lanes instruction is >> - equivalent to the cost of DR_GROUP_SIZE separate loads. If a grouped >> - access is instead being provided by a load-and-permute operation, >> - include the cost of the permutes. */ >> - if (first_stmt_p >> - && memory_access_type == VMAT_CONTIGUOUS_PERMUTE) >> - { >> - /* Uses an even and odd extract operations or shuffle operations >> - for each needed permute. */ >> - int group_size = DR_GROUP_SIZE (first_stmt_info); >> - int nstmts = ncopies * ceil_log2 (group_size) * group_size; >> - inside_cost += record_stmt_cost (cost_vec, nstmts, vec_perm, >> - stmt_info, 0, vect_body); >> - >> - if (dump_enabled_p ()) >> - dump_printf_loc (MSG_NOTE, vect_location, >> - "vect_model_load_cost: strided group_size = %d >> .\n", >> - group_size); >> - } >> - >> vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme, >> misalignment, first_stmt_p, &inside_cost, >> &prologue_cost, >> cost_vec, cost_vec, true); >> @@ -10652,11 +10631,22 @@ vectorizable_load (vec_info *vinfo, >> alignment support schemes. */ >> if (costing_p) >> { >> - if (memory_access_type == VMAT_CONTIGUOUS_REVERSE) >> + /* For VMAT_CONTIGUOUS_PERMUTE if it's grouped load, we >> + only need to take care of the first stmt, whose >> + stmt_info is first_stmt_info, vec_num iterating on it >> + will cover the cost for the remaining, it's consistent >> + with transforming. For the prologue cost for realign, >> + we only need to count it once for the whole group. */ >> + bool first_stmt_info_p = first_stmt_info == stmt_info; >> + bool add_realign_cost = first_stmt_info_p && i == 0; >> + if (memory_access_type == VMAT_CONTIGUOUS_REVERSE >> + || (memory_access_type == VMAT_CONTIGUOUS_PERMUTE >> + && (!grouped_load || first_stmt_info_p))) >> vect_get_load_cost (vinfo, stmt_info, 1, >> alignment_support_scheme, >> misalignment, >> - false, &inside_cost, &prologue_cost, >> - cost_vec, cost_vec, true); >> + add_realign_cost, &inside_cost, >> + &prologue_cost, cost_vec, cost_vec, >> + true); >> } >> else >> { >> @@ -10774,8 +10764,7 @@ vectorizable_load (vec_info *vinfo, >> ??? This is a hack to prevent compile-time issues as seen >> in PR101120 and friends. */ >> if (costing_p >> - && memory_access_type != VMAT_CONTIGUOUS >> - && memory_access_type != VMAT_CONTIGUOUS_PERMUTE) >> + && memory_access_type != VMAT_CONTIGUOUS) >> { >> vect_transform_slp_perm_load (vinfo, slp_node, vNULL, nullptr, >> vf, >> true, &n_perms, nullptr); >> @@ -10790,20 +10779,44 @@ vectorizable_load (vec_info *vinfo, >> gcc_assert (ok); >> } >> } >> - else if (!costing_p) >> + else >> { >> if (grouped_load) >> { >> if (memory_access_type != VMAT_LOAD_STORE_LANES) >> - vect_transform_grouped_load (vinfo, stmt_info, dr_chain, >> - group_size, gsi); >> - *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; >> - } >> - else >> - { >> - STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); >> + { >> + gcc_assert (memory_access_type == VMAT_CONTIGUOUS_PERMUTE); >> + /* We assume that the cost of a single load-lanes >> instruction >> + is equivalent to the cost of DR_GROUP_SIZE separate >> loads. >> + If a grouped access is instead being provided by a >> + load-and-permute operation, include the cost of the >> + permutes. */ >> + if (costing_p && first_stmt_info == stmt_info) >> + { >> + /* Uses an even and odd extract operations or shuffle >> + operations for each needed permute. */ >> + int group_size = DR_GROUP_SIZE (first_stmt_info); >> + int nstmts = ceil_log2 (group_size) * group_size; >> + inside_cost >> + += record_stmt_cost (cost_vec, nstmts, vec_perm, >> + stmt_info, 0, vect_body); >> + >> + if (dump_enabled_p ()) >> + dump_printf_loc ( >> + MSG_NOTE, vect_location, >> + "vect_model_load_cost: strided group_size = %d >> .\n", >> + group_size); >> + } >> + else if (!costing_p) >> + vect_transform_grouped_load (vinfo, stmt_info, dr_chain, >> + group_size, gsi); >> + } >> + if (!costing_p) >> + *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; >> } >> - } >> + else if (!costing_p) >> + STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); >> + } >> dr_chain.release (); >> } >> if (!slp && !costing_p) >> @@ -10814,8 +10827,7 @@ vectorizable_load (vec_info *vinfo, >> gcc_assert (memory_access_type != VMAT_INVARIANT >> && memory_access_type != VMAT_ELEMENTWISE >> && memory_access_type != VMAT_STRIDED_SLP); >> - if (memory_access_type != VMAT_CONTIGUOUS >> - && memory_access_type != VMAT_CONTIGUOUS_PERMUTE) >> + if (memory_access_type != VMAT_CONTIGUOUS) >> { >> if (dump_enabled_p ()) >> dump_printf_loc (MSG_NOTE, vect_location, >> -- >> 2.31.1 >> > >