on 2023/5/23 14:19, Richard Biener wrote: > On Tue, May 23, 2023 at 5:01 AM Kewen.Lin <li...@linux.ibm.com> wrote: >> >> Hi Richi, >> >> Thanks for the review! >> >> on 2023/5/22 21:44, Richard Biener wrote: >>> On Wed, May 17, 2023 at 8:15 AM Kewen.Lin <li...@linux.ibm.com> wrote: >>>> >>>> Hi, >>>> >>>> Following Richi's suggestion in [1], I'm working on deferring >>>> cost evaluation next to the transformation, this patch is >>>> to enhance function vect_transform_slp_perm_load_1 which >>>> could under-cost for vector permutation, since the costing >>>> doesn't try to consider nvectors_per_build, it's inconsistent >>>> with the transformation part. >>>> >>>> Bootstrapped and regtested on x86_64-redhat-linux, >>>> aarch64-linux-gnu and powerpc64{,le}-linux-gnu. >>>> >>>> Is it ok for trunk? >>>> >>>> [1] https://gcc.gnu.org/pipermail/gcc-patches/2021-January/563624.html >>>> >>>> BR, >>>> Kewen >>>> ----- >>>> gcc/ChangeLog: >>>> >>>> * tree-vect-slp.cc (vect_transform_slp_perm_load_1): Adjust the >>>> calculation on n_perms by considering nvectors_per_build. >>>> >>>> gcc/testsuite/ChangeLog: >>>> >>>> * gcc.dg/vect/costmodel/ppc/costmodel-slp-perm.c: New test. >>>> --- >>>> .../vect/costmodel/ppc/costmodel-slp-perm.c | 23 +++++++ >>>> gcc/tree-vect-slp.cc | 66 ++++++++++--------- >>>> 2 files changed, 57 insertions(+), 32 deletions(-) >>>> create mode 100644 >>>> gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-slp-perm.c >>>> >>>> diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-slp-perm.c >>>> b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-slp-perm.c >>>> new file mode 100644 >>>> index 00000000000..e5c4dceddfb >>>> --- /dev/null >>>> +++ b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-slp-perm.c >>>> @@ -0,0 +1,23 @@ >>>> +/* { dg-do compile } */ >>>> +/* { dg-require-effective-target vect_int } */ >>>> +/* { dg-require-effective-target powerpc_p9vector_ok } */ >>>> +/* Specify power9 to ensure the vectorization is profitable >>>> + and test point stands, otherwise it could be not profitable >>>> + to vectorize. */ >>>> +/* { dg-additional-options "-mdejagnu-cpu=power9 -mpower9-vector" } */ >>>> + >>>> +/* Verify we cost the exact count for required vec_perm. */ >>>> + >>>> +int x[1024], y[1024]; >>>> + >>>> +void >>>> +foo () >>>> +{ >>>> + for (int i = 0; i < 512; ++i) >>>> + { >>>> + x[2 * i] = y[1023 - (2 * i)]; >>>> + x[2 * i + 1] = y[1023 - (2 * i + 1)]; >>>> + } >>>> +} >>>> + >>>> +/* { dg-final { scan-tree-dump-times "2 times vec_perm" 1 "vect" } } */ >>>> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc >>>> index e5c9d7e766e..af9a6dd4fa9 100644 >>>> --- a/gcc/tree-vect-slp.cc >>>> +++ b/gcc/tree-vect-slp.cc >>>> @@ -8115,12 +8115,12 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, >>>> slp_tree node, >>>> >>>> mode = TYPE_MODE (vectype); >>>> poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); >>>> + unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node); >>>> >>>> /* Initialize the vect stmts of NODE to properly insert the generated >>>> stmts later. */ >>>> if (! analyze_only) >>>> - for (unsigned i = SLP_TREE_VEC_STMTS (node).length (); >>>> - i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++) >>>> + for (unsigned i = SLP_TREE_VEC_STMTS (node).length (); i < nstmts; >>>> i++) >>>> SLP_TREE_VEC_STMTS (node).quick_push (NULL); >>>> >>>> /* Generate permutation masks for every NODE. Number of masks for each >>>> NODE >>>> @@ -8161,7 +8161,10 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, >>>> slp_tree node, >>>> (b) the permutes only need a single vector input. */ >>>> mask.new_vector (nunits, group_size, 3); >>>> nelts_to_build = mask.encoded_nelts (); >>>> - nvectors_per_build = SLP_TREE_VEC_STMTS (node).length (); >>>> + /* It's possible to obtain zero nstmts during analyze_only, so make >>>> + it at least one to ensure the later computation for n_perms >>>> + proceed. */ >>>> + nvectors_per_build = nstmts > 0 ? nstmts : 1; >>>> in_nlanes = DR_GROUP_SIZE (stmt_info) * 3; >>>> } >>>> else >>>> @@ -8252,40 +8255,39 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, >>>> slp_tree node, >>>> return false; >>>> } >>>> >>>> - ++*n_perms; >>>> - >>>> + tree mask_vec = NULL_TREE; >>>> if (!analyze_only) >>>> - { >>>> - tree mask_vec = vect_gen_perm_mask_checked (vectype, >>>> indices); >>>> + mask_vec = vect_gen_perm_mask_checked (vectype, indices); >>>> >>>> - if (second_vec_index == -1) >>>> - second_vec_index = first_vec_index; >>>> + if (second_vec_index == -1) >>>> + second_vec_index = first_vec_index; >>>> >>>> - for (unsigned int ri = 0; ri < nvectors_per_build; ++ri) >>>> + for (unsigned int ri = 0; ri < nvectors_per_build; ++ri) >>>> + { >>>> + ++*n_perms; >>> >>> So the "real" change is doing >>> >>> *n_perms += nvectors_per_build; >>> >>> and *n_perms was unused when !analyze_only? And since at >> >> Yes, although both !analyze_only and analyze_only calls pass n_perms in, now >> only the call sites with analyze_only will use the returned n_perms further. >> >>> analysis time we (sometimes?) have zero nvectors you have to >>> fixup above? Which cases are that? >> >> Yes, the fixup is to avoid to result in unexpected n_perms in function >> vect_optimize_slp_pass::internal_node_cost。 One typical case is >> gcc.dg/vect/bb-slp-50.c, without special casing zero, slp2 fails to optimize >> out one more vec_perm unexpectedly. >> >> In vect_optimize_slp_pass::internal_node_cost, it checks if the returned >> n_perms >> is zero or not (vec_perm not needed or needed). >> >> if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL, >> nullptr, vf, true, false, >> &n_perms)) >> { >> auto rep = SLP_TREE_REPRESENTATIVE (node); >> if (out_layout_i == 0) >> { >> /* Use the fallback cost if the load is an N-to-N permutation. >> Otherwise assume that the node will be rejected later >> and rebuilt from scalars. */ >> if (STMT_VINFO_GROUPED_ACCESS (rep) >> && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep)) >> == SLP_TREE_LANES (node))) >> return fallback_cost; >> return 0; >> } >> return -1; >> } >> >> /* See the comment above the corresponding VEC_PERM_EXPR handling. */ >> return n_perms == 0 ? 0 : 1; >> >> In vect_optimize_slp_pass::forward_pass (), it only considers the case that >> factor > 0 (there is some vec_perm needed). >> >> /* Accumulate the cost of using LAYOUT_I within NODE, >> both for the inputs and the outputs. */ >> int factor = internal_node_cost (vertex.node, layout_i, >> layout_i); >> if (factor < 0) >> { >> is_possible = false; >> break; >> } >> else if (factor) >> layout_costs.internal_cost.add_serial_cost >> ({ vertex.weight * factor, m_optimize_size }); > > Ah, OK - thanks for clarifying. > > The patch is OK.
Thanks! Committed in r14-1151. BR, Kewen