> On 19 Dec 2024, at 14:10, Jennifer Schmitz <jschm...@nvidia.com> wrote: > > > >> On 19 Dec 2024, at 11:14, Richard Sandiford <richard.sandif...@arm.com> >> wrote: >> >> External email: Use caution opening links or attachments >> >> >> Jennifer Schmitz <jschm...@nvidia.com> writes: >>> @@ -8834,22 +8834,7 @@ vectorizable_store (vec_info *vinfo, >>> { >>> if (costing_p) >>> { >>> - /* Only need vector extracting when there are more >>> - than one stores. */ >>> - if (nstores > 1) >>> - inside_cost >>> - += record_stmt_cost (cost_vec, 1, vec_to_scalar, >>> - stmt_info, slp_node, >>> - 0, vect_body); >>> - /* Take a single lane vector type store as scalar >>> - store to avoid ICE like 110776. */ >>> - if (VECTOR_TYPE_P (ltype) >>> - && known_ne (TYPE_VECTOR_SUBPARTS (ltype), 1U)) >>> - n_adjacent_stores++; >>> - else >>> - inside_cost >>> - += record_stmt_cost (cost_vec, 1, scalar_store, >>> - stmt_info, 0, vect_body); >>> + n_adjacent_stores++; >>> continue; >>> } >>> tree newref, newoff; >>> @@ -8905,9 +8890,26 @@ vectorizable_store (vec_info *vinfo, >>> if (costing_p) >>> { >>> if (n_adjacent_stores > 0) >>> - vect_get_store_cost (vinfo, stmt_info, slp_node, >>> n_adjacent_stores, >>> - alignment_support_scheme, misalignment, >>> - &inside_cost, cost_vec); >>> + { >>> + /* Take a single lane vector type store as scalar >>> + store to avoid ICE like 110776. */ >>> + if (VECTOR_TYPE_P (ltype) >>> + && known_ne (TYPE_VECTOR_SUBPARTS (ltype), 1U)) >> >> Sorry to ask, since it's pre-existing, but could you change this to >> maybe_ne while you're there? nunits==1+1X should be treated as a vector >> rather than a scalar. > Sure, I made the change (see patch below) and re-validated on aarch64. > > It would also be good to check for performance regressions, now that we have > a patch to test: > I will run SPEC2017 with -mcpu=generic and -mcpu=native on Grace, but we > would appreciate help with benchmarking on other platforms. > Tamar, would you still be willing to test the patch on other platforms? > > If there are no other changes necessary and assuming there are no performance > regressions, I was planning to commit the patch in January after returning > from christmas break. > > In the meantime I wish everyone happy holidays. > Jennifer On Grace, the patch has no non-noise impact on performance for SPEC2017 with -mcpu=generic and -mcpu=native. I also re-validated on aarch64 today, no regression. Do you advise to run additional performance tests or is the patch ready to be pushed to trunk? Thanks, Jennifer > > This patch removes the AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS tunable and > use_new_vector_costs entry in aarch64-tuning-flags.def and makes the > AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS paths in the backend the > default. To that end, the function aarch64_use_new_vector_costs_p and its uses > were removed. To prevent costing vec_to_scalar operations with 0, as > described in > https://gcc.gnu.org/pipermail/gcc-patches/2024-October/665481.html, > we adjusted vectorizable_store such that the variable n_adjacent_stores > also covers vec_to_scalar operations. This way vec_to_scalar operations > are not costed individually, but as a group. > As suggested by Richard Sandiford, the "known_ne" in the multilane-check > was replaced by "maybe_ne" in order to treat nunits==1+1X as a vector > rather than a scalar. > > Two tests were adjusted due to changes in codegen. In both cases, the > old code performed loop unrolling once, but the new code does not: > Example from gcc.target/aarch64/sve/strided_load_2.c (compiled with > -O2 -ftree-vectorize -march=armv8.2-a+sve -mtune=generic > -moverride=tune=none): > f_int64_t_32: > cbz w3, .L92 > mov x4, 0 > uxtw x3, w3 > + cntd x5 > + whilelo p7.d, xzr, x3 > + mov z29.s, w5 > mov z31.s, w2 > - whilelo p6.d, xzr, x3 > - mov x2, x3 > - index z30.s, #0, #1 > - uqdecd x2 > - ptrue p5.b, all > - whilelo p7.d, xzr, x2 > + index z30.d, #0, #1 > + ptrue p6.b, all > .p2align 3,,7 > .L94: > - ld1d z27.d, p7/z, [x0, #1, mul vl] > - ld1d z28.d, p6/z, [x0] > - movprfx z29, z31 > - mul z29.s, p5/m, z29.s, z30.s > - incw x4 > - uunpklo z0.d, z29.s > - uunpkhi z29.d, z29.s > - ld1d z25.d, p6/z, [x1, z0.d, lsl 3] > - ld1d z26.d, p7/z, [x1, z29.d, lsl 3] > - add z25.d, z28.d, z25.d > + ld1d z27.d, p7/z, [x0, x4, lsl 3] > + movprfx z28, z31 > + mul z28.s, p6/m, z28.s, z30.s > + ld1d z26.d, p7/z, [x1, z28.d, uxtw 3] > add z26.d, z27.d, z26.d > - st1d z26.d, p7, [x0, #1, mul vl] > - whilelo p7.d, x4, x2 > - st1d z25.d, p6, [x0] > - incw z30.s > - incb x0, all, mul #2 > - whilelo p6.d, x4, x3 > + st1d z26.d, p7, [x0, x4, lsl 3] > + add z30.s, z30.s, z29.s > + incd x4 > + whilelo p7.d, x4, x3 > b.any .L94 > .L92: > ret > > Example from gcc.target/aarch64/sve/strided_store_2.c (compiled with > -O2 -ftree-vectorize -march=armv8.2-a+sve -mtune=generic > -moverride=tune=none): > f_int64_t_32: > cbz w3, .L84 > - addvl x5, x1, #1 > mov x4, 0 > uxtw x3, w3 > - mov z31.s, w2 > + cntd x5 > whilelo p7.d, xzr, x3 > - mov x2, x3 > - index z30.s, #0, #1 > - uqdecd x2 > - ptrue p5.b, all > - whilelo p6.d, xzr, x2 > + mov z29.s, w5 > + mov z31.s, w2 > + index z30.d, #0, #1 > + ptrue p6.b, all > .p2align 3,,7 > .L86: > - ld1d z28.d, p7/z, [x1, x4, lsl 3] > - ld1d z27.d, p6/z, [x5, x4, lsl 3] > - movprfx z29, z30 > - mul z29.s, p5/m, z29.s, z31.s > - add z28.d, z28.d, #1 > - uunpklo z26.d, z29.s > - st1d z28.d, p7, [x0, z26.d, lsl 3] > - incw x4 > - uunpkhi z29.d, z29.s > + ld1d z27.d, p7/z, [x1, x4, lsl 3] > + movprfx z28, z30 > + mul z28.s, p6/m, z28.s, z31.s > add z27.d, z27.d, #1 > - whilelo p6.d, x4, x2 > - st1d z27.d, p7, [x0, z29.d, lsl 3] > - incw z30.s > + st1d z27.d, p7, [x0, z28.d, uxtw 3] > + incd x4 > + add z30.s, z30.s, z29.s > whilelo p7.d, x4, x3 > b.any .L86 > .L84: > ret > > The patch was bootstrapped and tested on aarch64-linux-gnu, no > regression. > OK for mainline? > > Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com> > > gcc/ > * tree-vect-stmts.cc (vectorizable_store): Extend the use of > n_adjacent_stores to also cover vec_to_scalar operations. > * config/aarch64/aarch64-tuning-flags.def: Remove > use_new_vector_costs as tuning option. > * config/aarch64/aarch64.cc (aarch64_use_new_vector_costs_p): > Remove. > (aarch64_vector_costs::add_stmt_cost): Remove use of > aarch64_use_new_vector_costs_p. > (aarch64_vector_costs::finish_cost): Remove use of > aarch64_use_new_vector_costs_p. > * config/aarch64/tuning_models/cortexx925.h: Remove > AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS. > * config/aarch64/tuning_models/fujitsu_monaka.h: Likewise. > * config/aarch64/tuning_models/generic_armv8_a.h: Likewise. > * config/aarch64/tuning_models/generic_armv9_a.h: Likewise. > * config/aarch64/tuning_models/neoverse512tvb.h: Likewise. > * config/aarch64/tuning_models/neoversen2.h: Likewise. > * config/aarch64/tuning_models/neoversen3.h: Likewise. > * config/aarch64/tuning_models/neoversev1.h: Likewise. > * config/aarch64/tuning_models/neoversev2.h: Likewise. > * config/aarch64/tuning_models/neoversev3.h: Likewise. > * config/aarch64/tuning_models/neoversev3ae.h: Likewise. > > gcc/testsuite/ > * gcc.target/aarch64/sve/strided_load_2.c: Adjust expected outcome. > * gcc.target/aarch64/sve/strided_store_2.c: Likewise. > --- > gcc/config/aarch64/aarch64-tuning-flags.def | 2 - > gcc/config/aarch64/aarch64.cc | 20 ++-------- > gcc/config/aarch64/tuning_models/cortexx925.h | 1 - > .../aarch64/tuning_models/fujitsu_monaka.h | 1 - > .../aarch64/tuning_models/generic_armv8_a.h | 1 - > .../aarch64/tuning_models/generic_armv9_a.h | 1 - > .../aarch64/tuning_models/neoverse512tvb.h | 1 - > gcc/config/aarch64/tuning_models/neoversen2.h | 1 - > gcc/config/aarch64/tuning_models/neoversen3.h | 1 - > gcc/config/aarch64/tuning_models/neoversev1.h | 1 - > gcc/config/aarch64/tuning_models/neoversev2.h | 1 - > gcc/config/aarch64/tuning_models/neoversev3.h | 1 - > .../aarch64/tuning_models/neoversev3ae.h | 1 - > .../gcc.target/aarch64/sve/strided_load_2.c | 2 +- > .../gcc.target/aarch64/sve/strided_store_2.c | 2 +- > gcc/tree-vect-stmts.cc | 40 ++++++++++--------- > 16 files changed, 27 insertions(+), 50 deletions(-) > > diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def > b/gcc/config/aarch64/aarch64-tuning-flags.def > index ffbff20e29c..1de633c739b 100644 > --- a/gcc/config/aarch64/aarch64-tuning-flags.def > +++ b/gcc/config/aarch64/aarch64-tuning-flags.def > @@ -38,8 +38,6 @@ AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", > CHEAP_SHIFT_EXTEND) > > AARCH64_EXTRA_TUNING_OPTION ("cse_sve_vl_constants", CSE_SVE_VL_CONSTANTS) > > -AARCH64_EXTRA_TUNING_OPTION ("use_new_vector_costs", USE_NEW_VECTOR_COSTS) > - > AARCH64_EXTRA_TUNING_OPTION ("matched_vector_throughput", > MATCHED_VECTOR_THROUGHPUT) > > AARCH64_EXTRA_TUNING_OPTION ("avoid_cross_loop_fma", AVOID_CROSS_LOOP_FMA) > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index 77a2a6bfa3a..71fba9cc63b 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -16627,16 +16627,6 @@ aarch64_vectorize_create_costs (vec_info *vinfo, > bool costing_for_scalar) > return new aarch64_vector_costs (vinfo, costing_for_scalar); > } > > -/* Return true if the current CPU should use the new costs defined > - in GCC 11. This should be removed for GCC 12 and above, with the > - costs applying to all CPUs instead. */ > -static bool > -aarch64_use_new_vector_costs_p () > -{ > - return (aarch64_tune_params.extra_tuning_flags > - & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS); > -} > - > /* Return the appropriate SIMD costs for vectors of type VECTYPE. */ > static const simd_vec_cost * > aarch64_simd_vec_costs (tree vectype) > @@ -17555,7 +17545,7 @@ aarch64_vector_costs::add_stmt_cost (int count, > vect_cost_for_stmt kind, > > /* Do one-time initialization based on the vinfo. */ > loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo); > - if (!m_analyzed_vinfo && aarch64_use_new_vector_costs_p ()) > + if (!m_analyzed_vinfo) > { > if (loop_vinfo) > analyze_loop_vinfo (loop_vinfo); > @@ -17573,7 +17563,7 @@ aarch64_vector_costs::add_stmt_cost (int count, > vect_cost_for_stmt kind, > > /* Try to get a more accurate cost by looking at STMT_INFO instead > of just looking at KIND. */ > - if (stmt_info && aarch64_use_new_vector_costs_p ()) > + if (stmt_info) > { > /* If we scalarize a strided store, the vectorizer costs one > vec_to_scalar for each element. However, we can store the first > @@ -17638,7 +17628,7 @@ aarch64_vector_costs::add_stmt_cost (int count, > vect_cost_for_stmt kind, > else > m_num_last_promote_demote = 0; > > - if (stmt_info && aarch64_use_new_vector_costs_p ()) > + if (stmt_info) > { > /* Account for any extra "embedded" costs that apply additively > to the base cost calculated above. */ > @@ -17999,9 +17989,7 @@ aarch64_vector_costs::finish_cost (const vector_costs > *uncast_scalar_costs) > > auto *scalar_costs > = static_cast<const aarch64_vector_costs *> (uncast_scalar_costs); > - if (loop_vinfo > - && m_vec_flags > - && aarch64_use_new_vector_costs_p ()) > + if (loop_vinfo && m_vec_flags) > { > m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs, > m_costs[vect_body]); > diff --git a/gcc/config/aarch64/tuning_models/cortexx925.h > b/gcc/config/aarch64/tuning_models/cortexx925.h > index 5ebaf66e986..74772f3e15f 100644 > --- a/gcc/config/aarch64/tuning_models/cortexx925.h > +++ b/gcc/config/aarch64/tuning_models/cortexx925.h > @@ -221,7 +221,6 @@ static const struct tune_params cortexx925_tunings = > tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ > (AARCH64_EXTRA_TUNE_BASE > | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS > - | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS > | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT > | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */ > &generic_armv9a_prefetch_tune, > diff --git a/gcc/config/aarch64/tuning_models/fujitsu_monaka.h > b/gcc/config/aarch64/tuning_models/fujitsu_monaka.h > index 2d704ecd110..a564528f43d 100644 > --- a/gcc/config/aarch64/tuning_models/fujitsu_monaka.h > +++ b/gcc/config/aarch64/tuning_models/fujitsu_monaka.h > @@ -55,7 +55,6 @@ static const struct tune_params fujitsu_monaka_tunings = > 0, /* max_case_values. */ > tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ > (AARCH64_EXTRA_TUNE_BASE > - | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS > | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */ > &generic_prefetch_tune, > AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ > diff --git a/gcc/config/aarch64/tuning_models/generic_armv8_a.h > b/gcc/config/aarch64/tuning_models/generic_armv8_a.h > index bdd309ab03d..f090d5cde50 100644 > --- a/gcc/config/aarch64/tuning_models/generic_armv8_a.h > +++ b/gcc/config/aarch64/tuning_models/generic_armv8_a.h > @@ -183,7 +183,6 @@ static const struct tune_params generic_armv8_a_tunings = > tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ > (AARCH64_EXTRA_TUNE_BASE > | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS > - | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS > | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */ > &generic_prefetch_tune, > AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ > diff --git a/gcc/config/aarch64/tuning_models/generic_armv9_a.h > b/gcc/config/aarch64/tuning_models/generic_armv9_a.h > index 785e00946bc..7b5821183bc 100644 > --- a/gcc/config/aarch64/tuning_models/generic_armv9_a.h > +++ b/gcc/config/aarch64/tuning_models/generic_armv9_a.h > @@ -251,7 +251,6 @@ static const struct tune_params generic_armv9_a_tunings = > 0, /* max_case_values. */ > tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ > (AARCH64_EXTRA_TUNE_BASE > - | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS > | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */ > &generic_armv9a_prefetch_tune, > AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ > diff --git a/gcc/config/aarch64/tuning_models/neoverse512tvb.h > b/gcc/config/aarch64/tuning_models/neoverse512tvb.h > index 007f987154c..f7457df59e5 100644 > --- a/gcc/config/aarch64/tuning_models/neoverse512tvb.h > +++ b/gcc/config/aarch64/tuning_models/neoverse512tvb.h > @@ -156,7 +156,6 @@ static const struct tune_params neoverse512tvb_tunings = > 0, /* max_case_values. */ > tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ > (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS > - | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS > | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */ > &generic_armv9a_prefetch_tune, > AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ > diff --git a/gcc/config/aarch64/tuning_models/neoversen2.h > b/gcc/config/aarch64/tuning_models/neoversen2.h > index 32560d2f5f8..541b61c8179 100644 > --- a/gcc/config/aarch64/tuning_models/neoversen2.h > +++ b/gcc/config/aarch64/tuning_models/neoversen2.h > @@ -219,7 +219,6 @@ static const struct tune_params neoversen2_tunings = > tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ > (AARCH64_EXTRA_TUNE_BASE > | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS > - | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS > | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT > | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */ > &generic_armv9a_prefetch_tune, > diff --git a/gcc/config/aarch64/tuning_models/neoversen3.h > b/gcc/config/aarch64/tuning_models/neoversen3.h > index 2010bc4645b..eff668132a8 100644 > --- a/gcc/config/aarch64/tuning_models/neoversen3.h > +++ b/gcc/config/aarch64/tuning_models/neoversen3.h > @@ -219,7 +219,6 @@ static const struct tune_params neoversen3_tunings = > tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ > (AARCH64_EXTRA_TUNE_BASE > | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS > - | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS > | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */ > &generic_armv9a_prefetch_tune, > AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ > diff --git a/gcc/config/aarch64/tuning_models/neoversev1.h > b/gcc/config/aarch64/tuning_models/neoversev1.h > index c3751e32696..d11472b6e1e 100644 > --- a/gcc/config/aarch64/tuning_models/neoversev1.h > +++ b/gcc/config/aarch64/tuning_models/neoversev1.h > @@ -228,7 +228,6 @@ static const struct tune_params neoversev1_tunings = > tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ > (AARCH64_EXTRA_TUNE_BASE > | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS > - | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS > | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT > | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */ > &generic_armv9a_prefetch_tune, > diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h > b/gcc/config/aarch64/tuning_models/neoversev2.h > index 80dbe5c806c..ee77ffdd3bc 100644 > --- a/gcc/config/aarch64/tuning_models/neoversev2.h > +++ b/gcc/config/aarch64/tuning_models/neoversev2.h > @@ -219,7 +219,6 @@ static const struct tune_params neoversev2_tunings = > tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ > (AARCH64_EXTRA_TUNE_BASE > | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS > - | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS > | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT > | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW > | AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA), /* tune_flags. */ > diff --git a/gcc/config/aarch64/tuning_models/neoversev3.h > b/gcc/config/aarch64/tuning_models/neoversev3.h > index efe09e16d1e..6ef143ef7d5 100644 > --- a/gcc/config/aarch64/tuning_models/neoversev3.h > +++ b/gcc/config/aarch64/tuning_models/neoversev3.h > @@ -219,7 +219,6 @@ static const struct tune_params neoversev3_tunings = > tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ > (AARCH64_EXTRA_TUNE_BASE > | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS > - | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS > | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT > | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */ > &generic_armv9a_prefetch_tune, > diff --git a/gcc/config/aarch64/tuning_models/neoversev3ae.h > b/gcc/config/aarch64/tuning_models/neoversev3ae.h > index 66849f30889..96bdbf971f1 100644 > --- a/gcc/config/aarch64/tuning_models/neoversev3ae.h > +++ b/gcc/config/aarch64/tuning_models/neoversev3ae.h > @@ -219,7 +219,6 @@ static const struct tune_params neoversev3ae_tunings = > tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ > (AARCH64_EXTRA_TUNE_BASE > | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS > - | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS > | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT > | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */ > &generic_armv9a_prefetch_tune, > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_load_2.c > b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_2.c > index 762805ff54b..c334b7a6875 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/strided_load_2.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_2.c > @@ -15,4 +15,4 @@ > so we vectorize the offset calculation. This means that the > 64-bit version needs two copies. */ > /* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, > \[x[0-9]+, z[0-9]+.s, uxtw 2\]\n} 3 } } */ > -/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, > \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 15 } } */ > +/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, > \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 9 } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_store_2.c > b/gcc/testsuite/gcc.target/aarch64/sve/strided_store_2.c > index f0ea58e38e2..94cc63049bc 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/strided_store_2.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_store_2.c > @@ -15,4 +15,4 @@ > so we vectorize the offset calculation. This means that the > 64-bit version needs two copies. */ > /* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, > z[0-9]+.s, uxtw 2\]\n} 3 } } */ > -/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, > z[0-9]+.d, lsl 3\]\n} 15 } } */ > +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, > z[0-9]+.d, lsl 3\]\n} 9 } } */ > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > index be1139a423c..09c048ec00c 100644 > --- a/gcc/tree-vect-stmts.cc > +++ b/gcc/tree-vect-stmts.cc > @@ -8834,22 +8834,7 @@ vectorizable_store (vec_info *vinfo, > { > if (costing_p) > { > - /* Only need vector extracting when there are more > - than one stores. */ > - if (nstores > 1) > - inside_cost > - += record_stmt_cost (cost_vec, 1, vec_to_scalar, > - stmt_info, slp_node, > - 0, vect_body); > - /* Take a single lane vector type store as scalar > - store to avoid ICE like 110776. */ > - if (VECTOR_TYPE_P (ltype) > - && known_ne (TYPE_VECTOR_SUBPARTS (ltype), 1U)) > - n_adjacent_stores++; > - else > - inside_cost > - += record_stmt_cost (cost_vec, 1, scalar_store, > - stmt_info, 0, vect_body); > + n_adjacent_stores++; > continue; > } > tree newref, newoff; > @@ -8905,9 +8890,26 @@ vectorizable_store (vec_info *vinfo, > if (costing_p) > { > if (n_adjacent_stores > 0) > - vect_get_store_cost (vinfo, stmt_info, slp_node, n_adjacent_stores, > - alignment_support_scheme, misalignment, > - &inside_cost, cost_vec); > + { > + /* Take a single lane vector type store as scalar > + store to avoid ICE like 110776. */ > + if (VECTOR_TYPE_P (ltype) > + && maybe_ne (TYPE_VECTOR_SUBPARTS (ltype), 1U)) > + vect_get_store_cost (vinfo, stmt_info, slp_node, > + n_adjacent_stores, alignment_support_scheme, > + misalignment, &inside_cost, cost_vec); > + else > + inside_cost > + += record_stmt_cost (cost_vec, n_adjacent_stores, > + scalar_store, stmt_info, 0, vect_body); > + /* Only need vector extracting when there are more > + than one stores. */ > + if (nstores > 1) > + inside_cost > + += record_stmt_cost (cost_vec, n_adjacent_stores, > + vec_to_scalar, stmt_info, slp_node, > + 0, vect_body); > + } > if (dump_enabled_p ()) > dump_printf_loc (MSG_NOTE, vect_location, > "vect_model_store_cost: inside_cost = %d, " > -- > 2.44.0 > >> >> Thanks, >> Richard >> >>> + vect_get_store_cost (vinfo, stmt_info, slp_node, >>> + n_adjacent_stores, >>> alignment_support_scheme, >>> + misalignment, &inside_cost, cost_vec); >>> + else >>> + inside_cost >>> + += record_stmt_cost (cost_vec, n_adjacent_stores, >>> + scalar_store, stmt_info, 0, vect_body); >>> + /* Only need vector extracting when there are more >>> + than one stores. */ >>> + if (nstores > 1) >>> + inside_cost >>> + += record_stmt_cost (cost_vec, n_adjacent_stores, >>> + vec_to_scalar, stmt_info, slp_node, >>> + 0, vect_body); >>> + } >>> if (dump_enabled_p ()) >>> dump_printf_loc (MSG_NOTE, vect_location, >>> "vect_model_store_cost: inside_cost = %d, "
smime.p7s
Description: S/MIME cryptographic signature