https://gcc.gnu.org/g:70035b6c13852435d7ae396c0762ee26897d4d45

commit r15-6614-g70035b6c13852435d7ae396c0762ee26897d4d45
Author: Jennifer Schmitz <jschm...@nvidia.com>
Date:   Tue Nov 26 00:43:48 2024 -0800

    AArch64: Remove AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
    
    This patch removes the AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS tunable and
    use_new_vector_costs entry in aarch64-tuning-flags.def and makes the
    AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS paths in the backend the
    default. To that end, the function aarch64_use_new_vector_costs_p and its 
uses
    were removed. To prevent costing vec_to_scalar operations with 0, as
    described in
    https://gcc.gnu.org/pipermail/gcc-patches/2024-October/665481.html,
    we adjusted vectorizable_store such that the variable n_adjacent_stores
    also covers vec_to_scalar operations. This way vec_to_scalar operations
    are not costed individually, but as a group.
    As suggested by Richard Sandiford, the "known_ne" in the multilane-check
    was replaced by "maybe_ne" in order to treat nunits==1+1X as a vector
    rather than a scalar.
    
    Two tests were adjusted due to changes in codegen. In both cases, the
    old code performed loop unrolling once, but the new code does not:
    Example from gcc.target/aarch64/sve/strided_load_2.c (compiled with
    -O2 -ftree-vectorize -march=armv8.2-a+sve -mtune=generic 
-moverride=tune=none):
    f_int64_t_32:
            cbz     w3, .L92
            mov     x4, 0
            uxtw    x3, w3
    +       cntd    x5
    +       whilelo p7.d, xzr, x3
    +       mov     z29.s, w5
            mov     z31.s, w2
    -       whilelo p6.d, xzr, x3
    -       mov     x2, x3
    -       index   z30.s, #0, #1
    -       uqdecd  x2
    -       ptrue   p5.b, all
    -       whilelo p7.d, xzr, x2
    +       index   z30.d, #0, #1
    +       ptrue   p6.b, all
            .p2align 3,,7
     .L94:
    -       ld1d    z27.d, p7/z, [x0, #1, mul vl]
    -       ld1d    z28.d, p6/z, [x0]
    -       movprfx z29, z31
    -       mul     z29.s, p5/m, z29.s, z30.s
    -       incw    x4
    -       uunpklo z0.d, z29.s
    -       uunpkhi z29.d, z29.s
    -       ld1d    z25.d, p6/z, [x1, z0.d, lsl 3]
    -       ld1d    z26.d, p7/z, [x1, z29.d, lsl 3]
    -       add     z25.d, z28.d, z25.d
    +       ld1d    z27.d, p7/z, [x0, x4, lsl 3]
    +       movprfx z28, z31
    +       mul     z28.s, p6/m, z28.s, z30.s
    +       ld1d    z26.d, p7/z, [x1, z28.d, uxtw 3]
            add     z26.d, z27.d, z26.d
    -       st1d    z26.d, p7, [x0, #1, mul vl]
    -       whilelo p7.d, x4, x2
    -       st1d    z25.d, p6, [x0]
    -       incw    z30.s
    -       incb    x0, all, mul #2
    -       whilelo p6.d, x4, x3
    +       st1d    z26.d, p7, [x0, x4, lsl 3]
    +       add     z30.s, z30.s, z29.s
    +       incd    x4
    +       whilelo p7.d, x4, x3
            b.any   .L94
     .L92:
            ret
    
    Example from gcc.target/aarch64/sve/strided_store_2.c (compiled with
    -O2 -ftree-vectorize -march=armv8.2-a+sve -mtune=generic 
-moverride=tune=none):
    f_int64_t_32:
            cbz     w3, .L84
    -       addvl   x5, x1, #1
            mov     x4, 0
            uxtw    x3, w3
    -       mov     z31.s, w2
    +       cntd    x5
            whilelo p7.d, xzr, x3
    -       mov     x2, x3
    -       index   z30.s, #0, #1
    -       uqdecd  x2
    -       ptrue   p5.b, all
    -       whilelo p6.d, xzr, x2
    +       mov     z29.s, w5
    +       mov     z31.s, w2
    +       index   z30.d, #0, #1
    +       ptrue   p6.b, all
            .p2align 3,,7
     .L86:
    -       ld1d    z28.d, p7/z, [x1, x4, lsl 3]
    -       ld1d    z27.d, p6/z, [x5, x4, lsl 3]
    -       movprfx z29, z30
    -       mul     z29.s, p5/m, z29.s, z31.s
    -       add     z28.d, z28.d, #1
    -       uunpklo z26.d, z29.s
    -       st1d    z28.d, p7, [x0, z26.d, lsl 3]
    -       incw    x4
    -       uunpkhi z29.d, z29.s
    +       ld1d    z27.d, p7/z, [x1, x4, lsl 3]
    +       movprfx z28, z30
    +       mul     z28.s, p6/m, z28.s, z31.s
            add     z27.d, z27.d, #1
    -       whilelo p6.d, x4, x2
    -       st1d    z27.d, p7, [x0, z29.d, lsl 3]
    -       incw    z30.s
    +       st1d    z27.d, p7, [x0, z28.d, uxtw 3]
    +       incd    x4
    +       add     z30.s, z30.s, z29.s
            whilelo p7.d, x4, x3
            b.any   .L86
     .L84:
            ret
    
    The patch was bootstrapped and tested on aarch64-linux-gnu, no
    regression.
    OK for mainline?
    
    Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com>
    
    gcc/
            * tree-vect-stmts.cc (vectorizable_store): Extend the use of
            n_adjacent_stores to also cover vec_to_scalar operations.
            * config/aarch64/aarch64-tuning-flags.def: Remove
            use_new_vector_costs as tuning option.
            * config/aarch64/aarch64.cc (aarch64_use_new_vector_costs_p):
            Remove.
            (aarch64_vector_costs::add_stmt_cost): Remove use of
            aarch64_use_new_vector_costs_p.
            (aarch64_vector_costs::finish_cost): Remove use of
            aarch64_use_new_vector_costs_p.
            * config/aarch64/tuning_models/cortexx925.h: Remove
            AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS.
            * config/aarch64/tuning_models/fujitsu_monaka.h: Likewise.
            * config/aarch64/tuning_models/generic_armv8_a.h: Likewise.
            * config/aarch64/tuning_models/generic_armv9_a.h: Likewise.
            * config/aarch64/tuning_models/neoverse512tvb.h: Likewise.
            * config/aarch64/tuning_models/neoversen2.h: Likewise.
            * config/aarch64/tuning_models/neoversen3.h: Likewise.
            * config/aarch64/tuning_models/neoversev1.h: Likewise.
            * config/aarch64/tuning_models/neoversev2.h: Likewise.
            * config/aarch64/tuning_models/neoversev3.h: Likewise.
            * config/aarch64/tuning_models/neoversev3ae.h: Likewise.
    
    gcc/testsuite/
            * gcc.target/aarch64/sve/strided_load_2.c: Adjust expected outcome.
            * gcc.target/aarch64/sve/strided_store_2.c: Likewise.

Diff:
---
 gcc/config/aarch64/aarch64-tuning-flags.def        |  2 --
 gcc/config/aarch64/aarch64.cc                      | 20 +++--------
 gcc/config/aarch64/tuning_models/cortexx925.h      |  1 -
 gcc/config/aarch64/tuning_models/fujitsu_monaka.h  |  1 -
 gcc/config/aarch64/tuning_models/generic_armv8_a.h |  1 -
 gcc/config/aarch64/tuning_models/generic_armv9_a.h |  1 -
 gcc/config/aarch64/tuning_models/neoverse512tvb.h  |  1 -
 gcc/config/aarch64/tuning_models/neoversen2.h      |  1 -
 gcc/config/aarch64/tuning_models/neoversen3.h      |  1 -
 gcc/config/aarch64/tuning_models/neoversev1.h      |  1 -
 gcc/config/aarch64/tuning_models/neoversev2.h      |  1 -
 gcc/config/aarch64/tuning_models/neoversev3.h      |  1 -
 gcc/config/aarch64/tuning_models/neoversev3ae.h    |  1 -
 .../gcc.target/aarch64/sve/strided_load_2.c        |  2 +-
 .../gcc.target/aarch64/sve/strided_store_2.c       |  2 +-
 gcc/tree-vect-stmts.cc                             | 40 ++++++++++++----------
 16 files changed, 27 insertions(+), 50 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def 
b/gcc/config/aarch64/aarch64-tuning-flags.def
index 9bfa1f207df8..7ebf390ef818 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -38,8 +38,6 @@ AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", 
CHEAP_SHIFT_EXTEND)
 
 AARCH64_EXTRA_TUNING_OPTION ("cse_sve_vl_constants", CSE_SVE_VL_CONSTANTS)
 
-AARCH64_EXTRA_TUNING_OPTION ("use_new_vector_costs", USE_NEW_VECTOR_COSTS)
-
 AARCH64_EXTRA_TUNING_OPTION ("matched_vector_throughput", 
MATCHED_VECTOR_THROUGHPUT)
 
 AARCH64_EXTRA_TUNING_OPTION ("avoid_cross_loop_fma", AVOID_CROSS_LOOP_FMA)
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 9e69bc744499..d6a8e4c20952 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -16627,16 +16627,6 @@ aarch64_vectorize_create_costs (vec_info *vinfo, bool 
costing_for_scalar)
   return new aarch64_vector_costs (vinfo, costing_for_scalar);
 }
 
-/* Return true if the current CPU should use the new costs defined
-   in GCC 11.  This should be removed for GCC 12 and above, with the
-   costs applying to all CPUs instead.  */
-static bool
-aarch64_use_new_vector_costs_p ()
-{
-  return (aarch64_tune_params.extra_tuning_flags
-         & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
-}
-
 /* Return the appropriate SIMD costs for vectors of type VECTYPE.  */
 static const simd_vec_cost *
 aarch64_simd_vec_costs (tree vectype)
@@ -17555,7 +17545,7 @@ aarch64_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
 
   /* Do one-time initialization based on the vinfo.  */
   loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
-  if (!m_analyzed_vinfo && aarch64_use_new_vector_costs_p ())
+  if (!m_analyzed_vinfo)
     {
       if (loop_vinfo)
        analyze_loop_vinfo (loop_vinfo);
@@ -17573,7 +17563,7 @@ aarch64_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
 
   /* Try to get a more accurate cost by looking at STMT_INFO instead
      of just looking at KIND.  */
-  if (stmt_info && aarch64_use_new_vector_costs_p ())
+  if (stmt_info)
     {
       /* If we scalarize a strided store, the vectorizer costs one
         vec_to_scalar for each element.  However, we can store the first
@@ -17638,7 +17628,7 @@ aarch64_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
   else
     m_num_last_promote_demote = 0;
 
-  if (stmt_info && aarch64_use_new_vector_costs_p ())
+  if (stmt_info)
     {
       /* Account for any extra "embedded" costs that apply additively
         to the base cost calculated above.  */
@@ -17999,9 +17989,7 @@ aarch64_vector_costs::finish_cost (const vector_costs 
*uncast_scalar_costs)
 
   auto *scalar_costs
     = static_cast<const aarch64_vector_costs *> (uncast_scalar_costs);
-  if (loop_vinfo
-      && m_vec_flags
-      && aarch64_use_new_vector_costs_p ())
+  if (loop_vinfo && m_vec_flags)
     {
       m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
                                             m_costs[vect_body]);
diff --git a/gcc/config/aarch64/tuning_models/cortexx925.h 
b/gcc/config/aarch64/tuning_models/cortexx925.h
index 2823fd841c00..7d0162eae54c 100644
--- a/gcc/config/aarch64/tuning_models/cortexx925.h
+++ b/gcc/config/aarch64/tuning_models/cortexx925.h
@@ -221,7 +221,6 @@ static const struct tune_params cortexx925_tunings =
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_BASE
    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
    | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW),       /* tune_flags.  */
   &generic_armv9a_prefetch_tune,
diff --git a/gcc/config/aarch64/tuning_models/fujitsu_monaka.h 
b/gcc/config/aarch64/tuning_models/fujitsu_monaka.h
index 3850ed7479bc..5dc40243fe38 100644
--- a/gcc/config/aarch64/tuning_models/fujitsu_monaka.h
+++ b/gcc/config/aarch64/tuning_models/fujitsu_monaka.h
@@ -55,7 +55,6 @@ static const struct tune_params fujitsu_monaka_tunings =
   0,   /* max_case_values.  */
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_BASE
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),    /* tune_flags.  */
   &generic_prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
diff --git a/gcc/config/aarch64/tuning_models/generic_armv8_a.h 
b/gcc/config/aarch64/tuning_models/generic_armv8_a.h
index e092a34e3561..35de3f032963 100644
--- a/gcc/config/aarch64/tuning_models/generic_armv8_a.h
+++ b/gcc/config/aarch64/tuning_models/generic_armv8_a.h
@@ -183,7 +183,6 @@ static const struct tune_params generic_armv8_a_tunings =
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_BASE
    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),    /* tune_flags.  */
   &generic_prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
diff --git a/gcc/config/aarch64/tuning_models/generic_armv9_a.h 
b/gcc/config/aarch64/tuning_models/generic_armv9_a.h
index 32b5d0e491de..f76a2506f384 100644
--- a/gcc/config/aarch64/tuning_models/generic_armv9_a.h
+++ b/gcc/config/aarch64/tuning_models/generic_armv9_a.h
@@ -251,7 +251,6 @@ static const struct tune_params generic_armv9_a_tunings =
   0,   /* max_case_values.  */
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_BASE
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),    /* tune_flags.  */
   &generic_armv9a_prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
diff --git a/gcc/config/aarch64/tuning_models/neoverse512tvb.h 
b/gcc/config/aarch64/tuning_models/neoverse512tvb.h
index 2530217347c8..50eb058e23d1 100644
--- a/gcc/config/aarch64/tuning_models/neoverse512tvb.h
+++ b/gcc/config/aarch64/tuning_models/neoverse512tvb.h
@@ -156,7 +156,6 @@ static const struct tune_params neoverse512tvb_tunings =
   0,   /* max_case_values.  */
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),    /* tune_flags.  */
   &generic_armv9a_prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
diff --git a/gcc/config/aarch64/tuning_models/neoversen2.h 
b/gcc/config/aarch64/tuning_models/neoversen2.h
index dbfebc50f163..9fbc059ea12c 100644
--- a/gcc/config/aarch64/tuning_models/neoversen2.h
+++ b/gcc/config/aarch64/tuning_models/neoversen2.h
@@ -219,7 +219,6 @@ static const struct tune_params neoversen2_tunings =
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_BASE
    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
    | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW),       /* tune_flags.  */
   &generic_armv9a_prefetch_tune,
diff --git a/gcc/config/aarch64/tuning_models/neoversen3.h 
b/gcc/config/aarch64/tuning_models/neoversen3.h
index 5ca0e13af587..78177e78e070 100644
--- a/gcc/config/aarch64/tuning_models/neoversen3.h
+++ b/gcc/config/aarch64/tuning_models/neoversen3.h
@@ -219,7 +219,6 @@ static const struct tune_params neoversen3_tunings =
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_BASE
    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),    /* tune_flags.  */
   &generic_armv9a_prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
diff --git a/gcc/config/aarch64/tuning_models/neoversev1.h 
b/gcc/config/aarch64/tuning_models/neoversev1.h
index 871929708523..f1ec7dcdda7b 100644
--- a/gcc/config/aarch64/tuning_models/neoversev1.h
+++ b/gcc/config/aarch64/tuning_models/neoversev1.h
@@ -228,7 +228,6 @@ static const struct tune_params neoversev1_tunings =
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_BASE
    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
    | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW),       /* tune_flags.  */
   &generic_armv9a_prefetch_tune,
diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h 
b/gcc/config/aarch64/tuning_models/neoversev2.h
index 219016f32588..4fabe4df2de7 100644
--- a/gcc/config/aarch64/tuning_models/neoversev2.h
+++ b/gcc/config/aarch64/tuning_models/neoversev2.h
@@ -219,7 +219,6 @@ static const struct tune_params neoversev2_tunings =
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_BASE
    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
    | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW
    | AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA),  /* tune_flags.  */
diff --git a/gcc/config/aarch64/tuning_models/neoversev3.h 
b/gcc/config/aarch64/tuning_models/neoversev3.h
index 092de7b9cb72..ad3cd222512d 100644
--- a/gcc/config/aarch64/tuning_models/neoversev3.h
+++ b/gcc/config/aarch64/tuning_models/neoversev3.h
@@ -219,7 +219,6 @@ static const struct tune_params neoversev3_tunings =
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_BASE
    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
    | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW),       /* tune_flags.  */
   &generic_armv9a_prefetch_tune,
diff --git a/gcc/config/aarch64/tuning_models/neoversev3ae.h 
b/gcc/config/aarch64/tuning_models/neoversev3ae.h
index f395799b344f..a0adef00824d 100644
--- a/gcc/config/aarch64/tuning_models/neoversev3ae.h
+++ b/gcc/config/aarch64/tuning_models/neoversev3ae.h
@@ -219,7 +219,6 @@ static const struct tune_params neoversev3ae_tunings =
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_BASE
    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
    | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW),       /* tune_flags.  */
   &generic_armv9a_prefetch_tune,
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_load_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_2.c
index 762805ff54ba..c334b7a68757 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/strided_load_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_2.c
@@ -15,4 +15,4 @@
    so we vectorize the offset calculation.  This means that the
    64-bit version needs two copies.  */
 /* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, 
z[0-9]+.s, uxtw 2\]\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, 
z[0-9]+.d, lsl 3\]\n} 15 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, 
z[0-9]+.d, lsl 3\]\n} 9 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_store_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/strided_store_2.c
index f0ea58e38e23..94cc63049bc9 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/strided_store_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_store_2.c
@@ -15,4 +15,4 @@
    so we vectorize the offset calculation.  This means that the
    64-bit version needs two copies.  */
 /* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, 
z[0-9]+.s, uxtw 2\]\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, 
z[0-9]+.d, lsl 3\]\n} 15 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, 
z[0-9]+.d, lsl 3\]\n} 9 } } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 2765dcd428c0..c0e38d00246d 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -8834,22 +8834,7 @@ vectorizable_store (vec_info *vinfo,
                {
                  if (costing_p)
                    {
-                     /* Only need vector extracting when there are more
-                        than one stores.  */
-                     if (nstores > 1)
-                       inside_cost
-                         += record_stmt_cost (cost_vec, 1, vec_to_scalar,
-                                              stmt_info, slp_node,
-                                              0, vect_body);
-                     /* Take a single lane vector type store as scalar
-                        store to avoid ICE like 110776.  */
-                     if (VECTOR_TYPE_P (ltype)
-                         && known_ne (TYPE_VECTOR_SUBPARTS (ltype), 1U))
-                       n_adjacent_stores++;
-                     else
-                       inside_cost
-                         += record_stmt_cost (cost_vec, 1, scalar_store,
-                                              stmt_info, 0, vect_body);
+                     n_adjacent_stores++;
                      continue;
                    }
                  tree newref, newoff;
@@ -8905,9 +8890,26 @@ vectorizable_store (vec_info *vinfo,
       if (costing_p)
        {
          if (n_adjacent_stores > 0)
-           vect_get_store_cost (vinfo, stmt_info, slp_node, n_adjacent_stores,
-                                alignment_support_scheme, misalignment,
-                                &inside_cost, cost_vec);
+           {
+             /* Take a single lane vector type store as scalar
+                store to avoid ICE like 110776.  */
+             if (VECTOR_TYPE_P (ltype)
+                 && maybe_ne (TYPE_VECTOR_SUBPARTS (ltype), 1U))
+               vect_get_store_cost (vinfo, stmt_info, slp_node,
+                                    n_adjacent_stores, 
alignment_support_scheme,
+                                    misalignment, &inside_cost, cost_vec);
+             else
+               inside_cost
+                 += record_stmt_cost (cost_vec, n_adjacent_stores,
+                                      scalar_store, stmt_info, 0, vect_body);
+             /* Only need vector extracting when there are more
+                than one stores.  */
+             if (nstores > 1)
+               inside_cost
+                 += record_stmt_cost (cost_vec, n_adjacent_stores,
+                                      vec_to_scalar, stmt_info, slp_node,
+                                      0, vect_body);
+           }
          if (dump_enabled_p ())
            dump_printf_loc (MSG_NOTE, vect_location,
                             "vect_model_store_cost: inside_cost = %d, "

Reply via email to