Hi Tamar, > On 26 Jul 2024, at 11:21, Tamar Christina <tamar.christ...@arm.com> wrote: > > External email: Use caution opening links or attachments > > > Hi All, > > Gather and scatters are not usually beneficial when the loop count is small. > This is because there's not only a cost to their execution within the loop but > there is also some cost to enter loops with them. >
That makes sense and the benchmark numbers back it up so I’m sympathetic to the idea. > As such this patch models this overhead. For generic tuning we however still > prefer gathers/scatters when the loop costs work out. I don’t have a strong preference either way about the generic option, but I’m okay with it. > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > This improves performance of Exchange in SPECCPU 2017 by 3% with SVE enabled. > > Ok for master? > > Thanks, > Tamar > > gcc/ChangeLog: > > * config/aarch64/aarch64-protos.h (struct sve_vec_cost): Add > gather_load_x32_init_cost and gather_load_x64_init_cost. > * config/aarch64/aarch64.cc (aarch64_vector_costs): Add > m_sve_gather_scatter_x32 and m_sve_gather_scatter_x64. > (aarch64_vector_costs::add_stmt_cost): Use them. > (aarch64_vector_costs::finish_cost): Likewise. > * config/aarch64/tuning_models/a64fx.h: Update. > * config/aarch64/tuning_models/cortexx925.h: Update. > * config/aarch64/tuning_models/generic.h: Update. > * config/aarch64/tuning_models/generic_armv8_a.h: Update. > * config/aarch64/tuning_models/generic_armv9_a.h: Update. > * config/aarch64/tuning_models/neoverse512tvb.h: Update. > * config/aarch64/tuning_models/neoversen2.h: Update. > * config/aarch64/tuning_models/neoversen3.h: Update. > * config/aarch64/tuning_models/neoversev1.h: Update. > * config/aarch64/tuning_models/neoversev2.h: Update. > * config/aarch64/tuning_models/neoversev3.h: Update. > * config/aarch64/tuning_models/neoversev3ae.h: Update. > > --- > diff --git a/gcc/config/aarch64/aarch64-protos.h > b/gcc/config/aarch64/aarch64-protos.h > index > 42639e9efcf1e0f9362f759ae63a31b8eeb0d581..16eb8edab4d9fdfc6e3672c56ef5c9f6962d0c0b > 100644 > --- a/gcc/config/aarch64/aarch64-protos.h > +++ b/gcc/config/aarch64/aarch64-protos.h > @@ -262,6 +262,8 @@ struct sve_vec_cost : simd_vec_cost > unsigned int fadda_f64_cost, > unsigned int gather_load_x32_cost, > unsigned int gather_load_x64_cost, > + unsigned int gather_load_x32_init_cost, > + unsigned int gather_load_x64_init_cost, > unsigned int scatter_store_elt_cost) > : simd_vec_cost (base), > clast_cost (clast_cost), > @@ -270,6 +272,8 @@ struct sve_vec_cost : simd_vec_cost > fadda_f64_cost (fadda_f64_cost), > gather_load_x32_cost (gather_load_x32_cost), > gather_load_x64_cost (gather_load_x64_cost), > + gather_load_x32_init_cost (gather_load_x32_init_cost), > + gather_load_x64_init_cost (gather_load_x64_init_cost), > scatter_store_elt_cost (scatter_store_elt_cost) > {} > > @@ -289,6 +293,12 @@ struct sve_vec_cost : simd_vec_cost > const int gather_load_x32_cost; > const int gather_load_x64_cost; > > + /* Additional loop initialization cost of using a gather load instruction. > The x32 > + value is for loads of 32-bit elements and the x64 value is for loads of > + 64-bit elements. */ > + const int gather_load_x32_init_cost; > + const int gather_load_x64_init_cost; > + > /* The per-element cost of a scatter store. */ > const int scatter_store_elt_cost; > }; > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index > eafa377cb095f49408d8a926fb49ce13e2155ba2..1e14c3c0d24b449d404724e436ba57e1996ec062 > 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -16227,6 +16227,12 @@ private: > supported by Advanced SIMD and SVE2. */ > bool m_has_avg = false; > > + /* This loop uses an SVE 32-bit element gather or scatter operation. */ > + bool m_sve_gather_scatter_x32 = false; > + > + /* This loop uses an SVE 64-bit element gather or scatter operation. */ > + bool m_sve_gather_scatter_x64 = false; > + > /* True if the vector body contains a store to a decl and if the > function is known to have a vld1 from the same decl. > > @@ -17291,6 +17297,17 @@ aarch64_vector_costs::add_stmt_cost (int count, > vect_cost_for_stmt kind, > stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind, > stmt_info, vectype, > where, stmt_cost); > + > + /* Check if we've seen an SVE gather/scatter operation and which size. > */ > + if (kind == scalar_load > + && aarch64_sve_mode_p (TYPE_MODE (vectype)) > + && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER) > + { > + if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64) > + m_sve_gather_scatter_x64 = true; > + else > + m_sve_gather_scatter_x32 = true; This is a bit academic at this stage but SVE2.1 adds quadword gather loads. I know we’re not vectoring for those yet, but maybe it’s worth explicitly checking for 32-bit size and gcc_unreachable () otherwise? > + } > } > > /* Do any SVE-specific adjustments to the cost. */ > @@ -17676,6 +17693,18 @@ aarch64_vector_costs::finish_cost (const > vector_costs *uncast_scalar_costs) > m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs, > m_costs[vect_body]); > m_suggested_unroll_factor = determine_suggested_unroll_factor (); > + > + /* For gather and scatters there's an additional overhead for the first > + iteration. For low count loops they're not beneficial so model the > + overhead as loop prologue costs. */ > + if (m_sve_gather_scatter_x32 || m_sve_gather_scatter_x64) > + { > + const sve_vec_cost *sve_costs = aarch64_tune_params.vec_costs->sve; > + if (m_sve_gather_scatter_x32) > + m_costs[vect_prologue] += sve_costs->gather_load_x32_init_cost; > + else > + m_costs[vect_prologue] += sve_costs->gather_load_x64_init_cost; Shouldn’t this not be en else but rather: If (m_sve_gather_scatter_x64) m_costs[vect_prologue] += sve_costs->gather_load_x64_init_cost; In case the loop has both 32-bit and 64-bit gather/scatter? > + } > } > > /* Apply the heuristic described above m_stp_sequence_cost. Prefer > diff --git a/gcc/config/aarch64/tuning_models/a64fx.h > b/gcc/config/aarch64/tuning_models/a64fx.h > index > 6091289d4c3c66f01d7e4dbf97a85c1f8c40bb0b..378a1b3889ee265859786c1ff6525fce2305b615 > 100644 > --- a/gcc/config/aarch64/tuning_models/a64fx.h > +++ b/gcc/config/aarch64/tuning_models/a64fx.h > @@ -104,6 +104,8 @@ static const sve_vec_cost a64fx_sve_vector_cost = > 13, /* fadda_f64_cost */ > 64, /* gather_load_x32_cost */ > 32, /* gather_load_x64_cost */ > + 0, /* gather_load_x32_init_cost */ > + 0, /* gather_load_x64_init_cost */ > 1 /* scatter_store_elt_cost */ > }; > > diff --git a/gcc/config/aarch64/tuning_models/cortexx925.h > b/gcc/config/aarch64/tuning_models/cortexx925.h > index > fb95e87526985b02410d54a5a3ec8539c1b0ba6d..c4206018a3ff707f89ff3300700ec7dc2a5bc6b0 > 100644 > --- a/gcc/config/aarch64/tuning_models/cortexx925.h > +++ b/gcc/config/aarch64/tuning_models/cortexx925.h > @@ -135,6 +135,8 @@ static const sve_vec_cost cortexx925_sve_vector_cost = > operation more than a 64-bit gather. */ > 14, /* gather_load_x32_cost */ > 12, /* gather_load_x64_cost */ > + 42, /* gather_load_x32_init_cost */ > + 24, /* gather_load_x64_init_cost */ Can you comment on how these numbers are derived? Thanks, Kyrill > 1 /* scatter_store_elt_cost */ > }; > > diff --git a/gcc/config/aarch64/tuning_models/generic.h > b/gcc/config/aarch64/tuning_models/generic.h > index > 2b1f68b3052117814161a32f426422736ad6462b..101969bdbb9ccf7eafbd9a1cd6e25f0b584fb261 > 100644 > --- a/gcc/config/aarch64/tuning_models/generic.h > +++ b/gcc/config/aarch64/tuning_models/generic.h > @@ -105,6 +105,8 @@ static const sve_vec_cost generic_sve_vector_cost = > 2, /* fadda_f64_cost */ > 4, /* gather_load_x32_cost */ > 2, /* gather_load_x64_cost */ > + 12, /* gather_load_x32_init_cost */ > + 4, /* gather_load_x64_init_cost */ > 1 /* scatter_store_elt_cost */ > }; > > diff --git a/gcc/config/aarch64/tuning_models/generic_armv8_a.h > b/gcc/config/aarch64/tuning_models/generic_armv8_a.h > index > b38b9a8c5cad7d12aa38afdb610a14a25e755010..b5088afe068aa4be7f9dd614cfdd2a51fa96e524 > 100644 > --- a/gcc/config/aarch64/tuning_models/generic_armv8_a.h > +++ b/gcc/config/aarch64/tuning_models/generic_armv8_a.h > @@ -106,6 +106,8 @@ static const sve_vec_cost generic_armv8_a_sve_vector_cost > = > 2, /* fadda_f64_cost */ > 4, /* gather_load_x32_cost */ > 2, /* gather_load_x64_cost */ > + 12, /* gather_load_x32_init_cost */ > + 4, /* gather_load_x64_init_cost */ > 1 /* scatter_store_elt_cost */ > }; > > diff --git a/gcc/config/aarch64/tuning_models/generic_armv9_a.h > b/gcc/config/aarch64/tuning_models/generic_armv9_a.h > index > b39a0c73db910888168790888d24ddf4406bf1ee..fd72de542862909ccb9a9260a16bb01935d97f36 > 100644 > --- a/gcc/config/aarch64/tuning_models/generic_armv9_a.h > +++ b/gcc/config/aarch64/tuning_models/generic_armv9_a.h > @@ -136,6 +136,8 @@ static const sve_vec_cost generic_armv9_a_sve_vector_cost > = > operation more than a 64-bit gather. */ > 14, /* gather_load_x32_cost */ > 12, /* gather_load_x64_cost */ > + 42, /* gather_load_x32_init_cost */ > + 24, /* gather_load_x64_init_cost */ > 3 /* scatter_store_elt_cost */ > }; > > diff --git a/gcc/config/aarch64/tuning_models/neoverse512tvb.h > b/gcc/config/aarch64/tuning_models/neoverse512tvb.h > index > 825c6a64990b72cda3641737957dc94d75db1509..d2a0b647791de8fca6d7684849d2ab1e9104b045 > 100644 > --- a/gcc/config/aarch64/tuning_models/neoverse512tvb.h > +++ b/gcc/config/aarch64/tuning_models/neoverse512tvb.h > @@ -79,6 +79,8 @@ static const sve_vec_cost neoverse512tvb_sve_vector_cost = > operation more than a 64-bit gather. */ > 14, /* gather_load_x32_cost */ > 12, /* gather_load_x64_cost */ > + 42, /* gather_load_x32_init_cost */ > + 24, /* gather_load_x64_init_cost */ > 3 /* scatter_store_elt_cost */ > }; > > diff --git a/gcc/config/aarch64/tuning_models/neoversen2.h > b/gcc/config/aarch64/tuning_models/neoversen2.h > index > 3430eb9c06819e00ab38966bb960bd6525ff2b5c..00d2c12e739ffd371dd4720826894e980d577ca7 > 100644 > --- a/gcc/config/aarch64/tuning_models/neoversen2.h > +++ b/gcc/config/aarch64/tuning_models/neoversen2.h > @@ -135,6 +135,8 @@ static const sve_vec_cost neoversen2_sve_vector_cost = > operation more than a 64-bit gather. */ > 14, /* gather_load_x32_cost */ > 12, /* gather_load_x64_cost */ > + 42, /* gather_load_x32_init_cost */ > + 24, /* gather_load_x64_init_cost */ > 3 /* scatter_store_elt_cost */ > }; > > diff --git a/gcc/config/aarch64/tuning_models/neoversen3.h > b/gcc/config/aarch64/tuning_models/neoversen3.h > index > 7438e39a4bbe43de624b63fdd20d3fde9dfb6fc9..fc4333ffdeaef0115ac162e2da9d8d548bacf576 > 100644 > --- a/gcc/config/aarch64/tuning_models/neoversen3.h > +++ b/gcc/config/aarch64/tuning_models/neoversen3.h > @@ -135,6 +135,8 @@ static const sve_vec_cost neoversen3_sve_vector_cost = > operation more than a 64-bit gather. */ > 14, /* gather_load_x32_cost */ > 12, /* gather_load_x64_cost */ > + 42, /* gather_load_x32_init_cost */ > + 24, /* gather_load_x64_init_cost */ > 1 /* scatter_store_elt_cost */ > }; > > diff --git a/gcc/config/aarch64/tuning_models/neoversev1.h > b/gcc/config/aarch64/tuning_models/neoversev1.h > index > 0fc41ce6a41b3135fa06d2bda1f517fdf4f8dbcf..705ed025730f6683109a4796c6eefa55b437cec9 > 100644 > --- a/gcc/config/aarch64/tuning_models/neoversev1.h > +++ b/gcc/config/aarch64/tuning_models/neoversev1.h > @@ -126,6 +126,8 @@ static const sve_vec_cost neoversev1_sve_vector_cost = > 8, /* fadda_f64_cost */ > 32, /* gather_load_x32_cost */ > 16, /* gather_load_x64_cost */ > + 96, /* gather_load_x32_init_cost */ > + 32, /* gather_load_x64_init_cost */ > 3 /* scatter_store_elt_cost */ > }; > > diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h > b/gcc/config/aarch64/tuning_models/neoversev2.h > index > cca459e32c1384f57f8345d86b42b7814ae44115..680feeb9e4ee7bf21d5a258d83e522e079fdc156 > 100644 > --- a/gcc/config/aarch64/tuning_models/neoversev2.h > +++ b/gcc/config/aarch64/tuning_models/neoversev2.h > @@ -135,6 +135,8 @@ static const sve_vec_cost neoversev2_sve_vector_cost = > operation more than a 64-bit gather. */ > 14, /* gather_load_x32_cost */ > 12, /* gather_load_x64_cost */ > + 42, /* gather_load_x32_init_cost */ > + 24, /* gather_load_x64_init_cost */ > 3 /* scatter_store_elt_cost */ > }; > > diff --git a/gcc/config/aarch64/tuning_models/neoversev3.h > b/gcc/config/aarch64/tuning_models/neoversev3.h > index > 3daa3d2365c817d03c6c0d5e66fe832620d8fb2c..812c6ad304e8d4c503dcd444437bf6528d6f3176 > 100644 > --- a/gcc/config/aarch64/tuning_models/neoversev3.h > +++ b/gcc/config/aarch64/tuning_models/neoversev3.h > @@ -135,6 +135,8 @@ static const sve_vec_cost neoversev3_sve_vector_cost = > operation more than a 64-bit gather. */ > 14, /* gather_load_x32_cost */ > 12, /* gather_load_x64_cost */ > + 42, /* gather_load_x32_init_cost */ > + 24, /* gather_load_x64_init_cost */ > 1 /* scatter_store_elt_cost */ > }; > > diff --git a/gcc/config/aarch64/tuning_models/neoversev3ae.h > b/gcc/config/aarch64/tuning_models/neoversev3ae.h > index > 29c6f22e941b26ee333c87b9fac22aea86625e97..280b5abb27d3c9f404d5f96f14d0cba1e13b9bd1 > 100644 > --- a/gcc/config/aarch64/tuning_models/neoversev3ae.h > +++ b/gcc/config/aarch64/tuning_models/neoversev3ae.h > @@ -135,6 +135,8 @@ static const sve_vec_cost neoversev3ae_sve_vector_cost = > operation more than a 64-bit gather. */ > 14, /* gather_load_x32_cost */ > 12, /* gather_load_x64_cost */ > + 42, /* gather_load_x32_init_cost */ > + 24, /* gather_load_x64_init_cost */ > 1 /* scatter_store_elt_cost */ > }; > > > > > > -- > <rb18671.patch>