scatter decode overhead into account

Kyrylo Tkachov Fri, 26 Jul 2024 06:10:19 -0700

Hi Tamar,

> On 26 Jul 2024, at 11:21, Tamar Christina <tamar.christ...@arm.com> wrote:
> 
> External email: Use caution opening links or attachments
> 
> 
> Hi All,
> 
> Gather and scatters are not usually beneficial when the loop count is small.
> This is because there's not only a cost to their execution within the loop but
> there is also some cost to enter loops with them.
>


That makes sense and the benchmark numbers back it up so I’m sympathetic to the 
idea.


> As such this patch models this overhead.  For generic tuning we however still
> prefer gathers/scatters when the loop costs work out.

I don’t have a strong preference either way about the generic option, but I’m 
okay with it.


> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> This improves performance of Exchange in SPECCPU 2017 by 3% with SVE enabled.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
>        * config/aarch64/aarch64-protos.h (struct sve_vec_cost): Add
>        gather_load_x32_init_cost and gather_load_x64_init_cost.
>        * config/aarch64/aarch64.cc (aarch64_vector_costs): Add
>        m_sve_gather_scatter_x32 and m_sve_gather_scatter_x64.
>        (aarch64_vector_costs::add_stmt_cost): Use them.
>        (aarch64_vector_costs::finish_cost): Likewise.
>        * config/aarch64/tuning_models/a64fx.h: Update.
>        * config/aarch64/tuning_models/cortexx925.h: Update.
>        * config/aarch64/tuning_models/generic.h: Update.
>        * config/aarch64/tuning_models/generic_armv8_a.h: Update.
>        * config/aarch64/tuning_models/generic_armv9_a.h: Update.
>        * config/aarch64/tuning_models/neoverse512tvb.h: Update.
>        * config/aarch64/tuning_models/neoversen2.h: Update.
>        * config/aarch64/tuning_models/neoversen3.h: Update.
>        * config/aarch64/tuning_models/neoversev1.h: Update.
>        * config/aarch64/tuning_models/neoversev2.h: Update.
>        * config/aarch64/tuning_models/neoversev3.h: Update.
>        * config/aarch64/tuning_models/neoversev3ae.h: Update.
> 
> ---
> diff --git a/gcc/config/aarch64/aarch64-protos.h 
> b/gcc/config/aarch64/aarch64-protos.h
> index 
> 42639e9efcf1e0f9362f759ae63a31b8eeb0d581..16eb8edab4d9fdfc6e3672c56ef5c9f6962d0c0b
>  100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -262,6 +262,8 @@ struct sve_vec_cost : simd_vec_cost
>                          unsigned int fadda_f64_cost,
>                          unsigned int gather_load_x32_cost,
>                          unsigned int gather_load_x64_cost,
> +                         unsigned int gather_load_x32_init_cost,
> +                         unsigned int gather_load_x64_init_cost,
>                          unsigned int scatter_store_elt_cost)
>     : simd_vec_cost (base),
>       clast_cost (clast_cost),
> @@ -270,6 +272,8 @@ struct sve_vec_cost : simd_vec_cost
>       fadda_f64_cost (fadda_f64_cost),
>       gather_load_x32_cost (gather_load_x32_cost),
>       gather_load_x64_cost (gather_load_x64_cost),
> +      gather_load_x32_init_cost (gather_load_x32_init_cost),
> +      gather_load_x64_init_cost (gather_load_x64_init_cost),
>       scatter_store_elt_cost (scatter_store_elt_cost)
>   {}
> 
> @@ -289,6 +293,12 @@ struct sve_vec_cost : simd_vec_cost
>   const int gather_load_x32_cost;
>   const int gather_load_x64_cost;
> 
> +  /* Additional loop initialization cost of using a gather load instruction. 
>  The x32
> +     value is for loads of 32-bit elements and the x64 value is for loads of
> +     64-bit elements.  */
> +  const int gather_load_x32_init_cost;
> +  const int gather_load_x64_init_cost;
> +
>   /* The per-element cost of a scatter store.  */
>   const int scatter_store_elt_cost;
> };
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 
> eafa377cb095f49408d8a926fb49ce13e2155ba2..1e14c3c0d24b449d404724e436ba57e1996ec062
>  100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -16227,6 +16227,12 @@ private:
>      supported by Advanced SIMD and SVE2.  */
>   bool m_has_avg = false;
> 
> +  /* This loop uses an SVE 32-bit element gather or scatter operation.  */
> +  bool m_sve_gather_scatter_x32 = false;
> +
> +  /* This loop uses an SVE 64-bit element gather or scatter operation.  */
> +  bool m_sve_gather_scatter_x64 = false;
> +
>   /* True if the vector body contains a store to a decl and if the
>      function is known to have a vld1 from the same decl.
> 
> @@ -17291,6 +17297,17 @@ aarch64_vector_costs::add_stmt_cost (int count, 
> vect_cost_for_stmt kind,
>        stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind,
>                                                        stmt_info, vectype,
>                                                        where, stmt_cost);
> +
> +      /* Check if we've seen an SVE gather/scatter operation and which size. 
>  */
> +      if (kind == scalar_load
> +         && aarch64_sve_mode_p (TYPE_MODE (vectype))
> +         && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
> +       {
> +         if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64)
> +           m_sve_gather_scatter_x64 = true;
> +         else
> +           m_sve_gather_scatter_x32 = true;

This is a bit academic at this stage but SVE2.1 adds quadword gather loads. I 
know we’re not vectoring for those yet, but maybe it’s worth explicitly 
checking for 32-bit size and gcc_unreachable () otherwise?


> +       }
>     }
> 
>   /* Do any SVE-specific adjustments to the cost.  */
> @@ -17676,6 +17693,18 @@ aarch64_vector_costs::finish_cost (const 
> vector_costs *uncast_scalar_costs)
>       m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
>                                             m_costs[vect_body]);
>       m_suggested_unroll_factor = determine_suggested_unroll_factor ();
> +
> +      /* For gather and scatters there's an additional overhead for the first
> +        iteration.  For low count loops they're not beneficial so model the
> +        overhead as loop prologue costs.  */
> +      if (m_sve_gather_scatter_x32 || m_sve_gather_scatter_x64)
> +       {
> +         const sve_vec_cost *sve_costs = aarch64_tune_params.vec_costs->sve;
> +         if (m_sve_gather_scatter_x32)
> +           m_costs[vect_prologue] += sve_costs->gather_load_x32_init_cost;
> +         else
> +           m_costs[vect_prologue] += sve_costs->gather_load_x64_init_cost;

Shouldn’t this not be en else but rather:
If (m_sve_gather_scatter_x64)
   m_costs[vect_prologue] += sve_costs->gather_load_x64_init_cost;

In case the loop has both 32-bit and 64-bit gather/scatter?


> +       }
>     }
> 
>   /* Apply the heuristic described above m_stp_sequence_cost.  Prefer
> diff --git a/gcc/config/aarch64/tuning_models/a64fx.h 
> b/gcc/config/aarch64/tuning_models/a64fx.h
> index 
> 6091289d4c3c66f01d7e4dbf97a85c1f8c40bb0b..378a1b3889ee265859786c1ff6525fce2305b615
>  100644
> --- a/gcc/config/aarch64/tuning_models/a64fx.h
> +++ b/gcc/config/aarch64/tuning_models/a64fx.h
> @@ -104,6 +104,8 @@ static const sve_vec_cost a64fx_sve_vector_cost =
>   13, /* fadda_f64_cost  */
>   64, /* gather_load_x32_cost  */
>   32, /* gather_load_x64_cost  */
> +  0, /* gather_load_x32_init_cost  */
> +  0, /* gather_load_x64_init_cost  */
>   1 /* scatter_store_elt_cost  */
> };
> 
> diff --git a/gcc/config/aarch64/tuning_models/cortexx925.h 
> b/gcc/config/aarch64/tuning_models/cortexx925.h
> index 
> fb95e87526985b02410d54a5a3ec8539c1b0ba6d..c4206018a3ff707f89ff3300700ec7dc2a5bc6b0
>  100644
> --- a/gcc/config/aarch64/tuning_models/cortexx925.h
> +++ b/gcc/config/aarch64/tuning_models/cortexx925.h
> @@ -135,6 +135,8 @@ static const sve_vec_cost cortexx925_sve_vector_cost =
>      operation more than a 64-bit gather.  */
>   14, /* gather_load_x32_cost  */
>   12, /* gather_load_x64_cost  */
> +  42, /* gather_load_x32_init_cost  */
> +  24, /* gather_load_x64_init_cost  */


Can you comment on how these numbers are derived?
Thanks,
Kyrill


>   1 /* scatter_store_elt_cost  */
> };
> 
> diff --git a/gcc/config/aarch64/tuning_models/generic.h 
> b/gcc/config/aarch64/tuning_models/generic.h
> index 
> 2b1f68b3052117814161a32f426422736ad6462b..101969bdbb9ccf7eafbd9a1cd6e25f0b584fb261
>  100644
> --- a/gcc/config/aarch64/tuning_models/generic.h
> +++ b/gcc/config/aarch64/tuning_models/generic.h
> @@ -105,6 +105,8 @@ static const sve_vec_cost generic_sve_vector_cost =
>   2, /* fadda_f64_cost  */
>   4, /* gather_load_x32_cost  */
>   2, /* gather_load_x64_cost  */
> +  12, /* gather_load_x32_init_cost  */
> +  4, /* gather_load_x64_init_cost  */
>   1 /* scatter_store_elt_cost  */
> };
> 
> diff --git a/gcc/config/aarch64/tuning_models/generic_armv8_a.h 
> b/gcc/config/aarch64/tuning_models/generic_armv8_a.h
> index 
> b38b9a8c5cad7d12aa38afdb610a14a25e755010..b5088afe068aa4be7f9dd614cfdd2a51fa96e524
>  100644
> --- a/gcc/config/aarch64/tuning_models/generic_armv8_a.h
> +++ b/gcc/config/aarch64/tuning_models/generic_armv8_a.h
> @@ -106,6 +106,8 @@ static const sve_vec_cost generic_armv8_a_sve_vector_cost 
> =
>   2, /* fadda_f64_cost  */
>   4, /* gather_load_x32_cost  */
>   2, /* gather_load_x64_cost  */
> +  12, /* gather_load_x32_init_cost  */
> +  4, /* gather_load_x64_init_cost  */
>   1 /* scatter_store_elt_cost  */
> };
> 
> diff --git a/gcc/config/aarch64/tuning_models/generic_armv9_a.h 
> b/gcc/config/aarch64/tuning_models/generic_armv9_a.h
> index 
> b39a0c73db910888168790888d24ddf4406bf1ee..fd72de542862909ccb9a9260a16bb01935d97f36
>  100644
> --- a/gcc/config/aarch64/tuning_models/generic_armv9_a.h
> +++ b/gcc/config/aarch64/tuning_models/generic_armv9_a.h
> @@ -136,6 +136,8 @@ static const sve_vec_cost generic_armv9_a_sve_vector_cost 
> =
>      operation more than a 64-bit gather.  */
>   14, /* gather_load_x32_cost  */
>   12, /* gather_load_x64_cost  */
> +  42, /* gather_load_x32_init_cost  */
> +  24, /* gather_load_x64_init_cost  */
>   3 /* scatter_store_elt_cost  */
> };
> 
> diff --git a/gcc/config/aarch64/tuning_models/neoverse512tvb.h 
> b/gcc/config/aarch64/tuning_models/neoverse512tvb.h
> index 
> 825c6a64990b72cda3641737957dc94d75db1509..d2a0b647791de8fca6d7684849d2ab1e9104b045
>  100644
> --- a/gcc/config/aarch64/tuning_models/neoverse512tvb.h
> +++ b/gcc/config/aarch64/tuning_models/neoverse512tvb.h
> @@ -79,6 +79,8 @@ static const sve_vec_cost neoverse512tvb_sve_vector_cost =
>      operation more than a 64-bit gather.  */
>   14, /* gather_load_x32_cost  */
>   12, /* gather_load_x64_cost  */
> +  42, /* gather_load_x32_init_cost  */
> +  24, /* gather_load_x64_init_cost  */
>   3 /* scatter_store_elt_cost  */
> };
> 
> diff --git a/gcc/config/aarch64/tuning_models/neoversen2.h 
> b/gcc/config/aarch64/tuning_models/neoversen2.h
> index 
> 3430eb9c06819e00ab38966bb960bd6525ff2b5c..00d2c12e739ffd371dd4720826894e980d577ca7
>  100644
> --- a/gcc/config/aarch64/tuning_models/neoversen2.h
> +++ b/gcc/config/aarch64/tuning_models/neoversen2.h
> @@ -135,6 +135,8 @@ static const sve_vec_cost neoversen2_sve_vector_cost =
>      operation more than a 64-bit gather.  */
>   14, /* gather_load_x32_cost  */
>   12, /* gather_load_x64_cost  */
> +  42, /* gather_load_x32_init_cost  */
> +  24, /* gather_load_x64_init_cost  */
>   3 /* scatter_store_elt_cost  */
> };
> 
> diff --git a/gcc/config/aarch64/tuning_models/neoversen3.h 
> b/gcc/config/aarch64/tuning_models/neoversen3.h
> index 
> 7438e39a4bbe43de624b63fdd20d3fde9dfb6fc9..fc4333ffdeaef0115ac162e2da9d8d548bacf576
>  100644
> --- a/gcc/config/aarch64/tuning_models/neoversen3.h
> +++ b/gcc/config/aarch64/tuning_models/neoversen3.h
> @@ -135,6 +135,8 @@ static const sve_vec_cost neoversen3_sve_vector_cost =
>      operation more than a 64-bit gather.  */
>   14, /* gather_load_x32_cost  */
>   12, /* gather_load_x64_cost  */
> +  42, /* gather_load_x32_init_cost  */
> +  24, /* gather_load_x64_init_cost  */
>   1 /* scatter_store_elt_cost  */
> };
> 
> diff --git a/gcc/config/aarch64/tuning_models/neoversev1.h 
> b/gcc/config/aarch64/tuning_models/neoversev1.h
> index 
> 0fc41ce6a41b3135fa06d2bda1f517fdf4f8dbcf..705ed025730f6683109a4796c6eefa55b437cec9
>  100644
> --- a/gcc/config/aarch64/tuning_models/neoversev1.h
> +++ b/gcc/config/aarch64/tuning_models/neoversev1.h
> @@ -126,6 +126,8 @@ static const sve_vec_cost neoversev1_sve_vector_cost =
>   8, /* fadda_f64_cost  */
>   32, /* gather_load_x32_cost  */
>   16, /* gather_load_x64_cost  */
> +  96, /* gather_load_x32_init_cost  */
> +  32, /* gather_load_x64_init_cost  */
>   3 /* scatter_store_elt_cost  */
> };
> 
> diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h 
> b/gcc/config/aarch64/tuning_models/neoversev2.h
> index 
> cca459e32c1384f57f8345d86b42b7814ae44115..680feeb9e4ee7bf21d5a258d83e522e079fdc156
>  100644
> --- a/gcc/config/aarch64/tuning_models/neoversev2.h
> +++ b/gcc/config/aarch64/tuning_models/neoversev2.h
> @@ -135,6 +135,8 @@ static const sve_vec_cost neoversev2_sve_vector_cost =
>      operation more than a 64-bit gather.  */
>   14, /* gather_load_x32_cost  */
>   12, /* gather_load_x64_cost  */
> +  42, /* gather_load_x32_init_cost  */
> +  24, /* gather_load_x64_init_cost  */
>   3 /* scatter_store_elt_cost  */
> };
> 
> diff --git a/gcc/config/aarch64/tuning_models/neoversev3.h 
> b/gcc/config/aarch64/tuning_models/neoversev3.h
> index 
> 3daa3d2365c817d03c6c0d5e66fe832620d8fb2c..812c6ad304e8d4c503dcd444437bf6528d6f3176
>  100644
> --- a/gcc/config/aarch64/tuning_models/neoversev3.h
> +++ b/gcc/config/aarch64/tuning_models/neoversev3.h
> @@ -135,6 +135,8 @@ static const sve_vec_cost neoversev3_sve_vector_cost =
>      operation more than a 64-bit gather.  */
>   14, /* gather_load_x32_cost  */
>   12, /* gather_load_x64_cost  */
> +  42, /* gather_load_x32_init_cost  */
> +  24, /* gather_load_x64_init_cost  */
>   1 /* scatter_store_elt_cost  */
> };
> 
> diff --git a/gcc/config/aarch64/tuning_models/neoversev3ae.h 
> b/gcc/config/aarch64/tuning_models/neoversev3ae.h
> index 
> 29c6f22e941b26ee333c87b9fac22aea86625e97..280b5abb27d3c9f404d5f96f14d0cba1e13b9bd1
>  100644
> --- a/gcc/config/aarch64/tuning_models/neoversev3ae.h
> +++ b/gcc/config/aarch64/tuning_models/neoversev3ae.h
> @@ -135,6 +135,8 @@ static const sve_vec_cost neoversev3ae_sve_vector_cost =
>      operation more than a 64-bit gather.  */
>   14, /* gather_load_x32_cost  */
>   12, /* gather_load_x64_cost  */
> +  42, /* gather_load_x32_init_cost  */
> +  24, /* gather_load_x64_init_cost  */
>   1 /* scatter_store_elt_cost  */
> };
> 
> 
> 
> 
> 
> --
> <rb18671.patch>

Re: [PATCH 8/8]AArch64: take gather/scatter decode overhead into account

Reply via email to