Storing one element of a vector is costed as a vec_to_scalar followed by a scalar_store. However, vec_to_scalar is also used for reductions and for vector-to-GPR moves, which makes it difficult to pick one cost for them all.
This patch therefore adds a cost for extracting one element of a vector in preparation for storing it out. The store itself is still costed separately. Like with the previous patches, this one only becomes active if a CPU selects use_new_vector_costs. It should therefore have a very low impact on other CPUs. gcc/ * config/aarch64/aarch64-protos.h (simd_vec_cost::store_elt_extra_cost): New member variable. * config/aarch64/aarch64.c (generic_advsimd_vector_cost): Update accordingly, using the vec_to_scalar cost for the new field. (generic_sve_vector_cost, a64fx_advsimd_vector_cost): Likewise. (a64fx_sve_vector_cost, qdf24xx_advsimd_vector_cost): Likewise. (thunderx_advsimd_vector_cost, tsv110_advsimd_vector_cost): Likewise. (cortexa57_advsimd_vector_cost, exynosm1_advsimd_vector_cost) (xgene1_advsimd_vector_cost, thunderx2t99_advsimd_vector_cost) (thunderx3t110_advsimd_vector_cost): Likewise. (aarch64_detect_vector_stmt_subtype): Detect single-element stores. --- gcc/config/aarch64/aarch64-protos.h | 4 ++++ gcc/config/aarch64/aarch64.c | 20 ++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 3d152754981..fabe3df7071 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -224,6 +224,10 @@ struct simd_vec_cost const int reduc_f32_cost; const int reduc_f64_cost; + /* Additional cost of storing a single vector element, on top of the + normal cost of a scalar store. */ + const int store_elt_extra_cost; + /* Cost of a vector-to-scalar operation. */ const int vec_to_scalar_cost; diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 8fb723dabd2..20bb75bd56c 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -601,6 +601,7 @@ static const advsimd_vec_cost generic_advsimd_vector_cost = 2, /* reduc_f16_cost */ 2, /* reduc_f32_cost */ 2, /* reduc_f64_cost */ + 2, /* store_elt_extra_cost */ 2, /* vec_to_scalar_cost */ 1, /* scalar_to_vec_cost */ 1, /* align_load_cost */ @@ -626,6 +627,7 @@ static const sve_vec_cost generic_sve_vector_cost = 2, /* reduc_f16_cost */ 2, /* reduc_f32_cost */ 2, /* reduc_f64_cost */ + 2, /* store_elt_extra_cost */ 2, /* vec_to_scalar_cost */ 1, /* scalar_to_vec_cost */ 1, /* align_load_cost */ @@ -667,6 +669,7 @@ static const advsimd_vec_cost a64fx_advsimd_vector_cost = 13, /* reduc_f16_cost */ 13, /* reduc_f32_cost */ 13, /* reduc_f64_cost */ + 13, /* store_elt_extra_cost */ 13, /* vec_to_scalar_cost */ 4, /* scalar_to_vec_cost */ 6, /* align_load_cost */ @@ -691,6 +694,7 @@ static const sve_vec_cost a64fx_sve_vector_cost = 13, /* reduc_f16_cost */ 13, /* reduc_f32_cost */ 13, /* reduc_f64_cost */ + 13, /* store_elt_extra_cost */ 13, /* vec_to_scalar_cost */ 4, /* scalar_to_vec_cost */ 6, /* align_load_cost */ @@ -731,6 +735,7 @@ static const advsimd_vec_cost qdf24xx_advsimd_vector_cost = 1, /* reduc_f16_cost */ 1, /* reduc_f32_cost */ 1, /* reduc_f64_cost */ + 1, /* store_elt_extra_cost */ 1, /* vec_to_scalar_cost */ 1, /* scalar_to_vec_cost */ 1, /* align_load_cost */ @@ -768,6 +773,7 @@ static const advsimd_vec_cost thunderx_advsimd_vector_cost = 2, /* reduc_f16_cost */ 2, /* reduc_f32_cost */ 2, /* reduc_f64_cost */ + 2, /* store_elt_extra_cost */ 2, /* vec_to_scalar_cost */ 2, /* scalar_to_vec_cost */ 3, /* align_load_cost */ @@ -804,6 +810,7 @@ static const advsimd_vec_cost tsv110_advsimd_vector_cost = 3, /* reduc_f16_cost */ 3, /* reduc_f32_cost */ 3, /* reduc_f64_cost */ + 3, /* store_elt_extra_cost */ 3, /* vec_to_scalar_cost */ 2, /* scalar_to_vec_cost */ 5, /* align_load_cost */ @@ -839,6 +846,7 @@ static const advsimd_vec_cost cortexa57_advsimd_vector_cost = 8, /* reduc_f16_cost */ 8, /* reduc_f32_cost */ 8, /* reduc_f64_cost */ + 8, /* store_elt_extra_cost */ 8, /* vec_to_scalar_cost */ 8, /* scalar_to_vec_cost */ 4, /* align_load_cost */ @@ -875,6 +883,7 @@ static const advsimd_vec_cost exynosm1_advsimd_vector_cost = 3, /* reduc_f16_cost */ 3, /* reduc_f32_cost */ 3, /* reduc_f64_cost */ + 3, /* store_elt_extra_cost */ 3, /* vec_to_scalar_cost */ 3, /* scalar_to_vec_cost */ 5, /* align_load_cost */ @@ -910,6 +919,7 @@ static const advsimd_vec_cost xgene1_advsimd_vector_cost = 4, /* reduc_f16_cost */ 4, /* reduc_f32_cost */ 4, /* reduc_f64_cost */ + 4, /* store_elt_extra_cost */ 4, /* vec_to_scalar_cost */ 4, /* scalar_to_vec_cost */ 10, /* align_load_cost */ @@ -946,6 +956,7 @@ static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost = 6, /* reduc_f16_cost */ 6, /* reduc_f32_cost */ 6, /* reduc_f64_cost */ + 6, /* store_elt_extra_cost */ 6, /* vec_to_scalar_cost */ 5, /* scalar_to_vec_cost */ 4, /* align_load_cost */ @@ -982,6 +993,7 @@ static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost = 5, /* reduc_f16_cost */ 5, /* reduc_f32_cost */ 5, /* reduc_f64_cost */ + 5, /* store_elt_extra_cost */ 5, /* vec_to_scalar_cost */ 5, /* scalar_to_vec_cost */ 4, /* align_load_cost */ @@ -14259,6 +14271,14 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, if (aarch64_sve_mode_p (TYPE_MODE (vectype))) sve_costs = aarch64_tune_params.vec_costs->sve; + /* Detect cases in which vec_to_scalar is describing the extraction of a + vector element in preparation for a scalar store. The store itself is + costed separately. */ + if (kind == vec_to_scalar + && STMT_VINFO_DATA_REF (stmt_info) + && DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info))) + return simd_costs->store_elt_extra_cost; + /* Detect cases in which vec_to_scalar represents an in-loop reduction. */ if (kind == vec_to_scalar && where == vect_body -- 2.17.1