The adjusted vector costs give Falkor a reasonable boost in performance for FP benchmarks (both CPU2017 and CPU2006) and doesn't change INT benchmarks that much. About 0.7% for CPU2017 FP and 1.54% for CPU2006 FP.
OK for trunk? gcc/ChangeLog: 2018-07-25 Luis Machado <luis.mach...@linaro.org> * config/aarch64/aarch64.c (qdf24xx_vector_cost): New. (qdf24xx_tunings) <vec_costs>: Set to qdf24xx_vector_cost. --- gcc/config/aarch64/aarch64.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index fa01475..d443aee 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -430,6 +430,26 @@ static const struct cpu_vector_cost generic_vector_cost = 1 /* cond_not_taken_branch_cost */ }; +/* Qualcomm QDF24xx costs for vector insn classes. */ +static const struct cpu_vector_cost qdf24xx_vector_cost = +{ + 1, /* scalar_int_stmt_cost */ + 1, /* scalar_fp_stmt_cost */ + 1, /* scalar_load_cost */ + 1, /* scalar_store_cost */ + 1, /* vec_int_stmt_cost */ + 3, /* vec_fp_stmt_cost */ + 2, /* vec_permute_cost */ + 1, /* vec_to_scalar_cost */ + 1, /* scalar_to_vec_cost */ + 1, /* vec_align_load_cost */ + 1, /* vec_unalign_load_cost */ + 1, /* vec_unalign_store_cost */ + 1, /* vec_store_cost */ + 3, /* cond_taken_branch_cost */ + 1 /* cond_not_taken_branch_cost */ +}; + /* ThunderX costs for vector insn classes. */ static const struct cpu_vector_cost thunderx_vector_cost = { @@ -890,7 +910,7 @@ static const struct tune_params qdf24xx_tunings = &qdf24xx_extra_costs, &qdf24xx_addrcost_table, &qdf24xx_regmove_cost, - &generic_vector_cost, + &qdf24xx_vector_cost, &generic_branch_cost, &generic_approx_modes, 4, /* memmov_cost */ -- 2.7.4