This patch changes some vector costs for TX2 so that more vectorizations beneficial for TX2 chip can happen.
The new cost model makes the x264 benchmark of CPU2017 7% faster with no negative performance impact on other benchmarks. Bootstrapped on linux-aarch64 2020-07-06 Anton Youdkevitch <anton.youdkevi...@bell-sw.com> gcc/ * config/aarch64/aarch64.c (thunderx2t99_regmove_cost): Change instruction cost (thunderx2t99_vector_cost): Likewise
>From 3440e019c05fe5b565041cad549c6eefa2004a2b Mon Sep 17 00:00:00 2001 From: Anton Youdkevitch <anton.youdkevi...@bell-sw.com> Date: Tue, 26 May 2020 04:23:04 -0700 Subject: [PATCH] Change costs for TX2 to expose more vectorization opportunities Make the costs such that they do not exaclty reflect the actual instructions costs from the manual but make the codegen emit the code we want it to. --- gcc/config/aarch64/aarch64.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index e92c7e6..18c2251 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -535,9 +535,9 @@ static const struct cpu_regmove_cost thunderx2t99_regmove_cost = { 1, /* GP2GP */ /* Avoid the use of int<->fp moves for spilling. */ - 8, /* GP2FP */ - 8, /* FP2GP */ - 4 /* FP2FP */ + 5, /* GP2FP */ + 6, /* FP2GP */ + 3, /* FP2FP */ }; static const struct cpu_regmove_cost thunderx3t110_regmove_cost = @@ -704,15 +704,15 @@ static const struct cpu_vector_cost thunderx2t99_vector_cost = 6, /* scalar_fp_stmt_cost */ 4, /* scalar_load_cost */ 1, /* scalar_store_cost */ - 5, /* vec_int_stmt_cost */ - 6, /* vec_fp_stmt_cost */ + 4, /* vec_int_stmt_cost */ + 5, /* vec_fp_stmt_cost */ 10, /* vec_permute_cost */ 6, /* vec_to_scalar_cost */ 5, /* scalar_to_vec_cost */ - 8, /* vec_align_load_cost */ - 8, /* vec_unalign_load_cost */ - 4, /* vec_unalign_store_cost */ - 4, /* vec_store_cost */ + 4, /* vec_align_load_cost */ + 4, /* vec_unalign_load_cost */ + 1, /* vec_unalign_store_cost */ + 1, /* vec_store_cost */ 2, /* cond_taken_branch_cost */ 1 /* cond_not_taken_branch_cost */ }; -- 2.7.4