This patch changes some vector costs for TX2 so that
more vectorizations beneficial for TX2 chip can happen.

The new cost model makes the x264 benchmark of CPU2017
7% faster with no negative performance impact on other
benchmarks.

Bootstrapped on linux-aarch64

        2020-07-06 Anton Youdkevitch <anton.youdkevi...@bell-sw.com>
gcc/
    * config/aarch64/aarch64.c (thunderx2t99_regmove_cost):
    Change instruction cost
    (thunderx2t99_vector_cost): Likewise
>From 3440e019c05fe5b565041cad549c6eefa2004a2b Mon Sep 17 00:00:00 2001
From: Anton Youdkevitch <anton.youdkevi...@bell-sw.com>
Date: Tue, 26 May 2020 04:23:04 -0700
Subject: [PATCH] Change costs for TX2 to expose more vectorization opportunities

Make the costs such that they do not exaclty reflect
the actual instructions costs from the manual but make
the codegen emit the code we want it to.
---
 gcc/config/aarch64/aarch64.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index e92c7e6..18c2251 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -535,9 +535,9 @@ static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 {
   1, /* GP2GP  */
   /* Avoid the use of int<->fp moves for spilling.  */
-  8, /* GP2FP  */
-  8, /* FP2GP  */
-  4  /* FP2FP  */
+  5, /* GP2FP  */
+  6, /* FP2GP  */
+  3, /* FP2FP  */
 };
 
 static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
@@ -704,15 +704,15 @@ static const struct cpu_vector_cost thunderx2t99_vector_cost =
   6, /* scalar_fp_stmt_cost  */
   4, /* scalar_load_cost  */
   1, /* scalar_store_cost  */
-  5, /* vec_int_stmt_cost  */
-  6, /* vec_fp_stmt_cost  */
+  4, /* vec_int_stmt_cost  */
+  5, /* vec_fp_stmt_cost  */
   10, /* vec_permute_cost  */
   6, /* vec_to_scalar_cost  */
   5, /* scalar_to_vec_cost  */
-  8, /* vec_align_load_cost  */
-  8, /* vec_unalign_load_cost  */
-  4, /* vec_unalign_store_cost  */
-  4, /* vec_store_cost  */
+  4, /* vec_align_load_cost  */
+  4, /* vec_unalign_load_cost  */
+  1, /* vec_unalign_store_cost  */
+  1, /* vec_store_cost  */
   2, /* cond_taken_branch_cost  */
   1  /* cond_not_taken_branch_cost  */
 };
-- 
2.7.4

Reply via email to