AARCH64] Improve cost model for ThunderX2 CN99xx

Andrew Pinski Sat, 28 Jan 2017 12:35:58 -0800

Hi,
  On some (most) AARCH64 cores, it is not always profitable to
vectorize some integer loops.  This patch does two things (I can split
it into different patches if needed).
1) It splits the aarch64 back-end's vector cost model's vector and
scalar costs into int and fp fields
1a) For thunderx2t99, models correctly the integer vector/scalar costs.
2) Fixes/Improves a few calls to record_stmt_cost in tree-vect-loop.c
where stmt_info was not being passed.


OK?  Bootstrapped and tested on aarch64-linux-gnu and provides 20% on
libquantum and ~1% overall on SPEC CPU 2006 int.

Thanks,
Andrew Pinski

ChangeLog:
* tree-vect-loop.c (vect_compute_single_scalar_iteration_cost): Pass
stmt_info to record_stmt_cost.
(vect_get_known_peeling_cost): Pass stmt_info if known to record_stmt_cost.

* config/aarch64/aarch64-protos.h (cpu_vector_cost): Split
cpu_vector_cost field into
scalar_int_stmt_cost and scalar_fp_stmt_cost.  Split vec_stmt_cost
field into vec_int_stmt_cost and vec_fp_stmt_cost.
* config/aarch64/aarch64.c (generic_vector_cost): Update for the
splitting of scalar_stmt_cost and vec_stmt_cost.
(thunderx_vector_cost): Likewise.
(cortexa57_vector_cost): LIkewise.
(exynosm1_vector_cost): Likewise.
(xgene1_vector_cost): Likewise.
(thunderx2t99_vector_cost): Improve after the splitting of the two fields.
(aarch64_builtin_vectorization_cost): Update for the splitting of
scalar_stmt_cost and vec_stmt_cost.

Index: config/aarch64/aarch64-protos.h
===================================================================
--- config/aarch64/aarch64-protos.h     (revision 245002)
+++ config/aarch64/aarch64-protos.h     (working copy)
@@ -151,11 +151,17 @@ struct cpu_regmove_cost
 /* Cost for vector insn classes.  */
 struct cpu_vector_cost
 {
-  const int scalar_stmt_cost;           /* Cost of any scalar operation,
+  const int scalar_int_stmt_cost;       /* Cost of any int scalar operation,
+                                           excluding load and store.  */
+  const int scalar_fp_stmt_cost;        /* Cost of any fp scalar operation,
                                            excluding load and store.  */
   const int scalar_load_cost;           /* Cost of scalar load.  */
   const int scalar_store_cost;          /* Cost of scalar store.  */
-  const int vec_stmt_cost;              /* Cost of any vector operation,
+  const int vec_int_stmt_cost;          /* Cost of any int vector operation,
+                                           excluding load, store, permute,
+                                           vector-to-scalar and
+                                           scalar-to-vector operation.  */
+  const int vec_fp_stmt_cost;           /* Cost of any fp vector operation,
                                            excluding load, store, permute,
                                            vector-to-scalar and
                                            scalar-to-vector operation.  */
Index: config/aarch64/aarch64.c
===================================================================
--- config/aarch64/aarch64.c    (revision 245002)
+++ config/aarch64/aarch64.c    (working copy)
@@ -365,10 +365,12 @@ static const struct cpu_regmove_cost thu
 /* Generic costs for vector insn classes.  */
 static const struct cpu_vector_cost generic_vector_cost =
 {
-  1, /* scalar_stmt_cost  */
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
   1, /* scalar_load_cost  */
   1, /* scalar_store_cost  */
-  1, /* vec_stmt_cost  */
+  1, /* vec_int_stmt_cost  */
+  1, /* vec_fp_stmt_cost  */
   2, /* vec_permute_cost  */
   1, /* vec_to_scalar_cost  */
   1, /* scalar_to_vec_cost  */
@@ -383,10 +385,12 @@ static const struct cpu_vector_cost gene
 /* ThunderX costs for vector insn classes.  */
 static const struct cpu_vector_cost thunderx_vector_cost =
 {
-  1, /* scalar_stmt_cost  */
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
   3, /* scalar_load_cost  */
   1, /* scalar_store_cost  */
-  4, /* vec_stmt_cost  */
+  4, /* vec_int_stmt_cost  */
+  4, /* vec_fp_stmt_cost  */
   4, /* vec_permute_cost  */
   2, /* vec_to_scalar_cost  */
   2, /* scalar_to_vec_cost  */
@@ -401,10 +405,12 @@ static const struct cpu_vector_cost thun
 /* Generic costs for vector insn classes.  */
 static const struct cpu_vector_cost cortexa57_vector_cost =
 {
-  1, /* scalar_stmt_cost  */
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
   4, /* scalar_load_cost  */
   1, /* scalar_store_cost  */
-  2, /* vec_stmt_cost  */
+  2, /* vec_int_stmt_cost  */
+  2, /* vec_fp_stmt_cost  */
   3, /* vec_permute_cost  */
   8, /* vec_to_scalar_cost  */
   8, /* scalar_to_vec_cost  */
@@ -418,10 +424,12 @@ static const struct cpu_vector_cost cort
 
 static const struct cpu_vector_cost exynosm1_vector_cost =
 {
-  1, /* scalar_stmt_cost  */
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
   5, /* scalar_load_cost  */
   1, /* scalar_store_cost  */
-  3, /* vec_stmt_cost  */
+  3, /* vec_int_stmt_cost  */
+  3, /* vec_fp_stmt_cost  */
   3, /* vec_permute_cost  */
   3, /* vec_to_scalar_cost  */
   3, /* scalar_to_vec_cost  */
@@ -436,10 +444,12 @@ static const struct cpu_vector_cost exyn
 /* Generic costs for vector insn classes.  */
 static const struct cpu_vector_cost xgene1_vector_cost =
 {
-  1, /* scalar_stmt_cost  */
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
   5, /* scalar_load_cost  */
   1, /* scalar_store_cost  */
-  2, /* vec_stmt_cost  */
+  2, /* vec_int_stmt_cost  */
+  2, /* vec_fp_stmt_cost  */
   2, /* vec_permute_cost  */
   4, /* vec_to_scalar_cost  */
   4, /* scalar_to_vec_cost  */
@@ -454,10 +464,12 @@ static const struct cpu_vector_cost xgen
 /* Costs for vector insn classes for Vulcan.  */
 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 {
-  6, /* scalar_stmt_cost  */
+  1, /* scalar_int_stmt_cost  */
+  6, /* scalar_fp_stmt_cost  */
   4, /* scalar_load_cost  */
   1, /* scalar_store_cost  */
-  6, /* vec_stmt_cost  */
+  5, /* vec_int_stmt_cost  */
+  6, /* vec_fp_stmt_cost  */
   3, /* vec_permute_cost  */
   6, /* vec_to_scalar_cost  */
   5, /* scalar_to_vec_cost  */
@@ -8119,50 +8131,55 @@ aarch64_builtin_vectorization_cost (enum
                                    int misalign ATTRIBUTE_UNUSED)
 {
   unsigned elements;
+  const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
+  bool fp = true;
+
+  if (vectype != NULL)
+    fp = FLOAT_TYPE_P (vectype);
 
   switch (type_of_cost)
     {
       case scalar_stmt:
-       return aarch64_tune_params.vec_costs->scalar_stmt_cost;
+       return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
 
       case scalar_load:
-       return aarch64_tune_params.vec_costs->scalar_load_cost;
+       return costs->scalar_load_cost;
 
       case scalar_store:
-       return aarch64_tune_params.vec_costs->scalar_store_cost;
+       return costs->scalar_store_cost;
 
       case vector_stmt:
-       return aarch64_tune_params.vec_costs->vec_stmt_cost;
+       return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
 
       case vector_load:
-       return aarch64_tune_params.vec_costs->vec_align_load_cost;
+       return costs->vec_align_load_cost;
 
       case vector_store:
-       return aarch64_tune_params.vec_costs->vec_store_cost;
+       return costs->vec_store_cost;
 
       case vec_to_scalar:
-       return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
+       return costs->vec_to_scalar_cost;
 
       case scalar_to_vec:
-       return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
+       return costs->scalar_to_vec_cost;
 
       case unaligned_load:
-       return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
+       return costs->vec_unalign_load_cost;
 
       case unaligned_store:
-       return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
+       return costs->vec_unalign_store_cost;
 
       case cond_branch_taken:
-       return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
+       return costs->cond_taken_branch_cost;
 
       case cond_branch_not_taken:
-       return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
+       return costs->cond_not_taken_branch_cost;
 
       case vec_perm:
-       return aarch64_tune_params.vec_costs->vec_permute_cost;
+       return costs->vec_permute_cost;
 
       case vec_promote_demote:
-       return aarch64_tune_params.vec_costs->vec_stmt_cost;
+       return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
 
       case vec_construct:
         elements = TYPE_VECTOR_SUBPARTS (vectype);
Index: tree-vect-loop.c
===================================================================
--- tree-vect-loop.c    (revision 245002)
+++ tree-vect-loop.c    (working copy)
@@ -1329,9 +1329,9 @@ vect_compute_single_scalar_iteration_cos
             continue;
 
          vect_cost_for_stmt kind;
-          if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
+          if (STMT_VINFO_DATA_REF (stmt_info))
             {
-              if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
+              if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
                kind = scalar_load;
              else
                kind = scalar_store;
@@ -1341,7 +1341,7 @@ vect_compute_single_scalar_iteration_cos
 
          scalar_single_iter_cost
            += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
-                                factor, kind, NULL, 0, vect_prologue);
+                                factor, kind, stmt_info, 0, vect_prologue);
         }
     }
   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
@@ -3178,16 +3178,24 @@ vect_get_known_peeling_cost (loop_vec_in
   int j;
   if (peel_iters_prologue)
     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
-      retval += record_stmt_cost (prologue_cost_vec,
-                                 si->count * peel_iters_prologue,
-                                 si->kind, NULL, si->misalign,
-                                 vect_prologue);
+       {
+         struct _stmt_vec_info *stmt_info
+           = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
+         retval += record_stmt_cost (prologue_cost_vec,
+                                     si->count * peel_iters_prologue,
+                                     si->kind, stmt_info, si->misalign,
+                                     vect_prologue);
+       }
   if (*peel_iters_epilogue)
     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
-      retval += record_stmt_cost (epilogue_cost_vec,
-                                 si->count * *peel_iters_epilogue,
-                                 si->kind, NULL, si->misalign,
-                                 vect_epilogue);
+       {
+         struct _stmt_vec_info *stmt_info
+           = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
+         retval += record_stmt_cost (epilogue_cost_vec,
+                                     si->count * *peel_iters_epilogue,
+                                     si->kind, stmt_info, si->misalign,
+                                     vect_epilogue);
+       }
 
   return retval;
 }

[PATCH/VECT/AARCH64] Improve cost model for ThunderX2 CN99xx

Reply via email to