Hi, On some (most) AARCH64 cores, it is not always profitable to vectorize some integer loops. This patch does two things (I can split it into different patches if needed). 1) It splits the aarch64 back-end's vector cost model's vector and scalar costs into int and fp fields 1a) For thunderx2t99, models correctly the integer vector/scalar costs. 2) Fixes/Improves a few calls to record_stmt_cost in tree-vect-loop.c where stmt_info was not being passed.
OK? Bootstrapped and tested on aarch64-linux-gnu and provides 20% on libquantum and ~1% overall on SPEC CPU 2006 int. Thanks, Andrew Pinski ChangeLog: * tree-vect-loop.c (vect_compute_single_scalar_iteration_cost): Pass stmt_info to record_stmt_cost. (vect_get_known_peeling_cost): Pass stmt_info if known to record_stmt_cost. * config/aarch64/aarch64-protos.h (cpu_vector_cost): Split cpu_vector_cost field into scalar_int_stmt_cost and scalar_fp_stmt_cost. Split vec_stmt_cost field into vec_int_stmt_cost and vec_fp_stmt_cost. * config/aarch64/aarch64.c (generic_vector_cost): Update for the splitting of scalar_stmt_cost and vec_stmt_cost. (thunderx_vector_cost): Likewise. (cortexa57_vector_cost): LIkewise. (exynosm1_vector_cost): Likewise. (xgene1_vector_cost): Likewise. (thunderx2t99_vector_cost): Improve after the splitting of the two fields. (aarch64_builtin_vectorization_cost): Update for the splitting of scalar_stmt_cost and vec_stmt_cost.
Index: config/aarch64/aarch64-protos.h =================================================================== --- config/aarch64/aarch64-protos.h (revision 245002) +++ config/aarch64/aarch64-protos.h (working copy) @@ -151,11 +151,17 @@ struct cpu_regmove_cost /* Cost for vector insn classes. */ struct cpu_vector_cost { - const int scalar_stmt_cost; /* Cost of any scalar operation, + const int scalar_int_stmt_cost; /* Cost of any int scalar operation, + excluding load and store. */ + const int scalar_fp_stmt_cost; /* Cost of any fp scalar operation, excluding load and store. */ const int scalar_load_cost; /* Cost of scalar load. */ const int scalar_store_cost; /* Cost of scalar store. */ - const int vec_stmt_cost; /* Cost of any vector operation, + const int vec_int_stmt_cost; /* Cost of any int vector operation, + excluding load, store, permute, + vector-to-scalar and + scalar-to-vector operation. */ + const int vec_fp_stmt_cost; /* Cost of any fp vector operation, excluding load, store, permute, vector-to-scalar and scalar-to-vector operation. */ Index: config/aarch64/aarch64.c =================================================================== --- config/aarch64/aarch64.c (revision 245002) +++ config/aarch64/aarch64.c (working copy) @@ -365,10 +365,12 @@ static const struct cpu_regmove_cost thu /* Generic costs for vector insn classes. */ static const struct cpu_vector_cost generic_vector_cost = { - 1, /* scalar_stmt_cost */ + 1, /* scalar_int_stmt_cost */ + 1, /* scalar_fp_stmt_cost */ 1, /* scalar_load_cost */ 1, /* scalar_store_cost */ - 1, /* vec_stmt_cost */ + 1, /* vec_int_stmt_cost */ + 1, /* vec_fp_stmt_cost */ 2, /* vec_permute_cost */ 1, /* vec_to_scalar_cost */ 1, /* scalar_to_vec_cost */ @@ -383,10 +385,12 @@ static const struct cpu_vector_cost gene /* ThunderX costs for vector insn classes. */ static const struct cpu_vector_cost thunderx_vector_cost = { - 1, /* scalar_stmt_cost */ + 1, /* scalar_int_stmt_cost */ + 1, /* scalar_fp_stmt_cost */ 3, /* scalar_load_cost */ 1, /* scalar_store_cost */ - 4, /* vec_stmt_cost */ + 4, /* vec_int_stmt_cost */ + 4, /* vec_fp_stmt_cost */ 4, /* vec_permute_cost */ 2, /* vec_to_scalar_cost */ 2, /* scalar_to_vec_cost */ @@ -401,10 +405,12 @@ static const struct cpu_vector_cost thun /* Generic costs for vector insn classes. */ static const struct cpu_vector_cost cortexa57_vector_cost = { - 1, /* scalar_stmt_cost */ + 1, /* scalar_int_stmt_cost */ + 1, /* scalar_fp_stmt_cost */ 4, /* scalar_load_cost */ 1, /* scalar_store_cost */ - 2, /* vec_stmt_cost */ + 2, /* vec_int_stmt_cost */ + 2, /* vec_fp_stmt_cost */ 3, /* vec_permute_cost */ 8, /* vec_to_scalar_cost */ 8, /* scalar_to_vec_cost */ @@ -418,10 +424,12 @@ static const struct cpu_vector_cost cort static const struct cpu_vector_cost exynosm1_vector_cost = { - 1, /* scalar_stmt_cost */ + 1, /* scalar_int_stmt_cost */ + 1, /* scalar_fp_stmt_cost */ 5, /* scalar_load_cost */ 1, /* scalar_store_cost */ - 3, /* vec_stmt_cost */ + 3, /* vec_int_stmt_cost */ + 3, /* vec_fp_stmt_cost */ 3, /* vec_permute_cost */ 3, /* vec_to_scalar_cost */ 3, /* scalar_to_vec_cost */ @@ -436,10 +444,12 @@ static const struct cpu_vector_cost exyn /* Generic costs for vector insn classes. */ static const struct cpu_vector_cost xgene1_vector_cost = { - 1, /* scalar_stmt_cost */ + 1, /* scalar_int_stmt_cost */ + 1, /* scalar_fp_stmt_cost */ 5, /* scalar_load_cost */ 1, /* scalar_store_cost */ - 2, /* vec_stmt_cost */ + 2, /* vec_int_stmt_cost */ + 2, /* vec_fp_stmt_cost */ 2, /* vec_permute_cost */ 4, /* vec_to_scalar_cost */ 4, /* scalar_to_vec_cost */ @@ -454,10 +464,12 @@ static const struct cpu_vector_cost xgen /* Costs for vector insn classes for Vulcan. */ static const struct cpu_vector_cost thunderx2t99_vector_cost = { - 6, /* scalar_stmt_cost */ + 1, /* scalar_int_stmt_cost */ + 6, /* scalar_fp_stmt_cost */ 4, /* scalar_load_cost */ 1, /* scalar_store_cost */ - 6, /* vec_stmt_cost */ + 5, /* vec_int_stmt_cost */ + 6, /* vec_fp_stmt_cost */ 3, /* vec_permute_cost */ 6, /* vec_to_scalar_cost */ 5, /* scalar_to_vec_cost */ @@ -8119,50 +8131,55 @@ aarch64_builtin_vectorization_cost (enum int misalign ATTRIBUTE_UNUSED) { unsigned elements; + const cpu_vector_cost *costs = aarch64_tune_params.vec_costs; + bool fp = true; + + if (vectype != NULL) + fp = FLOAT_TYPE_P (vectype); switch (type_of_cost) { case scalar_stmt: - return aarch64_tune_params.vec_costs->scalar_stmt_cost; + return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost; case scalar_load: - return aarch64_tune_params.vec_costs->scalar_load_cost; + return costs->scalar_load_cost; case scalar_store: - return aarch64_tune_params.vec_costs->scalar_store_cost; + return costs->scalar_store_cost; case vector_stmt: - return aarch64_tune_params.vec_costs->vec_stmt_cost; + return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost; case vector_load: - return aarch64_tune_params.vec_costs->vec_align_load_cost; + return costs->vec_align_load_cost; case vector_store: - return aarch64_tune_params.vec_costs->vec_store_cost; + return costs->vec_store_cost; case vec_to_scalar: - return aarch64_tune_params.vec_costs->vec_to_scalar_cost; + return costs->vec_to_scalar_cost; case scalar_to_vec: - return aarch64_tune_params.vec_costs->scalar_to_vec_cost; + return costs->scalar_to_vec_cost; case unaligned_load: - return aarch64_tune_params.vec_costs->vec_unalign_load_cost; + return costs->vec_unalign_load_cost; case unaligned_store: - return aarch64_tune_params.vec_costs->vec_unalign_store_cost; + return costs->vec_unalign_store_cost; case cond_branch_taken: - return aarch64_tune_params.vec_costs->cond_taken_branch_cost; + return costs->cond_taken_branch_cost; case cond_branch_not_taken: - return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost; + return costs->cond_not_taken_branch_cost; case vec_perm: - return aarch64_tune_params.vec_costs->vec_permute_cost; + return costs->vec_permute_cost; case vec_promote_demote: - return aarch64_tune_params.vec_costs->vec_stmt_cost; + return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost; case vec_construct: elements = TYPE_VECTOR_SUBPARTS (vectype); Index: tree-vect-loop.c =================================================================== --- tree-vect-loop.c (revision 245002) +++ tree-vect-loop.c (working copy) @@ -1329,9 +1329,9 @@ vect_compute_single_scalar_iteration_cos continue; vect_cost_for_stmt kind; - if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))) + if (STMT_VINFO_DATA_REF (stmt_info)) { - if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))) + if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) kind = scalar_load; else kind = scalar_store; @@ -1341,7 +1341,7 @@ vect_compute_single_scalar_iteration_cos scalar_single_iter_cost += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), - factor, kind, NULL, 0, vect_prologue); + factor, kind, stmt_info, 0, vect_prologue); } } LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) @@ -3178,16 +3178,24 @@ vect_get_known_peeling_cost (loop_vec_in int j; if (peel_iters_prologue) FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si) - retval += record_stmt_cost (prologue_cost_vec, - si->count * peel_iters_prologue, - si->kind, NULL, si->misalign, - vect_prologue); + { + struct _stmt_vec_info *stmt_info + = si->stmt ? vinfo_for_stmt (si->stmt) : NULL; + retval += record_stmt_cost (prologue_cost_vec, + si->count * peel_iters_prologue, + si->kind, stmt_info, si->misalign, + vect_prologue); + } if (*peel_iters_epilogue) FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si) - retval += record_stmt_cost (epilogue_cost_vec, - si->count * *peel_iters_epilogue, - si->kind, NULL, si->misalign, - vect_epilogue); + { + struct _stmt_vec_info *stmt_info + = si->stmt ? vinfo_for_stmt (si->stmt) : NULL; + retval += record_stmt_cost (epilogue_cost_vec, + si->count * *peel_iters_epilogue, + si->kind, stmt_info, si->misalign, + vect_epilogue); + } return retval; }