Hi,
It is suspected that for some large loops with too many vect_insns after
vectorization, the performance will be regressed because of various reasons.
Two possible reasons are:
1) Suggested by powerpc backend, large loops with too many vect_insns may jam
vector unit's pipeline.
2) Vectorizer may generate bad code, especially if the loop has many different
memory accesses. These memory accesses could refer to different memory
objects, or the same object but with different constant offset. This may
result in more base register living from loop preheader, as well as high
register pressure.
This patch penalize vector cost for such loops in order to skip vectorizing.
The idea is totally borrowed from powerpc backend. I adjusted the parameter a
bit to get better performance on AArch64. It improves several cases in spec,
especially 459.GemsFDTD by ~15%.
Bootstrap and test on AArch64. Is it OK?
2016-10-13 Bin Cheng <bin.ch...@arm.com>
* config/aarch64/aarch64.c (struct aarch64_vect_loop_cost_data): New.
(aarch64_add_stmt_cost): Use struct aarch64_vect_loop_cost_data.
(aarch64_init_cost, aarch64_density_test): New funcs.
(aarch64_finish_cost, aarch64_destroy_cost_data): New funcs.
(TARGET_VECTORIZE_INIT_COST): New macro.
(TARGET_VECTORIZE_FINISH_COST): New macro.
(TARGET_VECTORIZE_DESTROY_COST_DATA): New macro.
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 6078b16..be5e56e 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -7774,15 +7774,22 @@ aarch64_builtin_vectorization_cost (enum
vect_cost_for_stmt type_of_cost,
}
}
+struct aarch64_vect_loop_cost_data
+{
+ struct loop *loop_info;
+ unsigned cost[3];
+};
+
/* Implement targetm.vectorize.add_stmt_cost. */
static unsigned
aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
struct _stmt_vec_info *stmt_info, int misalign,
enum vect_cost_model_location where)
{
- unsigned *cost = (unsigned *) data;
+ struct aarch64_vect_loop_cost_data *cost_data;
unsigned retval = 0;
+ cost_data = (struct aarch64_vect_loop_cost_data*) data;
if (flag_vect_cost_model)
{
tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
@@ -7796,7 +7803,7 @@ aarch64_add_stmt_cost (void *data, int count, enum
vect_cost_for_stmt kind,
count *= 50; /* FIXME */
retval = (unsigned) (count * stmt_cost);
- cost[where] += retval;
+ cost_data->cost[where] += retval;
}
return retval;
@@ -14054,6 +14061,95 @@ aarch64_optab_supported_p (int op, machine_mode mode1,
machine_mode,
}
}
+/* Implement targetm.vectorize.init_cost. */
+
+static void *
+aarch64_init_cost (struct loop *loop_info)
+{
+ struct aarch64_vect_loop_cost_data *data;
+
+ data = XNEW (struct aarch64_vect_loop_cost_data);
+ data->loop_info = loop_info;
+ data->cost[vect_prologue] = 0;
+ data->cost[vect_body] = 0;
+ data->cost[vect_epilogue] = 0;
+ return data;
+}
+
+/* Test for likely overcommitment of vector hardware resources. If a
+ loop iteration is relatively large, and too large a percentage of
+ instructions in the loop are vectorized, the cost model may not
+ adequately reflect delays from unavailable vector resources.
+ Penalize the loop body cost for this case. */
+
+static void
+aarch64_density_test (struct aarch64_vect_loop_cost_data *data)
+{
+ const int DENSITY_PCT_THRESHOLD = 85;
+ const int DENSITY_SIZE_THRESHOLD = 128;
+ const int DENSITY_PENALTY = 10;
+ struct loop *loop = data->loop_info;
+ basic_block *bbs = get_loop_body (loop);
+ int nbbs = loop->num_nodes;
+ int vec_cost = data->cost[vect_body], not_vec_cost = 0;
+ int i, density_pct;
+
+ for (i = 0; i < nbbs; i++)
+ {
+ basic_block bb = bbs[i];
+ gimple_stmt_iterator gsi;
+
+ for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+ {
+ gimple *stmt = gsi_stmt (gsi);
+ stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+
+ if (!STMT_VINFO_RELEVANT_P (stmt_info)
+ && !STMT_VINFO_IN_PATTERN_P (stmt_info))
+ not_vec_cost++;
+ }
+ }
+
+ free (bbs);
+ density_pct = (vec_cost * 100) / (vec_cost + not_vec_cost);
+
+ if (density_pct > DENSITY_PCT_THRESHOLD
+ && vec_cost + not_vec_cost > DENSITY_SIZE_THRESHOLD)
+ {
+ data->cost[vect_body] = vec_cost * (100 + DENSITY_PENALTY) / 100;
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "density %d%%, cost %d exceeds threshold, penalizing "
+ "loop body cost by %d%%", density_pct,
+ vec_cost + not_vec_cost, DENSITY_PENALTY);
+ }
+}
+
+/* Implement targetm.vectorize.finish_cost. */
+
+static void
+aarch64_finish_cost (void *data, unsigned *prologue_cost,
+ unsigned *body_cost, unsigned *epilogue_cost)
+{
+ struct aarch64_vect_loop_cost_data *cost_data;
+
+ cost_data = (struct aarch64_vect_loop_cost_data*) data;
+ if (cost_data->loop_info)
+ aarch64_density_test (cost_data);
+
+ *prologue_cost = cost_data->cost[vect_prologue];
+ *body_cost = cost_data->cost[vect_body];
+ *epilogue_cost = cost_data->cost[vect_epilogue];
+}
+
+/* Implement targetm.vectorize.destroy_cost_data. */
+
+static void
+aarch64_destroy_cost_data (void *data)
+{
+ free (data);
+}
+
#undef TARGET_ADDRESS_COST
#define TARGET_ADDRESS_COST aarch64_address_cost
@@ -14287,9 +14383,18 @@ aarch64_optab_supported_p (int op, machine_mode mode1,
machine_mode,
#undef TARGET_ARRAY_MODE_SUPPORTED_P
#define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
+#undef TARGET_VECTORIZE_INIT_COST
+#define TARGET_VECTORIZE_INIT_COST aarch64_init_cost
+
#undef TARGET_VECTORIZE_ADD_STMT_COST
#define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
+#undef TARGET_VECTORIZE_FINISH_COST
+#define TARGET_VECTORIZE_FINISH_COST aarch64_finish_cost
+
+#undef TARGET_VECTORIZE_DESTROY_COST_DATA
+#define TARGET_VECTORIZE_DESTROY_COST_DATA aarch64_destroy_cost_data
+
#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
aarch64_builtin_vectorization_cost