Hi,
So this is my second attempt at finding a way to improve how we generate
the vector IV's and teach the vectorizer to share them between main loop
and epilogues. On IRC we discussed my idea to use the loop's control_iv,
but that was a terrible idea and I quickly threw it in the bin. The main
problem, that for some reason I failed to see, was that the control_iv
increases by 's' and the datarefs by 's' * NELEMENTS where 's' is
usually 1 and NELEMENTs the amount of elements we handle per iteration.
That means the epilogue loops would have to start from the last loop's
IV * the last loop's NELEMENT's and that would just cause a mess.
Instead I started to think about creating IV's for the datarefs and what
I thought worked best was to create these in scalar before peeling. That
way the peeling mechanisms takes care of the duplication of these for
the vector and scalar epilogues and it also takes care of adding
phi-nodes for the skip_vector paths.
These new IV's have two functions:
1) 'vect_create_data_ref_ptr' can use them to:
a) if it's the main loop: replace the values of the 'initial' value of
the main loop's IV and the initial values in the skip_vector phi-nodes
b) Update the the skip_vector phi-nodes argument for the non-skip path
with the updated vector ptr.
2) They are used for the scalar epilogue ensuring they share the same
datareference ptr.
There are still a variety of 'hacky' elements here and a lot of testing
to be done, but I hope to be able to clean them away. One of the main
issues I had was that I had to skip a couple of checks and things for
the added phi-nodes and update statements as these do not have
stmt_vec_info representation. Though I'm not sure adding this
representation at their creation was much cleaner... It is something I
could play around with but I thought this was a good moment to ask you
for input. For instance, maybe we could do this transformation before
analysis?
Also be aware that because I create a IV for each dataref this leads to
regressions with SVE codegen for instance. NEON is able to use the
post-index addressing mode to increase each dr IV at access time, but
SVE can't do this. For this I don't know if maybe we could try to be
smart and create shared IV's. So rather than make them based on the
actual vector ptr, use a shared sizetype IV that can be shared among dr
IV's with the same step. Or maybe this is something for IVOPTs?
Let me know what ya think!
Kind regards,
Andre
diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h
index
8001cc54f518d9d9d1a0fcfe5790d22dae109fb2..939c0a7fefd4355dd75d7646ac2ae63ce23a0e14
100644
--- a/gcc/tree-data-ref.h
+++ b/gcc/tree-data-ref.h
@@ -174,6 +174,8 @@ struct data_reference
/* Alias information for the data reference. */
struct dr_alias alias;
+
+ hash_map<loop_p, tree> *iv_bases;
};
#define DR_STMT(DR) (DR)->stmt
diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c
index
124a7bea6a94161556a6622fa7b113b3cef98bcf..f638bb3e0aa007e0bf7ad8f75fb767d3484b02ce
100644
--- a/gcc/tree-data-ref.c
+++ b/gcc/tree-data-ref.c
@@ -1475,6 +1475,7 @@ void
free_data_ref (data_reference_p dr)
{
DR_ACCESS_FNS (dr).release ();
+ delete dr->iv_bases;
free (dr);
}
@@ -1506,6 +1507,7 @@ create_data_ref (edge nest, loop_p loop, tree memref,
gimple *stmt,
DR_REF (dr) = memref;
DR_IS_READ (dr) = is_read;
DR_IS_CONDITIONAL_IN_STMT (dr) = is_conditional_in_stmt;
+ dr->iv_bases = new hash_map<loop_p, tree> ();
dr_analyze_innermost (&DR_INNERMOST (dr), memref,
nest != NULL ? loop : NULL, stmt);
diff --git a/gcc/tree-ssa-loop-manip.h b/gcc/tree-ssa-loop-manip.h
index
86fc118b6befb06233e5e86a01454fd7075075e1..93e14d09763da5034ba97d09b07c94c20fe25a28
100644
--- a/gcc/tree-ssa-loop-manip.h
+++ b/gcc/tree-ssa-loop-manip.h
@@ -24,6 +24,8 @@ typedef void (*transform_callback)(class loop *, void *);
extern void create_iv (tree, tree, tree, class loop *, gimple_stmt_iterator *,
bool, tree *, tree *);
+extern void create_or_update_iv (tree, tree, tree, class loop *,
gimple_stmt_iterator *,
+ bool, tree *, tree *, gphi *, bool);
extern void rewrite_into_loop_closed_ssa_1 (bitmap, unsigned, int,
class loop *);
extern void rewrite_into_loop_closed_ssa (bitmap, unsigned);
diff --git a/gcc/tree-ssa-loop-manip.c b/gcc/tree-ssa-loop-manip.c
index
28ae1316fa0eb6939a45d15e893b7386622ba60c..1709e175c382ef5d74c2f628a61c9fffe26f726d
100644
--- a/gcc/tree-ssa-loop-manip.c
+++ b/gcc/tree-ssa-loop-manip.c
@@ -57,9 +57,10 @@ static bitmap_obstack loop_renamer_obstack;
VAR_AFTER (unless they are NULL). */
void
-create_iv (tree base, tree step, tree var, class loop *loop,
- gimple_stmt_iterator *incr_pos, bool after,
- tree *var_before, tree *var_after)
+create_or_update_iv (tree base, tree step, tree var, class loop *loop,
+ gimple_stmt_iterator *incr_pos, bool after,
+ tree *var_before, tree *var_after, gphi *update_phi,
+ bool epilogue)
{
gassign *stmt;
gphi *phi;
@@ -153,11 +154,64 @@ create_iv (tree base, tree step, tree var, class loop
*loop,
if (stmts)
gsi_insert_seq_on_edge_immediate (pe, stmts);
+ if (update_phi) {
+ gimple_seq stmts;
+ gimple_stmt_iterator gsi;
+ imm_use_iterator imm_iter;
+ gimple *stmt;
+ tree old_va = PHI_ARG_DEF_FROM_EDGE (update_phi, loop_latch_edge (loop));
+ tree old_init = PHI_ARG_DEF_FROM_EDGE (update_phi, loop_preheader_edge
(loop));
+ tree type = TREE_TYPE (old_va);
+ tree va_conv = fold_convert (type, va);
+
+ gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (va));
+ va_conv = force_gimple_operand (va_conv, &stmts, true, NULL_TREE);
+
+ if (stmts)
+ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT);
+
+
+ FOR_EACH_IMM_USE_STMT (stmt, imm_iter, old_va) {
+ if (!flow_bb_inside_loop_p (loop, stmt->bb))
+ {
+ /* We added this IV, so the outside uses should all be in PHI's
+ added for skip_vector handling. */
+ gcc_assert (gimple_code (stmt) == GIMPLE_PHI);
+ use_operand_p use_p;
+ FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
+ SET_USE (use_p, va_conv);
+
+ }
+ }
+ if (epilogue)
+ initial = old_init;
+ else {
+ tree initial_conv = fold_convert (type, initial);
+ stmts = NULL;
+ initial_conv = force_gimple_operand (initial_conv, &stmts, true,
+ NULL_TREE);
+ if (stmts)
+ gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
+ stmts);
+ replace_uses_by (old_init, initial_conv);
+ release_ssa_name_fn (cfun, old_init);
+ }
+ }
phi = create_phi_node (vb, loop->header);
add_phi_arg (phi, initial, loop_preheader_edge (loop), UNKNOWN_LOCATION);
add_phi_arg (phi, va, loop_latch_edge (loop), UNKNOWN_LOCATION);
}
+void
+create_iv (tree base, tree step, tree var, class loop *loop,
+ gimple_stmt_iterator *incr_pos, bool after,
+ tree *var_before, tree *var_after)
+{
+ create_or_update_iv (base, step, var, loop, incr_pos, after, var_before,
+ var_after, NULL, false);
+}
+
+
/* Return the innermost superloop LOOP of USE_LOOP that is a superloop of
both DEF_LOOP and USE_LOOP. */
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index
97c8577ebe720cf857d811c70eccf60c7655509c..87ca4891df792af73f226ebfb1ce6753818ec87e
100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -4941,11 +4941,14 @@ vect_create_data_ref_ptr (vec_info *vinfo,
stmt_vec_info stmt_info,
}
standard_iv_increment_position (loop, &incr_gsi, &insert_after);
+ gphi *update_phi
+ = as_a<gphi *> (SSA_NAME_DEF_STMT (*dr->iv_bases->get (loop)));
- create_iv (aggr_ptr_init,
+ create_or_update_iv (aggr_ptr_init,
fold_convert (aggr_ptr_type, iv_step),
aggr_ptr, loop, &incr_gsi, insert_after,
- &indx_before_incr, &indx_after_incr);
+ &indx_before_incr, &indx_after_incr, update_phi,
+ LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) != NULL);
incr = gsi_stmt (incr_gsi);
/* Copy the points-to information if it exists. */
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index
012f48bd4870125c820049b4fc70db0ef0759bdf..89e0db5d8adcd9afa709dd37a1842ec36dfa13e6
100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -1402,6 +1402,10 @@ find_loop_location (class loop *loop)
static bool
iv_phi_p (stmt_vec_info stmt_info)
{
+ /* This is to make sure we don't try to vectorize the IV's added by
+ the vectorizer for datarefs. */
+ if (stmt_info == NULL)
+ return false;
gphi *phi = as_a <gphi *> (stmt_info->stmt);
if (virtual_operand_p (PHI_RESULT (phi)))
return false;
@@ -1439,6 +1443,10 @@ vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
gphi *phi = gsi.phi ();
stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
+
+ if (!phi_info)
+ continue;
+
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
phi_info->stmt);
@@ -2280,7 +2288,8 @@ slpeel_update_phi_nodes_for_loops (loop_vec_info
loop_vinfo,
static void
slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
class loop *update_loop,
- edge guard_edge, edge merge_edge)
+ edge guard_edge, edge merge_edge,
+ vec<data_reference_p> *datarefs)
{
location_t merge_loc, guard_loc;
edge orig_e = loop_preheader_edge (skip_loop);
@@ -2310,6 +2319,25 @@ slpeel_update_phi_nodes_for_guard1 (class loop
*skip_loop,
/* Update phi in UPDATE_PHI. */
adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
+
+ if (datarefs) {
+ struct data_reference *dr;
+ unsigned int i;
+
+ FOR_EACH_VEC_ELT (*datarefs, i, dr) {
+ if (*(dr->iv_bases->get (skip_loop)) == PHI_RESULT (orig_phi)) {
+ tree &iv_base = dr->iv_bases->get_or_insert (update_loop);
+ iv_base = PHI_RESULT (update_phi);
+ /*
+ tree new_dummy_var = make_ssa_name (TREE_TYPE (iv_base));
+ SET_PHI_ARG_DEF (update_phi,
+ loop_latch_edge (update_loop)->dest_idx,
+ new_dummy_var);
+ */
+ }
+ }
+ }
+
}
}
@@ -2831,7 +2859,10 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters,
tree nitersm1,
irred_flag);
e = EDGE_PRED (guard_to, 0);
e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
- slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
+ slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e,
+ NULL);
+ /* TODO: create dummy phi-nodes and iv-bases for prolog. Update
+ 'loop's iv-bases. */
scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
scale_loop_profile (prolog, prob_prolog, bound_prolog);
@@ -2931,7 +2962,8 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters,
tree nitersm1,
skip_e = guard_e;
e = EDGE_PRED (guard_to, 0);
e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
- slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
+ slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e,
+ &LOOP_VINFO_DATAREFS
(loop_vinfo));
/* Simply propagate profile info from guard_bb to guard_to which is
a merge point of control flow. */
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index
3e973e774af8f9205be893e01ad9263281116885..638f620db96438b67512aa0d364a4dde57e1ea42
100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -9264,11 +9264,6 @@ update_epilogue_loop_vinfo (class loop *epilogue, tree
advance)
free (LOOP_VINFO_BBS (epilogue_vinfo));
LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
- /* Advance data_reference's with the number of iterations of the previous
- loop and its prologue. */
- vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
-
-
/* The EPILOGUE loop is a copy of the original loop so they share the same
gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
point to the copied statements. We also create a mapping of all LHS' in
@@ -9281,7 +9276,9 @@ update_epilogue_loop_vinfo (class loop *epilogue, tree
advance)
{
new_stmt = epilogue_phi_gsi.phi ();
- gcc_assert (gimple_uid (new_stmt) > 0);
+ if (gimple_uid (new_stmt) == 0)
+ continue;
+
stmt_vinfo
= epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
@@ -9299,10 +9296,9 @@ update_epilogue_loop_vinfo (class loop *epilogue, tree
advance)
!gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
{
new_stmt = gsi_stmt (epilogue_gsi);
- if (is_gimple_debug (new_stmt))
+ if (is_gimple_debug (new_stmt) || gimple_uid (new_stmt) == 0)
continue;
- gcc_assert (gimple_uid (new_stmt) > 0);
stmt_vinfo
= epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
@@ -9484,6 +9480,51 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple
*loop_vectorized_call)
tree advance;
drs_init_vec orig_drs_init;
+ if (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) == NULL)
+ {
+ struct data_reference *dr;
+ unsigned int i;
+ gimple_seq stmts;
+ gimple_stmt_iterator gsi = gsi_for_stmt (get_loop_exit_condition (loop));
+
+ FOR_EACH_VEC_ELT (LOOP_VINFO_DATAREFS (loop_vinfo), i, dr) {
+ tree &iv_base = dr->iv_bases->get_or_insert (loop);
+ tree type = TREE_TYPE (DR_BASE_ADDRESS (dr));
+ gphi *phi = create_phi_node (type, loop->header);
+
+ /* Create an update. */
+ int step_i = int_cst_value (DR_STEP (dr));
+ step_i = step_i / int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF
(dr))));
+ tree step = build_int_cst (sizetype, step_i);
+ tree update = fold_build2 (POINTER_PLUS_EXPR, type,
+ PHI_RESULT (phi),
+ step);
+ stmts = NULL;
+ update = force_gimple_operand (update, &stmts, true, NULL_TREE);
+ gsi_insert_seq_before (&gsi, stmts, GSI_NEW_STMT);
+
+ /* Replace IV in ref. */
+ tree mem = fold_build2 (MEM_REF, TREE_TYPE (DR_REF (dr)),
+ PHI_RESULT (phi),
+ build_zero_cst (type));
+ for (int j = 0; j < gimple_num_ops (DR_STMT (dr)); ++j) {
+ tree op = gimple_op (DR_STMT (dr), j);
+ if (op == DR_REF (dr))
+ gimple_set_op (DR_STMT (dr), j, mem);
+ else
+ simplify_replace_tree (op, DR_REF (dr), mem, NULL, NULL, false);
+ }
+ update_stmt (DR_STMT (dr));
+
+ tree dummy_base = make_ssa_name (type);
+ add_phi_arg (phi, update, single_succ_edge (loop->latch),
+ UNKNOWN_LOCATION);
+ add_phi_arg (phi, dummy_base, loop_preheader_edge (loop),
UNKNOWN_LOCATION);
+ iv_base = PHI_RESULT (phi);
+ }
+ }
+
+
epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
&step_vector, &niters_vector_mult_vf, th,
check_profitability, niters_no_overflow,