Hi Richard,
As I mentioned in the IRC channel, this is my current work in progress
patch. It currently ICE's when vectorizing
gcc/testsuite/gcc.c-torture/execute/nestfunc-2.c with '-O3' and '--param
vect-epilogues-nomask=1'.
It ICE's because the epilogue loop (after if conversion) and main loop
(before vectorization) are not the same, there are a bunch of extra BBs
and the epilogue loop seems to need some cleaning up too.
Let me know if you see a way around this issue.
Cheers,
Andre
diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h
index 0b0154ffd7bf031a005de993b101d9db6dd98c43..d01512ea46467f1cf77793bdc75b48e71b0b9641 100644
--- a/gcc/cfgloop.h
+++ b/gcc/cfgloop.h
@@ -21,6 +21,7 @@ along with GCC; see the file COPYING3. If not see
#define GCC_CFGLOOP_H
#include "cfgloopmanip.h"
+#include "target.h"
/* Structure to hold decision about unrolling/peeling. */
enum lpt_dec
@@ -268,6 +269,9 @@ public:
the basic-block from being collected but its index can still be
reused. */
basic_block former_header;
+
+ /* Keep track of vector sizes we know we can vectorize the epilogue with. */
+ vector_sizes epilogue_vsizes;
};
/* Set if the loop is known to be infinite. */
diff --git a/gcc/cfgloop.c b/gcc/cfgloop.c
index 4ad1f658708f83dbd8789666c26d4bd056837bc6..f3e81bcd00b3f125389aa15b12dc5201b3578d20 100644
--- a/gcc/cfgloop.c
+++ b/gcc/cfgloop.c
@@ -198,6 +198,7 @@ flow_loop_free (class loop *loop)
exit->prev = exit;
}
+ loop->epilogue_vsizes.release();
ggc_free (loop->exits);
ggc_free (loop);
}
@@ -355,6 +356,7 @@ alloc_loop (void)
loop->nb_iterations_upper_bound = 0;
loop->nb_iterations_likely_upper_bound = 0;
loop->nb_iterations_estimate = 0;
+ loop->epilogue_vsizes.create(8);
return loop;
}
diff --git a/gcc/gengtype.c b/gcc/gengtype.c
index 53317337cf8c8e8caefd6b819d28b3bba301e755..80fb6ef71465b24e034fa45d69fec56be6b2e7f8 100644
--- a/gcc/gengtype.c
+++ b/gcc/gengtype.c
@@ -5197,6 +5197,7 @@ main (int argc, char **argv)
POS_HERE (do_scalar_typedef ("widest_int", &pos));
POS_HERE (do_scalar_typedef ("int64_t", &pos));
POS_HERE (do_scalar_typedef ("poly_int64", &pos));
+ POS_HERE (do_scalar_typedef ("poly_uint64", &pos));
POS_HERE (do_scalar_typedef ("uint64_t", &pos));
POS_HERE (do_scalar_typedef ("uint8", &pos));
POS_HERE (do_scalar_typedef ("uintptr_t", &pos));
@@ -5206,6 +5207,7 @@ main (int argc, char **argv)
POS_HERE (do_scalar_typedef ("machine_mode", &pos));
POS_HERE (do_scalar_typedef ("fixed_size_mode", &pos));
POS_HERE (do_scalar_typedef ("CONSTEXPR", &pos));
+ POS_HERE (do_scalar_typedef ("vector_sizes", &pos));
POS_HERE (do_typedef ("PTR",
create_pointer (resolve_typedef ("void", &pos)),
&pos));
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index 5c25441c70a271f04730486e513437fffa75b7e3..b1c13dafdeeec8d95f00c232822d3ab9b11f4046 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -26,6 +26,7 @@ along with GCC; see the file COPYING3. If not see
#include "tree.h"
#include "gimple.h"
#include "cfghooks.h"
+#include "tree-if-conv.h"
#include "tree-pass.h"
#include "ssa.h"
#include "fold-const.h"
@@ -1730,6 +1731,7 @@ vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
{
unsigned int i;
vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
+ vec<data_reference> datarefs_copy = loop_vinfo->shared->datarefs_copy;
struct data_reference *dr;
DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
@@ -1756,6 +1758,12 @@ vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
vect_update_init_of_dr (dr, niters, code);
}
+ FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
+ {
+ dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
+ if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
+ vect_update_init_of_dr (dr, niters, code);
+ }
}
/* For the information recorded in LOOP_VINFO prepare the loop for peeling
@@ -2409,6 +2417,8 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
profile_probability prob_prolog, prob_vector, prob_epilog;
int estimated_vf;
int prolog_peeling = 0;
+ bool vect_epilogues
+ = loop_vinfo->epilogue_vinfos.length () > 0;
/* We currently do not support prolog peeling if the target alignment is not
known at compile time. 'vect_gen_prolog_loop_niters' depends on the
target alignment being constant. */
@@ -2469,12 +2479,12 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
/* Prolog loop may be skipped. */
bool skip_prolog = (prolog_peeling != 0);
/* Skip to epilog if scalar loop may be preferred. It's only needed
- when we peel for epilog loop and when it hasn't been checked with
- loop versioning. */
+ when we peel for epilog loop or when we loop version. */
bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
bound_prolog + bound_epilog)
- : !LOOP_REQUIRES_VERSIONING (loop_vinfo));
+ : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
+ || vect_epilogues));
/* Epilog loop must be executed if the number of iterations for epilog
loop is known at compile time, otherwise we need to add a check at
the end of vector loop and skip to the end of epilog loop. */
@@ -2586,6 +2596,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
}
/* Peel epilog and put it on exit edge of loop. */
epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
+
if (!epilog)
{
dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
@@ -2966,9 +2977,7 @@ vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
*COND_EXPR_STMT_LIST. */
class loop *
-vect_loop_versioning (loop_vec_info loop_vinfo,
- unsigned int th, bool check_profitability,
- poly_uint64 versioning_threshold)
+vect_loop_versioning (loop_vec_info loop_vinfo)
{
class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
@@ -2988,10 +2997,15 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
+ poly_uint64 versioning_threshold
+ = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
tree version_simd_if_cond
= LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
+ unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
- if (check_profitability)
+ if (th >= vect_vf_for_cost (loop_vinfo)
+ && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ && !ordered_p (th, versioning_threshold))
cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
build_int_cst (TREE_TYPE (scalar_loop_iters),
th - 1));
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index b0cbbac0cb5ba1ffce706715d3dbb9139063803d..3c355eccc5bef6668456fddf485b4996f2d2fb38 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -885,6 +885,8 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
}
}
}
+
+ epilogue_vinfos.create (6);
}
/* Free all levels of MASKS. */
@@ -960,6 +962,7 @@ _loop_vec_info::~_loop_vec_info ()
release_vec_loop_masks (&masks);
delete ivexpr_map;
delete scan_map;
+ epilogue_vinfos.release ();
loop->aux = NULL;
}
@@ -1726,7 +1729,13 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
return 0;
}
- HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
+ HOST_WIDE_INT estimated_niter = -1;
+
+ if (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
+ estimated_niter
+ = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
+ if (estimated_niter == -1)
+ estimated_niter = estimated_stmt_executions_int (loop);
if (estimated_niter == -1)
estimated_niter = likely_max_stmt_executions_int (loop);
if (estimated_niter != -1
@@ -1864,6 +1873,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
int res;
unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
poly_uint64 min_vf = 2;
+ loop_vec_info orig_loop_vinfo = NULL;
/* The first group of checks is independent of the vector size. */
fatal = true;
@@ -2183,9 +2193,12 @@ start_over:
enough for both peeled prolog loop and vector loop. This check
can be merged along with threshold check of loop versioning, so
increase threshold for this case if necessary. */
- if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
+ if (LOOP_REQUIRES_VERSIONING (loop_vinfo)
+ || ((orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
+ && LOOP_REQUIRES_VERSIONING (orig_loop_vinfo)))
{
poly_uint64 niters_th = 0;
+ unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
{
@@ -2206,6 +2219,14 @@ start_over:
/* One additional iteration because of peeling for gap. */
if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
niters_th += 1;
+
+ /* Use the same condition as vect_transform_loop to decide when to use
+ the cost to determine a versioning threshold. */
+ if (th >= vect_vf_for_cost (loop_vinfo)
+ && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ && ordered_p (th, niters_th))
+ niters_th = ordered_max (poly_uint64 (th), niters_th);
+
LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
}
@@ -2329,14 +2350,8 @@ again:
be vectorized. */
opt_loop_vec_info
vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
- vec_info_shared *shared)
+ vec_info_shared *shared, vector_sizes vector_sizes)
{
- auto_vector_sizes vector_sizes;
-
- /* Autodetect first vector size we try. */
- current_vector_size = 0;
- targetm.vectorize.autovectorize_vector_sizes (&vector_sizes,
- loop->simdlen != 0);
unsigned int next_size = 0;
DUMP_VECT_SCOPE ("analyze_loop_nest");
@@ -2357,6 +2372,9 @@ vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
poly_uint64 autodetected_vector_size = 0;
opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
poly_uint64 first_vector_size = 0;
+ poly_uint64 lowest_th = 0;
+ unsigned vectorized_loops = 0;
+ bool vect_epilogues = !loop->simdlen && PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK);
while (1)
{
/* Check the CFG characteristics of the loop (nesting, entry/exit). */
@@ -2375,24 +2393,53 @@ vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
if (orig_loop_vinfo)
LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
+ else if (vect_epilogues && first_loop_vinfo)
+ {
+ LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
+ }
opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
if (res)
{
LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
+ vectorized_loops++;
- if (loop->simdlen
- && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
- (unsigned HOST_WIDE_INT) loop->simdlen))
+ if ((loop->simdlen
+ && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+ (unsigned HOST_WIDE_INT) loop->simdlen))
+ || vect_epilogues)
{
if (first_loop_vinfo == NULL)
{
first_loop_vinfo = loop_vinfo;
+ lowest_th
+ = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
first_vector_size = current_vector_size;
loop->aux = NULL;
}
else
- delete loop_vinfo;
+ {
+ /* Keep track of vector sizes that we know we can vectorize
+ the epilogue with. */
+ if (vect_epilogues)
+ {
+ loop->epilogue_vsizes.reserve (1);
+ loop->epilogue_vsizes.quick_push (current_vector_size);
+ first_loop_vinfo->epilogue_vinfos.reserve (1);
+ first_loop_vinfo->epilogue_vinfos.quick_push (loop_vinfo);
+ LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
+ poly_uint64 th
+ = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
+ gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
+ || maybe_ne (lowest_th, 0U));
+ /* Keep track of the known smallest versioning threshold.
+ */
+ if (ordered_p (lowest_th, th))
+ lowest_th = ordered_min (lowest_th, th);
+ }
+ else
+ delete loop_vinfo;
+ }
}
else
{
@@ -2430,6 +2477,8 @@ vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
dump_dec (MSG_NOTE, current_vector_size);
dump_printf (MSG_NOTE, "\n");
}
+ LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
+
return first_loop_vinfo;
}
else
@@ -8483,6 +8532,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
gimple *stmt;
bool check_profitability = false;
unsigned int th;
+ auto_vec<gimple *> orig_stmts;
DUMP_VECT_SCOPE ("vec_transform_loop");
@@ -8497,11 +8547,11 @@ vect_transform_loop (loop_vec_info loop_vinfo)
if (th >= vect_vf_for_cost (loop_vinfo)
&& !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
{
- if (dump_enabled_p ())
- dump_printf_loc (MSG_NOTE, vect_location,
- "Profitability threshold is %d loop iterations.\n",
- th);
- check_profitability = true;
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Profitability threshold is %d loop iterations.\n",
+ th);
+ check_profitability = true;
}
/* Make sure there exists a single-predecessor exit bb. Do this before
@@ -8519,18 +8569,8 @@ vect_transform_loop (loop_vec_info loop_vinfo)
if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
{
- poly_uint64 versioning_threshold
- = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
- if (check_profitability
- && ordered_p (poly_uint64 (th), versioning_threshold))
- {
- versioning_threshold = ordered_max (poly_uint64 (th),
- versioning_threshold);
- check_profitability = false;
- }
class loop *sloop
- = vect_loop_versioning (loop_vinfo, th, check_profitability,
- versioning_threshold);
+ = vect_loop_versioning (loop_vinfo);
sloop->force_vectorize = false;
check_profitability = false;
}
@@ -8558,6 +8598,66 @@ vect_transform_loop (loop_vec_info loop_vinfo)
epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
&step_vector, &niters_vector_mult_vf, th,
check_profitability, niters_no_overflow);
+ if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
+ epilogue = NULL;
+
+ if (loop_vinfo->epilogue_vinfos.length () == 0)
+ epilogue = NULL;
+
+ /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
+ on niters already ajusted for the iterations of the prologue. */
+ if (epilogue && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ && known_eq (vf, lowest_vf))
+ {
+ vector_sizes vector_sizes = loop->epilogue_vsizes;
+ unsigned int next_size = 0;
+ unsigned HOST_WIDE_INT eiters
+ = (LOOP_VINFO_INT_NITERS (loop_vinfo)
+ - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
+ eiters
+ = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
+ epilogue->nb_iterations_upper_bound = eiters - 1;
+ epilogue->any_upper_bound = true;
+
+ unsigned int ratio;
+ while (next_size < vector_sizes.length ()
+ && !(constant_multiple_p (current_vector_size,
+ vector_sizes[next_size], &ratio)
+ && eiters >= lowest_vf / ratio))
+ next_size += 1;
+
+ if (next_size == vector_sizes.length ())
+ epilogue = NULL;
+ }
+
+ if (epilogue)
+ {
+ loop_vec_info epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
+ loop_vinfo->epilogue_vinfos.ordered_remove (0);
+ epilogue->aux = epilogue_vinfo;
+ LOOP_VINFO_LOOP (epilogue_vinfo) = epilogue;
+ epilogue->simduid = loop->simduid;
+
+ epilogue->force_vectorize = loop->force_vectorize;
+ epilogue->safelen = loop->safelen;
+ epilogue->dont_vectorize = false;
+
+ /* update stmts in stmt_vec_info for epilog */
+ gimple_stmt_iterator gsi;
+ gphi_iterator phi_gsi;
+ basic_block *bbs = get_loop_body (loop);
+
+ for (unsigned i = 0; i < loop->num_nodes; ++i)
+ {
+ for (phi_gsi = gsi_start_phis (bbs[i]); !gsi_end_p (phi_gsi);
+ gsi_next (&phi_gsi))
+ orig_stmts.safe_push (phi_gsi.phi ());
+
+ for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
+ orig_stmts.safe_push (gsi_stmt (gsi));
+ }
+ }
+
if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
&& LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
@@ -8814,57 +8914,86 @@ vect_transform_loop (loop_vec_info loop_vinfo)
since vectorized loop can have loop-carried dependencies. */
loop->safelen = 0;
- /* Don't vectorize epilogue for epilogue. */
- if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
- epilogue = NULL;
-
- if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
- epilogue = NULL;
if (epilogue)
{
- auto_vector_sizes vector_sizes;
- targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
- unsigned int next_size = 0;
+ if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
+ tree_if_conversion (epilogue);
- /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
- on niters already ajusted for the iterations of the prologue. */
- if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
- && known_eq (vf, lowest_vf))
+ loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
+ hash_map<tree,tree> mapping;
+ auto_vec<stmt_vec_info> worklist;
+ basic_block *bbs = get_loop_body (epilogue);
+ gimple_stmt_iterator gsi;
+ gphi_iterator phi_gsi;
+ gimple * orig_stmt, * new_stmt;
+ stmt_vec_info stmt_vinfo = NULL;
+
+ LOOP_VINFO_BBS (epilogue_vinfo) = bbs;
+ for (unsigned i = 0; i < epilogue->num_nodes; ++i)
{
- unsigned HOST_WIDE_INT eiters
- = (LOOP_VINFO_INT_NITERS (loop_vinfo)
- - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
- eiters
- = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
- epilogue->nb_iterations_upper_bound = eiters - 1;
- epilogue->any_upper_bound = true;
-
- unsigned int ratio;
- while (next_size < vector_sizes.length ()
- && !(constant_multiple_p (current_vector_size,
- vector_sizes[next_size], &ratio)
- && eiters >= lowest_vf / ratio))
- next_size += 1;
+ for (phi_gsi = gsi_start_phis (bbs[i]); !gsi_end_p (phi_gsi);
+ gsi_next (&phi_gsi))
+ {
+ gcc_assert (orig_stmts.length () > 0);
+ orig_stmt = orig_stmts[0];
+ orig_stmts.ordered_remove (0);
+ new_stmt = phi_gsi.phi ();
+
+ stmt_vinfo
+ = epilogue_vinfo->lookup_stmt (orig_stmt);
+
+ STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
+ gimple_set_uid (new_stmt, gimple_uid (orig_stmt));
+
+ mapping.put (gimple_phi_result (orig_stmt),
+ gimple_phi_result (new_stmt));
+
+ if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
+ worklist.safe_push (stmt_vinfo);
+ }
+
+ for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
+ {
+ gcc_assert (orig_stmts.length () > 0);
+ orig_stmt = orig_stmts[0];
+ orig_stmts.ordered_remove (0);
+ new_stmt = gsi_stmt (gsi);
+
+ stmt_vinfo
+ = epilogue_vinfo->lookup_stmt (orig_stmt);
+
+ STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
+ gimple_set_uid (new_stmt, gimple_uid (orig_stmt));
+
+ if (is_gimple_assign (orig_stmt))
+ {
+ gcc_assert (is_gimple_assign (new_stmt));
+ mapping.put (gimple_assign_lhs (orig_stmt),
+ gimple_assign_lhs (new_stmt));
+ }
+
+ if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
+ worklist.safe_push (stmt_vinfo);
+ }
+ gcc_assert (orig_stmts.is_empty ());
}
- else
- while (next_size < vector_sizes.length ()
- && maybe_lt (current_vector_size, vector_sizes[next_size]))
- next_size += 1;
- if (next_size == vector_sizes.length ())
- epilogue = NULL;
- }
+ for (unsigned i = 0; i < worklist.length (); ++i)
+ {
+ tree *new_t;
+ gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (worklist[i]);
- if (epilogue)
- {
- epilogue->force_vectorize = loop->force_vectorize;
- epilogue->safelen = loop->safelen;
- epilogue->dont_vectorize = false;
+ while (seq)
+ {
+ for (unsigned j = 1; j < gimple_num_ops (seq); ++j)
+ if ((new_t = mapping.get(gimple_op (seq, j))))
+ gimple_set_op (seq, j, *new_t);
+ seq = seq->next;
+ }
+ }
- /* We may need to if-convert epilogue to vectorize it. */
- if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
- tree_if_conversion (epilogue);
+ vect_analyze_scalar_cycles (epilogue_vinfo);
}
return epilogue;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 1456cde4c2c2dec7244c504d2c496248894a4f1e..6e453d39190b362b6d5a01bc2167e10a617f91f9 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -564,6 +564,8 @@ public:
this points to the original vectorized loop. Otherwise NULL. */
_loop_vec_info *orig_loop_info;
+ vec<_loop_vec_info *> epilogue_vinfos;
+
} *loop_vec_info;
/* Access Functions. */
@@ -1480,10 +1482,9 @@ extern void vect_set_loop_condition (class loop *, loop_vec_info,
extern bool slpeel_can_duplicate_loop_p (const class loop *, const_edge);
class loop *slpeel_tree_duplicate_loop_to_edge_cfg (class loop *,
class loop *, edge);
-class loop *vect_loop_versioning (loop_vec_info, unsigned int, bool,
- poly_uint64);
+class loop *vect_loop_versioning (loop_vec_info);
extern class loop *vect_do_peeling (loop_vec_info, tree, tree,
- tree *, tree *, tree *, int, bool, bool);
+ tree *, tree *, tree *, int, bool, bool);
extern void vect_prepare_for_masked_peels (loop_vec_info);
extern dump_user_location_t find_loop_location (class loop *);
extern bool vect_can_advance_ivs_p (loop_vec_info);
@@ -1610,7 +1611,8 @@ extern bool check_reduction_path (dump_user_location_t, loop_p, gphi *, tree,
/* Drive for loop analysis stage. */
extern opt_loop_vec_info vect_analyze_loop (class loop *,
loop_vec_info,
- vec_info_shared *);
+ vec_info_shared *,
+ vector_sizes);
extern tree vect_build_loop_niters (loop_vec_info, bool * = NULL);
extern void vect_gen_vector_loop_niters (loop_vec_info, tree, tree *,
tree *, bool);
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 173e6b51652fd023893b38da786ff28f827553b5..71bbf4fdf8dc7588c45a0e8feef9272b52c0c04c 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -875,6 +875,10 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
vec_info_shared shared;
auto_purge_vect_location sentinel;
vect_location = find_loop_location (loop);
+ auto_vector_sizes auto_vector_sizes;
+ vector_sizes vector_sizes;
+ bool assert_versioning = false;
+
if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
&& dump_enabled_p ())
dump_printf (MSG_NOTE | MSG_PRIORITY_INTERNALS,
@@ -882,10 +886,35 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
LOCATION_FILE (vect_location.get_location_t ()),
LOCATION_LINE (vect_location.get_location_t ()));
+ /* If this is an epilogue, we already know what vector sizes we will use for
+ vectorization as the analyzis was part of the main vectorized loop. Use
+ these instead of going through all vector sizes again. */
+ if (orig_loop_vinfo
+ && !LOOP_VINFO_LOOP (orig_loop_vinfo)->epilogue_vsizes.is_empty ())
+ {
+ vector_sizes = LOOP_VINFO_LOOP (orig_loop_vinfo)->epilogue_vsizes;
+ assert_versioning = LOOP_REQUIRES_VERSIONING (orig_loop_vinfo);
+ current_vector_size = vector_sizes[0];
+ }
+ else
+ {
+ /* Autodetect first vector size we try. */
+ current_vector_size = 0;
+
+ targetm.vectorize.autovectorize_vector_sizes (&auto_vector_sizes,
+ loop->simdlen != 0);
+ vector_sizes = auto_vector_sizes;
+ }
+
/* Try to analyze the loop, retaining an opt_problem if dump_enabled_p. */
- opt_loop_vec_info loop_vinfo
- = vect_analyze_loop (loop, orig_loop_vinfo, &shared);
- loop->aux = loop_vinfo;
+ opt_loop_vec_info loop_vinfo = opt_loop_vec_info::success (NULL);
+ if (loop_vec_info_for_loop (loop))
+ loop_vinfo = opt_loop_vec_info::success (loop_vec_info_for_loop (loop));
+ else
+ {
+ loop_vinfo = vect_analyze_loop (loop, orig_loop_vinfo, &shared, vector_sizes);
+ loop->aux = loop_vinfo;
+ }
if (!loop_vinfo)
if (dump_enabled_p ())
@@ -898,6 +927,10 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
if (!loop_vinfo || !LOOP_VINFO_VECTORIZABLE_P (loop_vinfo))
{
+ /* If this loops requires versioning, make sure the analyzis done on the
+ epilogue loops succeeds. */
+ gcc_assert (!assert_versioning);
+
/* Free existing information if loop is analyzed with some
assumptions. */
if (loop_constraint_set_p (loop, LOOP_C_FINITE))
@@ -1013,8 +1046,13 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
/* Epilogue of vectorized loop must be vectorized too. */
if (new_loop)
- ret |= try_vectorize_loop_1 (simduid_to_vf_htab, num_vectorized_loops,
- new_loop, loop_vinfo, NULL, NULL);
+ {
+ /* Don't include vectorized epilogues in the "vectorized loops" count.
+ */
+ unsigned dont_count = *num_vectorized_loops;
+ ret |= try_vectorize_loop_1 (simduid_to_vf_htab, &dont_count,
+ new_loop, loop_vinfo, NULL, NULL);
+ }
return ret;
}