Hi Richard,
As I mentioned in the IRC channel, I managed to get "most" of the
regression testsuite working for x86_64 (avx512) and aarch64.
On x86_64 I get a failure that I can't explain, was hoping you might be
able to have a look with me:
"PASS->FAIL: gcc.target/i386/vect-perm-odd-1.c execution test"
vect-perm-odd-1.exe segfaults and when I gdb it seems to be the first
iteration of the main loop. The tree dumps look alright, but I do
notice the stack usage seems to change between --param
vect-epilogue-nomask={0,1}.
Am I missing to update some field that may later lead to the amount of
stack being used? I am confused, it could very well be that I am missing
something obvious, I am not too familiar with x86's ISA. I will try to
investigate further.
This patch needs further clean-up and more comments (or comment
updates), but I thought I'd share current state to see if you can help
me unblock.
Cheers,
Andre
diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h
index 0b0154ffd7bf031a005de993b101d9db6dd98c43..d01512ea46467f1cf77793bdc75b48e71b0b9641 100644
--- a/gcc/cfgloop.h
+++ b/gcc/cfgloop.h
@@ -21,6 +21,7 @@ along with GCC; see the file COPYING3. If not see
#define GCC_CFGLOOP_H
#include "cfgloopmanip.h"
+#include "target.h"
/* Structure to hold decision about unrolling/peeling. */
enum lpt_dec
@@ -268,6 +269,9 @@ public:
the basic-block from being collected but its index can still be
reused. */
basic_block former_header;
+
+ /* Keep track of vector sizes we know we can vectorize the epilogue with. */
+ vector_sizes epilogue_vsizes;
};
/* Set if the loop is known to be infinite. */
diff --git a/gcc/cfgloop.c b/gcc/cfgloop.c
index 4ad1f658708f83dbd8789666c26d4bd056837bc6..f3e81bcd00b3f125389aa15b12dc5201b3578d20 100644
--- a/gcc/cfgloop.c
+++ b/gcc/cfgloop.c
@@ -198,6 +198,7 @@ flow_loop_free (class loop *loop)
exit->prev = exit;
}
+ loop->epilogue_vsizes.release();
ggc_free (loop->exits);
ggc_free (loop);
}
@@ -355,6 +356,7 @@ alloc_loop (void)
loop->nb_iterations_upper_bound = 0;
loop->nb_iterations_likely_upper_bound = 0;
loop->nb_iterations_estimate = 0;
+ loop->epilogue_vsizes.create(8);
return loop;
}
diff --git a/gcc/gengtype.c b/gcc/gengtype.c
index 53317337cf8c8e8caefd6b819d28b3bba301e755..80fb6ef71465b24e034fa45d69fec56be6b2e7f8 100644
--- a/gcc/gengtype.c
+++ b/gcc/gengtype.c
@@ -5197,6 +5197,7 @@ main (int argc, char **argv)
POS_HERE (do_scalar_typedef ("widest_int", &pos));
POS_HERE (do_scalar_typedef ("int64_t", &pos));
POS_HERE (do_scalar_typedef ("poly_int64", &pos));
+ POS_HERE (do_scalar_typedef ("poly_uint64", &pos));
POS_HERE (do_scalar_typedef ("uint64_t", &pos));
POS_HERE (do_scalar_typedef ("uint8", &pos));
POS_HERE (do_scalar_typedef ("uintptr_t", &pos));
@@ -5206,6 +5207,7 @@ main (int argc, char **argv)
POS_HERE (do_scalar_typedef ("machine_mode", &pos));
POS_HERE (do_scalar_typedef ("fixed_size_mode", &pos));
POS_HERE (do_scalar_typedef ("CONSTEXPR", &pos));
+ POS_HERE (do_scalar_typedef ("vector_sizes", &pos));
POS_HERE (do_typedef ("PTR",
create_pointer (resolve_typedef ("void", &pos)),
&pos));
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index 5c25441c70a271f04730486e513437fffa75b7e3..189f7458b1b20be06a9a20d3ee05e74bc176434c 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -26,6 +26,7 @@ along with GCC; see the file COPYING3. If not see
#include "tree.h"
#include "gimple.h"
#include "cfghooks.h"
+#include "tree-if-conv.h"
#include "tree-pass.h"
#include "ssa.h"
#include "fold-const.h"
@@ -1724,7 +1725,7 @@ vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
CODE and NITERS are as for vect_update_inits_of_dr. */
-static void
+void
vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
tree_code code)
{
@@ -1736,19 +1737,7 @@ vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
/* Adjust niters to sizetype and insert stmts on loop preheader edge. */
if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
- {
- gimple_seq seq;
- edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
- tree var = create_tmp_var (sizetype, "prolog_loop_adjusted_niters");
-
- niters = fold_convert (sizetype, niters);
- niters = force_gimple_operand (niters, &seq, false, var);
- if (seq)
- {
- basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
- gcc_assert (!new_bb);
- }
- }
+ niters = fold_convert (sizetype, niters);
FOR_EACH_VEC_ELT (datarefs, i, dr)
{
@@ -2401,14 +2390,18 @@ class loop *
vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
tree *niters_vector, tree *step_vector,
tree *niters_vector_mult_vf_var, int th,
- bool check_profitability, bool niters_no_overflow)
+ bool check_profitability, bool niters_no_overflow,
+ tree *advance)
{
edge e, guard_e;
- tree type = TREE_TYPE (niters), guard_cond;
+ tree type = TREE_TYPE (niters), guard_cond, advance_guard = NULL;
basic_block guard_bb, guard_to;
profile_probability prob_prolog, prob_vector, prob_epilog;
int estimated_vf;
int prolog_peeling = 0;
+ bool vect_epilogues
+ = loop_vinfo->epilogue_vinfos.length () > 0
+ && !LOOP_VINFO_EPILOGUE_P (loop_vinfo);
/* We currently do not support prolog peeling if the target alignment is not
known at compile time. 'vect_gen_prolog_loop_niters' depends on the
target alignment being constant. */
@@ -2466,15 +2459,61 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
else
niters_prolog = build_int_cst (type, 0);
+ loop_vec_info epilogue_vinfo = NULL;
+ if (vect_epilogues)
+ {
+ epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
+ loop_vinfo->epilogue_vinfos.ordered_remove (0);
+
+ /* Don't vectorize epilogues if not most inner loop or if you may need to
+ peel the epilogue loop for alignment. */
+ if (loop->inner != NULL
+ || LOOP_VINFO_PEELING_FOR_ALIGNMENT (epilogue_vinfo))
+ vect_epilogues = false;
+
+ }
+
+ unsigned int lowest_vf = constant_lower_bound (vf);
+ bool epilogue_any_upper_bound = false;
+ unsigned HOST_WIDE_INT eiters = 0;
+ tree niters_vector_mult_vf;
+
+ /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
+ on niters already ajusted for the iterations of the prologue. */
+ if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ && known_eq (vf, lowest_vf))
+ {
+ vector_sizes vector_sizes = loop->epilogue_vsizes;
+ unsigned next_size = 0;
+ eiters = (LOOP_VINFO_INT_NITERS (loop_vinfo)
+ - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
+
+ if (prolog_peeling > 0)
+ eiters -= prolog_peeling;
+ eiters
+ = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
+ epilogue_any_upper_bound = true;
+
+ unsigned int ratio;
+ while (next_size < vector_sizes.length ()
+ && !(constant_multiple_p (current_vector_size,
+ vector_sizes[next_size], &ratio)
+ && eiters >= lowest_vf / ratio))
+ next_size += 1;
+
+ if (next_size == vector_sizes.length ())
+ vect_epilogues = false;
+ }
+
/* Prolog loop may be skipped. */
bool skip_prolog = (prolog_peeling != 0);
/* Skip to epilog if scalar loop may be preferred. It's only needed
- when we peel for epilog loop and when it hasn't been checked with
- loop versioning. */
+ when we peel for epilog loop or when we loop version. */
bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
bound_prolog + bound_epilog)
- : !LOOP_REQUIRES_VERSIONING (loop_vinfo));
+ : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
+ || vect_epilogues));
/* Epilog loop must be executed if the number of iterations for epilog
loop is known at compile time, otherwise we need to add a check at
the end of vector loop and skip to the end of epilog loop. */
@@ -2503,7 +2542,17 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
}
dump_user_location_t loop_loc = find_loop_location (loop);
- class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
+ class loop *scalar_loop;
+ if (vect_epilogues)
+ {
+ scalar_loop = get_loop_copy (loop);
+ LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
+ = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
+ LOOP_VINFO_SCALAR_LOOP (loop_vinfo) = NULL;
+ }
+ else
+ scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
+
if (prolog_peeling)
{
e = loop_preheader_edge (loop);
@@ -2586,12 +2635,24 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
}
/* Peel epilog and put it on exit edge of loop. */
epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
+
if (!epilog)
{
dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
"slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
gcc_unreachable ();
}
+
+ if (epilogue_any_upper_bound && prolog_peeling >= 0)
+ {
+ epilog->any_upper_bound = true;
+ epilog->nb_iterations_upper_bound = eiters + 1;
+ }
+ else if (prolog_peeling < 0)
+ {
+ epilog->any_upper_bound = false;
+ }
+
epilog->force_vectorize = false;
slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
@@ -2608,6 +2669,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
check_profitability);
/* Build guard against NITERSM1 since NITERS may overflow. */
guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
+ advance_guard = guard_cond;
guard_bb = anchor;
guard_to = split_edge (loop_preheader_edge (epilog));
guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
@@ -2635,7 +2697,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
}
basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
- tree niters_vector_mult_vf;
/* If loop is peeled for non-zero constant times, now niters refers to
orig_niters - prolog_peeling, it won't overflow even the orig_niters
overflows. */
@@ -2699,10 +2760,105 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
adjust_vec_debug_stmts ();
scev_reset ();
}
+
+ if (vect_epilogues)
+ {
+ epilog->aux = epilogue_vinfo;
+ LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
+
+ loop_constraint_clear (epilog, LOOP_C_INFINITE);
+
+ /* We now must calculate the number of iterations for our epilogue. */
+ tree cond_niters, niters;
+
+ /* Depending on whether we peel for gaps we take niters or niters - 1,
+ we will refer to this as N - G, where both N and G are the NITERS and
+ GAP for the original loop. */
+ niters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+ ? LOOP_VINFO_NITERSM1 (loop_vinfo)
+ : LOOP_VINFO_NITERS (loop_vinfo);
+
+ /* Here we build a vector factorization mask:
+ vf_mask = ~(VF - 1), where VF is the Vectorization Factor. */
+ tree vf_mask = build_int_cst (TREE_TYPE (niters),
+ LOOP_VINFO_VECT_FACTOR (loop_vinfo));
+ vf_mask = fold_build2 (MINUS_EXPR, TREE_TYPE (vf_mask),
+ vf_mask,
+ build_one_cst (TREE_TYPE (vf_mask)));
+ vf_mask = fold_build1 (BIT_NOT_EXPR, TREE_TYPE (niters), vf_mask);
+
+ /* Here we calculate:
+ niters = N - ((N-G) & ~(VF -1)) */
+ niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
+ LOOP_VINFO_NITERS (loop_vinfo),
+ fold_build2 (BIT_AND_EXPR, TREE_TYPE (niters),
+ niters,
+ vf_mask));
+
+ if (skip_vector)
+ {
+ /* We do this by constructing:
+ cond_niters = !do_we_enter_main_loop ? N + niters_prolog : niters
+ we add npeel, the number of peeled iterations for alignment, to N
+ in case we don't enter the main loop, has these have already been
+ subtracted from N (the number of iterations of the main loop).
+ Since the prolog peeling is also skipped if we skip the
+ vectorization we must add them back. */
+ cond_niters
+ = fold_build3 (COND_EXPR, TREE_TYPE (niters),
+ advance_guard,
+ fold_build2 (PLUS_EXPR, TREE_TYPE (niters),
+ LOOP_VINFO_NITERS (loop_vinfo),
+ fold_convert (TREE_TYPE (niters),
+ niters_prolog)),
+ niters);
+ }
+ else
+ cond_niters = niters;
+
+ LOOP_VINFO_NITERS (epilogue_vinfo) = cond_niters;
+ LOOP_VINFO_NITERSM1 (epilogue_vinfo)
+ = fold_build2 (MINUS_EXPR, TREE_TYPE (cond_niters),
+ cond_niters, build_one_cst (TREE_TYPE (cond_niters)));
+
+ /* We now calculate the amount of iterations we must advance our
+ epilogue's data references by.
+ Make sure to use sizetype here as we might use a negative constant
+ if the loop peels for alignment. If the target is 64-bit this can go
+ wrong if the computation is not done in sizetype. */
+ *advance = fold_convert (sizetype, niters);
+
+ *advance = fold_build2 (MINUS_EXPR, TREE_TYPE (*advance),
+ *advance,
+ fold_convert (sizetype,
+ LOOP_VINFO_NITERS (loop_vinfo)));
+ *advance = fold_build2 (MINUS_EXPR, TREE_TYPE (*advance),
+ build_zero_cst (TREE_TYPE (*advance)),
+ *advance);
+
+ if (skip_vector)
+ {
+ *advance
+ = fold_build3 (COND_EXPR, TREE_TYPE (*advance),
+ advance_guard,
+ fold_build2 (MINUS_EXPR, TREE_TYPE (*advance),
+ build_zero_cst (TREE_TYPE (*advance)),
+ fold_convert (TREE_TYPE (*advance),
+ niters_prolog)),
+ *advance);
+ }
+
+ /* Redo the peeling for niter analysis as the NITERs and need for
+ alignment have been updated to take the main loop into
+ account. */
+ LOOP_VINFO_PEELING_FOR_NITER (epilogue_vinfo) = false;
+ determine_peel_for_niter (epilogue_vinfo);
+ }
+
adjust_vec.release ();
free_original_copy_tables ();
- return epilog;
+ return vect_epilogues ? epilog : NULL;
}
/* Function vect_create_cond_for_niters_checks.
@@ -2966,9 +3122,7 @@ vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
*COND_EXPR_STMT_LIST. */
class loop *
-vect_loop_versioning (loop_vec_info loop_vinfo,
- unsigned int th, bool check_profitability,
- poly_uint64 versioning_threshold)
+vect_loop_versioning (loop_vec_info loop_vinfo)
{
class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
@@ -2988,10 +3142,15 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
+ poly_uint64 versioning_threshold
+ = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
tree version_simd_if_cond
= LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
+ unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
- if (check_profitability)
+ if (th >= vect_vf_for_cost (loop_vinfo)
+ && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ && !ordered_p (th, versioning_threshold))
cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
build_int_cst (TREE_TYPE (scalar_loop_iters),
th - 1));
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index b0cbbac0cb5ba1ffce706715d3dbb9139063803d..6dbde0fe35c29d0357cf5c6e7ab5599957a8242a 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -885,6 +885,8 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
}
}
}
+
+ epilogue_vinfos.create (6);
}
/* Free all levels of MASKS. */
@@ -960,6 +962,7 @@ _loop_vec_info::~_loop_vec_info ()
release_vec_loop_masks (&masks);
delete ivexpr_map;
delete scan_map;
+ epilogue_vinfos.release ();
loop->aux = NULL;
}
@@ -1726,7 +1729,13 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
return 0;
}
- HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
+ HOST_WIDE_INT estimated_niter = -1;
+
+ if (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
+ estimated_niter
+ = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
+ if (estimated_niter == -1)
+ estimated_niter = estimated_stmt_executions_int (loop);
if (estimated_niter == -1)
estimated_niter = likely_max_stmt_executions_int (loop);
if (estimated_niter != -1
@@ -1852,6 +1861,56 @@ vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
}
}
+
+/* Decides whether we need to create an epilogue loop to handle
+ remaining scalar iterations and sets PEELING_FOR_NITERS accordingly. */
+
+void
+determine_peel_for_niter (loop_vec_info loop_vinfo)
+{
+
+ unsigned HOST_WIDE_INT const_vf;
+ HOST_WIDE_INT max_niter
+ = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
+
+ unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
+ if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
+ th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
+ (loop_vinfo));
+
+ if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ /* The main loop handles all iterations. */
+ LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
+ else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
+ {
+ /* Work out the (constant) number of iterations that need to be
+ peeled for reasons other than niters. */
+ unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
+ if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+ peel_niter += 1;
+ if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
+ LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
+ LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
+ }
+ else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+ /* ??? When peeling for gaps but not alignment, we could
+ try to check whether the (variable) niters is known to be
+ VF * N + 1. That's something of a niche case though. */
+ || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+ || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
+ || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
+ < (unsigned) exact_log2 (const_vf))
+ /* In case of versioning, check if the maximum number of
+ iterations is greater than th. If they are identical,
+ the epilogue is unnecessary. */
+ && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
+ || ((unsigned HOST_WIDE_INT) max_niter
+ > (th / const_vf) * const_vf))))
+ LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
+}
+
+
/* Function vect_analyze_loop_2.
Apply a set of analyses on LOOP, and create a loop_vec_info struct
@@ -1864,6 +1923,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
int res;
unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
poly_uint64 min_vf = 2;
+ loop_vec_info orig_loop_vinfo = NULL;
/* The first group of checks is independent of the vector size. */
fatal = true;
@@ -1979,7 +2039,6 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
vect_compute_single_scalar_iteration_cost (loop_vinfo);
poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
- unsigned th;
/* Check the SLP opportunities in the loop, analyze and build SLP trees. */
ok = vect_analyze_slp (loop_vinfo, *n_stmts);
@@ -2019,9 +2078,6 @@ start_over:
LOOP_VINFO_INT_NITERS (loop_vinfo));
}
- HOST_WIDE_INT max_niter
- = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
-
/* Analyze the alignment of the data-refs in the loop.
Fail if a data reference is found that cannot be vectorized. */
@@ -2125,42 +2181,7 @@ start_over:
return opt_result::failure_at (vect_location,
"Loop costings not worthwhile.\n");
- /* Decide whether we need to create an epilogue loop to handle
- remaining scalar iterations. */
- th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
-
- unsigned HOST_WIDE_INT const_vf;
- if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
- /* The main loop handles all iterations. */
- LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
- else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
- && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
- {
- /* Work out the (constant) number of iterations that need to be
- peeled for reasons other than niters. */
- unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
- if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
- peel_niter += 1;
- if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
- LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
- LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
- }
- else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
- /* ??? When peeling for gaps but not alignment, we could
- try to check whether the (variable) niters is known to be
- VF * N + 1. That's something of a niche case though. */
- || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
- || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
- || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
- < (unsigned) exact_log2 (const_vf))
- /* In case of versioning, check if the maximum number of
- iterations is greater than th. If they are identical,
- the epilogue is unnecessary. */
- && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
- || ((unsigned HOST_WIDE_INT) max_niter
- > (th / const_vf) * const_vf))))
- LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
-
+ determine_peel_for_niter (loop_vinfo);
/* If an epilogue loop is required make sure we can create one. */
if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
|| LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
@@ -2183,9 +2204,12 @@ start_over:
enough for both peeled prolog loop and vector loop. This check
can be merged along with threshold check of loop versioning, so
increase threshold for this case if necessary. */
- if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
+ if (LOOP_REQUIRES_VERSIONING (loop_vinfo)
+ || ((orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
+ && LOOP_REQUIRES_VERSIONING (orig_loop_vinfo)))
{
poly_uint64 niters_th = 0;
+ unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
{
@@ -2206,6 +2230,14 @@ start_over:
/* One additional iteration because of peeling for gap. */
if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
niters_th += 1;
+
+ /* Use the same condition as vect_transform_loop to decide when to use
+ the cost to determine a versioning threshold. */
+ if (th >= vect_vf_for_cost (loop_vinfo)
+ && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ && ordered_p (th, niters_th))
+ niters_th = ordered_max (poly_uint64 (th), niters_th);
+
LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
}
@@ -2329,14 +2361,8 @@ again:
be vectorized. */
opt_loop_vec_info
vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
- vec_info_shared *shared)
+ vec_info_shared *shared, vector_sizes vector_sizes)
{
- auto_vector_sizes vector_sizes;
-
- /* Autodetect first vector size we try. */
- current_vector_size = 0;
- targetm.vectorize.autovectorize_vector_sizes (&vector_sizes,
- loop->simdlen != 0);
unsigned int next_size = 0;
DUMP_VECT_SCOPE ("analyze_loop_nest");
@@ -2357,6 +2383,9 @@ vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
poly_uint64 autodetected_vector_size = 0;
opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
poly_uint64 first_vector_size = 0;
+ poly_uint64 lowest_th = 0;
+ unsigned vectorized_loops = 0;
+ bool vect_epilogues = !loop->simdlen && PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK);
while (1)
{
/* Check the CFG characteristics of the loop (nesting, entry/exit). */
@@ -2375,24 +2404,54 @@ vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
if (orig_loop_vinfo)
LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
+ else if (vect_epilogues && first_loop_vinfo)
+ {
+ LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
+ }
opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
if (res)
{
LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
+ vectorized_loops++;
- if (loop->simdlen
- && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
- (unsigned HOST_WIDE_INT) loop->simdlen))
+ if ((loop->simdlen
+ && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+ (unsigned HOST_WIDE_INT) loop->simdlen))
+ || vect_epilogues)
{
if (first_loop_vinfo == NULL)
{
first_loop_vinfo = loop_vinfo;
+ lowest_th
+ = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
first_vector_size = current_vector_size;
loop->aux = NULL;
}
else
- delete loop_vinfo;
+ {
+ /* Keep track of vector sizes that we know we can vectorize
+ the epilogue with. */
+ if (vect_epilogues)
+ {
+ loop->aux = NULL;
+ loop->epilogue_vsizes.reserve (1);
+ loop->epilogue_vsizes.quick_push (current_vector_size);
+ first_loop_vinfo->epilogue_vinfos.reserve (1);
+ first_loop_vinfo->epilogue_vinfos.quick_push (loop_vinfo);
+ LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
+ poly_uint64 th
+ = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
+ gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
+ || maybe_ne (lowest_th, 0U));
+ /* Keep track of the known smallest versioning
+ threshold. */
+ if (ordered_p (lowest_th, th))
+ lowest_th = ordered_min (lowest_th, th);
+ }
+ else
+ delete loop_vinfo;
+ }
}
else
{
@@ -2430,6 +2489,8 @@ vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
dump_dec (MSG_NOTE, current_vector_size);
dump_printf (MSG_NOTE, "\n");
}
+ LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
+
return first_loop_vinfo;
}
else
@@ -8460,6 +8521,33 @@ vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
*seen_store = stmt_info;
}
+
+
+static tree
+replace_ops (tree op, hash_map<tree, tree> &mapping)
+{
+ if (!op)
+ return NULL;
+
+ tree *new_op;
+ tree ret = NULL;
+ for (int j = 0; j < TREE_OPERAND_LENGTH (op); ++j)
+ {
+ if ((new_op = mapping.get (TREE_OPERAND (op, j))))
+ {
+ TREE_OPERAND (op, j) = *new_op;
+ ret = *new_op;
+ }
+ else
+ ret = replace_ops (TREE_OPERAND (op, j), mapping);
+
+ if (ret)
+ return ret;
+ }
+
+ return NULL;
+}
+
/* Function vect_transform_loop.
The analysis phase has determined that the loop is vectorizable.
@@ -8483,6 +8571,9 @@ vect_transform_loop (loop_vec_info loop_vinfo)
gimple *stmt;
bool check_profitability = false;
unsigned int th;
+ auto_vec<gimple *> orig_stmts;
+ auto_vec<dr_vec_info *> gather_scatter_drs;
+ auto_vec<gimple *> gather_scatter_stmts;
DUMP_VECT_SCOPE ("vec_transform_loop");
@@ -8497,11 +8588,11 @@ vect_transform_loop (loop_vec_info loop_vinfo)
if (th >= vect_vf_for_cost (loop_vinfo)
&& !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
{
- if (dump_enabled_p ())
- dump_printf_loc (MSG_NOTE, vect_location,
- "Profitability threshold is %d loop iterations.\n",
- th);
- check_profitability = true;
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Profitability threshold is %d loop iterations.\n",
+ th);
+ check_profitability = true;
}
/* Make sure there exists a single-predecessor exit bb. Do this before
@@ -8519,18 +8610,8 @@ vect_transform_loop (loop_vec_info loop_vinfo)
if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
{
- poly_uint64 versioning_threshold
- = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
- if (check_profitability
- && ordered_p (poly_uint64 (th), versioning_threshold))
- {
- versioning_threshold = ordered_max (poly_uint64 (th),
- versioning_threshold);
- check_profitability = false;
- }
class loop *sloop
- = vect_loop_versioning (loop_vinfo, th, check_profitability,
- versioning_threshold);
+ = vect_loop_versioning (loop_vinfo);
sloop->force_vectorize = false;
check_profitability = false;
}
@@ -8555,9 +8636,58 @@ vect_transform_loop (loop_vec_info loop_vinfo)
LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
+ tree advance;
epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
&step_vector, &niters_vector_mult_vf, th,
- check_profitability, niters_no_overflow);
+ check_profitability, niters_no_overflow,
+ &advance);
+
+ if (epilogue)
+ {
+ basic_block *orig_bbs = get_loop_body (loop);
+ loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
+
+ gimple_stmt_iterator orig_gsi;
+ gphi_iterator orig_phi_gsi;
+ gimple *stmt;
+ stmt_vec_info stmt_vinfo;
+
+ /* The stmt_vec_info's of the epilogue were constructed for the main loop
+ and need to be updated to refer to the cloned variables used in the
+ epilogue loop. We do this by assuming the original main loop and the
+ epilogue loop are identical (aside the different SSA names). This
+ means we assume we can go through each BB in the loop and each STMT in
+ each BB and map them 1:1, replacing the STMT_VINFO_STMT of each
+ stmt_vec_info in the epilogue's loop_vec_info. Here we only keep
+ track of the original state of the main loop, before vectorization.
+ After vectorization we proceed to update the epilogue's stmt_vec_infos
+ information. We also update the references in PATTERN_DEF_SEQ's,
+ RELATED_STMT's and data_references. Mainly the latter has to be
+ updated after we are done vectorizing the main loop, as the
+ data_references are shared between main and epilogue. */
+ for (unsigned i = 0; i < loop->num_nodes; ++i)
+ {
+ for (orig_phi_gsi = gsi_start_phis (orig_bbs[i]);
+ !gsi_end_p (orig_phi_gsi); gsi_next (&orig_phi_gsi))
+ orig_stmts.safe_push (orig_phi_gsi.phi ());
+ for (orig_gsi = gsi_start_bb (orig_bbs[i]);
+ !gsi_end_p (orig_gsi); gsi_next (&orig_gsi))
+ {
+ stmt = gsi_stmt (orig_gsi);
+ orig_stmts.safe_push (stmt);
+ stmt_vinfo = epilogue_vinfo->lookup_stmt (stmt);
+ /* Data references pointing to gather loads and scatter stores
+ require special treatment because the address computation
+ happens in a different gimple node, pointed to by DR_REF. In
+ contrast to normal loads and stores where we only need to
+ update the offset of the data reference. */
+ if (stmt_vinfo
+ && STMT_VINFO_GATHER_SCATTER_P (stmt_vinfo))
+ gather_scatter_drs.safe_push (STMT_VINFO_DR_INFO (stmt_vinfo));
+ }
+ }
+ }
+
if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
&& LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
@@ -8814,57 +8944,157 @@ vect_transform_loop (loop_vec_info loop_vinfo)
since vectorized loop can have loop-carried dependencies. */
loop->safelen = 0;
- /* Don't vectorize epilogue for epilogue. */
- if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
- epilogue = NULL;
-
- if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
- epilogue = NULL;
-
if (epilogue)
{
- auto_vector_sizes vector_sizes;
- targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
- unsigned int next_size = 0;
- /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
- on niters already ajusted for the iterations of the prologue. */
- if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
- && known_eq (vf, lowest_vf))
- {
- unsigned HOST_WIDE_INT eiters
- = (LOOP_VINFO_INT_NITERS (loop_vinfo)
- - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
- eiters
- = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
- epilogue->nb_iterations_upper_bound = eiters - 1;
- epilogue->any_upper_bound = true;
-
- unsigned int ratio;
- while (next_size < vector_sizes.length ()
- && !(constant_multiple_p (current_vector_size,
- vector_sizes[next_size], &ratio)
- && eiters >= lowest_vf / ratio))
- next_size += 1;
- }
- else
- while (next_size < vector_sizes.length ()
- && maybe_lt (current_vector_size, vector_sizes[next_size]))
- next_size += 1;
+ loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
+ vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
- if (next_size == vector_sizes.length ())
- epilogue = NULL;
- }
+ auto_vec<stmt_vec_info> pattern_worklist, related_worklist;
+ hash_map<tree,tree> mapping;
+ gimple * orig_stmt, * new_stmt;
+ gimple_stmt_iterator epilogue_gsi;
+ gphi_iterator epilogue_phi_gsi;
+ stmt_vec_info stmt_vinfo = NULL, related_vinfo;
+ basic_block *epilogue_bbs = get_loop_body (epilogue);
- if (epilogue)
- {
+ epilogue->simduid = loop->simduid;
epilogue->force_vectorize = loop->force_vectorize;
epilogue->safelen = loop->safelen;
epilogue->dont_vectorize = false;
+ LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
+
+ /* We are done vectorizing the main loop, so now we update the epilogues
+ stmt_vec_info's. At the same time we set the gimple UID of each
+ statement in the epilogue, as these are used to look them up in the
+ epilogues loop_vec_info later. We also keep track of what
+ stmt_vec_info's have PATTERN_DEF_SEQ's and RELATED_STMT's that might
+ need updating and we construct a mapping between variables defined in
+ the main loop and their corresponding names in epilogue. */
+ for (unsigned i = 0; i < loop->num_nodes; ++i)
+ {
+ for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
+ !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
+ {
+ orig_stmt = orig_stmts[0];
+ orig_stmts.ordered_remove (0);
+ new_stmt = epilogue_phi_gsi.phi ();
+
+ stmt_vinfo
+ = epilogue_vinfo->lookup_stmt (orig_stmt);
+
+ STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
+ gimple_set_uid (new_stmt, gimple_uid (orig_stmt));
+
+ mapping.put (gimple_phi_result (orig_stmt),
+ gimple_phi_result (new_stmt));
+
+ if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
+ pattern_worklist.safe_push (stmt_vinfo);
+
+ related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
+ while (related_vinfo && related_vinfo != stmt_vinfo)
+ {
+ related_worklist.safe_push (related_vinfo);
+ /* Set BB such that the assert in
+ 'get_initial_def_for_reduction' is able to determine that
+ the BB of the related stmt is inside this loop. */
+ gimple_set_bb (STMT_VINFO_STMT (related_vinfo),
+ gimple_bb (new_stmt));
+ related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
+ }
+ }
+
+ for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
+ !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
+ {
+ orig_stmt = orig_stmts[0];
+ orig_stmts.ordered_remove (0);
+ new_stmt = gsi_stmt (epilogue_gsi);
+
+ stmt_vinfo
+ = epilogue_vinfo->lookup_stmt (orig_stmt);
+
+ STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
+ gimple_set_uid (new_stmt, gimple_uid (orig_stmt));
+
+ if (is_gimple_assign (orig_stmt))
+ {
+ gcc_assert (is_gimple_assign (new_stmt));
+ mapping.put (gimple_assign_lhs (orig_stmt),
+ gimple_assign_lhs (new_stmt));
+ }
+
+ if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
+ pattern_worklist.safe_push (stmt_vinfo);
+
+ related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
+ related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
+ while (related_vinfo && related_vinfo != stmt_vinfo)
+ {
+ related_worklist.safe_push (related_vinfo);
+ /* Set BB such that the assert in
+ 'get_initial_def_for_reduction' is able to determine that
+ the BB of the related stmt is inside this loop. */
+ gimple_set_bb (STMT_VINFO_STMT (related_vinfo),
+ gimple_bb (new_stmt));
+ related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
+ }
+ }
+ gcc_assert (orig_stmts.length () == 0);
+ }
+
+ /* The PATTERN_DEF_SEQ's in the epilogue were constructed using the
+ original main loop and thus need to be updated to refer to the cloned
+ variables used in the epilogue. */
+ for (unsigned i = 0; i < pattern_worklist.length (); ++i)
+ {
+ gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (pattern_worklist[i]);
+ tree *new_op;
+
+ while (seq)
+ {
+ for (unsigned j = 1; j < gimple_num_ops (seq); ++j)
+ {
+ tree op = gimple_op (seq, j);
+ if ((new_op = mapping.get(op)))
+ gimple_set_op (seq, j, *new_op);
+ else
+ {
+ op = unshare_expr (op);
+ replace_ops (op, mapping);
+ gimple_set_op (seq, j, op);
+ }
+ }
+ seq = seq->next;
+ }
+ }
+
+ /* Just like the PATTERN_DEF_SEQ's the RELATED_STMT's also need to be
+ updated. */
+ for (unsigned i = 0; i < related_worklist.length (); ++i)
+ {
+ tree *new_t;
+ gimple * stmt = STMT_VINFO_STMT (related_worklist[i]);
+ for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
+ if ((new_t = mapping.get(gimple_op (stmt, j))))
+ gimple_set_op (stmt, j, *new_t);
+ }
+
+ tree new_op;
+ for (unsigned i = 0; i < gather_scatter_drs.length (); ++i)
+ {
+ dr_vec_info *dr_info = gather_scatter_drs[i];
+ data_reference *dr = dr_info->dr;
+ gcc_assert (dr);
+ DR_REF (dr) = unshare_expr (DR_REF (dr));
+ new_op = replace_ops (DR_REF (dr), mapping);
+ if (new_op)
+ DR_STMT (dr_info->dr) = SSA_NAME_DEF_STMT (new_op);
+ }
- /* We may need to if-convert epilogue to vectorize it. */
- if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
- tree_if_conversion (epilogue);
+ epilogue_vinfo->shared->datarefs_copy.release ();
+ epilogue_vinfo->shared->save_datarefs ();
}
return epilogue;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 1456cde4c2c2dec7244c504d2c496248894a4f1e..9788c02535999e2e08cb03d1f20ddd80ff448d51 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -564,6 +564,8 @@ public:
this points to the original vectorized loop. Otherwise NULL. */
_loop_vec_info *orig_loop_info;
+ vec<_loop_vec_info *> epilogue_vinfos;
+
} *loop_vec_info;
/* Access Functions. */
@@ -1480,13 +1482,15 @@ extern void vect_set_loop_condition (class loop *, loop_vec_info,
extern bool slpeel_can_duplicate_loop_p (const class loop *, const_edge);
class loop *slpeel_tree_duplicate_loop_to_edge_cfg (class loop *,
class loop *, edge);
-class loop *vect_loop_versioning (loop_vec_info, unsigned int, bool,
- poly_uint64);
+class loop *vect_loop_versioning (loop_vec_info);
extern class loop *vect_do_peeling (loop_vec_info, tree, tree,
- tree *, tree *, tree *, int, bool, bool);
+ tree *, tree *, tree *, int, bool, bool,
+ tree *);
extern void vect_prepare_for_masked_peels (loop_vec_info);
extern dump_user_location_t find_loop_location (class loop *);
extern bool vect_can_advance_ivs_p (loop_vec_info);
+extern void vect_update_inits_of_drs (loop_vec_info, tree, tree_code);
+
/* In tree-vect-stmts.c. */
extern poly_uint64 current_vector_size;
@@ -1600,6 +1604,8 @@ extern tree vect_create_addr_base_for_vector_ref (stmt_vec_info, gimple_seq *,
tree, tree = NULL_TREE);
/* In tree-vect-loop.c. */
+/* Used in tree-vect-loop-manip.c */
+extern void determine_peel_for_niter (loop_vec_info);
/* FORNOW: Used in tree-parloops.c. */
extern stmt_vec_info vect_force_simple_reduction (loop_vec_info, stmt_vec_info,
bool *, bool);
@@ -1610,7 +1616,8 @@ extern bool check_reduction_path (dump_user_location_t, loop_p, gphi *, tree,
/* Drive for loop analysis stage. */
extern opt_loop_vec_info vect_analyze_loop (class loop *,
loop_vec_info,
- vec_info_shared *);
+ vec_info_shared *,
+ vector_sizes);
extern tree vect_build_loop_niters (loop_vec_info, bool * = NULL);
extern void vect_gen_vector_loop_niters (loop_vec_info, tree, tree *,
tree *, bool);
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 173e6b51652fd023893b38da786ff28f827553b5..71bbf4fdf8dc7588c45a0e8feef9272b52c0c04c 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -875,6 +875,10 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
vec_info_shared shared;
auto_purge_vect_location sentinel;
vect_location = find_loop_location (loop);
+ auto_vector_sizes auto_vector_sizes;
+ vector_sizes vector_sizes;
+ bool assert_versioning = false;
+
if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
&& dump_enabled_p ())
dump_printf (MSG_NOTE | MSG_PRIORITY_INTERNALS,
@@ -882,10 +886,35 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
LOCATION_FILE (vect_location.get_location_t ()),
LOCATION_LINE (vect_location.get_location_t ()));
+ /* If this is an epilogue, we already know what vector sizes we will use for
+ vectorization as the analyzis was part of the main vectorized loop. Use
+ these instead of going through all vector sizes again. */
+ if (orig_loop_vinfo
+ && !LOOP_VINFO_LOOP (orig_loop_vinfo)->epilogue_vsizes.is_empty ())
+ {
+ vector_sizes = LOOP_VINFO_LOOP (orig_loop_vinfo)->epilogue_vsizes;
+ assert_versioning = LOOP_REQUIRES_VERSIONING (orig_loop_vinfo);
+ current_vector_size = vector_sizes[0];
+ }
+ else
+ {
+ /* Autodetect first vector size we try. */
+ current_vector_size = 0;
+
+ targetm.vectorize.autovectorize_vector_sizes (&auto_vector_sizes,
+ loop->simdlen != 0);
+ vector_sizes = auto_vector_sizes;
+ }
+
/* Try to analyze the loop, retaining an opt_problem if dump_enabled_p. */
- opt_loop_vec_info loop_vinfo
- = vect_analyze_loop (loop, orig_loop_vinfo, &shared);
- loop->aux = loop_vinfo;
+ opt_loop_vec_info loop_vinfo = opt_loop_vec_info::success (NULL);
+ if (loop_vec_info_for_loop (loop))
+ loop_vinfo = opt_loop_vec_info::success (loop_vec_info_for_loop (loop));
+ else
+ {
+ loop_vinfo = vect_analyze_loop (loop, orig_loop_vinfo, &shared, vector_sizes);
+ loop->aux = loop_vinfo;
+ }
if (!loop_vinfo)
if (dump_enabled_p ())
@@ -898,6 +927,10 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
if (!loop_vinfo || !LOOP_VINFO_VECTORIZABLE_P (loop_vinfo))
{
+ /* If this loops requires versioning, make sure the analyzis done on the
+ epilogue loops succeeds. */
+ gcc_assert (!assert_versioning);
+
/* Free existing information if loop is analyzed with some
assumptions. */
if (loop_constraint_set_p (loop, LOOP_C_FINITE))
@@ -1013,8 +1046,13 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
/* Epilogue of vectorized loop must be vectorized too. */
if (new_loop)
- ret |= try_vectorize_loop_1 (simduid_to_vf_htab, num_vectorized_loops,
- new_loop, loop_vinfo, NULL, NULL);
+ {
+ /* Don't include vectorized epilogues in the "vectorized loops" count.
+ */
+ unsigned dont_count = *num_vectorized_loops;
+ ret |= try_vectorize_loop_1 (simduid_to_vf_htab, &dont_count,
+ new_loop, loop_vinfo, NULL, NULL);
+ }
return ret;
}