Hi Richard,

As I mentioned in the IRC channel, I managed to get "most" of the regression testsuite working for x86_64 (avx512) and aarch64.

On x86_64 I get a failure that I can't explain, was hoping you might be able to have a look with me:
"PASS->FAIL: gcc.target/i386/vect-perm-odd-1.c execution test"

vect-perm-odd-1.exe segfaults and when I gdb it seems to be the first iteration of the main loop. The tree dumps look alright, but I do notice the stack usage seems to change between --param vect-epilogue-nomask={0,1}.

Am I missing to update some field that may later lead to the amount of stack being used? I am confused, it could very well be that I am missing something obvious, I am not too familiar with x86's ISA. I will try to investigate further.

This patch needs further clean-up and more comments (or comment updates), but I thought I'd share current state to see if you can help me unblock.

Cheers,
Andre
diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h
index 0b0154ffd7bf031a005de993b101d9db6dd98c43..d01512ea46467f1cf77793bdc75b48e71b0b9641 100644
--- a/gcc/cfgloop.h
+++ b/gcc/cfgloop.h
@@ -21,6 +21,7 @@ along with GCC; see the file COPYING3.  If not see
 #define GCC_CFGLOOP_H
 
 #include "cfgloopmanip.h"
+#include "target.h"
 
 /* Structure to hold decision about unrolling/peeling.  */
 enum lpt_dec
@@ -268,6 +269,9 @@ public:
      the basic-block from being collected but its index can still be
      reused.  */
   basic_block former_header;
+
+  /* Keep track of vector sizes we know we can vectorize the epilogue with.  */
+  vector_sizes epilogue_vsizes;
 };
 
 /* Set if the loop is known to be infinite.  */
diff --git a/gcc/cfgloop.c b/gcc/cfgloop.c
index 4ad1f658708f83dbd8789666c26d4bd056837bc6..f3e81bcd00b3f125389aa15b12dc5201b3578d20 100644
--- a/gcc/cfgloop.c
+++ b/gcc/cfgloop.c
@@ -198,6 +198,7 @@ flow_loop_free (class loop *loop)
       exit->prev = exit;
     }
 
+  loop->epilogue_vsizes.release();
   ggc_free (loop->exits);
   ggc_free (loop);
 }
@@ -355,6 +356,7 @@ alloc_loop (void)
   loop->nb_iterations_upper_bound = 0;
   loop->nb_iterations_likely_upper_bound = 0;
   loop->nb_iterations_estimate = 0;
+  loop->epilogue_vsizes.create(8);
   return loop;
 }
 
diff --git a/gcc/gengtype.c b/gcc/gengtype.c
index 53317337cf8c8e8caefd6b819d28b3bba301e755..80fb6ef71465b24e034fa45d69fec56be6b2e7f8 100644
--- a/gcc/gengtype.c
+++ b/gcc/gengtype.c
@@ -5197,6 +5197,7 @@ main (int argc, char **argv)
       POS_HERE (do_scalar_typedef ("widest_int", &pos));
       POS_HERE (do_scalar_typedef ("int64_t", &pos));
       POS_HERE (do_scalar_typedef ("poly_int64", &pos));
+      POS_HERE (do_scalar_typedef ("poly_uint64", &pos));
       POS_HERE (do_scalar_typedef ("uint64_t", &pos));
       POS_HERE (do_scalar_typedef ("uint8", &pos));
       POS_HERE (do_scalar_typedef ("uintptr_t", &pos));
@@ -5206,6 +5207,7 @@ main (int argc, char **argv)
       POS_HERE (do_scalar_typedef ("machine_mode", &pos));
       POS_HERE (do_scalar_typedef ("fixed_size_mode", &pos));
       POS_HERE (do_scalar_typedef ("CONSTEXPR", &pos));
+      POS_HERE (do_scalar_typedef ("vector_sizes", &pos));
       POS_HERE (do_typedef ("PTR", 
 			    create_pointer (resolve_typedef ("void", &pos)),
 			    &pos));
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index 5c25441c70a271f04730486e513437fffa75b7e3..189f7458b1b20be06a9a20d3ee05e74bc176434c 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -26,6 +26,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree.h"
 #include "gimple.h"
 #include "cfghooks.h"
+#include "tree-if-conv.h"
 #include "tree-pass.h"
 #include "ssa.h"
 #include "fold-const.h"
@@ -1724,7 +1725,7 @@ vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
    CODE and NITERS are as for vect_update_inits_of_dr.  */
 
-static void
+void
 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
 			  tree_code code)
 {
@@ -1736,19 +1737,7 @@ vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
 
   /* Adjust niters to sizetype and insert stmts on loop preheader edge.  */
   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
-    {
-      gimple_seq seq;
-      edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
-      tree var = create_tmp_var (sizetype, "prolog_loop_adjusted_niters");
-
-      niters = fold_convert (sizetype, niters);
-      niters = force_gimple_operand (niters, &seq, false, var);
-      if (seq)
-	{
-	  basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
-	  gcc_assert (!new_bb);
-	}
-    }
+    niters = fold_convert (sizetype, niters);
 
   FOR_EACH_VEC_ELT (datarefs, i, dr)
     {
@@ -2401,14 +2390,18 @@ class loop *
 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 		 tree *niters_vector, tree *step_vector,
 		 tree *niters_vector_mult_vf_var, int th,
-		 bool check_profitability, bool niters_no_overflow)
+		 bool check_profitability, bool niters_no_overflow,
+		 tree *advance)
 {
   edge e, guard_e;
-  tree type = TREE_TYPE (niters), guard_cond;
+  tree type = TREE_TYPE (niters), guard_cond, advance_guard = NULL;
   basic_block guard_bb, guard_to;
   profile_probability prob_prolog, prob_vector, prob_epilog;
   int estimated_vf;
   int prolog_peeling = 0;
+  bool vect_epilogues
+    = loop_vinfo->epilogue_vinfos.length () > 0
+    && !LOOP_VINFO_EPILOGUE_P (loop_vinfo);
   /* We currently do not support prolog peeling if the target alignment is not
      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
      target alignment being constant.  */
@@ -2466,15 +2459,61 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
   else
     niters_prolog = build_int_cst (type, 0);
 
+  loop_vec_info epilogue_vinfo = NULL;
+  if (vect_epilogues)
+    {
+      epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
+      loop_vinfo->epilogue_vinfos.ordered_remove (0);
+
+      /* Don't vectorize epilogues if not most inner loop or if you may need to
+	 peel the epilogue loop for alignment.  */
+      if (loop->inner != NULL
+	  || LOOP_VINFO_PEELING_FOR_ALIGNMENT (epilogue_vinfo))
+	vect_epilogues = false;
+
+    }
+
+  unsigned int lowest_vf = constant_lower_bound (vf);
+  bool epilogue_any_upper_bound = false;
+  unsigned HOST_WIDE_INT eiters = 0;
+  tree niters_vector_mult_vf;
+
+  /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
+     on niters already ajusted for the iterations of the prologue.  */
+  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      && known_eq (vf, lowest_vf))
+    {
+      vector_sizes vector_sizes = loop->epilogue_vsizes;
+      unsigned next_size = 0;
+      eiters = (LOOP_VINFO_INT_NITERS (loop_vinfo)
+	   - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
+
+      if (prolog_peeling > 0)
+	eiters -= prolog_peeling;
+      eiters
+	= eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
+      epilogue_any_upper_bound = true;
+
+      unsigned int ratio;
+      while (next_size < vector_sizes.length ()
+	     && !(constant_multiple_p (current_vector_size,
+				       vector_sizes[next_size], &ratio)
+		  && eiters >= lowest_vf / ratio))
+	next_size += 1;
+
+      if (next_size == vector_sizes.length ())
+	vect_epilogues = false;
+    }
+
   /* Prolog loop may be skipped.  */
   bool skip_prolog = (prolog_peeling != 0);
   /* Skip to epilog if scalar loop may be preferred.  It's only needed
-     when we peel for epilog loop and when it hasn't been checked with
-     loop versioning.  */
+     when we peel for epilog loop or when we loop version.  */
   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
 		      ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
 				  bound_prolog + bound_epilog)
-		      : !LOOP_REQUIRES_VERSIONING (loop_vinfo));
+		      : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
+			 || vect_epilogues));
   /* Epilog loop must be executed if the number of iterations for epilog
      loop is known at compile time, otherwise we need to add a check at
      the end of vector loop and skip to the end of epilog loop.  */
@@ -2503,7 +2542,17 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
     }
 
   dump_user_location_t loop_loc = find_loop_location (loop);
-  class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
+  class loop *scalar_loop;
+  if (vect_epilogues)
+    {
+      scalar_loop = get_loop_copy (loop);
+      LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
+	= LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
+      LOOP_VINFO_SCALAR_LOOP (loop_vinfo) = NULL;
+    }
+  else
+   scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
+
   if (prolog_peeling)
     {
       e = loop_preheader_edge (loop);
@@ -2586,12 +2635,24 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 	}
       /* Peel epilog and put it on exit edge of loop.  */
       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
+
       if (!epilog)
 	{
 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
 			   "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
 	  gcc_unreachable ();
 	}
+
+      if (epilogue_any_upper_bound && prolog_peeling >= 0)
+	{
+	  epilog->any_upper_bound = true;
+	  epilog->nb_iterations_upper_bound = eiters + 1;
+	}
+      else if (prolog_peeling < 0)
+	{
+	  epilog->any_upper_bound = false;
+	}
+
       epilog->force_vectorize = false;
       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
 
@@ -2608,6 +2669,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 						check_profitability);
 	  /* Build guard against NITERSM1 since NITERS may overflow.  */
 	  guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
+	  advance_guard = guard_cond;
 	  guard_bb = anchor;
 	  guard_to = split_edge (loop_preheader_edge (epilog));
 	  guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
@@ -2635,7 +2697,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 	}
 
       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
-      tree niters_vector_mult_vf;
       /* If loop is peeled for non-zero constant times, now niters refers to
 	 orig_niters - prolog_peeling, it won't overflow even the orig_niters
 	 overflows.  */
@@ -2699,10 +2760,105 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
       adjust_vec_debug_stmts ();
       scev_reset ();
     }
+
+  if (vect_epilogues)
+    {
+      epilog->aux = epilogue_vinfo;
+      LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
+
+      loop_constraint_clear (epilog, LOOP_C_INFINITE);
+
+      /* We now must calculate the number of iterations for our epilogue.  */
+      tree cond_niters, niters;
+
+      /* Depending on whether we peel for gaps we take niters or niters - 1,
+	 we will refer to this as N - G, where both N and G are the NITERS and
+	 GAP for the original loop.  */
+      niters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+	? LOOP_VINFO_NITERSM1 (loop_vinfo)
+	: LOOP_VINFO_NITERS (loop_vinfo);
+
+      /* Here we build a vector factorization mask:
+	 vf_mask = ~(VF - 1), where VF is the Vectorization Factor.  */
+      tree vf_mask = build_int_cst (TREE_TYPE (niters),
+				    LOOP_VINFO_VECT_FACTOR (loop_vinfo));
+      vf_mask = fold_build2 (MINUS_EXPR, TREE_TYPE (vf_mask),
+			     vf_mask,
+			     build_one_cst (TREE_TYPE (vf_mask)));
+      vf_mask = fold_build1 (BIT_NOT_EXPR, TREE_TYPE (niters), vf_mask);
+
+      /* Here we calculate:
+	 niters = N - ((N-G) & ~(VF -1)) */
+      niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
+			    LOOP_VINFO_NITERS (loop_vinfo),
+			    fold_build2 (BIT_AND_EXPR, TREE_TYPE (niters),
+					 niters,
+					 vf_mask));
+
+      if (skip_vector)
+	{
+	  /* We do this by constructing:
+	     cond_niters = !do_we_enter_main_loop ? N + niters_prolog : niters
+	     we add npeel, the number of peeled iterations for alignment, to N
+	     in case we don't enter the main loop, has these have already been
+	     subtracted from N (the number of iterations of the main loop).
+	     Since the prolog peeling is also skipped if we skip the
+	     vectorization we must add them back.  */
+	  cond_niters
+	    = fold_build3 (COND_EXPR, TREE_TYPE (niters),
+			   advance_guard,
+			   fold_build2 (PLUS_EXPR, TREE_TYPE (niters),
+					LOOP_VINFO_NITERS (loop_vinfo),
+					fold_convert (TREE_TYPE (niters),
+						      niters_prolog)),
+			   niters);
+	}
+      else
+	cond_niters = niters;
+
+      LOOP_VINFO_NITERS (epilogue_vinfo) = cond_niters;
+      LOOP_VINFO_NITERSM1 (epilogue_vinfo)
+	= fold_build2 (MINUS_EXPR, TREE_TYPE (cond_niters),
+		       cond_niters, build_one_cst (TREE_TYPE (cond_niters)));
+
+      /* We now calculate the amount of iterations we must advance our
+         epilogue's data references by.
+	 Make sure to use sizetype here as we might use a negative constant
+	 if the loop peels for alignment.  If the target is 64-bit this can go
+	 wrong if the computation is not done in sizetype.  */
+      *advance = fold_convert (sizetype, niters);
+
+      *advance = fold_build2 (MINUS_EXPR, TREE_TYPE (*advance),
+			      *advance,
+			      fold_convert (sizetype,
+					    LOOP_VINFO_NITERS (loop_vinfo)));
+      *advance = fold_build2 (MINUS_EXPR, TREE_TYPE (*advance),
+			      build_zero_cst (TREE_TYPE (*advance)),
+			      *advance);
+
+      if (skip_vector)
+	{
+	  *advance
+	    = fold_build3 (COND_EXPR, TREE_TYPE (*advance),
+			   advance_guard,
+			   fold_build2 (MINUS_EXPR, TREE_TYPE (*advance),
+					build_zero_cst (TREE_TYPE (*advance)),
+					fold_convert (TREE_TYPE (*advance),
+						      niters_prolog)),
+			   *advance);
+	}
+
+      /* Redo the peeling for niter analysis as the NITERs and need for
+	 alignment have been updated to take the main loop into
+	 account.  */
+      LOOP_VINFO_PEELING_FOR_NITER (epilogue_vinfo) = false;
+      determine_peel_for_niter (epilogue_vinfo);
+    }
+
   adjust_vec.release ();
   free_original_copy_tables ();
 
-  return epilog;
+  return vect_epilogues ? epilog : NULL;
 }
 
 /* Function vect_create_cond_for_niters_checks.
@@ -2966,9 +3122,7 @@ vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
    *COND_EXPR_STMT_LIST.  */
 
 class loop *
-vect_loop_versioning (loop_vec_info loop_vinfo,
-		      unsigned int th, bool check_profitability,
-		      poly_uint64 versioning_threshold)
+vect_loop_versioning (loop_vec_info loop_vinfo)
 {
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
@@ -2988,10 +3142,15 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
+  poly_uint64 versioning_threshold
+    = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
   tree version_simd_if_cond
     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
+  unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
 
-  if (check_profitability)
+  if (th >= vect_vf_for_cost (loop_vinfo)
+      && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      && !ordered_p (th, versioning_threshold))
     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
 			     build_int_cst (TREE_TYPE (scalar_loop_iters),
 					    th - 1));
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index b0cbbac0cb5ba1ffce706715d3dbb9139063803d..6dbde0fe35c29d0357cf5c6e7ab5599957a8242a 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -885,6 +885,8 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 	    }
 	}
     }
+
+  epilogue_vinfos.create (6);
 }
 
 /* Free all levels of MASKS.  */
@@ -960,6 +962,7 @@ _loop_vec_info::~_loop_vec_info ()
   release_vec_loop_masks (&masks);
   delete ivexpr_map;
   delete scan_map;
+  epilogue_vinfos.release ();
 
   loop->aux = NULL;
 }
@@ -1726,7 +1729,13 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
       return 0;
     }
 
-  HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
+  HOST_WIDE_INT estimated_niter = -1;
+
+  if (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
+    estimated_niter
+      = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
+  if (estimated_niter == -1)
+    estimated_niter = estimated_stmt_executions_int (loop);
   if (estimated_niter == -1)
     estimated_niter = likely_max_stmt_executions_int (loop);
   if (estimated_niter != -1
@@ -1852,6 +1861,56 @@ vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
     }
 }
 
+
+/* Decides whether we need to create an epilogue loop to handle
+   remaining scalar iterations and sets PEELING_FOR_NITERS accordingly.  */
+
+void
+determine_peel_for_niter (loop_vec_info loop_vinfo)
+{
+
+  unsigned HOST_WIDE_INT const_vf;
+  HOST_WIDE_INT max_niter
+    = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
+
+  unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
+  if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
+    th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
+					  (loop_vinfo));
+
+  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+    /* The main loop handles all iterations.  */
+    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
+  else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+	   && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
+    {
+      /* Work out the (constant) number of iterations that need to be
+	 peeled for reasons other than niters.  */
+      unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
+      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+	peel_niter += 1;
+      if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
+		       LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
+	LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
+    }
+  else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+	   /* ??? When peeling for gaps but not alignment, we could
+	      try to check whether the (variable) niters is known to be
+	      VF * N + 1.  That's something of a niche case though.  */
+	   || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+	   || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
+	   || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
+		< (unsigned) exact_log2 (const_vf))
+	       /* In case of versioning, check if the maximum number of
+		  iterations is greater than th.  If they are identical,
+		  the epilogue is unnecessary.  */
+	       && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
+		   || ((unsigned HOST_WIDE_INT) max_niter
+		       > (th / const_vf) * const_vf))))
+    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
+}
+
+
 /* Function vect_analyze_loop_2.
 
    Apply a set of analyses on LOOP, and create a loop_vec_info struct
@@ -1864,6 +1923,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
   int res;
   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
   poly_uint64 min_vf = 2;
+  loop_vec_info orig_loop_vinfo = NULL;
 
   /* The first group of checks is independent of the vector size.  */
   fatal = true;
@@ -1979,7 +2039,6 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
   vect_compute_single_scalar_iteration_cost (loop_vinfo);
 
   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-  unsigned th;
 
   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
@@ -2019,9 +2078,6 @@ start_over:
 		   LOOP_VINFO_INT_NITERS (loop_vinfo));
     }
 
-  HOST_WIDE_INT max_niter
-    = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
-
   /* Analyze the alignment of the data-refs in the loop.
      Fail if a data reference is found that cannot be vectorized.  */
 
@@ -2125,42 +2181,7 @@ start_over:
     return opt_result::failure_at (vect_location,
 				   "Loop costings not worthwhile.\n");
 
-  /* Decide whether we need to create an epilogue loop to handle
-     remaining scalar iterations.  */
-  th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
-
-  unsigned HOST_WIDE_INT const_vf;
-  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
-    /* The main loop handles all iterations.  */
-    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
-  else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-	   && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
-    {
-      /* Work out the (constant) number of iterations that need to be
-	 peeled for reasons other than niters.  */
-      unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
-      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
-	peel_niter += 1;
-      if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
-		       LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
-	LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
-    }
-  else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
-	   /* ??? When peeling for gaps but not alignment, we could
-	      try to check whether the (variable) niters is known to be
-	      VF * N + 1.  That's something of a niche case though.  */
-	   || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
-	   || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
-	   || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
-		< (unsigned) exact_log2 (const_vf))
-	       /* In case of versioning, check if the maximum number of
-		  iterations is greater than th.  If they are identical,
-		  the epilogue is unnecessary.  */
-	       && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
-		   || ((unsigned HOST_WIDE_INT) max_niter
-		       > (th / const_vf) * const_vf))))
-    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
-
+  determine_peel_for_niter (loop_vinfo);
   /* If an epilogue loop is required make sure we can create one.  */
   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
@@ -2183,9 +2204,12 @@ start_over:
      enough for both peeled prolog loop and vector loop.  This check
      can be merged along with threshold check of loop versioning, so
      increase threshold for this case if necessary.  */
-  if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
+  if (LOOP_REQUIRES_VERSIONING (loop_vinfo)
+      || ((orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
+	  && LOOP_REQUIRES_VERSIONING (orig_loop_vinfo)))
     {
       poly_uint64 niters_th = 0;
+      unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
 
       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
 	{
@@ -2206,6 +2230,14 @@ start_over:
       /* One additional iteration because of peeling for gap.  */
       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
 	niters_th += 1;
+
+      /*  Use the same condition as vect_transform_loop to decide when to use
+	  the cost to determine a versioning threshold.  */
+      if (th >= vect_vf_for_cost (loop_vinfo)
+	  && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+	  && ordered_p (th, niters_th))
+	niters_th = ordered_max (poly_uint64 (th), niters_th);
+
       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
     }
 
@@ -2329,14 +2361,8 @@ again:
    be vectorized.  */
 opt_loop_vec_info
 vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
-		   vec_info_shared *shared)
+		   vec_info_shared *shared, vector_sizes vector_sizes)
 {
-  auto_vector_sizes vector_sizes;
-
-  /* Autodetect first vector size we try.  */
-  current_vector_size = 0;
-  targetm.vectorize.autovectorize_vector_sizes (&vector_sizes,
-						loop->simdlen != 0);
   unsigned int next_size = 0;
 
   DUMP_VECT_SCOPE ("analyze_loop_nest");
@@ -2357,6 +2383,9 @@ vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
   poly_uint64 autodetected_vector_size = 0;
   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
   poly_uint64 first_vector_size = 0;
+  poly_uint64 lowest_th = 0;
+  unsigned vectorized_loops = 0;
+  bool vect_epilogues = !loop->simdlen && PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK);
   while (1)
     {
       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
@@ -2375,24 +2404,54 @@ vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
 
       if (orig_loop_vinfo)
 	LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
+      else if (vect_epilogues && first_loop_vinfo)
+	{
+	  LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
+	}
 
       opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
       if (res)
 	{
 	  LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
+	  vectorized_loops++;
 
-	  if (loop->simdlen
-	      && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
-			   (unsigned HOST_WIDE_INT) loop->simdlen))
+	  if ((loop->simdlen
+	       && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+			    (unsigned HOST_WIDE_INT) loop->simdlen))
+	      || vect_epilogues)
 	    {
 	      if (first_loop_vinfo == NULL)
 		{
 		  first_loop_vinfo = loop_vinfo;
+		  lowest_th
+		    = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
 		  first_vector_size = current_vector_size;
 		  loop->aux = NULL;
 		}
 	      else
-		delete loop_vinfo;
+		{
+		  /* Keep track of vector sizes that we know we can vectorize
+		     the epilogue with.  */
+		  if (vect_epilogues)
+		    {
+		      loop->aux = NULL;
+		      loop->epilogue_vsizes.reserve (1);
+		      loop->epilogue_vsizes.quick_push (current_vector_size);
+		      first_loop_vinfo->epilogue_vinfos.reserve (1);
+		      first_loop_vinfo->epilogue_vinfos.quick_push (loop_vinfo);
+		      LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
+		      poly_uint64 th
+			= LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
+		      gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
+				  || maybe_ne (lowest_th, 0U));
+		      /* Keep track of the known smallest versioning
+			 threshold.  */
+		      if (ordered_p (lowest_th, th))
+			lowest_th = ordered_min (lowest_th, th);
+		    }
+		  else
+		    delete loop_vinfo;
+		}
 	    }
 	  else
 	    {
@@ -2430,6 +2489,8 @@ vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
 		  dump_dec (MSG_NOTE, current_vector_size);
 		  dump_printf (MSG_NOTE, "\n");
 		}
+	      LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
+
 	      return first_loop_vinfo;
 	    }
 	  else
@@ -8460,6 +8521,33 @@ vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
     *seen_store = stmt_info;
 }
 
+
+
+static tree
+replace_ops (tree op, hash_map<tree, tree> &mapping)
+{
+  if (!op)
+    return NULL;
+
+  tree *new_op;
+  tree ret = NULL;
+  for (int j = 0; j < TREE_OPERAND_LENGTH (op); ++j)
+    {
+      if ((new_op = mapping.get (TREE_OPERAND (op, j))))
+	{
+	  TREE_OPERAND (op, j) = *new_op;
+	  ret = *new_op;
+	}
+      else
+	ret = replace_ops (TREE_OPERAND (op, j), mapping);
+
+      if (ret)
+	return ret;
+    }
+
+  return NULL;
+}
+
 /* Function vect_transform_loop.
 
    The analysis phase has determined that the loop is vectorizable.
@@ -8483,6 +8571,9 @@ vect_transform_loop (loop_vec_info loop_vinfo)
   gimple *stmt;
   bool check_profitability = false;
   unsigned int th;
+  auto_vec<gimple *> orig_stmts;
+  auto_vec<dr_vec_info *> gather_scatter_drs;
+  auto_vec<gimple *> gather_scatter_stmts;
 
   DUMP_VECT_SCOPE ("vec_transform_loop");
 
@@ -8497,11 +8588,11 @@ vect_transform_loop (loop_vec_info loop_vinfo)
   if (th >= vect_vf_for_cost (loop_vinfo)
       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
     {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_NOTE, vect_location,
-			 "Profitability threshold is %d loop iterations.\n",
-                         th);
-      check_profitability = true;
+	if (dump_enabled_p ())
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "Profitability threshold is %d loop iterations.\n",
+			   th);
+	check_profitability = true;
     }
 
   /* Make sure there exists a single-predecessor exit bb.  Do this before 
@@ -8519,18 +8610,8 @@ vect_transform_loop (loop_vec_info loop_vinfo)
 
   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
     {
-      poly_uint64 versioning_threshold
-	= LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
-      if (check_profitability
-	  && ordered_p (poly_uint64 (th), versioning_threshold))
-	{
-	  versioning_threshold = ordered_max (poly_uint64 (th),
-					      versioning_threshold);
-	  check_profitability = false;
-	}
       class loop *sloop
-	= vect_loop_versioning (loop_vinfo, th, check_profitability,
-				versioning_threshold);
+	= vect_loop_versioning (loop_vinfo);
       sloop->force_vectorize = false;
       check_profitability = false;
     }
@@ -8555,9 +8636,58 @@ vect_transform_loop (loop_vec_info loop_vinfo)
   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
+  tree advance;
   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
 			      &step_vector, &niters_vector_mult_vf, th,
-			      check_profitability, niters_no_overflow);
+			      check_profitability, niters_no_overflow,
+			      &advance);
+
+  if (epilogue)
+    {
+      basic_block *orig_bbs = get_loop_body (loop);
+      loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
+
+      gimple_stmt_iterator orig_gsi;
+      gphi_iterator orig_phi_gsi;
+      gimple *stmt;
+      stmt_vec_info stmt_vinfo;
+
+      /* The stmt_vec_info's of the epilogue were constructed for the main loop
+	 and need to be updated to refer to the cloned variables used in the
+	 epilogue loop.  We do this by assuming the original main loop and the
+	 epilogue loop are identical (aside the different SSA names).  This
+	 means we assume we can go through each BB in the loop and each STMT in
+	 each BB and map them 1:1, replacing the STMT_VINFO_STMT of each
+	 stmt_vec_info in the epilogue's loop_vec_info.  Here we only keep
+	 track of the original state of the main loop, before vectorization.
+	 After vectorization we proceed to update the epilogue's stmt_vec_infos
+	 information.  We also update the references in PATTERN_DEF_SEQ's,
+	 RELATED_STMT's and data_references.  Mainly the latter has to be
+	 updated after we are done vectorizing the main loop, as the
+	 data_references are shared between main and epilogue.  */
+      for (unsigned i = 0; i < loop->num_nodes; ++i)
+	{
+	  for (orig_phi_gsi = gsi_start_phis (orig_bbs[i]);
+	       !gsi_end_p (orig_phi_gsi); gsi_next (&orig_phi_gsi))
+	    orig_stmts.safe_push (orig_phi_gsi.phi ());
+	  for (orig_gsi = gsi_start_bb (orig_bbs[i]);
+	       !gsi_end_p (orig_gsi); gsi_next (&orig_gsi))
+	    {
+	      stmt = gsi_stmt (orig_gsi);
+	      orig_stmts.safe_push (stmt);
+	      stmt_vinfo  = epilogue_vinfo->lookup_stmt (stmt);
+	      /* Data references pointing to gather loads and scatter stores
+		 require special treatment because the address computation
+		 happens in a different gimple node, pointed to by DR_REF.  In
+		 contrast to normal loads and stores where we only need to
+		 update the offset of the data reference.  */
+	      if (stmt_vinfo
+		  && STMT_VINFO_GATHER_SCATTER_P (stmt_vinfo))
+		gather_scatter_drs.safe_push (STMT_VINFO_DR_INFO (stmt_vinfo));
+	    }
+	}
+    }
+
   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
@@ -8814,57 +8944,157 @@ vect_transform_loop (loop_vec_info loop_vinfo)
      since vectorized loop can have loop-carried dependencies.  */
   loop->safelen = 0;
 
-  /* Don't vectorize epilogue for epilogue.  */
-  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
-    epilogue = NULL;
-
-  if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
-    epilogue = NULL;
-
   if (epilogue)
     {
-      auto_vector_sizes vector_sizes;
-      targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
-      unsigned int next_size = 0;
 
-      /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
-         on niters already ajusted for the iterations of the prologue.  */
-      if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-	  && known_eq (vf, lowest_vf))
-	{
-	  unsigned HOST_WIDE_INT eiters
-	    = (LOOP_VINFO_INT_NITERS (loop_vinfo)
-	       - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
-	  eiters
-	    = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
-	  epilogue->nb_iterations_upper_bound = eiters - 1;
-	  epilogue->any_upper_bound = true;
-
-	  unsigned int ratio;
-	  while (next_size < vector_sizes.length ()
-		 && !(constant_multiple_p (current_vector_size,
-					   vector_sizes[next_size], &ratio)
-		      && eiters >= lowest_vf / ratio))
-	    next_size += 1;
-	}
-      else
-	while (next_size < vector_sizes.length ()
-	       && maybe_lt (current_vector_size, vector_sizes[next_size]))
-	  next_size += 1;
+      loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
+      vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
 
-      if (next_size == vector_sizes.length ())
-	epilogue = NULL;
-    }
+      auto_vec<stmt_vec_info> pattern_worklist, related_worklist;
+      hash_map<tree,tree> mapping;
+      gimple * orig_stmt, * new_stmt;
+      gimple_stmt_iterator epilogue_gsi;
+      gphi_iterator epilogue_phi_gsi;
+      stmt_vec_info stmt_vinfo = NULL, related_vinfo;
+      basic_block *epilogue_bbs = get_loop_body (epilogue);
 
-  if (epilogue)
-    {
+      epilogue->simduid = loop->simduid;
       epilogue->force_vectorize = loop->force_vectorize;
       epilogue->safelen = loop->safelen;
       epilogue->dont_vectorize = false;
+      LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
+
+      /* We are done vectorizing the main loop, so now we update the epilogues
+	 stmt_vec_info's.  At the same time we set the gimple UID of each
+	 statement in the epilogue, as these are used to look them up in the
+	 epilogues loop_vec_info later.  We also keep track of what
+	 stmt_vec_info's have PATTERN_DEF_SEQ's and RELATED_STMT's that might
+	 need updating and we construct a mapping between variables defined in
+	 the main loop and their corresponding names in epilogue.  */
+      for (unsigned i = 0; i < loop->num_nodes; ++i)
+	{
+	  for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
+	       !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
+	    {
+	      orig_stmt = orig_stmts[0];
+	      orig_stmts.ordered_remove (0);
+	      new_stmt = epilogue_phi_gsi.phi ();
+
+	      stmt_vinfo
+		= epilogue_vinfo->lookup_stmt (orig_stmt);
+
+	      STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
+	      gimple_set_uid (new_stmt, gimple_uid (orig_stmt));
+
+	      mapping.put (gimple_phi_result (orig_stmt),
+			    gimple_phi_result (new_stmt));
+
+	      if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
+		pattern_worklist.safe_push (stmt_vinfo);
+
+	      related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
+	      while (related_vinfo && related_vinfo != stmt_vinfo)
+		{
+		  related_worklist.safe_push (related_vinfo);
+		  /* Set BB such that the assert in
+		    'get_initial_def_for_reduction' is able to determine that
+		    the BB of the related stmt is inside this loop.  */
+		  gimple_set_bb (STMT_VINFO_STMT (related_vinfo),
+				 gimple_bb (new_stmt));
+		  related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
+		}
+	    }
+
+	  for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
+	       !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
+	    {
+	      orig_stmt = orig_stmts[0];
+	      orig_stmts.ordered_remove (0);
+	      new_stmt = gsi_stmt (epilogue_gsi);
+
+	      stmt_vinfo
+		= epilogue_vinfo->lookup_stmt (orig_stmt);
+
+	      STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
+	      gimple_set_uid (new_stmt, gimple_uid (orig_stmt));
+
+	      if (is_gimple_assign (orig_stmt))
+		{
+		  gcc_assert (is_gimple_assign (new_stmt));
+		  mapping.put (gimple_assign_lhs (orig_stmt),
+			      gimple_assign_lhs (new_stmt));
+		}
+
+	      if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
+		pattern_worklist.safe_push (stmt_vinfo);
+
+	      related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
+	      related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
+	      while (related_vinfo && related_vinfo != stmt_vinfo)
+		{
+		  related_worklist.safe_push (related_vinfo);
+		  /* Set BB such that the assert in
+		    'get_initial_def_for_reduction' is able to determine that
+		    the BB of the related stmt is inside this loop.  */
+		  gimple_set_bb (STMT_VINFO_STMT (related_vinfo),
+				 gimple_bb (new_stmt));
+		  related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
+		}
+	    }
+	  gcc_assert (orig_stmts.length () == 0);
+	}
+
+      /* The PATTERN_DEF_SEQ's in the epilogue were constructed using the
+	 original main loop and thus need to be updated to refer to the cloned
+	 variables used in the epilogue.  */
+      for (unsigned i = 0; i < pattern_worklist.length (); ++i)
+	{
+	  gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (pattern_worklist[i]);
+	  tree *new_op;
+
+	  while (seq)
+	    {
+	      for (unsigned j = 1; j < gimple_num_ops (seq); ++j)
+		{
+		  tree op = gimple_op (seq, j);
+		  if ((new_op = mapping.get(op)))
+		    gimple_set_op (seq, j, *new_op);
+		  else
+		    {
+		      op = unshare_expr (op);
+		      replace_ops (op, mapping);
+		      gimple_set_op (seq, j, op);
+		    }
+		}
+	      seq = seq->next;
+	    }
+	}
+
+      /* Just like the PATTERN_DEF_SEQ's the RELATED_STMT's also need to be
+	 updated.  */
+      for (unsigned i = 0; i < related_worklist.length (); ++i)
+	{
+	  tree *new_t;
+	  gimple * stmt = STMT_VINFO_STMT (related_worklist[i]);
+	  for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
+	    if ((new_t = mapping.get(gimple_op (stmt, j))))
+	      gimple_set_op (stmt, j, *new_t);
+	}
+
+      tree new_op;
+      for (unsigned i = 0; i < gather_scatter_drs.length (); ++i)
+	{
+	  dr_vec_info *dr_info = gather_scatter_drs[i];
+	  data_reference *dr = dr_info->dr;
+	  gcc_assert (dr);
+	  DR_REF (dr) = unshare_expr (DR_REF (dr));
+	  new_op = replace_ops (DR_REF (dr), mapping);
+	  if (new_op)
+	    DR_STMT (dr_info->dr) = SSA_NAME_DEF_STMT (new_op);
+	}
 
-      /* We may need to if-convert epilogue to vectorize it.  */
-      if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
-	tree_if_conversion (epilogue);
+      epilogue_vinfo->shared->datarefs_copy.release ();
+      epilogue_vinfo->shared->save_datarefs ();
     }
 
   return epilogue;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 1456cde4c2c2dec7244c504d2c496248894a4f1e..9788c02535999e2e08cb03d1f20ddd80ff448d51 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -564,6 +564,8 @@ public:
      this points to the original vectorized loop.  Otherwise NULL.  */
   _loop_vec_info *orig_loop_info;
 
+  vec<_loop_vec_info *> epilogue_vinfos;
+
 } *loop_vec_info;
 
 /* Access Functions.  */
@@ -1480,13 +1482,15 @@ extern void vect_set_loop_condition (class loop *, loop_vec_info,
 extern bool slpeel_can_duplicate_loop_p (const class loop *, const_edge);
 class loop *slpeel_tree_duplicate_loop_to_edge_cfg (class loop *,
 						     class loop *, edge);
-class loop *vect_loop_versioning (loop_vec_info, unsigned int, bool,
-				   poly_uint64);
+class loop *vect_loop_versioning (loop_vec_info);
 extern class loop *vect_do_peeling (loop_vec_info, tree, tree,
-				     tree *, tree *, tree *, int, bool, bool);
+				    tree *, tree *, tree *, int, bool, bool,
+				    tree *);
 extern void vect_prepare_for_masked_peels (loop_vec_info);
 extern dump_user_location_t find_loop_location (class loop *);
 extern bool vect_can_advance_ivs_p (loop_vec_info);
+extern void vect_update_inits_of_drs (loop_vec_info, tree, tree_code);
+
 
 /* In tree-vect-stmts.c.  */
 extern poly_uint64 current_vector_size;
@@ -1600,6 +1604,8 @@ extern tree vect_create_addr_base_for_vector_ref (stmt_vec_info, gimple_seq *,
 						  tree, tree = NULL_TREE);
 
 /* In tree-vect-loop.c.  */
+/* Used in tree-vect-loop-manip.c */
+extern void determine_peel_for_niter (loop_vec_info);
 /* FORNOW: Used in tree-parloops.c.  */
 extern stmt_vec_info vect_force_simple_reduction (loop_vec_info, stmt_vec_info,
 						  bool *, bool);
@@ -1610,7 +1616,8 @@ extern bool check_reduction_path (dump_user_location_t, loop_p, gphi *, tree,
 /* Drive for loop analysis stage.  */
 extern opt_loop_vec_info vect_analyze_loop (class loop *,
 					    loop_vec_info,
-					    vec_info_shared *);
+					    vec_info_shared *,
+					    vector_sizes);
 extern tree vect_build_loop_niters (loop_vec_info, bool * = NULL);
 extern void vect_gen_vector_loop_niters (loop_vec_info, tree, tree *,
 					 tree *, bool);
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 173e6b51652fd023893b38da786ff28f827553b5..71bbf4fdf8dc7588c45a0e8feef9272b52c0c04c 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -875,6 +875,10 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
   vec_info_shared shared;
   auto_purge_vect_location sentinel;
   vect_location = find_loop_location (loop);
+  auto_vector_sizes auto_vector_sizes;
+  vector_sizes vector_sizes;
+  bool assert_versioning = false;
+
   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
       && dump_enabled_p ())
     dump_printf (MSG_NOTE | MSG_PRIORITY_INTERNALS,
@@ -882,10 +886,35 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
 		 LOCATION_FILE (vect_location.get_location_t ()),
 		 LOCATION_LINE (vect_location.get_location_t ()));
 
+  /* If this is an epilogue, we already know what vector sizes we will use for
+     vectorization as the analyzis was part of the main vectorized loop.  Use
+     these instead of going through all vector sizes again.  */
+  if (orig_loop_vinfo
+      && !LOOP_VINFO_LOOP (orig_loop_vinfo)->epilogue_vsizes.is_empty ())
+    {
+      vector_sizes = LOOP_VINFO_LOOP (orig_loop_vinfo)->epilogue_vsizes;
+      assert_versioning = LOOP_REQUIRES_VERSIONING (orig_loop_vinfo);
+      current_vector_size = vector_sizes[0];
+    }
+  else
+    {
+      /* Autodetect first vector size we try.  */
+      current_vector_size = 0;
+
+      targetm.vectorize.autovectorize_vector_sizes (&auto_vector_sizes,
+						    loop->simdlen != 0);
+      vector_sizes = auto_vector_sizes;
+    }
+
   /* Try to analyze the loop, retaining an opt_problem if dump_enabled_p.  */
-  opt_loop_vec_info loop_vinfo
-    = vect_analyze_loop (loop, orig_loop_vinfo, &shared);
-  loop->aux = loop_vinfo;
+  opt_loop_vec_info loop_vinfo = opt_loop_vec_info::success (NULL);
+  if (loop_vec_info_for_loop (loop))
+    loop_vinfo = opt_loop_vec_info::success (loop_vec_info_for_loop (loop));
+  else
+    {
+      loop_vinfo = vect_analyze_loop (loop, orig_loop_vinfo, &shared, vector_sizes);
+      loop->aux = loop_vinfo;
+    }
 
   if (!loop_vinfo)
     if (dump_enabled_p ())
@@ -898,6 +927,10 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
 
   if (!loop_vinfo || !LOOP_VINFO_VECTORIZABLE_P (loop_vinfo))
     {
+      /* If this loops requires versioning, make sure the analyzis done on the
+	 epilogue loops succeeds.  */
+      gcc_assert (!assert_versioning);
+
       /* Free existing information if loop is analyzed with some
 	 assumptions.  */
       if (loop_constraint_set_p (loop, LOOP_C_FINITE))
@@ -1013,8 +1046,13 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
 
   /* Epilogue of vectorized loop must be vectorized too.  */
   if (new_loop)
-    ret |= try_vectorize_loop_1 (simduid_to_vf_htab, num_vectorized_loops,
-				 new_loop, loop_vinfo, NULL, NULL);
+    {
+      /* Don't include vectorized epilogues in the "vectorized loops" count.
+       */
+      unsigned dont_count = *num_vectorized_loops;
+      ret |= try_vectorize_loop_1 (simduid_to_vf_htab, &dont_count,
+				   new_loop, loop_vinfo, NULL, NULL);
+    }
 
   return ret;
 }

Reply via email to