Re: [PATCH 1/2][vect]PR 88915: Vectorize epilogues when versioning loops

Andre Vieira (lists) Thu, 10 Oct 2019 06:50:38 -0700

Hi,

After all the discussions and respins I now believe this patch is closeto what we envisioned.


This patch achieves two things when vect-epilogues-nomask=1:

1) It analyzes the original loop for each supported vector size andsaves this analysis per loop, as well as the vector sizes we know we canvectorize the loop for.2) When loop versioning it uses the 'skip_vector' code path to vectorizethe epilogue, and uses the lowest versioning threshold between the mainand epilogue's.

As side effects of this patch I also changed ensure_base_align to onlyupdate the alignment if the new alignment is lower than the current one.This function already did that if the object was a symbol, now itbehaves this way for any object.

I bootstrapped this patch with both vect-epilogues-nomask turned on andoff on x86_64 (AVX512) and aarch64. Regression tests looked good.


Is this OK for trunk?

gcc/ChangeLog:
2019-10-10  Andre Vieira  <andre.simoesdiasvie...@arm.com>

    PR 88915
    * cfgloop.h (loop): Add epilogue_vsizes member.
    * cfgloop.c (flow_loop_free): Release epilogue_vsizes.
    (alloc_loop): Initialize epilogue_vsizes.
    * gentype.c (main): Add poly_uint64 type and vector_sizes to
    generator.
    * tree-vect-loop.c (vect_get_loop_niters): Make externally visible.
    (_loop_vec_info): Initialize epilogue_vinfos.
    (~_loop_vec_info): Release epilogue_vinfos.
    (vect_analyze_loop_costing): Use knowledge of main VF to estimate
    number of iterations of epilogue.
    (determine_peel_for_niter): New. Outlined code to re-use in two
    places.
    (vect_analyze_loop_2): Adapt to analyse main loop for all supported
    vector sizes when vect-epilogues-nomask=1.  Also keep track of lowest
    versioning threshold needed for main loop.
    (vect_analyze_loop): Likewise.
    (replace_ops): New helper function.
    (vect_transform_loop): When vectorizing epilogues re-use analysis done
    on main loop and update necessary information.
    * tree-vect-loop-manip.c (vect_update_inits_of_drs): No longer insert
    stmts on loop preheader edge.
    (vect_do_peeling): Enable skip-vectors when doing loop versioning if
    we decided to vectorize epilogues.  Update epilogues NITERS and
    construct ADVANCE to update epilogues data references where needed.
    (vect_loop_versioning): Moved decision to check_profitability
    based on cost model.
    * tree-vect-stmts.c (ensure_base_align): Only update alignment
    if new alignment is lower.
    * tree-vectorizer.h (_loop_vec_info): Add epilogue_vinfos member.
    (vect_loop_versioning, vect_do_peeling, vect_get_loop_niters,
    vect_update_inits_of_drs, determine_peel_for_niter,
    vect_analyze_loop): Add or update declarations.
    * tree-vectorizer.c (try_vectorize_loop_1): Make sure to use already
    create loop_vec_info's for epilogues when available.  Otherwise analyse
    epilogue separately.



Cheers,
Andre

diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h
index 0b0154ffd7bf031a005de993b101d9db6dd98c43..d01512ea46467f1cf77793bdc75b48e71b0b9641 100644
--- a/gcc/cfgloop.h
+++ b/gcc/cfgloop.h
@@ -21,6 +21,7 @@ along with GCC; see the file COPYING3.  If not see
 #define GCC_CFGLOOP_H
 
 #include "cfgloopmanip.h"
+#include "target.h"
 
 /* Structure to hold decision about unrolling/peeling.  */
 enum lpt_dec
@@ -268,6 +269,9 @@ public:
      the basic-block from being collected but its index can still be
      reused.  */
   basic_block former_header;
+
+  /* Keep track of vector sizes we know we can vectorize the epilogue with.  */
+  vector_sizes epilogue_vsizes;
 };
 
 /* Set if the loop is known to be infinite.  */
diff --git a/gcc/cfgloop.c b/gcc/cfgloop.c
index 4ad1f658708f83dbd8789666c26d4bd056837bc6..f3e81bcd00b3f125389aa15b12dc5201b3578d20 100644
--- a/gcc/cfgloop.c
+++ b/gcc/cfgloop.c
@@ -198,6 +198,7 @@ flow_loop_free (class loop *loop)
       exit->prev = exit;
     }
 
+  loop->epilogue_vsizes.release();
   ggc_free (loop->exits);
   ggc_free (loop);
 }
@@ -355,6 +356,7 @@ alloc_loop (void)
   loop->nb_iterations_upper_bound = 0;
   loop->nb_iterations_likely_upper_bound = 0;
   loop->nb_iterations_estimate = 0;
+  loop->epilogue_vsizes.create(8);
   return loop;
 }
 
diff --git a/gcc/gengtype.c b/gcc/gengtype.c
index 53317337cf8c8e8caefd6b819d28b3bba301e755..80fb6ef71465b24e034fa45d69fec56be6b2e7f8 100644
--- a/gcc/gengtype.c
+++ b/gcc/gengtype.c
@@ -5197,6 +5197,7 @@ main (int argc, char **argv)
       POS_HERE (do_scalar_typedef ("widest_int", &pos));
       POS_HERE (do_scalar_typedef ("int64_t", &pos));
       POS_HERE (do_scalar_typedef ("poly_int64", &pos));
+      POS_HERE (do_scalar_typedef ("poly_uint64", &pos));
       POS_HERE (do_scalar_typedef ("uint64_t", &pos));
       POS_HERE (do_scalar_typedef ("uint8", &pos));
       POS_HERE (do_scalar_typedef ("uintptr_t", &pos));
@@ -5206,6 +5207,7 @@ main (int argc, char **argv)
       POS_HERE (do_scalar_typedef ("machine_mode", &pos));
       POS_HERE (do_scalar_typedef ("fixed_size_mode", &pos));
       POS_HERE (do_scalar_typedef ("CONSTEXPR", &pos));
+      POS_HERE (do_scalar_typedef ("vector_sizes", &pos));
       POS_HERE (do_typedef ("PTR", 
 			    create_pointer (resolve_typedef ("void", &pos)),
 			    &pos));
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index 5c25441c70a271f04730486e513437fffa75b7e3..6349e4e808edfc0813ad1d0a1125420d9b0b260c 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -1724,7 +1724,7 @@ vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
    CODE and NITERS are as for vect_update_inits_of_dr.  */
 
-static void
+void
 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
 			  tree_code code)
 {
@@ -1734,21 +1734,12 @@ vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
 
   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
 
-  /* Adjust niters to sizetype and insert stmts on loop preheader edge.  */
+  /* Adjust niters to sizetype.  We used to insert the stmts on loop preheader
+     here, but since we might use these niters to update the epilogues niters
+     and data references we can't insert them here as this definition might not
+     always dominate its uses.  */
   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
-    {
-      gimple_seq seq;
-      edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
-      tree var = create_tmp_var (sizetype, "prolog_loop_adjusted_niters");
-
-      niters = fold_convert (sizetype, niters);
-      niters = force_gimple_operand (niters, &seq, false, var);
-      if (seq)
-	{
-	  basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
-	  gcc_assert (!new_bb);
-	}
-    }
+    niters = fold_convert (sizetype, niters);
 
   FOR_EACH_VEC_ELT (datarefs, i, dr)
     {
@@ -2401,14 +2392,18 @@ class loop *
 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 		 tree *niters_vector, tree *step_vector,
 		 tree *niters_vector_mult_vf_var, int th,
-		 bool check_profitability, bool niters_no_overflow)
+		 bool check_profitability, bool niters_no_overflow,
+		 tree *advance)
 {
   edge e, guard_e;
-  tree type = TREE_TYPE (niters), guard_cond;
+  tree type = TREE_TYPE (niters), guard_cond, vector_guard = NULL;
   basic_block guard_bb, guard_to;
   profile_probability prob_prolog, prob_vector, prob_epilog;
   int estimated_vf;
   int prolog_peeling = 0;
+  bool vect_epilogues
+    = loop_vinfo->epilogue_vinfos.length () > 0
+    && !LOOP_VINFO_EPILOGUE_P (loop_vinfo);
   /* We currently do not support prolog peeling if the target alignment is not
      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
      target alignment being constant.  */
@@ -2466,15 +2461,62 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
   else
     niters_prolog = build_int_cst (type, 0);
 
+  loop_vec_info epilogue_vinfo = NULL;
+  if (vect_epilogues)
+    {
+      epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
+      loop_vinfo->epilogue_vinfos.ordered_remove (0);
+
+      /* Don't vectorize epilogues if this is not the most inner loop or if
+	 the epilogue may need peeling for alignment as the vectorizer doesn't
+	 know how to handle these situations properly yet.  */
+      if (loop->inner != NULL
+	  || LOOP_VINFO_PEELING_FOR_ALIGNMENT (epilogue_vinfo))
+	vect_epilogues = false;
+
+    }
+
+  unsigned int lowest_vf = constant_lower_bound (vf);
+  bool epilogue_any_upper_bound = false;
+  unsigned HOST_WIDE_INT eiters = 0;
+  tree niters_vector_mult_vf;
+
+  /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
+     on niters already ajusted for the iterations of the prologue.  */
+  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      && known_eq (vf, lowest_vf))
+    {
+      vector_sizes vector_sizes = loop->epilogue_vsizes;
+      unsigned next_size = 0;
+      eiters = (LOOP_VINFO_INT_NITERS (loop_vinfo)
+	   - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
+
+      if (prolog_peeling > 0)
+	eiters -= prolog_peeling;
+      eiters
+	= eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
+      epilogue_any_upper_bound = true;
+
+      unsigned int ratio;
+      while (next_size < vector_sizes.length ()
+	     && !(constant_multiple_p (current_vector_size,
+				       vector_sizes[next_size], &ratio)
+		  && eiters >= lowest_vf / ratio))
+	next_size += 1;
+
+      if (next_size == vector_sizes.length ())
+	vect_epilogues = false;
+    }
+
   /* Prolog loop may be skipped.  */
   bool skip_prolog = (prolog_peeling != 0);
   /* Skip to epilog if scalar loop may be preferred.  It's only needed
-     when we peel for epilog loop and when it hasn't been checked with
-     loop versioning.  */
+     when we peel for epilog loop or when we loop version.  */
   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
 		      ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
 				  bound_prolog + bound_epilog)
-		      : !LOOP_REQUIRES_VERSIONING (loop_vinfo));
+		      : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
+			 || vect_epilogues));
   /* Epilog loop must be executed if the number of iterations for epilog
      loop is known at compile time, otherwise we need to add a check at
      the end of vector loop and skip to the end of epilog loop.  */
@@ -2503,7 +2545,25 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
     }
 
   dump_user_location_t loop_loc = find_loop_location (loop);
-  class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
+  class loop *scalar_loop;
+  /* If we are vectorizing the epilogue then we should use a copy of the
+     original main loop to vectorize.  This copy has already been if-converted
+     and is identical to the loop on which the analysis was done, making it
+     easier to update loop_vec_info, stmt_vec_info and dr_vec_info references
+     where needed.  */
+  if (vect_epilogues)
+    {
+      scalar_loop = get_loop_copy (loop);
+      /* Make sure to set the epilogue's epilogue scalar loop, such that we can
+	 we can use the original scalar loop as remaining epilogue if
+	 necessary.  */
+      LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
+	= LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
+      LOOP_VINFO_SCALAR_LOOP (loop_vinfo) = NULL;
+    }
+  else
+   scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
+
   if (prolog_peeling)
     {
       e = loop_preheader_edge (loop);
@@ -2592,6 +2652,13 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 			   "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
 	  gcc_unreachable ();
 	}
+
+      if (epilogue_any_upper_bound && prolog_peeling >= 0)
+	{
+	  epilog->any_upper_bound = true;
+	  epilog->nb_iterations_upper_bound = eiters + 1;
+	}
+
       epilog->force_vectorize = false;
       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
 
@@ -2608,6 +2675,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 						check_profitability);
 	  /* Build guard against NITERSM1 since NITERS may overflow.  */
 	  guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
+	  vector_guard = guard_cond;
 	  guard_bb = anchor;
 	  guard_to = split_edge (loop_preheader_edge (epilog));
 	  guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
@@ -2635,7 +2703,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 	}
 
       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
-      tree niters_vector_mult_vf;
       /* If loop is peeled for non-zero constant times, now niters refers to
 	 orig_niters - prolog_peeling, it won't overflow even the orig_niters
 	 overflows.  */
@@ -2699,10 +2766,108 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
       adjust_vec_debug_stmts ();
       scev_reset ();
     }
+
+  if (vect_epilogues)
+    {
+      epilog->aux = epilogue_vinfo;
+      LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
+
+      loop_constraint_clear (epilog, LOOP_C_INFINITE);
+
+      /* We now must calculate the number of iterations for our epilogue.  */
+      tree cond_niters, niters;
+
+      /* Depending on whether we peel for gaps we take niters or niters - 1,
+	 we will refer to this as N - G, where N and G are the NITERS and
+	 GAP for the original loop.  */
+      niters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+	? LOOP_VINFO_NITERSM1 (loop_vinfo)
+	: LOOP_VINFO_NITERS (loop_vinfo);
+
+      /* Here we build a vector factorization mask:
+	 vf_mask = ~(VF - 1), where VF is the Vectorization Factor.  */
+      tree vf_mask = build_int_cst (TREE_TYPE (niters),
+				    LOOP_VINFO_VECT_FACTOR (loop_vinfo));
+      vf_mask = fold_build2 (MINUS_EXPR, TREE_TYPE (vf_mask),
+			     vf_mask,
+			     build_one_cst (TREE_TYPE (vf_mask)));
+      vf_mask = fold_build1 (BIT_NOT_EXPR, TREE_TYPE (niters), vf_mask);
+
+      /* Here we calculate:
+	 niters = N - ((N-G) & ~(VF -1)) */
+      niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
+			    LOOP_VINFO_NITERS (loop_vinfo),
+			    fold_build2 (BIT_AND_EXPR, TREE_TYPE (niters),
+					 niters,
+					 vf_mask));
+
+      if (skip_vector)
+	{
+	  /* If it is not guaranteed we enter the main loop we need to
+	     make the niters of the epilogue conditional on entireing the main
+	     loop.  We do this by constructing:
+	     cond_niters = !do_we_enter_main_loop ? N + niters_prolog : niters
+	     we add niters_prolog, the number of peeled iterations in the
+	     prolog, to N in case we don't enter the main loop, as these have
+	     already been subtracted from N (the number of iterations of the
+	     main loop).  Since the prolog peeling is also skipped if we skip the
+	     main loop we must add those interations back.  */
+	  cond_niters
+	    = fold_build3 (COND_EXPR, TREE_TYPE (niters),
+			   vector_guard,
+			   fold_build2 (PLUS_EXPR, TREE_TYPE (niters),
+					LOOP_VINFO_NITERS (loop_vinfo),
+					fold_convert (TREE_TYPE (niters),
+						      niters_prolog)),
+			   niters);
+	}
+      else
+	cond_niters = niters;
+
+      LOOP_VINFO_NITERS (epilogue_vinfo) = cond_niters;
+      LOOP_VINFO_NITERSM1 (epilogue_vinfo)
+	= fold_build2 (MINUS_EXPR, TREE_TYPE (cond_niters),
+		       cond_niters, build_one_cst (TREE_TYPE (cond_niters)));
+
+      /* We now calculate the amount of iterations we must advance our
+	 epilogue's data references by.  Make sure to use sizetype here as
+	 otherwise the pointer computation may go wrong on targets with
+	 different pointer sizes to the used niters type.  */
+      *advance = fold_convert (sizetype, niters);
+
+      *advance = fold_build2 (MINUS_EXPR, TREE_TYPE (*advance),
+			      *advance,
+			      fold_convert (sizetype,
+					    LOOP_VINFO_NITERS (loop_vinfo)));
+      *advance = fold_build2 (MINUS_EXPR, TREE_TYPE (*advance),
+			      build_zero_cst (TREE_TYPE (*advance)),
+			      *advance);
+
+      if (skip_vector)
+	{
+	  /* If we are skipping the vectorized loop then we must roll back the
+	     data references by the amount we might have expected to peel in
+	     the, also skipped, prolog.  */
+	  *advance
+	    = fold_build3 (COND_EXPR, TREE_TYPE (*advance),
+			   vector_guard,
+			   fold_build2 (MINUS_EXPR, TREE_TYPE (*advance),
+					build_zero_cst (TREE_TYPE (*advance)),
+					fold_convert (TREE_TYPE (*advance),
+						      niters_prolog)),
+			   *advance);
+	}
+
+      /* Redo the peeling for niter analysis as the NITERs and alignment
+	 may have been updated to take the main loop into account.  */
+      LOOP_VINFO_PEELING_FOR_NITER (epilogue_vinfo) = false;
+      determine_peel_for_niter (epilogue_vinfo);
+    }
+
   adjust_vec.release ();
   free_original_copy_tables ();
 
-  return epilog;
+  return vect_epilogues ? epilog : NULL;
 }
 
 /* Function vect_create_cond_for_niters_checks.
@@ -2966,9 +3131,7 @@ vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
    *COND_EXPR_STMT_LIST.  */
 
 class loop *
-vect_loop_versioning (loop_vec_info loop_vinfo,
-		      unsigned int th, bool check_profitability,
-		      poly_uint64 versioning_threshold)
+vect_loop_versioning (loop_vec_info loop_vinfo)
 {
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
@@ -2988,10 +3151,15 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
+  poly_uint64 versioning_threshold
+    = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
   tree version_simd_if_cond
     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
+  unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
 
-  if (check_profitability)
+  if (th >= vect_vf_for_cost (loop_vinfo)
+      && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      && !ordered_p (th, versioning_threshold))
     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
 			     build_int_cst (TREE_TYPE (scalar_loop_iters),
 					    th - 1));
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index b0cbbac0cb5ba1ffce706715d3dbb9139063803d..5cba0bcf9df93bb25dcd37c8deeff601d3e64c8f 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -713,7 +713,7 @@ vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
    Return the loop exit condition.  */
 
 
-static gcond *
+gcond *
 vect_get_loop_niters (class loop *loop, tree *assumptions,
 		      tree *number_of_iterations, tree *number_of_iterationsm1)
 {
@@ -885,6 +885,8 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 	    }
 	}
     }
+
+  epilogue_vinfos.create (6);
 }
 
 /* Free all levels of MASKS.  */
@@ -960,6 +962,7 @@ _loop_vec_info::~_loop_vec_info ()
   release_vec_loop_masks (&masks);
   delete ivexpr_map;
   delete scan_map;
+  epilogue_vinfos.release ();
 
   loop->aux = NULL;
 }
@@ -1726,7 +1729,13 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
       return 0;
     }
 
-  HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
+  HOST_WIDE_INT estimated_niter = -1;
+
+  if (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
+    estimated_niter
+      = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
+  if (estimated_niter == -1)
+    estimated_niter = estimated_stmt_executions_int (loop);
   if (estimated_niter == -1)
     estimated_niter = likely_max_stmt_executions_int (loop);
   if (estimated_niter != -1
@@ -1852,6 +1861,56 @@ vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
     }
 }
 
+
+/* Decides whether we need to create an epilogue loop to handle
+   remaining scalar iterations and sets PEELING_FOR_NITERS accordingly.  */
+
+void
+determine_peel_for_niter (loop_vec_info loop_vinfo)
+{
+
+  unsigned HOST_WIDE_INT const_vf;
+  HOST_WIDE_INT max_niter
+    = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
+
+  unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
+  if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
+    th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
+					  (loop_vinfo));
+
+  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+    /* The main loop handles all iterations.  */
+    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
+  else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+	   && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
+    {
+      /* Work out the (constant) number of iterations that need to be
+	 peeled for reasons other than niters.  */
+      unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
+      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+	peel_niter += 1;
+      if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
+		       LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
+	LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
+    }
+  else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+	   /* ??? When peeling for gaps but not alignment, we could
+	      try to check whether the (variable) niters is known to be
+	      VF * N + 1.  That's something of a niche case though.  */
+	   || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+	   || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
+	   || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
+		< (unsigned) exact_log2 (const_vf))
+	       /* In case of versioning, check if the maximum number of
+		  iterations is greater than th.  If they are identical,
+		  the epilogue is unnecessary.  */
+	       && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
+		   || ((unsigned HOST_WIDE_INT) max_niter
+		       > (th / const_vf) * const_vf))))
+    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
+}
+
+
 /* Function vect_analyze_loop_2.
 
    Apply a set of analyses on LOOP, and create a loop_vec_info struct
@@ -1864,6 +1923,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
   int res;
   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
   poly_uint64 min_vf = 2;
+  loop_vec_info orig_loop_vinfo = NULL;
 
   /* The first group of checks is independent of the vector size.  */
   fatal = true;
@@ -1979,7 +2039,6 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
   vect_compute_single_scalar_iteration_cost (loop_vinfo);
 
   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-  unsigned th;
 
   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
@@ -2019,9 +2078,6 @@ start_over:
 		   LOOP_VINFO_INT_NITERS (loop_vinfo));
     }
 
-  HOST_WIDE_INT max_niter
-    = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
-
   /* Analyze the alignment of the data-refs in the loop.
      Fail if a data reference is found that cannot be vectorized.  */
 
@@ -2125,42 +2181,7 @@ start_over:
     return opt_result::failure_at (vect_location,
 				   "Loop costings not worthwhile.\n");
 
-  /* Decide whether we need to create an epilogue loop to handle
-     remaining scalar iterations.  */
-  th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
-
-  unsigned HOST_WIDE_INT const_vf;
-  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
-    /* The main loop handles all iterations.  */
-    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
-  else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-	   && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
-    {
-      /* Work out the (constant) number of iterations that need to be
-	 peeled for reasons other than niters.  */
-      unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
-      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
-	peel_niter += 1;
-      if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
-		       LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
-	LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
-    }
-  else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
-	   /* ??? When peeling for gaps but not alignment, we could
-	      try to check whether the (variable) niters is known to be
-	      VF * N + 1.  That's something of a niche case though.  */
-	   || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
-	   || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
-	   || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
-		< (unsigned) exact_log2 (const_vf))
-	       /* In case of versioning, check if the maximum number of
-		  iterations is greater than th.  If they are identical,
-		  the epilogue is unnecessary.  */
-	       && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
-		   || ((unsigned HOST_WIDE_INT) max_niter
-		       > (th / const_vf) * const_vf))))
-    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
-
+  determine_peel_for_niter (loop_vinfo);
   /* If an epilogue loop is required make sure we can create one.  */
   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
@@ -2183,9 +2204,12 @@ start_over:
      enough for both peeled prolog loop and vector loop.  This check
      can be merged along with threshold check of loop versioning, so
      increase threshold for this case if necessary.  */
-  if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
+  if (LOOP_REQUIRES_VERSIONING (loop_vinfo)
+      || ((orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
+	  && LOOP_REQUIRES_VERSIONING (orig_loop_vinfo)))
     {
       poly_uint64 niters_th = 0;
+      unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
 
       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
 	{
@@ -2206,6 +2230,14 @@ start_over:
       /* One additional iteration because of peeling for gap.  */
       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
 	niters_th += 1;
+
+      /*  Use the same condition as vect_transform_loop to decide when to use
+	  the cost to determine a versioning threshold.  */
+      if (th >= vect_vf_for_cost (loop_vinfo)
+	  && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+	  && ordered_p (th, niters_th))
+	niters_th = ordered_max (poly_uint64 (th), niters_th);
+
       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
     }
 
@@ -2329,14 +2361,8 @@ again:
    be vectorized.  */
 opt_loop_vec_info
 vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
-		   vec_info_shared *shared)
+		   vec_info_shared *shared, vector_sizes vector_sizes)
 {
-  auto_vector_sizes vector_sizes;
-
-  /* Autodetect first vector size we try.  */
-  current_vector_size = 0;
-  targetm.vectorize.autovectorize_vector_sizes (&vector_sizes,
-						loop->simdlen != 0);
   unsigned int next_size = 0;
 
   DUMP_VECT_SCOPE ("analyze_loop_nest");
@@ -2357,6 +2383,9 @@ vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
   poly_uint64 autodetected_vector_size = 0;
   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
   poly_uint64 first_vector_size = 0;
+  poly_uint64 lowest_th = 0;
+  unsigned vectorized_loops = 0;
+  bool vect_epilogues = !loop->simdlen && PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK);
   while (1)
     {
       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
@@ -2375,24 +2404,52 @@ vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
 
       if (orig_loop_vinfo)
 	LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
+      else if (vect_epilogues && first_loop_vinfo)
+	LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
 
       opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
       if (res)
 	{
 	  LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
+	  vectorized_loops++;
 
-	  if (loop->simdlen
-	      && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
-			   (unsigned HOST_WIDE_INT) loop->simdlen))
+	  if ((loop->simdlen
+	       && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+			    (unsigned HOST_WIDE_INT) loop->simdlen))
+	      || vect_epilogues)
 	    {
 	      if (first_loop_vinfo == NULL)
 		{
 		  first_loop_vinfo = loop_vinfo;
+		  lowest_th
+		    = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
 		  first_vector_size = current_vector_size;
 		  loop->aux = NULL;
 		}
 	      else
-		delete loop_vinfo;
+		{
+		  /* Keep track of vector sizes that we know we can vectorize
+		     the epilogue with.  */
+		  if (vect_epilogues)
+		    {
+		      loop->aux = NULL;
+		      loop->epilogue_vsizes.reserve (1);
+		      loop->epilogue_vsizes.quick_push (current_vector_size);
+		      first_loop_vinfo->epilogue_vinfos.reserve (1);
+		      first_loop_vinfo->epilogue_vinfos.quick_push (loop_vinfo);
+		      LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
+		      poly_uint64 th
+			= LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
+		      gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
+				  || maybe_ne (lowest_th, 0U));
+		      /* Keep track of the known smallest versioning
+			 threshold.  */
+		      if (ordered_p (lowest_th, th))
+			lowest_th = ordered_min (lowest_th, th);
+		    }
+		  else
+		    delete loop_vinfo;
+		}
 	    }
 	  else
 	    {
@@ -2430,6 +2487,8 @@ vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
 		  dump_dec (MSG_NOTE, current_vector_size);
 		  dump_printf (MSG_NOTE, "\n");
 		}
+	      LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
+
 	      return first_loop_vinfo;
 	    }
 	  else
@@ -8460,6 +8519,34 @@ vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
     *seen_store = stmt_info;
 }
 
+/* Helper function to replace a SSA name in OP with its equivalent SSA name in
+   MAPPING.  */
+
+static tree
+replace_ops (tree op, hash_map<tree, tree> &mapping)
+{
+  if (!op)
+    return NULL;
+
+  tree *new_op;
+  tree ret = NULL;
+  for (int j = 0; j < TREE_OPERAND_LENGTH (op); ++j)
+    {
+      if ((new_op = mapping.get (TREE_OPERAND (op, j))))
+	{
+	  TREE_OPERAND (op, j) = *new_op;
+	  ret = *new_op;
+	}
+      else
+	ret = replace_ops (TREE_OPERAND (op, j), mapping);
+
+      if (ret)
+	return ret;
+    }
+
+  return NULL;
+}
+
 /* Function vect_transform_loop.
 
    The analysis phase has determined that the loop is vectorizable.
@@ -8483,6 +8570,10 @@ vect_transform_loop (loop_vec_info loop_vinfo)
   gimple *stmt;
   bool check_profitability = false;
   unsigned int th;
+  auto_vec<gimple *> orig_stmts;
+  auto_vec<dr_vec_info *> gather_scatter_drs;
+  auto_vec<dr_vec_info *> drs;
+  auto_vec<gimple *> gather_scatter_stmts;
 
   DUMP_VECT_SCOPE ("vec_transform_loop");
 
@@ -8497,11 +8588,11 @@ vect_transform_loop (loop_vec_info loop_vinfo)
   if (th >= vect_vf_for_cost (loop_vinfo)
       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
     {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_NOTE, vect_location,
-			 "Profitability threshold is %d loop iterations.\n",
-                         th);
-      check_profitability = true;
+	if (dump_enabled_p ())
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "Profitability threshold is %d loop iterations.\n",
+			   th);
+	check_profitability = true;
     }
 
   /* Make sure there exists a single-predecessor exit bb.  Do this before 
@@ -8519,18 +8610,8 @@ vect_transform_loop (loop_vec_info loop_vinfo)
 
   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
     {
-      poly_uint64 versioning_threshold
-	= LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
-      if (check_profitability
-	  && ordered_p (poly_uint64 (th), versioning_threshold))
-	{
-	  versioning_threshold = ordered_max (poly_uint64 (th),
-					      versioning_threshold);
-	  check_profitability = false;
-	}
       class loop *sloop
-	= vect_loop_versioning (loop_vinfo, th, check_profitability,
-				versioning_threshold);
+	= vect_loop_versioning (loop_vinfo);
       sloop->force_vectorize = false;
       check_profitability = false;
     }
@@ -8555,9 +8636,64 @@ vect_transform_loop (loop_vec_info loop_vinfo)
   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
+  tree advance;
   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
 			      &step_vector, &niters_vector_mult_vf, th,
-			      check_profitability, niters_no_overflow);
+			      check_profitability, niters_no_overflow,
+			      &advance);
+
+  if (epilogue)
+    {
+      basic_block *orig_bbs = get_loop_body (loop);
+      loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
+
+      gimple_stmt_iterator orig_gsi;
+      gphi_iterator orig_phi_gsi;
+      gimple *stmt;
+      stmt_vec_info stmt_vinfo;
+      dr_vec_info *dr_vinfo;
+
+      /* The stmt_vec_info's of the epilogue were constructed for the main loop
+	 and need to be updated to refer to the cloned variables used in the
+	 epilogue loop.  We do this by assuming the original main loop and the
+	 epilogue loop are identical (aside the different SSA names).  This
+	 means we assume we can go through each BB in the loop and each STMT in
+	 each BB and map them 1:1, replacing the STMT_VINFO_STMT of each
+	 stmt_vec_info in the epilogue's loop_vec_info.  Here we only keep
+	 track of the original state of the main loop, before vectorization.
+	 After vectorization we proceed to update the epilogue's stmt_vec_infos
+	 information.  We also update the references in PATTERN_DEF_SEQ's,
+	 RELATED_STMT's and data_references.  Mainly the latter has to be
+	 updated after we are done vectorizing the main loop, as the
+	 data_references are shared between main and epilogue.  */
+      for (unsigned i = 0; i < loop->num_nodes; ++i)
+	{
+	  for (orig_phi_gsi = gsi_start_phis (orig_bbs[i]);
+	       !gsi_end_p (orig_phi_gsi); gsi_next (&orig_phi_gsi))
+	    orig_stmts.safe_push (orig_phi_gsi.phi ());
+	  for (orig_gsi = gsi_start_bb (orig_bbs[i]);
+	       !gsi_end_p (orig_gsi); gsi_next (&orig_gsi))
+	    {
+	      stmt = gsi_stmt (orig_gsi);
+	      orig_stmts.safe_push (stmt);
+	      stmt_vinfo  = epilogue_vinfo->lookup_stmt (stmt);
+	      /* Data references pointing to gather loads and scatter stores
+		 require special treatment because the address computation
+		 happens in a different gimple node, pointed to by DR_REF.  In
+		 contrast to normal loads and stores where we only need to
+		 update the offset of the data reference.  */
+	      if (stmt_vinfo != NULL
+		  && stmt_vinfo->dr_aux.stmt == stmt_vinfo)
+		{
+		  dr_vinfo = STMT_VINFO_DR_INFO (stmt_vinfo);
+		  if (STMT_VINFO_GATHER_SCATTER_P (dr_vinfo->stmt))
+		    gather_scatter_drs.safe_push (dr_vinfo);
+		  drs.safe_push (dr_vinfo);
+		}
+	    }
+	}
+    }
+
   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
@@ -8814,58 +8950,168 @@ vect_transform_loop (loop_vec_info loop_vinfo)
      since vectorized loop can have loop-carried dependencies.  */
   loop->safelen = 0;
 
-  /* Don't vectorize epilogue for epilogue.  */
-  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
-    epilogue = NULL;
-
-  if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
-    epilogue = NULL;
-
   if (epilogue)
     {
-      auto_vector_sizes vector_sizes;
-      targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
-      unsigned int next_size = 0;
 
-      /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
-         on niters already ajusted for the iterations of the prologue.  */
-      if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-	  && known_eq (vf, lowest_vf))
-	{
-	  unsigned HOST_WIDE_INT eiters
-	    = (LOOP_VINFO_INT_NITERS (loop_vinfo)
-	       - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
-	  eiters
-	    = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
-	  epilogue->nb_iterations_upper_bound = eiters - 1;
-	  epilogue->any_upper_bound = true;
-
-	  unsigned int ratio;
-	  while (next_size < vector_sizes.length ()
-		 && !(constant_multiple_p (current_vector_size,
-					   vector_sizes[next_size], &ratio)
-		      && eiters >= lowest_vf / ratio))
-	    next_size += 1;
-	}
-      else
-	while (next_size < vector_sizes.length ()
-	       && maybe_lt (current_vector_size, vector_sizes[next_size]))
-	  next_size += 1;
+      loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
+      vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
 
-      if (next_size == vector_sizes.length ())
-	epilogue = NULL;
-    }
+      auto_vec<stmt_vec_info> pattern_worklist, related_worklist;
+      hash_map<tree,tree> mapping;
+      gimple * orig_stmt, * new_stmt;
+      gimple_stmt_iterator epilogue_gsi;
+      gphi_iterator epilogue_phi_gsi;
+      stmt_vec_info stmt_vinfo = NULL, related_vinfo;
+      basic_block *epilogue_bbs = get_loop_body (epilogue);
 
-  if (epilogue)
-    {
+      epilogue->simduid = loop->simduid;
       epilogue->force_vectorize = loop->force_vectorize;
       epilogue->safelen = loop->safelen;
       epilogue->dont_vectorize = false;
+      LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
+
+      /* We are done vectorizing the main loop, so now we update the epilogues
+	 stmt_vec_info's.  At the same time we set the gimple UID of each
+	 statement in the epilogue, as these are used to look them up in the
+	 epilogues loop_vec_info later.  We also keep track of what
+	 stmt_vec_info's have PATTERN_DEF_SEQ's and RELATED_STMT's that might
+	 need updating and we construct a mapping between variables defined in
+	 the main loop and their corresponding names in epilogue.  */
+      for (unsigned i = 0; i < loop->num_nodes; ++i)
+	{
+	  for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
+	       !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
+	    {
+	      orig_stmt = orig_stmts[0];
+	      orig_stmts.ordered_remove (0);
+	      new_stmt = epilogue_phi_gsi.phi ();
 
-      /* We may need to if-convert epilogue to vectorize it.  */
-      if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
-	tree_if_conversion (epilogue);
-    }
+	      stmt_vinfo
+		= epilogue_vinfo->lookup_stmt (orig_stmt);
+
+	      STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
+	      gimple_set_uid (new_stmt, gimple_uid (orig_stmt));
+
+	      mapping.put (gimple_phi_result (orig_stmt),
+			    gimple_phi_result (new_stmt));
+
+	      if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
+		pattern_worklist.safe_push (stmt_vinfo);
+
+	      related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
+	      while (related_vinfo && related_vinfo != stmt_vinfo)
+		{
+		  related_worklist.safe_push (related_vinfo);
+		  /* Set BB such that the assert in
+		    'get_initial_def_for_reduction' is able to determine that
+		    the BB of the related stmt is inside this loop.  */
+		  gimple_set_bb (STMT_VINFO_STMT (related_vinfo),
+				 gimple_bb (new_stmt));
+		  related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
+		}
+	    }
+
+	  for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
+	       !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
+	    {
+	      orig_stmt = orig_stmts[0];
+	      orig_stmts.ordered_remove (0);
+	      new_stmt = gsi_stmt (epilogue_gsi);
+
+	      stmt_vinfo
+		= epilogue_vinfo->lookup_stmt (orig_stmt);
+
+	      STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
+	      gimple_set_uid (new_stmt, gimple_uid (orig_stmt));
+
+	      if (is_gimple_assign (orig_stmt))
+		{
+		  gcc_assert (is_gimple_assign (new_stmt));
+		  mapping.put (gimple_assign_lhs (orig_stmt),
+			      gimple_assign_lhs (new_stmt));
+		}
+
+	      if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
+		pattern_worklist.safe_push (stmt_vinfo);
+
+	      related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
+	      related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
+	      while (related_vinfo && related_vinfo != stmt_vinfo)
+		{
+		  related_worklist.safe_push (related_vinfo);
+		  /* Set BB such that the assert in
+		    'get_initial_def_for_reduction' is able to determine that
+		    the BB of the related stmt is inside this loop.  */
+		  gimple_set_bb (STMT_VINFO_STMT (related_vinfo),
+				 gimple_bb (new_stmt));
+		  related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
+		}
+	    }
+	  gcc_assert (orig_stmts.length () == 0);
+	}
+
+      /* The PATTERN_DEF_SEQ's in the epilogue were constructed using the
+	 original main loop and thus need to be updated to refer to the cloned
+	 variables used in the epilogue.  */
+      for (unsigned i = 0; i < pattern_worklist.length (); ++i)
+	{
+	  gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (pattern_worklist[i]);
+	  tree *new_op;
+
+	  while (seq)
+	    {
+	      for (unsigned j = 1; j < gimple_num_ops (seq); ++j)
+		{
+		  tree op = gimple_op (seq, j);
+		  if ((new_op = mapping.get(op)))
+		    gimple_set_op (seq, j, *new_op);
+		  else
+		    {
+		      op = unshare_expr (op);
+		      replace_ops (op, mapping);
+		      gimple_set_op (seq, j, op);
+		    }
+		}
+	      seq = seq->next;
+	    }
+	}
+
+      /* Just like the PATTERN_DEF_SEQ's the RELATED_STMT's also need to be
+	 updated.  */
+      for (unsigned i = 0; i < related_worklist.length (); ++i)
+	{
+	  tree *new_t;
+	  gimple * stmt = STMT_VINFO_STMT (related_worklist[i]);
+	  for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
+	    if ((new_t = mapping.get(gimple_op (stmt, j))))
+	      gimple_set_op (stmt, j, *new_t);
+	}
+
+      tree new_op;
+      /* Data references for gather loads and scatter stores do not use the
+	 updated offset we set using ADVANCE.  Instead we have to make sure the
+	 reference in the data references point to the corresponding copy of
+	 the original in the epilogue.  */
+      for (unsigned i = 0; i < gather_scatter_drs.length (); ++i)
+	{
+	  dr_vec_info *dr_vinfo = gather_scatter_drs[i];
+	  data_reference *dr = dr_vinfo->dr;
+	  gcc_assert (dr);
+	  DR_REF (dr) = unshare_expr (DR_REF (dr));
+	  new_op = replace_ops (DR_REF (dr), mapping);
+	  if (new_op)
+	    DR_STMT (dr_vinfo->dr) = SSA_NAME_DEF_STMT (new_op);
+	}
+
+	  /* The vector size of the epilogue is smaller than that of the main loop
+	     so the alignment is either the same or lower. This means the dr will
+	     thus by definition be aligned.  */
+	  for (unsigned i = 0; i <drs.length (); ++i)
+	    drs[i]->base_misaligned = false;
+
+	  epilogue_vinfo->shared->datarefs_copy.release ();
+	  epilogue_vinfo->shared->save_datarefs ();
+	}
 
   return epilogue;
 }
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 601a6f55fbff388c89f88d994e790aebf2bf960e..201549da6c0cbae0797a23ae1b8967b9895505e9 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -6288,7 +6288,7 @@ ensure_base_align (dr_vec_info *dr_info)
 
       if (decl_in_symtab_p (base_decl))
 	symtab_node::get (base_decl)->increase_alignment (align_base_to);
-      else
+      else if (DECL_ALIGN (base_decl) < align_base_to)
 	{
 	  SET_DECL_ALIGN (base_decl, align_base_to);
           DECL_USER_ALIGN (base_decl) = 1;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 1456cde4c2c2dec7244c504d2c496248894a4f1e..00ab80544f6a7ffac8f62f09f2b2ba099b24d83e 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -564,6 +564,8 @@ public:
      this points to the original vectorized loop.  Otherwise NULL.  */
   _loop_vec_info *orig_loop_info;
 
+  vec<_loop_vec_info *> epilogue_vinfos;
+
 } *loop_vec_info;
 
 /* Access Functions.  */
@@ -1480,13 +1482,16 @@ extern void vect_set_loop_condition (class loop *, loop_vec_info,
 extern bool slpeel_can_duplicate_loop_p (const class loop *, const_edge);
 class loop *slpeel_tree_duplicate_loop_to_edge_cfg (class loop *,
 						     class loop *, edge);
-class loop *vect_loop_versioning (loop_vec_info, unsigned int, bool,
-				   poly_uint64);
+class loop *vect_loop_versioning (loop_vec_info);
 extern class loop *vect_do_peeling (loop_vec_info, tree, tree,
-				     tree *, tree *, tree *, int, bool, bool);
+				    tree *, tree *, tree *, int, bool, bool,
+				    tree *);
 extern void vect_prepare_for_masked_peels (loop_vec_info);
 extern dump_user_location_t find_loop_location (class loop *);
 extern bool vect_can_advance_ivs_p (loop_vec_info);
+extern gcond * vect_get_loop_niters (class loop *, tree *, tree *, tree *);
+extern void vect_update_inits_of_drs (loop_vec_info, tree, tree_code);
+
 
 /* In tree-vect-stmts.c.  */
 extern poly_uint64 current_vector_size;
@@ -1600,6 +1605,8 @@ extern tree vect_create_addr_base_for_vector_ref (stmt_vec_info, gimple_seq *,
 						  tree, tree = NULL_TREE);
 
 /* In tree-vect-loop.c.  */
+/* Used in tree-vect-loop-manip.c */
+extern void determine_peel_for_niter (loop_vec_info);
 /* FORNOW: Used in tree-parloops.c.  */
 extern stmt_vec_info vect_force_simple_reduction (loop_vec_info, stmt_vec_info,
 						  bool *, bool);
@@ -1610,7 +1617,8 @@ extern bool check_reduction_path (dump_user_location_t, loop_p, gphi *, tree,
 /* Drive for loop analysis stage.  */
 extern opt_loop_vec_info vect_analyze_loop (class loop *,
 					    loop_vec_info,
-					    vec_info_shared *);
+					    vec_info_shared *,
+					    vector_sizes);
 extern tree vect_build_loop_niters (loop_vec_info, bool * = NULL);
 extern void vect_gen_vector_loop_niters (loop_vec_info, tree, tree *,
 					 tree *, bool);
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 173e6b51652fd023893b38da786ff28f827553b5..71bbf4fdf8dc7588c45a0e8feef9272b52c0c04c 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -875,6 +875,10 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
   vec_info_shared shared;
   auto_purge_vect_location sentinel;
   vect_location = find_loop_location (loop);
+  auto_vector_sizes auto_vector_sizes;
+  vector_sizes vector_sizes;
+  bool assert_versioning = false;
+
   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
       && dump_enabled_p ())
     dump_printf (MSG_NOTE | MSG_PRIORITY_INTERNALS,
@@ -882,10 +886,35 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
 		 LOCATION_FILE (vect_location.get_location_t ()),
 		 LOCATION_LINE (vect_location.get_location_t ()));
 
+  /* If this is an epilogue, we already know what vector sizes we will use for
+     vectorization as the analyzis was part of the main vectorized loop.  Use
+     these instead of going through all vector sizes again.  */
+  if (orig_loop_vinfo
+      && !LOOP_VINFO_LOOP (orig_loop_vinfo)->epilogue_vsizes.is_empty ())
+    {
+      vector_sizes = LOOP_VINFO_LOOP (orig_loop_vinfo)->epilogue_vsizes;
+      assert_versioning = LOOP_REQUIRES_VERSIONING (orig_loop_vinfo);
+      current_vector_size = vector_sizes[0];
+    }
+  else
+    {
+      /* Autodetect first vector size we try.  */
+      current_vector_size = 0;
+
+      targetm.vectorize.autovectorize_vector_sizes (&auto_vector_sizes,
+						    loop->simdlen != 0);
+      vector_sizes = auto_vector_sizes;
+    }
+
   /* Try to analyze the loop, retaining an opt_problem if dump_enabled_p.  */
-  opt_loop_vec_info loop_vinfo
-    = vect_analyze_loop (loop, orig_loop_vinfo, &shared);
-  loop->aux = loop_vinfo;
+  opt_loop_vec_info loop_vinfo = opt_loop_vec_info::success (NULL);
+  if (loop_vec_info_for_loop (loop))
+    loop_vinfo = opt_loop_vec_info::success (loop_vec_info_for_loop (loop));
+  else
+    {
+      loop_vinfo = vect_analyze_loop (loop, orig_loop_vinfo, &shared, vector_sizes);
+      loop->aux = loop_vinfo;
+    }
 
   if (!loop_vinfo)
     if (dump_enabled_p ())
@@ -898,6 +927,10 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
 
   if (!loop_vinfo || !LOOP_VINFO_VECTORIZABLE_P (loop_vinfo))
     {
+      /* If this loops requires versioning, make sure the analyzis done on the
+	 epilogue loops succeeds.  */
+      gcc_assert (!assert_versioning);
+
       /* Free existing information if loop is analyzed with some
 	 assumptions.  */
       if (loop_constraint_set_p (loop, LOOP_C_FINITE))
@@ -1013,8 +1046,13 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
 
   /* Epilogue of vectorized loop must be vectorized too.  */
   if (new_loop)
-    ret |= try_vectorize_loop_1 (simduid_to_vf_htab, num_vectorized_loops,
-				 new_loop, loop_vinfo, NULL, NULL);
+    {
+      /* Don't include vectorized epilogues in the "vectorized loops" count.
+       */
+      unsigned dont_count = *num_vectorized_loops;
+      ret |= try_vectorize_loop_1 (simduid_to_vf_htab, &dont_count,
+				   new_loop, loop_vinfo, NULL, NULL);
+    }
 
   return ret;
 }

Re: [PATCH 1/2][vect]PR 88915: Vectorize epilogues when versioning loops

Reply via email to