[RFC PATCH 1/5] vect: Force alignment peeling to vectorize more early break loops

Alex Coplan Mon, 28 Oct 2024 12:54:28 -0700

This allows us to vectorize more loops with early exits by forcing
peeling for alignment to make sure that we're guaranteed to be able to
safely read an entire vector iteration without crossing a page boundary.


To make this work for VLA architectures we have to allow compile-time
non-constant target alignments.  We also have to override the result of
the target's preferred_vector_alignment hook if it isn't a power-of-two
multiple of the TYPE_SIZE of the chosen vector type.

There is currently an implicit assumption that the TYPE_SIZE of the
vector type is itself a power of two.  For non-VLA types this
could be checked directly in the vectorizer.  For VLA types I
had discussed offline with Richard S about adding a target hook to allow
the vectorizer to query the backend to confirm that a given VLA type
is known to have a power-of-two size at runtime.  I thought we
might be able to do this check in vector_alignment_reachable_p.  Any
thoughts on that, richi?

gcc/ChangeLog:

        * tree-vect-data-refs.cc (vect_analyze_early_break_dependences):
        Set need_peeling_for_alignment flag on read DRs instead of
        failing vectorization.  Punt on gathers.
        (dr_misalignment): Handle non-constant target alignments.
        (vect_compute_data_ref_alignment): If need_peeling_for_alignment
        flag is set on the DR, then override the target alignment chosen
        by the preferred_vector_alignment hook to choose a safe
        alignment.
        (vect_supportable_dr_alignment): Override
        support_vector_misalignment hook if need_peeling_for_alignment
        is set on the DR: in this case we must return
        dr_unaligned_unsupported in order to force peeling.
        * tree-vect-loop-manip.cc (vect_do_peeling): Allow prolog
        peeling by a compile-time non-constant amount.
        * tree-vectorizer.h (dr_vec_info): Add new flag
        need_peeling_for_alignment.
---
 gcc/tree-vect-data-refs.cc  | 77 ++++++++++++++++++++++++++++++-------
 gcc/tree-vect-loop-manip.cc |  6 ---
 gcc/tree-vectorizer.h       |  5 +++
 3 files changed, 68 insertions(+), 20 deletions(-)

diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index 202af7a8952..4e49d8403df 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -739,15 +739,22 @@ vect_analyze_early_break_dependences (loop_vec_info loop_vinfo)
 	  if (DR_IS_READ (dr_ref)
 	      && !ref_within_array_bound (stmt, DR_REF (dr_ref)))
 	    {
+	      if (STMT_VINFO_GATHER_SCATTER_P (stmt_vinfo))
+		{
+		  const char *msg
+		    = "early break not supported: cannot peel gather "
+		      "for alignment, vectorization would read out of "
+		      "bounds at %G";
+		  return opt_result::failure_at (stmt, msg, stmt);
+		}
+
+	      dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_vinfo);
+	      dr_info->need_peeling_for_alignment = true;
+
 	      if (dump_enabled_p ())
-		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-				 "early breaks not supported: vectorization "
-				 "would %s beyond size of obj.\n",
-				 DR_IS_READ (dr_ref) ? "read" : "write");
-	      return opt_result::failure_at (stmt,
-				 "can't safely apply code motion to "
-				 "dependencies of %G to vectorize "
-				 "the early exit.\n", stmt);
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "marking DR (read) as needing peeling for "
+				 "alignment\n");
 	    }
 
 	  if (DR_IS_READ (dr_ref))
@@ -1230,11 +1237,15 @@ dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
      offset which can for example result from a negative stride access.  */
   poly_int64 misalignment = misalign + diff + offset;
 
-  /* vect_compute_data_ref_alignment will have ensured that target_alignment
-     is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN.  */
-  unsigned HOST_WIDE_INT target_alignment_c
-    = dr_info->target_alignment.to_constant ();
-  if (!known_misalignment (misalignment, target_alignment_c, &misalign))
+  /* Below we reject compile-time non-constant target alignments, but if
+     our misalignment is zero, then we are known to already be aligned
+     w.r.t. any such possible target alignment.  */
+  if (known_eq (misalignment, 0))
+    return 0;
+
+  unsigned HOST_WIDE_INT target_alignment_c;
+  if (!dr_info->target_alignment.is_constant (&target_alignment_c)
+      || !known_misalignment (misalignment, target_alignment_c, &misalign))
     return DR_MISALIGNMENT_UNKNOWN;
   return misalign;
 }
@@ -1337,6 +1348,43 @@ vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
   poly_uint64 vector_alignment
     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
 		 BITS_PER_UNIT);
+
+  /* If this DR needs peeling for alignment for correctness, we must
+     ensure the target alignment is a constant power-of-two multiple of the
+     amount read per vector iteration (overriding the above hook where
+     necessary).  */
+  if (dr_info->need_peeling_for_alignment)
+    {
+      /* Vector size in bytes.  */
+      poly_uint64 safe_align
+	= exact_div (tree_to_poly_uint64 (TYPE_SIZE (vectype)), BITS_PER_UNIT);
+
+      /* Multiply by the unroll factor to get the number of bytes read
+	 per vector iteration.  */
+      if (loop_vinfo)
+	{
+	  auto num_copies = vect_get_num_copies (loop_vinfo, vectype);
+	  gcc_checking_assert (pow2p_hwi (num_copies));
+	  safe_align *= num_copies;
+	}
+
+      unsigned HOST_WIDE_INT multiple;
+      if (!constant_multiple_p (vector_alignment, safe_align, &multiple)
+	  || !pow2p_hwi (multiple))
+	{
+	  if (dump_enabled_p ())
+	    {
+	      dump_printf_loc (MSG_NOTE, vect_location,
+			       "forcing alignment for DR from preferred (");
+	      dump_dec (MSG_NOTE, vector_alignment);
+	      dump_printf (MSG_NOTE, ") to safe align (");
+	      dump_dec (MSG_NOTE, safe_align);
+	      dump_printf (MSG_NOTE, " for stmt %G\n", stmt_info->stmt);
+	    }
+	  vector_alignment = safe_align;
+	}
+    }
+
   SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
 
   /* If the main loop has peeled for alignment we have no way of knowing
@@ -7194,7 +7242,8 @@ vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
   if (misalignment == DR_MISALIGNMENT_UNKNOWN)
     is_packed = not_size_aligned (DR_REF (dr));
   if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
-						     is_packed))
+						     is_packed)
+      && !dr_info->need_peeling_for_alignment)
     return dr_unaligned_supported;
 
   /* Unsupported.  */
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 5bbeeddd854..718652f9bd8 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -3129,12 +3129,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
   int estimated_vf;
   int prolog_peeling = 0;
   bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
-  /* We currently do not support prolog peeling if the target alignment is not
-     known at compile time.  'vect_gen_prolog_loop_niters' depends on the
-     target alignment being constant.  */
-  dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
-  if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
-    return NULL;
 
   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index b51771f836c..7650ca2dee5 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1267,6 +1267,11 @@ public:
   poly_uint64 target_alignment;
   /* If true the alignment of base_decl needs to be increased.  */
   bool base_misaligned;
+
+  /* Set by early break vectorization when this DR needs peeling for alignment
+     for correctness.  */
+  bool need_peeling_for_alignment;
+
   tree base_decl;
 
   /* Stores current vectorized loop's offset.  To be added to the DR's

[RFC PATCH 1/5] vect: Force alignment peeling to vectorize more early break loops

Reply via email to