On Mon, 3 Mar 2025, Tamar Christina wrote:
> > > /* For now assume all conditional loads/stores support unaligned
> > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > > index
> > 6bbb16beff2c627fca11a7403ba5ee3a5faa21c1..b661deeeed400e5826fc1c4f70
> > 957b335d1741fa 100644
> > > --- a/gcc/tree-vect-stmts.cc
> > > +++ b/gcc/tree-vect-stmts.cc
> > > @@ -2597,6 +2597,128 @@ get_load_store_type (vec_info *vinfo,
> > stmt_vec_info stmt_info,
> > > return false;
> > > }
> > >
> > > + /* If this DR needs alignment for correctness, we must ensure the
> > > target
> > > + alignment is a constant power-of-two multiple of the amount read per
> > > + vector iteration or force masking. */
> > > + if (dr_safe_speculative_read_required (stmt_info))
> > > + {
> > > + /* We can only peel for loops, of course. */
> > > + gcc_checking_assert (loop_vinfo);
> > > +
> > > + /* Check if we support the operation if early breaks are needed.
> > > Here we
> > > + must ensure that we don't access any more than the scalar code would
> > > + have. A masked operation would ensure this, so for these load types
> > > + force masking. */
> > > + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> > > + && (*memory_access_type == VMAT_GATHER_SCATTER
> > > + || *memory_access_type == VMAT_STRIDED_SLP))
> > > + {
> > > + if (dump_enabled_p ())
> > > + dump_printf_loc (MSG_NOTE, vect_location,
> > > + "early break not supported: cannot peel for "
> > > + "alignment. With non-contiguous memory
> > vectorization"
> > > + " could read out of bounds at %G ",
> > > + STMT_VINFO_STMT (stmt_info));
> > > + LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;
> > > + }
> > > +
> > > + auto target_alignment
> > > + = DR_TARGET_ALIGNMENT (STMT_VINFO_DR_INFO (stmt_info));
> > > + unsigned HOST_WIDE_INT target_align;
> > > + bool inbounds
> > > + = DR_SCALAR_KNOWN_BOUNDS (STMT_VINFO_DR_INFO (stmt_info));
> > > +
> > > + /* If the scalar loop is known to be in bounds, and we're using
> > > scalar
> > > + accesses then there's no need to check further. */
> > > + if (inbounds
> > > + && *memory_access_type == VMAT_ELEMENTWISE)
> > > + {
> > > + *alignment_support_scheme = dr_aligned;
> >
> > Nothing should look at *alignment_support_scheme for VMAT_ELEMENTWISE.
> > Did you actually need this adjustment?
> >
>
> Yes, bitfields are relaxed a few lines up from contiguous to this:
>
> if (SLP_TREE_LANES (slp_node) == 1)
> {
> *memory_access_type = VMAT_ELEMENTWISE;
> if (dump_enabled_p ())
> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> "single-element interleaving not supported
> "
> "for not adjacent vector loads, using "
> "elementwise access\n");
> }
>
> This means we then reach:
> if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
>
> we bail out because the permutes still exist. The code relaxed the load to
> elements but
> never removed the permutes or any associated information.
>
> If the permutes are removed or some other workaround, you then hit
>
> if (!group_aligned && inbounds)
> LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;
>
> Because these aren't group loads. Because the original load didn't have any
> misalignment they
> never needed peeling and as such are dr_unaligned_supported.
>
> So the only way to avoid checking elementwise if by guarding the top level
> with
>
> if (dr_safe_speculative_read_required (stmt_info)
> && *alignment_support_scheme == dr_aligned)
> {
>
> Instead of just
>
> if (dr_safe_speculative_read_required (stmt_info))
> {
>
> Which I wasn't sure if it was the right thing to do... Anyway if I do that I
> can remove...
>
> > > + return true;
> > > + }
> > > +
> > > + bool group_aligned = false;
> > > + if (*alignment_support_scheme == dr_aligned
> > > + && target_alignment.is_constant (&target_align)
> > > + && nunits.is_constant ())
> > > + {
> > > + poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> > > + auto vectype_size
> > > + = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
> > > + poly_uint64 required_alignment = vf * vectype_size;
> > > + /* If we have a grouped access we require that the alignment be N *
> > > elem.
> > */
> > > + if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> > > + required_alignment *=
> > > + DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
> > > + if (!multiple_p (target_alignment, required_alignment))
> > > + {
> > > + if (dump_enabled_p ())
> > > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > + "desired alignment %wu not met. Instead got %wu "
> > > + "for DR alignment at %G",
> > > + required_alignment.to_constant (),
> > > + target_align, STMT_VINFO_STMT (stmt_info));
> > > + return false;
> > > + }
> > > +
> > > + if (!pow2p_hwi (target_align))
> > > + {
> > > + if (dump_enabled_p ())
> > > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > + "non-power-of-two vector alignment %wd "
> > > + "for DR alignment at %G",
> > > + target_align, STMT_VINFO_STMT (stmt_info));
> > > + return false;
> > > + }
> > > +
> > > + /* For VLA we have to insert a runtime check that the vector loads
> > > + per iterations don't exceed a page size. For now we can use
> > > + POLY_VALUE_MAX as a proxy as we can't peel for VLA. */
> > > + if (known_gt (required_alignment, (unsigned)param_min_pagesize))
> > > + {
> > > + if (dump_enabled_p ())
> > > + {
> > > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > + "alignment required for correctness (");
> > > + dump_dec (MSG_MISSED_OPTIMIZATION, required_alignment);
> > > + dump_printf (MSG_NOTE, ") may exceed page size\n");
> > > + }
> > > + return false;
> > > + }
> > > +
> > > + group_aligned = true;
> > > + }
> > > +
> > > + /* There are multiple loads that have a misalignment that we
> > > couldn't
> > > + align. We would need LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P to
> > > + vectorize. */
> > > + if (!group_aligned)
> > > + LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;
> >
> > I think we need to fail here unless scalar-access-in-bounds.
> >
> > > +
> > > + /* When using a group access the first element may be aligned but
> > > the
> > > + subsequent loads may not be. For LOAD_LANES since the loads are based
> > > + on the first DR then all loads in the group are aligned. For
> > > + non-LOAD_LANES this is not the case. In particular a load + blend when
> > > + there are gaps can have the non first loads issued unaligned, even
> > > + partially overlapping the memory of the first load in order to simplify
> > > + the blend. This is what the x86_64 backend does for instance. As
> > > + such only the first load in the group is aligned, the rest are not.
> > > + Because of this the permutes may break the alignment requirements that
> > > + have been set, and as such we should for now, reject them. */
> > > + if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
> > > + {
> > > + if (dump_enabled_p ())
> > > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > + "loads with load permutations not supported for "
> > > + "speculative early break loads without partial "
> > > + "vectors for %G",
> > > + STMT_VINFO_STMT (stmt_info));
> > > + LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;
> >
> > again, I think this doesn't save us. Specifically ...
> >
> > > + }
> > > +
> > > + *alignment_support_scheme = dr_aligned;
> >
> > ... we must not simply claim the access is aligned when it wasn't
> > analyzed as such. If we committed to try peeling for a high
> > target alignment we can't simply walk back here either.
> >
>
> ...This.
>
> That also solves your other comment about once we commit to peeling we can't
> back out.
>
> Are those changes ok?
Yes, that sounds good to me.
Richard.
> Thanks,
> Tamar
>
> > Richard.
> >
> > > + }
> > > +
> > > if (*alignment_support_scheme == dr_unaligned_unsupported)
> > > {
> > > if (dump_enabled_p ())
> > > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> > > index
> > b0cb081cba0ae8b11fbfcfcb8c6d440ec451ccb5..97caf61b345735d297ec49fd6ca
> > 64797435b46fc 100644
> > > --- a/gcc/tree-vectorizer.h
> > > +++ b/gcc/tree-vectorizer.h
> > > @@ -1281,7 +1281,11 @@ public:
> > >
> > > /* Set by early break vectorization when this DR needs peeling for
> > > alignment
> > > for correctness. */
> > > - bool need_peeling_for_alignment;
> > > + bool safe_speculative_read_required;
> > > +
> > > + /* Set by early break vectorization when this DR's scalar accesses are
> > > known
> > > + to be inbounds of a known bounds loop. */
> > > + bool scalar_access_known_in_bounds;
> > >
> > > tree base_decl;
> > >
> > > @@ -1997,6 +2001,35 @@ dr_target_alignment (dr_vec_info *dr_info)
> > > return dr_info->target_alignment;
> > > }
> > > #define DR_TARGET_ALIGNMENT(DR) dr_target_alignment (DR)
> > > +#define DR_SCALAR_KNOWN_BOUNDS(DR) (DR)-
> > >scalar_access_known_in_bounds
> > > +
> > > +/* Return if the stmt_vec_info requires peeling for alignment. */
> > > +inline bool
> > > +dr_safe_speculative_read_required (stmt_vec_info stmt_info)
> > > +{
> > > + dr_vec_info *dr_info;
> > > + if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> > > + dr_info = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (stmt_info));
> > > + else
> > > + dr_info = STMT_VINFO_DR_INFO (stmt_info);
> > > +
> > > + return dr_info->safe_speculative_read_required;
> > > +}
> > > +
> > > +/* Set the safe_speculative_read_required for the the stmt_vec_info, if
> > > group
> > > + access then set on the fist element otherwise set on DR directly. */
> > > +inline void
> > > +dr_set_safe_speculative_read_required (stmt_vec_info stmt_info,
> > > + bool requires_alignment)
> > > +{
> > > + dr_vec_info *dr_info;
> > > + if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> > > + dr_info = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (stmt_info));
> > > + else
> > > + dr_info = STMT_VINFO_DR_INFO (stmt_info);
> > > +
> > > + dr_info->safe_speculative_read_required = requires_alignment;
> > > +}
> > >
> > > inline void
> > > set_dr_target_alignment (dr_vec_info *dr_info, poly_uint64 val)
> > >
> > >
> > >
> >
> > --
> > Richard Biener <[email protected]>
> > SUSE Software Solutions Germany GmbH,
> > Frankenstrasse 146, 90461 Nuernberg, Germany;
> > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
>
--
Richard Biener <[email protected]>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)