Richard Biener <rguent...@suse.de> writes: > The following ensures that peeling a single iteration for gaps is > sufficient by enforcing niter masking (partial vector use) given > we cannot (always) statically decide when the vector size isn't known. > The condition guarding this and thus statically giving a pass in > some cases for VL vectors is questionable, the patch doesn't address > this. > > This fixes a set of known failout from enabling > --param vect-force-slp=1 by default. > > Bootstrapped and tested on x86_64-unknown-linux-gnu. > > PR tree-optimization/117558 > * tree-vectorizer.h (_loop_vec_info::must_use_partial_vectors_p): New. > (LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P): Likewise. > * tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): Initialize > must_use_partial_vectors_p. > (vect_determine_partial_vectors_and_peeling): Enforce it. > (vect_analyze_loop_2): Reset before restarting. > * tree-vect-stmts.cc (get_group_load_store_type): When peeling > a single gap iteration cannot be determined safe statically > enforce the use of partial vectors.
LGTM. Just to make sure I understand... > --- > gcc/tree-vect-loop.cc | 13 ++++++++++++- > gcc/tree-vect-stmts.cc | 24 +++++++++++++++++++----- > gcc/tree-vectorizer.h | 4 ++++ > 3 files changed, 35 insertions(+), 6 deletions(-) > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > index c67248e851d..18c4fa1d000 100644 > --- a/gcc/tree-vect-loop.cc > +++ b/gcc/tree-vect-loop.cc > @@ -1059,6 +1059,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, > vec_info_shared *shared) > inner_loop_cost_factor (param_vect_inner_loop_cost_factor), > vectorizable (false), > can_use_partial_vectors_p (param_vect_partial_vector_usage != 0), > + must_use_partial_vectors_p (false), > using_partial_vectors_p (false), > using_decrementing_iv_p (false), > using_select_vl_p (false), > @@ -2679,7 +2680,10 @@ vect_determine_partial_vectors_and_peeling > (loop_vec_info loop_vinfo) > LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false; > LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false; > if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) > - && need_peeling_or_partial_vectors_p) > + && LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo)) > + LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true; > + else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) > + && need_peeling_or_partial_vectors_p) > { > /* For partial-vector-usage=1, try to push the handling of partial > vectors to the epilogue, with the main loop continuing to operate > @@ -2702,6 +2706,12 @@ vect_determine_partial_vectors_and_peeling > (loop_vec_info loop_vinfo) > LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true; > } > > + if (LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) > + && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) > + return opt_result::failure_at (vect_location, > + "not vectorized: loop needs but cannot " > + "use partial vectors\n"); > + > if (dump_enabled_p ()) > dump_printf_loc (MSG_NOTE, vect_location, > "operating on %s vectors%s.\n", > @@ -3387,6 +3397,7 @@ again: > LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0; > LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) > = saved_can_use_partial_vectors_p; > + LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; > LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false; > if (loop_vinfo->scan_map) > loop_vinfo->scan_map->empty (); > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > index 458056dd13d..f4a4d5a554c 100644 > --- a/gcc/tree-vect-stmts.cc > +++ b/gcc/tree-vect-stmts.cc > @@ -2202,11 +2202,25 @@ get_group_load_store_type (vec_info *vinfo, > stmt_vec_info stmt_info, > (vectype, cnunits / cpart_size, > &half_vtype) == NULL_TREE))) > { > - if (dump_enabled_p ()) > - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "peeling for gaps insufficient for " > - "access\n"); > - return false; > + /* If all fails we can still resort to niter masking, so > + enforce the use of partial vectors. */ > + if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "peeling for gaps insufficient for " > + "access unless using partial " > + "vectors\n"); > + LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true; > + } > + else > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "peeling for gaps insufficient for " > + "access\n"); > + return false; > + } ...is this a compile-time optimisation? I.e. CAN_USE_PARTIAL_VECTORS_P mustn't ever go from false to true, so if it's already false, there's no point continuing? Richard > } > } > } > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h > index 273e8c644e7..d85dd594094 100644 > --- a/gcc/tree-vectorizer.h > +++ b/gcc/tree-vectorizer.h > @@ -913,6 +913,9 @@ public: > fewer than VF scalars. */ > bool can_use_partial_vectors_p; > > + /* Records whether we must use niter masking for correctness reasons. */ > + bool must_use_partial_vectors_p; > + > /* True if we've decided to use partially-populated vectors, so that > the vector loop can handle fewer than VF scalars. */ > bool using_partial_vectors_p; > @@ -1051,6 +1054,7 @@ public: > #define LOOP_VINFO_VERSIONING_THRESHOLD(L) (L)->versioning_threshold > #define LOOP_VINFO_VECTORIZABLE_P(L) (L)->vectorizable > #define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) > (L)->can_use_partial_vectors_p > +#define LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P(L) > (L)->can_use_partial_vectors_p > #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p > #define LOOP_VINFO_USING_DECREMENTING_IV_P(L) (L)->using_decrementing_iv_p > #define LOOP_VINFO_USING_SELECT_VL_P(L) (L)->using_select_vl_p