Richard Biener <rguent...@suse.de> writes:
> The following ensures that peeling a single iteration for gaps is
> sufficient by enforcing niter masking (partial vector use) given
> we cannot (always) statically decide when the vector size isn't known.
> The condition guarding this and thus statically giving a pass in
> some cases for VL vectors is questionable, the patch doesn't address
> this.
>
> This fixes a set of known failout from enabling
> --param vect-force-slp=1 by default.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu.
>
>       PR tree-optimization/117558
>       * tree-vectorizer.h (_loop_vec_info::must_use_partial_vectors_p): New.
>       (LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P): Likewise.
>       * tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): Initialize
>       must_use_partial_vectors_p.
>       (vect_determine_partial_vectors_and_peeling): Enforce it.
>       (vect_analyze_loop_2): Reset before restarting.
>       * tree-vect-stmts.cc (get_group_load_store_type): When peeling
>       a single gap iteration cannot be determined safe statically
>       enforce the use of partial vectors.

LGTM.  Just to make sure I understand...

> ---
>  gcc/tree-vect-loop.cc  | 13 ++++++++++++-
>  gcc/tree-vect-stmts.cc | 24 +++++++++++++++++++-----
>  gcc/tree-vectorizer.h  |  4 ++++
>  3 files changed, 35 insertions(+), 6 deletions(-)
>
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index c67248e851d..18c4fa1d000 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -1059,6 +1059,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, 
> vec_info_shared *shared)
>      inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
>      vectorizable (false),
>      can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
> +    must_use_partial_vectors_p (false),
>      using_partial_vectors_p (false),
>      using_decrementing_iv_p (false),
>      using_select_vl_p (false),
> @@ -2679,7 +2680,10 @@ vect_determine_partial_vectors_and_peeling 
> (loop_vec_info loop_vinfo)
>    LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
>    LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
>    if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
> -      && need_peeling_or_partial_vectors_p)
> +      && LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo))
> +    LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
> +  else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
> +        && need_peeling_or_partial_vectors_p)
>      {
>        /* For partial-vector-usage=1, try to push the handling of partial
>        vectors to the epilogue, with the main loop continuing to operate
> @@ -2702,6 +2706,12 @@ vect_determine_partial_vectors_and_peeling 
> (loop_vec_info loop_vinfo)
>       LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
>      }
>  
> +  if (LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo)
> +      && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
> +    return opt_result::failure_at (vect_location,
> +                                "not vectorized: loop needs but cannot "
> +                                "use partial vectors\n");
> +
>    if (dump_enabled_p ())
>      dump_printf_loc (MSG_NOTE, vect_location,
>                    "operating on %s vectors%s.\n",
> @@ -3387,6 +3397,7 @@ again:
>    LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
>    LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
>      = saved_can_use_partial_vectors_p;
> +  LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>    LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
>    if (loop_vinfo->scan_map)
>      loop_vinfo->scan_map->empty ();
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 458056dd13d..f4a4d5a554c 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -2202,11 +2202,25 @@ get_group_load_store_type (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>                              (vectype, cnunits / cpart_size,
>                               &half_vtype) == NULL_TREE)))
>               {
> -               if (dump_enabled_p ())
> -                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                                  "peeling for gaps insufficient for "
> -                                  "access\n");
> -               return false;
> +               /* If all fails we can still resort to niter masking, so
> +                  enforce the use of partial vectors.  */
> +               if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> +                 {
> +                   if (dump_enabled_p ())
> +                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                                      "peeling for gaps insufficient for "
> +                                      "access unless using partial "
> +                                      "vectors\n");
> +                   LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;
> +                 }
> +               else
> +                 {
> +                   if (dump_enabled_p ())
> +                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                                      "peeling for gaps insufficient for "
> +                                      "access\n");
> +                   return false;
> +                 }

...is this a compile-time optimisation?  I.e. CAN_USE_PARTIAL_VECTORS_P
mustn't ever go from false to true, so if it's already false, there's no
point continuing?

Richard

>               }
>           }
>       }
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 273e8c644e7..d85dd594094 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -913,6 +913,9 @@ public:
>       fewer than VF scalars.  */
>    bool can_use_partial_vectors_p;
>  
> +  /* Records whether we must use niter masking for correctness reasons.  */
> +  bool must_use_partial_vectors_p;
> +
>    /* True if we've decided to use partially-populated vectors, so that
>       the vector loop can handle fewer than VF scalars.  */
>    bool using_partial_vectors_p;
> @@ -1051,6 +1054,7 @@ public:
>  #define LOOP_VINFO_VERSIONING_THRESHOLD(L) (L)->versioning_threshold
>  #define LOOP_VINFO_VECTORIZABLE_P(L)       (L)->vectorizable
>  #define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) 
> (L)->can_use_partial_vectors_p
> +#define LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P(L) 
> (L)->can_use_partial_vectors_p
>  #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
>  #define LOOP_VINFO_USING_DECREMENTING_IV_P(L) (L)->using_decrementing_iv_p
>  #define LOOP_VINFO_USING_SELECT_VL_P(L) (L)->using_select_vl_p

Reply via email to