"Andre Vieira (lists)" <andre.simoesdiasvie...@arm.com> writes:
> Made the suggested changes.
>
> Regarding the name change to partial vectors, I agree in the name change 
> since that is the terminology we are using in the loop_vinfo members 
> too, but is there an actual difference between predication/masking and 
> partial vectors that I am missing?

“Predication/masking” refers to the ability to enable and disable
operations on a lane-by-lane basis.  E.g. it means that patterns
like 10100010 are possible.

“Operating on partial vectors” is the ability to operate on just the
first N lanes of a vector, for some given N.  It means that patterns
like 11111100 are possible but patterns like 10100010 might not be.

At the moment, “operating on partial vectors” also requires either
direct support for loading and storing the first N lanes, or a way of
generating a loop predicate/mask from N and using predication/masking.

So the two concepts overlap, but support for one doesn't directly
imply support for the other.

> OK for trunk?
>
> gcc/ChangeLog:
>
>          * tree-vect-loop.c (vect_better_loop_vinfo_p): Round factors up 
> for epilogue costing.
>          (vect_analyze_loop): Re-analyze all modes for epilogues, unless 
> we are guaranteed that we can't
>          have partial vectors.
>          (genopinit.c) (partial_vectors_supported): Generate new function.
>
> gcc/testsuite/ChangeLog:
>
>          * gcc.target/aarch64/masked_epilogue.c: New test.

OK, thanks.

Richard

> diff --git a/gcc/genopinit.c b/gcc/genopinit.c
> index 
> 195ddf74fa2b7d89760622073dcec9d5d339a097..2bc7cdbf53337beae181afd7bb05b366ab068c6a
>  100644
> --- a/gcc/genopinit.c
> +++ b/gcc/genopinit.c
> @@ -321,6 +321,7 @@ main (int argc, const char **argv)
>          "  bool supports_vec_scatter_store_cached;\n"
>          "};\n"
>          "extern void init_all_optabs (struct target_optabs *);\n"
> +        "extern bool partial_vectors_supported_p (void);\n"
>          "\n"
>          "extern struct target_optabs default_target_optabs;\n"
>          "extern struct target_optabs *this_fn_optabs;\n"
> @@ -373,6 +374,33 @@ main (int argc, const char **argv)
>      fprintf (s_file, "  ena[%u] = HAVE_%s;\n", i, p->name);
>    fprintf (s_file, "}\n\n");
>  
> +  fprintf (s_file,
> +        "/* Returns TRUE if the target supports any of the partial vector\n"
> +        "   optabs: while_ult_optab, len_load_optab or len_store_optab,\n"
> +        "   for any mode.  */\n"
> +        "bool\npartial_vectors_supported_p (void)\n{\n");
> +  bool any_match = false;
> +  fprintf (s_file, "\treturn");
> +  bool first = true;
> +  for (i = 0; patterns.iterate (i, &p); ++i)
> +    {
> +#define CMP_NAME(N) !strncmp (p->name, (N), strlen ((N)))
> +      if (CMP_NAME("while_ult") || CMP_NAME ("len_load")
> +       || CMP_NAME ("len_store"))
> +     {
> +       if (first)
> +         fprintf (s_file, " HAVE_%s", p->name);
> +       else
> +         fprintf (s_file, " || HAVE_%s", p->name);
> +       first = false;
> +       any_match = true;
> +     }
> +    }
> +  if (!any_match)
> +    fprintf (s_file, " false");
> +  fprintf (s_file, ";\n}\n");
> +
> +
>    /* Perform a binary search on a pre-encoded optab+mode*2.  */
>    /* ??? Perhaps even better to generate a minimal perfect hash.
>       Using gperf directly is awkward since it's so geared to working
> diff --git a/gcc/testsuite/gcc.target/aarch64/masked_epilogue.c 
> b/gcc/testsuite/gcc.target/aarch64/masked_epilogue.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..286a7be236f337fee4c4650f42da72000855c5e6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/masked_epilogue.c
> @@ -0,0 +1,10 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details 
> -march=armv8-a+sve -msve-vector-bits=scalable" } */
> +
> +void f(unsigned char y[restrict],
> +       unsigned char x[restrict], int n) {
> +  for (int i = 0; i < n; ++i)
> +    y[i] = (y[i] + x[i] + 1) >> 1;
> +}
> +
> +/* { dg-final { scan-tree-dump {LOOP EPILOGUE VECTORIZED \(MODE=VNx} "vect" 
> } } */
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index 
> a28bb6321d76b8222bc8cfdade151ca9b4dca406..5af98a36678ae61e99f93beb90920e2d0940c53a
>  100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -2824,11 +2824,13 @@ vect_better_loop_vinfo_p (loop_vec_info 
> new_loop_vinfo,
>       {
>         unsigned HOST_WIDE_INT main_vf_max
>           = estimated_poly_value (main_poly_vf, POLY_VALUE_MAX);
> +       unsigned HOST_WIDE_INT old_vf_max
> +         = estimated_poly_value (old_vf, POLY_VALUE_MAX);
> +       unsigned HOST_WIDE_INT new_vf_max
> +         = estimated_poly_value (new_vf, POLY_VALUE_MAX);
>  
> -       old_factor = main_vf_max / estimated_poly_value (old_vf,
> -                                                        POLY_VALUE_MAX);
> -       new_factor = main_vf_max / estimated_poly_value (new_vf,
> -                                                        POLY_VALUE_MAX);
> +       old_factor = CEIL (main_vf_max, old_vf_max);
> +       new_factor = CEIL (main_vf_max, new_vf_max);
>  
>         /* If the loop is not using partial vectors then it will iterate one
>            time less than one that does.  It is safe to subtract one here,
> @@ -3069,8 +3071,6 @@ vect_analyze_loop (class loop *loop, vec_info_shared 
> *shared)
>    machine_mode autodetected_vector_mode = VOIDmode;
>    opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
>    unsigned int mode_i = 0;
> -  unsigned int first_loop_i = 0;
> -  unsigned int first_loop_next_i = 0;
>    unsigned HOST_WIDE_INT simdlen = loop->simdlen;
>  
>    /* First determine the main loop vectorization mode, either the first
> @@ -3079,7 +3079,6 @@ vect_analyze_loop (class loop *loop, vec_info_shared 
> *shared)
>       lowest cost if pick_lowest_cost_p.  */
>    while (1)
>      {
> -      unsigned int loop_vinfo_i = mode_i;
>        bool fatal;
>        opt_loop_vec_info loop_vinfo
>       = vect_analyze_loop_1 (loop, shared, &loop_form_info,
> @@ -3108,11 +3107,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared 
> *shared)
>             first_loop_vinfo = opt_loop_vec_info::success (NULL);
>           }
>         if (first_loop_vinfo == NULL)
> -         {
> -           first_loop_vinfo = loop_vinfo;
> -           first_loop_i = loop_vinfo_i;
> -           first_loop_next_i = mode_i;
> -         }
> +         first_loop_vinfo = loop_vinfo;
>         else
>           {
>             delete loop_vinfo;
> @@ -3158,32 +3153,37 @@ vect_analyze_loop (class loop *loop, vec_info_shared 
> *shared)
>    /* Now analyze first_loop_vinfo for epilogue vectorization.  */
>    poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
>  
> -  /* Handle the case that the original loop can use partial
> -     vectorization, but want to only adopt it for the epilogue.
> -     The retry should be in the same mode as original.  */
> -  if (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (first_loop_vinfo))
> +  /* For epilogues start the analysis from the first mode.  The motivation
> +     behind starting from the beginning comes from cases where the 
> VECTOR_MODES
> +     array may contain length-agnostic and length-specific modes.  Their
> +     ordering is not guaranteed, so we could end up picking a mode for the 
> main
> +     loop that is after the epilogue's optimal mode.  */
> +  mode_i = 1;
> +  bool supports_partial_vectors = partial_vectors_supported_p ();
> +  poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
> +
> +  while (1)
>      {
> -      gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (first_loop_vinfo)
> -               && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (first_loop_vinfo));
> +      /* If the target does not support partial vectors we can shorten the
> +      number of modes to analyze for the epilogue as we know we can't pick a
> +      mode that has at least as many NUNITS as the main loop's vectorization
> +      factor, since that would imply the epilogue's vectorization factor
> +      would be at least as high as the main loop's and we would be
> +      vectorizing for more scalar iterations than there would be left.  */
> +      if (!supports_partial_vectors
> +       && maybe_ge (GET_MODE_NUNITS (vector_modes[mode_i]), first_vinfo_vf))
> +     {
> +       mode_i++;
> +       if (mode_i == vector_modes.length ())
> +         break;
> +       continue;
> +     }
> +
>        if (dump_enabled_p ())
>       dump_printf_loc (MSG_NOTE, vect_location,
> -                      "***** Re-trying analysis with same vector mode"
> -                      " %s for epilogue with partial vectors.\n",
> -                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
> -      mode_i = first_loop_i;
> -    }
> -  else
> -    {
> -      mode_i = first_loop_next_i;
> -      if (mode_i == vector_modes.length ())
> -     return first_loop_vinfo;
> -    }
> -
> -  /* ???  If first_loop_vinfo was using VOIDmode then we probably
> -     want to instead search for the corresponding mode in vector_modes[].  */
> +                      "***** Re-trying epilogue analysis with vector "
> +                      "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
>  
> -  while (1)
> -    {
>        bool fatal;
>        opt_loop_vec_info loop_vinfo
>       = vect_analyze_loop_1 (loop, shared, &loop_form_info,
> @@ -3235,11 +3235,6 @@ vect_analyze_loop (class loop *loop, vec_info_shared 
> *shared)
>        if (mode_i == vector_modes.length ())
>       break;
>  
> -      /* Try the next biggest vector size.  */
> -      if (dump_enabled_p ())
> -     dump_printf_loc (MSG_NOTE, vect_location,
> -                      "***** Re-trying epilogue analysis with vector "
> -                      "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
>      }
>  
>    if (!first_loop_vinfo->epilogue_vinfos.is_empty ())

Reply via email to