On Fri, 17 May 2019, Jakub Jelinek wrote:

> Hi!
> 
> When simdlen clause is specified on simd loop, it specifies the preferred
> vectorization factor.  It is a preference, so if there is no possibility of
> satisfying it, we can do something else, but still, we shouldn't ignore it
> as we've been ignoring it before.
> 
> Unfortunately, we iterate over vectorization sizes rather than over
> vectorization factors, so in order to determine the vectorization factor, we
> need to analyze.
> 
> The following patch in the vectorizer when seeing a possible vectorization
> which doesn't have the requested vectorization factor remembers first such
> vectorization and continues searching and if no vectorization size with the
> right vectorization factor is found, just uses the first one.
> 
> Another thing is that on x86 with -mprefer-vector-width={256,128} (the
> former is the default), we don't actually push all the possible
> vectorization sizes.  IMHO when one uses the simd clause and says say
> simdlen(16) for loop which just uses ints, then the user wants to use %zmmN
> operations even if the default is -mprefer-vector-width=256 or even if that
> option is used explicitly.  Perhaps one option would be to push the
> 64 size to the vector always, just when it is not preferred put it last, but
> then even for normal loops if 32 and 16 byte vectorization is unsuccessful,
> we'd either waste compile time or in rare corner cases could in theory
> vectorize using that vectorization size even when it is not preferred.
> So, the patch adds an argument and does that only when the simdlen clause
> is used.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

OK with the nits below.

Thanks,
Richard.

> 2019-05-17  Jakub Jelinek  <ja...@redhat.com>
> 
>       * cfgloop.h (struct loop): Add simdlen member.
>       * omp-expand.c (expand_omp_simd): Set it if simdlen clause is present.
>       * tree-vect-loop.c (vect_analyze_loop): Pass loop->simdlen != 0
>       as new argument to autovectorize_vector_sizes target hook.  If
>       loop->simdlen, pick up vector size where the vectorization factor
>       is equal to loop->simd, and if there is none, fall back to the first
>       successful one.
>       (vect_transform_loop): Adjust autovectorize_vector_sizes target hook
>       caller.
>       * omp-low.c (omp_clause_aligned_alignment): Likewise.
>       * omp-general.c (omp_max_vf): Likewise.
>       * optabs-query.c (can_vec_mask_load_store_p): Likewise.
>       * tree-vect-slp.c (vect_slp_bb): Likewise.
>       * target.def (autovectorize_vector_sizes): Add ALL argument and
>       document it.
>       * doc/tm.texi: Adjust documentation.
>       * targhooks.c (default_autovectorize_vector_sizes): Add bool argument.
>       * targhooks.h (default_autovectorize_vector_sizes): Likewise.
>       * config/aarch64/aarch64.c (aarch64_autovectorize_vector_sizes): Add
>       bool argument.
>       * config/arc/arc.c (arc_autovectorize_vector_sizes): Likewise.
>       * config/arm/arm.c (arm_autovectorize_vector_sizes): Likewise.
>       * config/mips/mips.c (mips_autovectorize_vector_sizes): Likewise.
>       * config/i386/i386.c (ix86_autovectorize_vector_sizes): Likewise.  If
>       true and TARGET_AVX512F or TARGET_AVX, push 3 or 2 sizes even if
>       preferred vector size is not 512-bit or 256-bit, just put those
>       unpreferred ones last.
> 
>       * gcc.target/i386/avx512f-simd-1.c: New test.
> 
> --- gcc/cfgloop.h.jj  2019-03-08 11:43:35.063317726 +0100
> +++ gcc/cfgloop.h     2019-05-16 15:52:05.974315760 +0200
> @@ -174,6 +174,9 @@ struct GTY ((chain_next ("%h.next"))) lo
>       of the loop can be safely evaluated concurrently.  */
>    int safelen;
>  
> +  /* Preferred vectorization factor for the loop if non-zero.  */
> +  int simdlen;
> +

You probably want to copy this in copy_loop_info?

>    /* Constraints are generally set by consumers and affect certain
>       semantics of niter analyzer APIs.  Currently the APIs affected are
>       number_of_iterations_exit* functions and their callers.  One typical
> --- gcc/omp-expand.c.jj       2019-05-15 23:42:16.049859907 +0200
> +++ gcc/omp-expand.c  2019-05-16 16:10:46.093932348 +0200
> @@ -4974,6 +4974,13 @@ expand_omp_simd (struct omp_region *regi
>         && loop->safelen > 1)
>       {
>         loop->force_vectorize = true;
> +       if (simdlen && tree_fits_uhwi_p (OMP_CLAUSE_SIMDLEN_EXPR (simdlen)))
> +         {
> +           unsigned HOST_WIDE_INT v
> +             = tree_to_uhwi (OMP_CLAUSE_SIMDLEN_EXPR (simdlen));
> +           if (v < INT_MAX && v <= (unsigned HOST_WIDE_INT) loop->safelen)
> +             loop->simdlen = v;
> +         }
>         cfun->has_force_vectorize_loops = true;
>       }
>        else if (dont_vectorize)
> --- gcc/tree-vect-loop.c.jj   2019-05-16 15:25:17.826832201 +0200
> +++ gcc/tree-vect-loop.c      2019-05-16 19:00:33.999540073 +0200
> @@ -2254,7 +2254,8 @@ vect_analyze_loop (struct loop *loop, lo
>  
>    /* Autodetect first vector size we try.  */
>    current_vector_size = 0;
> -  targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
> +  targetm.vectorize.autovectorize_vector_sizes (&vector_sizes,
> +                                             loop->simdlen != 0);
>    unsigned int next_size = 0;
>  
>    DUMP_VECT_SCOPE ("analyze_loop_nest");
> @@ -2273,6 +2274,8 @@ vect_analyze_loop (struct loop *loop, lo
>  
>    unsigned n_stmts = 0;
>    poly_uint64 autodetected_vector_size = 0;
> +  opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
> +  poly_uint64 first_vector_size = 0;
>    while (1)
>      {
>        /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
> @@ -2283,6 +2286,7 @@ vect_analyze_loop (struct loop *loop, lo
>         if (dump_enabled_p ())
>           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>                            "bad loop form.\n");
> +       gcc_checking_assert (first_loop_vinfo == NULL);
>         return loop_vinfo;
>       }
>  
> @@ -2296,10 +2300,27 @@ vect_analyze_loop (struct loop *loop, lo
>       {
>         LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
>  
> -       return loop_vinfo;
> +       if (loop->simdlen
> +           && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
> +                        (unsigned HOST_WIDE_INT) loop->simdlen))
> +         {
> +           if (first_loop_vinfo == NULL)
> +             {
> +               first_loop_vinfo = loop_vinfo;
> +               first_vector_size = current_vector_size;
> +               loop->aux = NULL;
> +             }
> +           else
> +             delete loop_vinfo;
> +         }
> +       else
> +         {
> +           delete first_loop_vinfo;
> +           return loop_vinfo;
> +         }
>       }
> -
> -      delete loop_vinfo;
> +      else
> +     delete loop_vinfo;
>  
>        if (next_size == 0)
>       autodetected_vector_size = current_vector_size;
> @@ -2308,10 +2329,31 @@ vect_analyze_loop (struct loop *loop, lo
>         && known_eq (vector_sizes[next_size], autodetected_vector_size))
>       next_size += 1;
>  
> -      if (fatal
> -       || next_size == vector_sizes.length ()
> +      if (fatal)
> +     {
> +       gcc_checking_assert (first_loop_vinfo == NULL);
> +       return opt_loop_vec_info::propagate_failure (res);
> +     }
> +
> +      if (next_size == vector_sizes.length ()
>         || known_eq (current_vector_size, 0U))
> -     return opt_loop_vec_info::propagate_failure (res);
> +     {
> +       if (first_loop_vinfo)
> +         {
> +           current_vector_size = first_vector_size;
> +           loop->aux = (loop_vec_info) first_loop_vinfo;
> +           if (dump_enabled_p ())
> +             {
> +               dump_printf_loc (MSG_NOTE, vect_location,
> +                                "***** Choosing vector size ");
> +               dump_dec (MSG_NOTE, current_vector_size);
> +               dump_printf (MSG_NOTE, "\n");
> +             }
> +           return first_loop_vinfo;
> +         }
> +       else
> +         return opt_loop_vec_info::propagate_failure (res);
> +     }
>  
>        /* Try the next biggest vector size.  */
>        current_vector_size = vector_sizes[next_size++];
> @@ -8670,7 +8712,8 @@ vect_transform_loop (loop_vec_info loop_
>    if (epilogue)
>      {
>        auto_vector_sizes vector_sizes;
> -      targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
> +      targetm.vectorize.autovectorize_vector_sizes (&vector_sizes,
> +                                                 loop->simdlen != 0);

For epilogue vectorization loop->simdlen shouldn't apply, so I'd pass
false here.

>        unsigned int next_size = 0;
>  
>        /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
> --- gcc/tree-vect-slp.c.jj    2019-05-14 21:37:33.653388439 +0200
> +++ gcc/tree-vect-slp.c       2019-05-16 18:59:12.825873858 +0200
> @@ -2983,7 +2983,7 @@ vect_slp_bb (basic_block bb)
>  
>    /* Autodetect first vector size we try.  */
>    current_vector_size = 0;
> -  targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
> +  targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
>    unsigned int next_size = 0;
>  
>    gsi = gsi_start_bb (bb);
> --- gcc/target.def.jj 2019-02-18 20:48:35.742681472 +0100
> +++ gcc/target.def    2019-05-16 18:55:50.373200394 +0200
> @@ -1899,12 +1899,14 @@ DEFHOOK
>  the only one that is worth considering, this hook should add all suitable\n\
>  vector sizes to @var{sizes}, in order of decreasing preference.  The first\n\
>  one should be the size of @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE}.\n\
> +If @var{all} is true, add suitable vector sizes even when they are 
> generally\n\
> +not expected to be worthwhile.\n\
>  \n\
>  The hook does not need to do anything if the vector returned by\n\
>  @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is the only one relevant\n\
>  for autovectorization.  The default implementation does nothing.",
>   void,
> - (vector_sizes *sizes),
> + (vector_sizes *sizes, bool all),
>   default_autovectorize_vector_sizes)
>  
>  /* Function to get a target mode for a vector mask.  */
> --- gcc/doc/tm.texi.jj        2019-02-18 20:48:34.132707883 +0100
> +++ gcc/doc/tm.texi   2019-05-16 19:08:05.975113214 +0200
> @@ -6016,11 +6016,13 @@ against lower halves of vectors recursiv
>  reached.  The default is @var{mode} which means no splitting.
>  @end deftypefn
>  
> -@deftypefn {Target Hook} void TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES 
> (vector_sizes *@var{sizes})
> +@deftypefn {Target Hook} void TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES 
> (vector_sizes *@var{sizes}, bool @var{all})
>  If the mode returned by @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is not
>  the only one that is worth considering, this hook should add all suitable
>  vector sizes to @var{sizes}, in order of decreasing preference.  The first
>  one should be the size of @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE}.
> +If @var{all} is true, add suitable vector sizes even when they are generally
> +not expected to be worthwhile.
>  
>  The hook does not need to do anything if the vector returned by
>  @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is the only one relevant
> --- gcc/targhooks.c.jj        2019-04-17 21:21:40.918117115 +0200
> +++ gcc/targhooks.c   2019-05-16 18:56:38.586408190 +0200
> @@ -1316,7 +1316,7 @@ default_split_reduction (machine_mode mo
>     is tried.  */
>  
>  void
> -default_autovectorize_vector_sizes (vector_sizes *)
> +default_autovectorize_vector_sizes (vector_sizes *, bool)
>  {
>  }
>  
> --- gcc/targhooks.h.jj        2019-01-16 09:35:04.563323106 +0100
> +++ gcc/targhooks.h   2019-05-16 18:56:27.002598531 +0200
> @@ -110,7 +110,7 @@ default_builtin_support_vector_misalignm
>                                            int, bool);
>  extern machine_mode default_preferred_simd_mode (scalar_mode mode);
>  extern machine_mode default_split_reduction (machine_mode);
> -extern void default_autovectorize_vector_sizes (vector_sizes *);
> +extern void default_autovectorize_vector_sizes (vector_sizes *, bool);
>  extern opt_machine_mode default_get_mask_mode (poly_uint64, poly_uint64);
>  extern bool default_empty_mask_is_expensive (unsigned);
>  extern void *default_init_cost (struct loop *);
> --- gcc/omp-low.c.jj  2019-05-16 15:04:41.785179634 +0200
> +++ gcc/omp-low.c     2019-05-16 18:58:07.253951283 +0200
> @@ -3600,7 +3600,7 @@ omp_clause_aligned_alignment (tree claus
>    unsigned int al = 1;
>    opt_scalar_mode mode_iter;
>    auto_vector_sizes sizes;
> -  targetm.vectorize.autovectorize_vector_sizes (&sizes);
> +  targetm.vectorize.autovectorize_vector_sizes (&sizes, true);
>    poly_uint64 vs = 0;
>    for (unsigned int i = 0; i < sizes.length (); ++i)
>      vs = ordered_max (vs, sizes[i]);
> --- gcc/omp-general.c.jj      2019-02-22 15:22:20.880919652 +0100
> +++ gcc/omp-general.c 2019-05-16 18:57:05.254969995 +0200
> @@ -469,7 +469,7 @@ omp_max_vf (void)
>      return 1;
>  
>    auto_vector_sizes sizes;
> -  targetm.vectorize.autovectorize_vector_sizes (&sizes);
> +  targetm.vectorize.autovectorize_vector_sizes (&sizes, true);
>    if (!sizes.is_empty ())
>      {
>        poly_uint64 vf = 0;
> --- gcc/optabs-query.c.jj     2019-02-11 11:38:08.177618415 +0100
> +++ gcc/optabs-query.c        2019-05-16 18:58:48.830268128 +0200
> @@ -593,7 +593,7 @@ can_vec_mask_load_store_p (machine_mode
>      return true;
>  
>    auto_vector_sizes vector_sizes;
> -  targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
> +  targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, true);
>    for (unsigned int i = 0; i < vector_sizes.length (); ++i)
>      {
>        poly_uint64 cur = vector_sizes[i];
> --- gcc/config/aarch64/aarch64.c.jj   2019-05-11 11:32:58.229357774 +0200
> +++ gcc/config/aarch64/aarch64.c      2019-05-16 19:04:18.269854907 +0200
> @@ -14105,7 +14105,7 @@ aarch64_preferred_simd_mode (scalar_mode
>  /* Return a list of possible vector sizes for the vectorizer
>     to iterate over.  */
>  static void
> -aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
> +aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
>  {
>    if (TARGET_SVE)
>      sizes->safe_push (BYTES_PER_SVE_VECTOR);
> --- gcc/config/arc/arc.c.jj   2019-04-24 17:44:44.280019376 +0200
> +++ gcc/config/arc/arc.c      2019-05-16 19:04:31.934630363 +0200
> @@ -480,7 +480,7 @@ arc_preferred_simd_mode (scalar_mode mod
>     TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES.  */
>  
>  static void
> -arc_autovectorize_vector_sizes (vector_sizes *sizes)
> +arc_autovectorize_vector_sizes (vector_sizes *sizes, bool)
>  {
>    if (TARGET_PLUS_QMACW)
>      {
> --- gcc/config/arm/arm.c.jj   2019-05-10 09:31:31.113119373 +0200
> +++ gcc/config/arm/arm.c      2019-05-16 19:04:51.586307442 +0200
> @@ -288,7 +288,7 @@ static bool arm_builtin_support_vector_m
>  static void arm_conditional_register_usage (void);
>  static enum flt_eval_method arm_excess_precision (enum 
> excess_precision_type);
>  static reg_class_t arm_preferred_rename_class (reg_class_t rclass);
> -static void arm_autovectorize_vector_sizes (vector_sizes *);
> +static void arm_autovectorize_vector_sizes (vector_sizes *, bool);
>  static int arm_default_branch_cost (bool, bool);
>  static int arm_cortex_a5_branch_cost (bool, bool);
>  static int arm_cortex_m_branch_cost (bool, bool);
> @@ -28347,7 +28347,7 @@ arm_vector_alignment (const_tree type)
>  }
>  
>  static void
> -arm_autovectorize_vector_sizes (vector_sizes *sizes)
> +arm_autovectorize_vector_sizes (vector_sizes *sizes, bool)
>  {
>    if (!TARGET_NEON_VECTORIZE_DOUBLE)
>      {
> --- gcc/config/i386/i386.c.jj 2019-05-15 23:36:47.920060787 +0200
> +++ gcc/config/i386/i386.c    2019-05-16 19:03:16.217874556 +0200
> @@ -21328,7 +21328,7 @@ ix86_preferred_simd_mode (scalar_mode mo
>     256bit and 128bit vectors.  */
>  
>  static void
> -ix86_autovectorize_vector_sizes (vector_sizes *sizes)
> +ix86_autovectorize_vector_sizes (vector_sizes *sizes, bool all)
>  {
>    if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
>      {
> @@ -21336,11 +21336,22 @@ ix86_autovectorize_vector_sizes (vector_
>        sizes->safe_push (32);
>        sizes->safe_push (16);
>      }
> +  else if (TARGET_AVX512F && all)
> +    {
> +      sizes->safe_push (32);
> +      sizes->safe_push (16);
> +      sizes->safe_push (64);
> +    }
>    else if (TARGET_AVX && !TARGET_PREFER_AVX128)
>      {
>        sizes->safe_push (32);
>        sizes->safe_push (16);
>      }
> +  else if (TARGET_AVX && all)
> +    {
> +      sizes->safe_push (16);
> +      sizes->safe_push (32);
> +    }
>  }
>  
>  /* Implemenation of targetm.vectorize.get_mask_mode.  */
> --- gcc/config/mips/mips.c.jj 2019-05-14 21:37:20.166613524 +0200
> +++ gcc/config/mips/mips.c    2019-05-16 19:05:29.124690606 +0200
> @@ -13460,7 +13460,7 @@ mips_preferred_simd_mode (scalar_mode mo
>  /* Implement TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES.  */
>  
>  static void
> -mips_autovectorize_vector_sizes (vector_sizes *sizes)
> +mips_autovectorize_vector_sizes (vector_sizes *sizes, bool)
>  {
>    if (ISA_HAS_MSA)
>      sizes->safe_push (16);
> --- gcc/testsuite/gcc.target/i386/avx512f-simd-1.c.jj 2019-05-16 
> 19:29:17.556218761 +0200
> +++ gcc/testsuite/gcc.target/i386/avx512f-simd-1.c    2019-05-16 
> 19:23:50.508592664 +0200
> @@ -0,0 +1,35 @@
> +/* { dg-do compile } */
> +/* { dg-options "-fopenmp-simd -O2 -mavx512f -masm=att" } */
> +/* { dg-final { scan-assembler "vpadd\[^\n\r]*%xmm" } } */
> +/* { dg-final { scan-assembler "vpadd\[^\n\r]*%ymm" } } */
> +/* { dg-final { scan-assembler "vpadd\[^\n\r]*%zmm" } } */
> +
> +#define N 1024
> +int a[N];
> +
> +void
> +f1 (void)
> +{
> +  int i;
> +  #pragma omp simd simdlen (4)
> +  for (i = 0; i < N; ++i)
> +    a[i] = a[i] + 1;
> +}
> +
> +void
> +f2 (void)
> +{
> +  int i;
> +  #pragma omp simd simdlen (8)
> +  for (i = 0; i < N; ++i)
> +    a[i] = a[i] + 2;
> +}
> +
> +void
> +f3 (void)
> +{
> +  int i;
> +  #pragma omp simd simdlen (16)
> +  for (i = 0; i < N; ++i)
> +    a[i] = a[i] + 3;
> +}
> 
>       Jakub
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Linux GmbH, Maxfeldstrasse 5, 90409 Nuernberg, Germany;
GF: Felix Imendörffer, Mary Higgins, Sri Rasiah; HRB 21284 (AG Nürnberg)

Reply via email to