On Wed, 15 May 2019, Jakub Jelinek wrote:

> Hi!
> 
> The following patch implements what I've failed to do in time for 9.x,
> in particular, if #pragma omp simd has if clause and the argument is
> not constant zero (handled by the previous patch), or constant non-zero
> (ignored, we want normal vectorization in that case as before), this patch
> arranges for that information to be preserved until vectorization in the IL
> through the .GOMP_SIMD_LANE ifn (whether it is used for some data
> privatization or not) argument and during vectorization makes sure we
> version such loop with that runtime condition (either solely or together
> with say runtime checks for aliasing, alignment etc.).
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux.  Richard, is this
> approach ok with you?
> 
> 2019-05-15  Jakub Jelinek  <ja...@redhat.com>
> 
>       * omp-low.c (lower_rec_input_clauses): If OMP_CLAUSE_IF
>       has non-constant expression, force sctx.lane and use two
>       argument IFN_GOMP_SIMD_LANE instead of single argument.
>       * tree-ssa-dce.c (eliminate_unnecessary_stmts): Don't DCE
>       two argument IFN_GOMP_SIMD_LANE without lhs.
>       * tree-vectorizer.h (struct _loop_vec_info): Add simd_if_cond
>       member.
>       (LOOP_VINFO_SIMD_IF_COND, LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND):
>       Define.
>       (LOOP_REQUIRES_VERSIONING): Or in
>       LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND.
>       * tree-vect-loop.c (vect_determine_vectorization_factor): Punt
>       for IFN_GOMP_SIMD_LANE with zero second argument.  Initialize
>       LOOP_VINFO_SIMD_IF_COND from IFN_GOMP_SIMD_LANE second argument.
>       (_loop_vec_info::_loop_vec_info): Clear simd_if_cond.
>       * tree-vect-loop-manip.c (vect_loop_versioning): Add runtime check
>       from simd if clause if needed.
> 
>       * gcc.dg/vect/vect-simd-1.c: New test.
>       * gcc.dg/vect/vect-simd-2.c: New test.
>       * gcc.dg/vect/vect-simd-3.c: New test.
>       * gcc.dg/vect/vect-simd-4.c: New test.
> 
> --- gcc/omp-low.c.jj  2019-05-15 14:56:19.564136934 +0200
> +++ gcc/omp-low.c     2019-05-15 16:59:38.686639583 +0200
> @@ -3783,6 +3783,7 @@ lower_rec_input_clauses (tree clauses, g
>    tree simt_lane = NULL_TREE, simtrec = NULL_TREE;
>    tree ivar = NULL_TREE, lvar = NULL_TREE, uid = NULL_TREE;
>    gimple_seq llist[3] = { };
> +  tree nonconst_simd_if = NULL_TREE;
>  
>    copyin_seq = NULL;
>    sctx.is_simt = is_simd && omp_find_clause (clauses, OMP_CLAUSE__SIMT_);
> @@ -3814,6 +3815,8 @@ lower_rec_input_clauses (tree clauses, g
>       case OMP_CLAUSE_IF:
>         if (integer_zerop (OMP_CLAUSE_IF_EXPR (c)))
>           sctx.max_vf = 1;
> +       else if (TREE_CODE (OMP_CLAUSE_IF_EXPR (c)) != INTEGER_CST)
> +         nonconst_simd_if = OMP_CLAUSE_IF_EXPR (c);
>         break;
>          case OMP_CLAUSE_SIMDLEN:
>         if (integer_onep (OMP_CLAUSE_SIMDLEN_EXPR (c)))
> @@ -5190,6 +5193,17 @@ lower_rec_input_clauses (tree clauses, g
>    if (known_eq (sctx.max_vf, 1U))
>      sctx.is_simt = false;
>  
> +  if (nonconst_simd_if)
> +    {
> +      if (sctx.lane == NULL_TREE)
> +     {
> +       sctx.idx = create_tmp_var (unsigned_type_node);
> +       sctx.lane = create_tmp_var (unsigned_type_node);
> +     }

Does forcing a SIMD_LANE possibly pessimize things?  Just looking
whether a separate IFN might be better here or even doing the
versioning in omp expansion/lowering?  But see question below...

> +      /* FIXME: For now.  */
> +      sctx.is_simt = false;
> +    }
> +
>    if (sctx.lane || sctx.is_simt)
>      {
>        uid = create_tmp_var (ptr_type_node, "simduid");
> @@ -5219,8 +5233,9 @@ lower_rec_input_clauses (tree clauses, g
>      }
>    if (sctx.lane)
>      {
> -      gimple *g
> -     = gimple_build_call_internal (IFN_GOMP_SIMD_LANE, 1, uid);
> +      gimple *g = gimple_build_call_internal (IFN_GOMP_SIMD_LANE,
> +                                           1 + (nonconst_simd_if != NULL),
> +                                           uid, nonconst_simd_if);
>        gimple_call_set_lhs (g, sctx.lane);
>        gimple_stmt_iterator gsi = gsi_start_1 (gimple_omp_body_ptr 
> (ctx->stmt));
>        gsi_insert_before_without_update (&gsi, g, GSI_SAME_STMT);
> --- gcc/tree-ssa-dce.c.jj     2019-05-03 09:27:00.236840002 +0200
> +++ gcc/tree-ssa-dce.c        2019-05-15 17:22:31.176822544 +0200
> @@ -1328,12 +1328,16 @@ eliminate_unnecessary_stmts (void)
>                 update_stmt (stmt);
>                 release_ssa_name (name);
>  
> -               /* GOMP_SIMD_LANE or ASAN_POISON without lhs is not
> -                  needed.  */
> +               /* GOMP_SIMD_LANE (unless two argument) or ASAN_POISON
> +                  without lhs is not needed.  */
>                 if (gimple_call_internal_p (stmt))
>                   switch (gimple_call_internal_fn (stmt))
>                     {
>                     case IFN_GOMP_SIMD_LANE:
> +                     if (gimple_call_num_args (stmt) >= 2
> +                         && !integer_nonzerop (gimple_call_arg (stmt, 1)))
> +                       break;
> +                     /* FALLTHRU */

GOMP_SIMD_LANE has ECF_NOVOPS, how do you prevent code motion of it
if the result is unused?  Even with VOPs the vectorized loop may be
a reduction not involving any memory operations.

>                     case IFN_ASAN_POISON:
>                       remove_dead_stmt (&gsi, bb, to_remove_edges);
>                       break;
> --- gcc/tree-vectorizer.h.jj  2019-04-10 14:19:14.946896353 +0200
> +++ gcc/tree-vectorizer.h     2019-05-15 18:26:16.176917741 +0200
> @@ -428,6 +428,13 @@ typedef struct _loop_vec_info : public v
>       loops.  */
>    tree mask_compare_type;
>  
> +  /* For #pragma omp simd if (x) loops the x expression.  If constant 0,
> +     the loop should not be vectorized, if constant non-zero, simd_if_cond
> +     shouldn't be set and loop vectorized normally, if SSA_NAME, the loop
> +     should be versioned on that condition, using scalar loop if the 
> condition
> +     is false and vectorized loop otherwise.  */
> +  tree simd_if_cond;
> +
>    /* Unknown DRs according to which loop was peeled.  */
>    struct dr_vec_info *unaligned_dr;
>  
> @@ -591,6 +598,7 @@ typedef struct _loop_vec_info : public v
>  #define LOOP_VINFO_SCALAR_ITERATION_COST(L) (L)->scalar_cost_vec
>  #define LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST(L) 
> (L)->single_scalar_iteration_cost
>  #define LOOP_VINFO_ORIG_LOOP_INFO(L)       (L)->orig_loop_info
> +#define LOOP_VINFO_SIMD_IF_COND(L)         (L)->simd_if_cond
>  
>  #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L)    \
>    ((L)->may_misalign_stmts.length () > 0)
> @@ -600,10 +608,13 @@ typedef struct _loop_vec_info : public v
>     || (L)->lower_bounds.length () > 0)
>  #define LOOP_REQUIRES_VERSIONING_FOR_NITERS(L)               \
>    (LOOP_VINFO_NITERS_ASSUMPTIONS (L))
> +#define LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND(L) \
> +  (LOOP_VINFO_SIMD_IF_COND (L))
>  #define LOOP_REQUIRES_VERSIONING(L)                  \
>    (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (L)                \
>     || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (L)         \
> -   || LOOP_REQUIRES_VERSIONING_FOR_NITERS (L))
> +   || LOOP_REQUIRES_VERSIONING_FOR_NITERS (L)                \
> +   || LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (L))
>  
>  #define LOOP_VINFO_NITERS_KNOWN_P(L)          \
>    (tree_fits_shwi_p ((L)->num_iters) && tree_to_shwi ((L)->num_iters) > 0)
> --- gcc/tree-vect-loop.c.jj   2019-05-08 09:18:31.924685516 +0200
> +++ gcc/tree-vect-loop.c      2019-05-15 18:53:59.257068299 +0200
> @@ -357,6 +357,32 @@ vect_determine_vectorization_factor (loo
>                                         &mask_producers);
>         if (!res)
>           return res;
> +       if (loop->simduid)
> +         {
> +           gimple *g = gsi_stmt (si);
> +           /* If .GOMP_SIMD_LANE call for the current loop has 2 arguments,
> +              the second argument is the #pragma omp simd if (x) condition,
> +              when 0, loop shouldn't be vectorized, when non-zero constant,
> +              it should be vectorized normally, otherwise versioned with
> +              vectorized loop done if the condition is non-zero at
> +              runtime.  */
> +           if (is_gimple_call (g)
> +               && gimple_call_internal_p (g)
> +               && gimple_call_internal_fn (g) == IFN_GOMP_SIMD_LANE
> +               && gimple_call_num_args (g) >= 2
> +               && TREE_CODE (gimple_call_arg (g, 0)) == SSA_NAME
> +               && (loop->simduid
> +                   == SSA_NAME_VAR (gimple_call_arg (g, 0))))
> +             {
> +               tree arg = gimple_call_arg (g, 1);
> +               if (integer_zerop (arg))
> +                 return opt_result::failure_at (g,
> +                                                "not vectorized: "
> +                                                "simd if(0)\n");
> +               if (TREE_CODE (arg) == SSA_NAME)
> +                 LOOP_VINFO_SIMD_IF_COND (loop_vinfo) = arg;
> +             }
> +         }

This looks like a quite arbitrary place to do this, it wastes
compile-time in case of a zero arg.  Can't you do this in
vect_analyze_loop before the loop over vector sizes?  Maybe
re-using what note_simd_array_uses computes?

Also I wonder what happens if arg is not SSA name - I guess
you omitted the

   else
     gcc_assert (integer_onep (arg));

?  Otherwise just dropping the condition looks wrong.

Is a zero argument really a "force no vectorization" or should
it merely reset force_vectorize to zero?


>          }
>      }
>  
> @@ -819,6 +845,7 @@ _loop_vec_info::_loop_vec_info (struct l
>      max_vectorization_factor (0),
>      mask_skip_niters (NULL_TREE),
>      mask_compare_type (NULL_TREE),
> +    simd_if_cond (NULL_TREE),
>      unaligned_dr (NULL),
>      peeling_for_alignment (0),
>      ptr_mask (0),
> --- gcc/tree-vect-loop-manip.c.jj     2019-03-11 13:43:47.361455501 +0100
> +++ gcc/tree-vect-loop-manip.c        2019-05-15 18:36:21.068725974 +0200
> @@ -3009,6 +3009,8 @@ vect_loop_versioning (loop_vec_info loop
>    bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
>    bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
>    bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
> +  tree version_simd_if_cond
> +    = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
>  
>    if (check_profitability)
>      cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
> @@ -3044,6 +3046,32 @@ vect_loop_versioning (loop_vec_info loop
>        vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
>      }
>  
> +  if (version_simd_if_cond)
> +    {
> +      gcc_assert (TREE_CODE (version_simd_if_cond) == SSA_NAME);
> +      gcc_assert (dom_info_available_p (CDI_DOMINATORS));
> +      if (basic_block bb
> +       = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
> +     {
> +       if (!dominated_by_p (CDI_DOMINATORS, loop->header, bb)
> +           || (scalar_loop
> +               && !dominated_by_p (CDI_DOMINATORS, scalar_loop->header,
> +                                   bb)))
> +         version_simd_if_cond = boolean_false_node;

How can this ever happen?  A loop has a single entry and I hope
omp lowering places the condition computation on that entry.

dominators are available since loops are up-to-date so the DOM
assertion is not necessary.  Likewise the SSA_NAME assert
is covered by tree checking on the SSA_NAME_DEF_STMT.

> +     }
> +      tree zero = build_int_cst (TREE_TYPE (version_simd_if_cond), 0);

build_zero_cst (TREE_TYPE (version_simd_if_cond))

> +      tree c = fold_build2 (NE_EXPR, boolean_type_node,
> +                         version_simd_if_cond, zero);
> +      if (cond_expr)
> +        cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
> +                              c, cond_expr);
> +      else
> +        cond_expr = c;
> +      if (dump_enabled_p ())
> +     dump_printf_loc (MSG_NOTE, vect_location,
> +                      "created versioning for simd if condition check.\n");
> +    }
> +

Otherwise the approach looks OK to me.

Richard.

>    cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
>                                     &gimplify_stmt_list,
>                                     is_gimple_condexpr, NULL_TREE);
> --- gcc/testsuite/gcc.dg/vect/vect-simd-1.c.jj        2019-05-15 
> 18:45:09.779910995 +0200
> +++ gcc/testsuite/gcc.dg/vect/vect-simd-1.c   2019-05-15 18:55:49.006235391 
> +0200
> @@ -0,0 +1,64 @@
> +/* { dg-additional-options "-fopenmp-simd" } */
> +/* { dg-additional-options "-mavx" { target avx_runtime } } */
> +
> +#include "tree-vect.h"
> +
> +#define N 1024
> +int a[N];
> +int x;
> +
> +__attribute__((noipa)) int
> +bar (void)
> +{
> +  return x;
> +}
> +
> +__attribute__((noipa)) void
> +foo (void)
> +{
> +  #pragma omp simd if (bar ())
> +  for (int i = 0; i < N; ++i)
> +    a[i] = a[i] + 1;
> +}
> +
> +__attribute__((noipa)) void
> +baz (void)
> +{
> +  int c = 0;
> +  #pragma omp simd if (c)
> +  for (int i = 0; i < N; ++i)
> +    a[i] = a[i] + 1;
> +}
> +
> +__attribute__((noipa)) void
> +qux (void)
> +{
> +  int c = 1;
> +  #pragma omp simd if (c)
> +  for (int i = 0; i < N; ++i)
> +    a[i] = a[i] + 1;
> +}
> +
> +int
> +main ()
> +{
> +  check_vect ();
> +  foo ();
> +  for (int i = 0; i < N; ++i)
> +    if (a[i] != 1)
> +      abort ();
> +  x = 1;
> +  foo ();
> +  for (int i = 0; i < N; ++i)
> +    if (a[i] != 2)
> +      abort ();
> +  baz ();
> +  for (int i = 0; i < N; ++i)
> +    if (a[i] != 3)
> +      abort ();
> +  qux ();
> +  for (int i = 0; i < N; ++i)
> +    if (a[i] != 4)
> +      abort ();
> +  return 0;
> +}
> --- gcc/testsuite/gcc.dg/vect/vect-simd-2.c.jj        2019-05-15 
> 18:56:43.000333651 +0200
> +++ gcc/testsuite/gcc.dg/vect/vect-simd-2.c   2019-05-15 18:59:13.692816968 
> +0200
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-additional-options "-fopenmp-simd" } */
> +
> +#define N 1024
> +int a[N];
> +int bar (void);
> +
> +void
> +foo (void)
> +{
> +  #pragma omp simd if (bar ())
> +  for (int i = 0; i < N; ++i)
> +    a[i] = a[i] + 1;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times "created versioning for simd if 
> condition check" 1 "vect" } } */
> --- gcc/testsuite/gcc.dg/vect/vect-simd-3.c.jj        2019-05-15 
> 18:59:21.865680474 +0200
> +++ gcc/testsuite/gcc.dg/vect/vect-simd-3.c   2019-05-15 18:59:56.381104037 
> +0200
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-additional-options "-fopenmp-simd" } */
> +
> +#define N 1024
> +int a[N];
> +
> +void
> +foo (void)
> +{
> +  int c = 0;
> +  #pragma omp simd if (c)
> +  for (int i = 0; i < N; ++i)
> +    a[i] = a[i] + 1;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" } } */
> --- gcc/testsuite/gcc.dg/vect/vect-simd-4.c.jj        2019-05-15 
> 18:59:51.550184714 +0200
> +++ gcc/testsuite/gcc.dg/vect/vect-simd-4.c   2019-05-15 19:00:35.752446509 
> +0200
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-additional-options "-fopenmp-simd" } */
> +
> +#define N 1024
> +int a[N];
> +
> +void
> +foo (void)
> +{
> +  int c = 1;
> +  #pragma omp simd if (c)
> +  for (int i = 0; i < N; ++i)
> +    a[i] = a[i] + 1;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-not "created versioning for simd if condition 
> check" "vect" } } */
> 
>       Jakub
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Linux GmbH, Maxfeldstrasse 5, 90409 Nuernberg, Germany;
GF: Felix Imendörffer, Mary Higgins, Sri Rasiah; HRB 21284 (AG Nürnberg)

Reply via email to