On Wed, 15 May 2019, Jakub Jelinek wrote: > Hi! > > The following patch implements what I've failed to do in time for 9.x, > in particular, if #pragma omp simd has if clause and the argument is > not constant zero (handled by the previous patch), or constant non-zero > (ignored, we want normal vectorization in that case as before), this patch > arranges for that information to be preserved until vectorization in the IL > through the .GOMP_SIMD_LANE ifn (whether it is used for some data > privatization or not) argument and during vectorization makes sure we > version such loop with that runtime condition (either solely or together > with say runtime checks for aliasing, alignment etc.). > > Bootstrapped/regtested on x86_64-linux and i686-linux. Richard, is this > approach ok with you? > > 2019-05-15 Jakub Jelinek <ja...@redhat.com> > > * omp-low.c (lower_rec_input_clauses): If OMP_CLAUSE_IF > has non-constant expression, force sctx.lane and use two > argument IFN_GOMP_SIMD_LANE instead of single argument. > * tree-ssa-dce.c (eliminate_unnecessary_stmts): Don't DCE > two argument IFN_GOMP_SIMD_LANE without lhs. > * tree-vectorizer.h (struct _loop_vec_info): Add simd_if_cond > member. > (LOOP_VINFO_SIMD_IF_COND, LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND): > Define. > (LOOP_REQUIRES_VERSIONING): Or in > LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND. > * tree-vect-loop.c (vect_determine_vectorization_factor): Punt > for IFN_GOMP_SIMD_LANE with zero second argument. Initialize > LOOP_VINFO_SIMD_IF_COND from IFN_GOMP_SIMD_LANE second argument. > (_loop_vec_info::_loop_vec_info): Clear simd_if_cond. > * tree-vect-loop-manip.c (vect_loop_versioning): Add runtime check > from simd if clause if needed. > > * gcc.dg/vect/vect-simd-1.c: New test. > * gcc.dg/vect/vect-simd-2.c: New test. > * gcc.dg/vect/vect-simd-3.c: New test. > * gcc.dg/vect/vect-simd-4.c: New test. > > --- gcc/omp-low.c.jj 2019-05-15 14:56:19.564136934 +0200 > +++ gcc/omp-low.c 2019-05-15 16:59:38.686639583 +0200 > @@ -3783,6 +3783,7 @@ lower_rec_input_clauses (tree clauses, g > tree simt_lane = NULL_TREE, simtrec = NULL_TREE; > tree ivar = NULL_TREE, lvar = NULL_TREE, uid = NULL_TREE; > gimple_seq llist[3] = { }; > + tree nonconst_simd_if = NULL_TREE; > > copyin_seq = NULL; > sctx.is_simt = is_simd && omp_find_clause (clauses, OMP_CLAUSE__SIMT_); > @@ -3814,6 +3815,8 @@ lower_rec_input_clauses (tree clauses, g > case OMP_CLAUSE_IF: > if (integer_zerop (OMP_CLAUSE_IF_EXPR (c))) > sctx.max_vf = 1; > + else if (TREE_CODE (OMP_CLAUSE_IF_EXPR (c)) != INTEGER_CST) > + nonconst_simd_if = OMP_CLAUSE_IF_EXPR (c); > break; > case OMP_CLAUSE_SIMDLEN: > if (integer_onep (OMP_CLAUSE_SIMDLEN_EXPR (c))) > @@ -5190,6 +5193,17 @@ lower_rec_input_clauses (tree clauses, g > if (known_eq (sctx.max_vf, 1U)) > sctx.is_simt = false; > > + if (nonconst_simd_if) > + { > + if (sctx.lane == NULL_TREE) > + { > + sctx.idx = create_tmp_var (unsigned_type_node); > + sctx.lane = create_tmp_var (unsigned_type_node); > + }
Does forcing a SIMD_LANE possibly pessimize things? Just looking whether a separate IFN might be better here or even doing the versioning in omp expansion/lowering? But see question below... > + /* FIXME: For now. */ > + sctx.is_simt = false; > + } > + > if (sctx.lane || sctx.is_simt) > { > uid = create_tmp_var (ptr_type_node, "simduid"); > @@ -5219,8 +5233,9 @@ lower_rec_input_clauses (tree clauses, g > } > if (sctx.lane) > { > - gimple *g > - = gimple_build_call_internal (IFN_GOMP_SIMD_LANE, 1, uid); > + gimple *g = gimple_build_call_internal (IFN_GOMP_SIMD_LANE, > + 1 + (nonconst_simd_if != NULL), > + uid, nonconst_simd_if); > gimple_call_set_lhs (g, sctx.lane); > gimple_stmt_iterator gsi = gsi_start_1 (gimple_omp_body_ptr > (ctx->stmt)); > gsi_insert_before_without_update (&gsi, g, GSI_SAME_STMT); > --- gcc/tree-ssa-dce.c.jj 2019-05-03 09:27:00.236840002 +0200 > +++ gcc/tree-ssa-dce.c 2019-05-15 17:22:31.176822544 +0200 > @@ -1328,12 +1328,16 @@ eliminate_unnecessary_stmts (void) > update_stmt (stmt); > release_ssa_name (name); > > - /* GOMP_SIMD_LANE or ASAN_POISON without lhs is not > - needed. */ > + /* GOMP_SIMD_LANE (unless two argument) or ASAN_POISON > + without lhs is not needed. */ > if (gimple_call_internal_p (stmt)) > switch (gimple_call_internal_fn (stmt)) > { > case IFN_GOMP_SIMD_LANE: > + if (gimple_call_num_args (stmt) >= 2 > + && !integer_nonzerop (gimple_call_arg (stmt, 1))) > + break; > + /* FALLTHRU */ GOMP_SIMD_LANE has ECF_NOVOPS, how do you prevent code motion of it if the result is unused? Even with VOPs the vectorized loop may be a reduction not involving any memory operations. > case IFN_ASAN_POISON: > remove_dead_stmt (&gsi, bb, to_remove_edges); > break; > --- gcc/tree-vectorizer.h.jj 2019-04-10 14:19:14.946896353 +0200 > +++ gcc/tree-vectorizer.h 2019-05-15 18:26:16.176917741 +0200 > @@ -428,6 +428,13 @@ typedef struct _loop_vec_info : public v > loops. */ > tree mask_compare_type; > > + /* For #pragma omp simd if (x) loops the x expression. If constant 0, > + the loop should not be vectorized, if constant non-zero, simd_if_cond > + shouldn't be set and loop vectorized normally, if SSA_NAME, the loop > + should be versioned on that condition, using scalar loop if the > condition > + is false and vectorized loop otherwise. */ > + tree simd_if_cond; > + > /* Unknown DRs according to which loop was peeled. */ > struct dr_vec_info *unaligned_dr; > > @@ -591,6 +598,7 @@ typedef struct _loop_vec_info : public v > #define LOOP_VINFO_SCALAR_ITERATION_COST(L) (L)->scalar_cost_vec > #define LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST(L) > (L)->single_scalar_iteration_cost > #define LOOP_VINFO_ORIG_LOOP_INFO(L) (L)->orig_loop_info > +#define LOOP_VINFO_SIMD_IF_COND(L) (L)->simd_if_cond > > #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \ > ((L)->may_misalign_stmts.length () > 0) > @@ -600,10 +608,13 @@ typedef struct _loop_vec_info : public v > || (L)->lower_bounds.length () > 0) > #define LOOP_REQUIRES_VERSIONING_FOR_NITERS(L) \ > (LOOP_VINFO_NITERS_ASSUMPTIONS (L)) > +#define LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND(L) \ > + (LOOP_VINFO_SIMD_IF_COND (L)) > #define LOOP_REQUIRES_VERSIONING(L) \ > (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (L) \ > || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (L) \ > - || LOOP_REQUIRES_VERSIONING_FOR_NITERS (L)) > + || LOOP_REQUIRES_VERSIONING_FOR_NITERS (L) \ > + || LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (L)) > > #define LOOP_VINFO_NITERS_KNOWN_P(L) \ > (tree_fits_shwi_p ((L)->num_iters) && tree_to_shwi ((L)->num_iters) > 0) > --- gcc/tree-vect-loop.c.jj 2019-05-08 09:18:31.924685516 +0200 > +++ gcc/tree-vect-loop.c 2019-05-15 18:53:59.257068299 +0200 > @@ -357,6 +357,32 @@ vect_determine_vectorization_factor (loo > &mask_producers); > if (!res) > return res; > + if (loop->simduid) > + { > + gimple *g = gsi_stmt (si); > + /* If .GOMP_SIMD_LANE call for the current loop has 2 arguments, > + the second argument is the #pragma omp simd if (x) condition, > + when 0, loop shouldn't be vectorized, when non-zero constant, > + it should be vectorized normally, otherwise versioned with > + vectorized loop done if the condition is non-zero at > + runtime. */ > + if (is_gimple_call (g) > + && gimple_call_internal_p (g) > + && gimple_call_internal_fn (g) == IFN_GOMP_SIMD_LANE > + && gimple_call_num_args (g) >= 2 > + && TREE_CODE (gimple_call_arg (g, 0)) == SSA_NAME > + && (loop->simduid > + == SSA_NAME_VAR (gimple_call_arg (g, 0)))) > + { > + tree arg = gimple_call_arg (g, 1); > + if (integer_zerop (arg)) > + return opt_result::failure_at (g, > + "not vectorized: " > + "simd if(0)\n"); > + if (TREE_CODE (arg) == SSA_NAME) > + LOOP_VINFO_SIMD_IF_COND (loop_vinfo) = arg; > + } > + } This looks like a quite arbitrary place to do this, it wastes compile-time in case of a zero arg. Can't you do this in vect_analyze_loop before the loop over vector sizes? Maybe re-using what note_simd_array_uses computes? Also I wonder what happens if arg is not SSA name - I guess you omitted the else gcc_assert (integer_onep (arg)); ? Otherwise just dropping the condition looks wrong. Is a zero argument really a "force no vectorization" or should it merely reset force_vectorize to zero? > } > } > > @@ -819,6 +845,7 @@ _loop_vec_info::_loop_vec_info (struct l > max_vectorization_factor (0), > mask_skip_niters (NULL_TREE), > mask_compare_type (NULL_TREE), > + simd_if_cond (NULL_TREE), > unaligned_dr (NULL), > peeling_for_alignment (0), > ptr_mask (0), > --- gcc/tree-vect-loop-manip.c.jj 2019-03-11 13:43:47.361455501 +0100 > +++ gcc/tree-vect-loop-manip.c 2019-05-15 18:36:21.068725974 +0200 > @@ -3009,6 +3009,8 @@ vect_loop_versioning (loop_vec_info loop > bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo); > bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo); > bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo); > + tree version_simd_if_cond > + = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo); > > if (check_profitability) > cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters, > @@ -3044,6 +3046,32 @@ vect_loop_versioning (loop_vec_info loop > vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr); > } > > + if (version_simd_if_cond) > + { > + gcc_assert (TREE_CODE (version_simd_if_cond) == SSA_NAME); > + gcc_assert (dom_info_available_p (CDI_DOMINATORS)); > + if (basic_block bb > + = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond))) > + { > + if (!dominated_by_p (CDI_DOMINATORS, loop->header, bb) > + || (scalar_loop > + && !dominated_by_p (CDI_DOMINATORS, scalar_loop->header, > + bb))) > + version_simd_if_cond = boolean_false_node; How can this ever happen? A loop has a single entry and I hope omp lowering places the condition computation on that entry. dominators are available since loops are up-to-date so the DOM assertion is not necessary. Likewise the SSA_NAME assert is covered by tree checking on the SSA_NAME_DEF_STMT. > + } > + tree zero = build_int_cst (TREE_TYPE (version_simd_if_cond), 0); build_zero_cst (TREE_TYPE (version_simd_if_cond)) > + tree c = fold_build2 (NE_EXPR, boolean_type_node, > + version_simd_if_cond, zero); > + if (cond_expr) > + cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, > + c, cond_expr); > + else > + cond_expr = c; > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_NOTE, vect_location, > + "created versioning for simd if condition check.\n"); > + } > + Otherwise the approach looks OK to me. Richard. > cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr), > &gimplify_stmt_list, > is_gimple_condexpr, NULL_TREE); > --- gcc/testsuite/gcc.dg/vect/vect-simd-1.c.jj 2019-05-15 > 18:45:09.779910995 +0200 > +++ gcc/testsuite/gcc.dg/vect/vect-simd-1.c 2019-05-15 18:55:49.006235391 > +0200 > @@ -0,0 +1,64 @@ > +/* { dg-additional-options "-fopenmp-simd" } */ > +/* { dg-additional-options "-mavx" { target avx_runtime } } */ > + > +#include "tree-vect.h" > + > +#define N 1024 > +int a[N]; > +int x; > + > +__attribute__((noipa)) int > +bar (void) > +{ > + return x; > +} > + > +__attribute__((noipa)) void > +foo (void) > +{ > + #pragma omp simd if (bar ()) > + for (int i = 0; i < N; ++i) > + a[i] = a[i] + 1; > +} > + > +__attribute__((noipa)) void > +baz (void) > +{ > + int c = 0; > + #pragma omp simd if (c) > + for (int i = 0; i < N; ++i) > + a[i] = a[i] + 1; > +} > + > +__attribute__((noipa)) void > +qux (void) > +{ > + int c = 1; > + #pragma omp simd if (c) > + for (int i = 0; i < N; ++i) > + a[i] = a[i] + 1; > +} > + > +int > +main () > +{ > + check_vect (); > + foo (); > + for (int i = 0; i < N; ++i) > + if (a[i] != 1) > + abort (); > + x = 1; > + foo (); > + for (int i = 0; i < N; ++i) > + if (a[i] != 2) > + abort (); > + baz (); > + for (int i = 0; i < N; ++i) > + if (a[i] != 3) > + abort (); > + qux (); > + for (int i = 0; i < N; ++i) > + if (a[i] != 4) > + abort (); > + return 0; > +} > --- gcc/testsuite/gcc.dg/vect/vect-simd-2.c.jj 2019-05-15 > 18:56:43.000333651 +0200 > +++ gcc/testsuite/gcc.dg/vect/vect-simd-2.c 2019-05-15 18:59:13.692816968 > +0200 > @@ -0,0 +1,18 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target vect_int } */ > +/* { dg-additional-options "-fopenmp-simd" } */ > + > +#define N 1024 > +int a[N]; > +int bar (void); > + > +void > +foo (void) > +{ > + #pragma omp simd if (bar ()) > + for (int i = 0; i < N; ++i) > + a[i] = a[i] + 1; > +} > + > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ > +/* { dg-final { scan-tree-dump-times "created versioning for simd if > condition check" 1 "vect" } } */ > --- gcc/testsuite/gcc.dg/vect/vect-simd-3.c.jj 2019-05-15 > 18:59:21.865680474 +0200 > +++ gcc/testsuite/gcc.dg/vect/vect-simd-3.c 2019-05-15 18:59:56.381104037 > +0200 > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target vect_int } */ > +/* { dg-additional-options "-fopenmp-simd" } */ > + > +#define N 1024 > +int a[N]; > + > +void > +foo (void) > +{ > + int c = 0; > + #pragma omp simd if (c) > + for (int i = 0; i < N; ++i) > + a[i] = a[i] + 1; > +} > + > +/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" } } */ > --- gcc/testsuite/gcc.dg/vect/vect-simd-4.c.jj 2019-05-15 > 18:59:51.550184714 +0200 > +++ gcc/testsuite/gcc.dg/vect/vect-simd-4.c 2019-05-15 19:00:35.752446509 > +0200 > @@ -0,0 +1,18 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target vect_int } */ > +/* { dg-additional-options "-fopenmp-simd" } */ > + > +#define N 1024 > +int a[N]; > + > +void > +foo (void) > +{ > + int c = 1; > + #pragma omp simd if (c) > + for (int i = 0; i < N; ++i) > + a[i] = a[i] + 1; > +} > + > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ > +/* { dg-final { scan-tree-dump-not "created versioning for simd if condition > check" "vect" } } */ > > Jakub > -- Richard Biener <rguent...@suse.de> SUSE Linux GmbH, Maxfeldstrasse 5, 90409 Nuernberg, Germany; GF: Felix Imendörffer, Mary Higgins, Sri Rasiah; HRB 21284 (AG Nürnberg)