> -----Original Message----- > From: Richard Biener <rguent...@suse.de> > Sent: Friday, September 6, 2024 2:09 PM > To: Tamar Christina <tamar.christ...@arm.com> > Cc: gcc-patches@gcc.gnu.org; nd <n...@arm.com>; j...@ventanamicro.com > Subject: Re: [PATCH 2/4]middle-end: lower COND_EXPR into gimple form in > vect_recog_bool_pattern > > On Tue, 3 Sep 2024, Tamar Christina wrote: > > > Hi All, > > > > Currently the vectorizer cheats when lowering COND_EXPR during bool recog. > > In the cases where the conditonal is loop invariant or non-boolean it > > instead > > converts the operation back into GENERIC and hides much of the operation > > from > > the analysis part of the vectorizer. > > > > i.e. > > > > a ? b : c > > > > is transformed into: > > > > a != 0 ? b : c > > > > however by doing so we can't perform any optimization on the mask as they > aren't > > explicit until quite late during codegen. > > > > To fix this this patch lowers booleans earlier and so ensures that we are > > always > > in GIMPLE. > > > > For when the value is a loop invariant boolean we have to generate an > > additional > > conversion from bool to the integer mask form. > > > > This is done by creating a loop invariant a ? -1 : 0 with the target mask > > precision and then doing a normal != 0 comparison on that. > > > > To support this the patch also adds the ability to during pattern matching > > create a loop invariant pattern that won't be seen by the vectorizer and > > will > > instead me materialized inside the loop preheader in the case of loops, or > > in > > the case of BB vectorization it materializes it in the first BB in the > > region. > > > > Bootstrapped Regtested on aarch64-none-linux-gnu, arm-none-linux-gnueabihf, > > x86_64-pc-linux-gnu -m32, -m64 and no issues. > > > > Ok for master? > > OK, but can you clarify a question below? > > > Thanks, > > Tamar > > > > gcc/ChangeLog: > > > > * tree-vect-patterns.cc (append_inv_pattern_def_seq): New. > > (vect_recog_bool_pattern): Lower COND_EXPRs. > > * tree-vect-slp.cc (vect_schedule_slp): Materialize loop invariant > > statements. > > * tree-vect-loop.cc (vect_transform_loop): Likewise. > > * tree-vect-stmts.cc (vectorizable_comparison_1): Remove > > VECT_SCALAR_BOOLEAN_TYPE_P handling for vectype. > > * tree-vectorizer.cc (vec_info::vec_info): Initialize > > inv_pattern_def_seq. > > * tree-vectorizer.h (LOOP_VINFO_INV_PATTERN_DEF_SEQ): New. > > (class vec_info): Add inv_pattern_def_seq. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.dg/vect/bb-slp-conditional_store_1.c: New test. > > * gcc.dg/vect/vect-conditional_store_5.c: New test. > > * gcc.dg/vect/vect-conditional_store_6.c: New test. > > > > --- > > diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-conditional_store_1.c > b/gcc/testsuite/gcc.dg/vect/bb-slp-conditional_store_1.c > > new file mode 100644 > > index > 0000000000000000000000000000000000000000..650a3bfbfb1dd44afc2d58b > be85f75f1d28b9bd0 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-conditional_store_1.c > > @@ -0,0 +1,15 @@ > > +/* { dg-do compile } */ > > +/* { dg-require-effective-target vect_int } */ > > +/* { dg-require-effective-target vect_float } */ > > + > > +/* { dg-additional-options "-mavx2" { target avx2 } } */ > > +/* { dg-additional-options "-march=armv9-a" { target aarch64-*-* } } */ > > + > > +void foo3 (float *restrict a, int *restrict c) > > +{ > > +#pragma GCC unroll 8 > > + for (int i = 0; i < 8; i++) > > + c[i] = a[i] > 1.0; > > +} > > + > > +/* { dg-final { scan-tree-dump "vectorized using SLP" "slp1" } } */ > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-conditional_store_5.c > b/gcc/testsuite/gcc.dg/vect/vect-conditional_store_5.c > > new file mode 100644 > > index > 0000000000000000000000000000000000000000..37d60fa76351c13980427 > 751be4450c14617a9a9 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/vect/vect-conditional_store_5.c > > @@ -0,0 +1,28 @@ > > +/* { dg-do compile } */ > > +/* { dg-require-effective-target vect_int } */ > > +/* { dg-require-effective-target vect_masked_store } */ > > + > > +/* { dg-additional-options "-mavx2" { target avx2 } } */ > > +/* { dg-additional-options "-march=armv9-a" { target aarch64-*-* } } */ > > + > > +#include <stdbool.h> > > + > > +void foo3 (float *restrict a, int *restrict b, int *restrict c, int n, int > > stride) > > +{ > > + if (stride <= 1) > > + return; > > + > > + bool ai = a[0]; > > + > > + for (int i = 0; i < n; i++) > > + { > > + int res = c[i]; > > + int t = b[i+stride]; > > + if (ai) > > + t = res; > > + c[i] = t; > > + } > > +} > > + > > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ > > +/* { dg-final { scan-tree-dump-not "VEC_COND_EXPR " "vect" { target > > aarch64- > *-* } } } */ > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-conditional_store_6.c > b/gcc/testsuite/gcc.dg/vect/vect-conditional_store_6.c > > new file mode 100644 > > index > 0000000000000000000000000000000000000000..5e1aedf3726b073c132bb6 > 4a9b474592ceb8e9b9 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/vect/vect-conditional_store_6.c > > @@ -0,0 +1,24 @@ > > +/* { dg-do compile } */ > > +/* { dg-require-effective-target vect_int } */ > > +/* { dg-require-effective-target vect_masked_store } */ > > + > > +/* { dg-additional-options "-mavx2" { target avx2 } } */ > > +/* { dg-additional-options "-march=armv9-a" { target aarch64-*-* } } */ > > + > > +void foo3 (unsigned long long *restrict a, int *restrict b, int *restrict > > c, int n, int > stride) > > +{ > > + if (stride <= 1) > > + return; > > + > > + for (int i = 0; i < n; i++) > > + { > > + int res = c[i]; > > + int t = b[i+stride]; > > + if (a[i]) > > + t = res; > > + c[i] = t; > > + } > > +} > > + > > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ > > +/* { dg-final { scan-tree-dump-not "VEC_COND_EXPR " "vect" { target > > aarch64- > *-* } } } */ > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > > index > 6456220cdc9bb0ba35baf04c555060ea38d13bbc..e4bed1f88435cb6ebad5651a > 266a9c74106500c0 100644 > > --- a/gcc/tree-vect-loop.cc > > +++ b/gcc/tree-vect-loop.cc > > @@ -12404,6 +12404,18 @@ vect_transform_loop (loop_vec_info loop_vinfo, > gimple *loop_vectorized_call) > > vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES > > (loop_vinfo)); > > } > > > > + /* Generate the loop invariant statements. */ > > + if (!gimple_seq_empty_p (LOOP_VINFO_INV_PATTERN_DEF_SEQ > (loop_vinfo))) > > + { > > + if (dump_enabled_p ()) > > + dump_printf_loc (MSG_NOTE, vect_location, > > + "------>generating loop invariant statements\n"); > > + gimple_stmt_iterator gsi; > > + gsi = gsi_after_labels (loop_preheader_edge (loop)->src); > > + gsi_insert_seq_before (&gsi, LOOP_VINFO_INV_PATTERN_DEF_SEQ > (loop_vinfo), > > + GSI_CONTINUE_LINKING); > > + } > > + > > So this is one instance ... > > > /* FORNOW: the vectorizer supports only loops which body consist > > of one basic block (header + empty latch). When the vectorizer will > > support more involved loop forms, the order by which the BBs are > > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc > > index > 4b112910df357e9f2783f7173b71812085126389..3aae6a066954d9e1e0d4180 > 3dd43307fa297af67 100644 > > --- a/gcc/tree-vect-patterns.cc > > +++ b/gcc/tree-vect-patterns.cc > > @@ -182,6 +182,17 @@ append_pattern_def_seq (vec_info *vinfo, > > new_stmt); > > } > > > > + > > +/* Add NEW_STMT to VINFO's invariant pattern definition statements. These > > + statements are not vectorized but are materialized as scalar in the loop > > + preheader. */ > > + > > +static inline void > > +append_inv_pattern_def_seq (vec_info *vinfo, gimple *new_stmt) > > +{ > > + gimple_seq_add_stmt_without_update (&vinfo->inv_pattern_def_seq, > new_stmt); > > +} > > + > > /* The caller wants to perform new operations on vect_external variable > > VAR, so that the result of the operations would also be vect_external. > > Return the edge on which the operations can be performed, if one exists. > > @@ -5983,12 +5994,34 @@ vect_recog_bool_pattern (vec_info *vinfo, > > var = adjust_bool_stmts (vinfo, bool_stmts, type, stmt_vinfo); > > else if (integer_type_for_mask (var, vinfo)) > > return NULL; > > + else if (TREE_CODE (TREE_TYPE (var)) == BOOLEAN_TYPE > > + && !vect_get_internal_def (vinfo, var)) > > + { > > + /* If the condition is already a boolean then manually convert it to a > > + mask of the given integer type but don't set a vectype. */ > > + tree lhs_ivar = vect_recog_temp_ssa_var (type, NULL); > > + pattern_stmt = gimple_build_assign (lhs_ivar, COND_EXPR, var, > > + build_all_ones_cst (type), > > + build_zero_cst (type)); > > + append_inv_pattern_def_seq (vinfo, pattern_stmt); > > + var = lhs_ivar; > > + } > > + > > + tree lhs_var = vect_recog_temp_ssa_var (boolean_type_node, NULL); > > + pattern_stmt = gimple_build_assign (lhs_var, NE_EXPR, var, > > + build_zero_cst (TREE_TYPE (var))); > > + > > + tree new_vectype = get_mask_type_for_scalar_type (vinfo, TREE_TYPE > (var)); > > + if (!new_vectype) > > + return NULL; > > + > > + new_vectype = truth_type_for (new_vectype); > > + append_pattern_def_seq (vinfo, stmt_vinfo, pattern_stmt, new_vectype, > > + TREE_TYPE (var)); > > > > lhs = vect_recog_temp_ssa_var (TREE_TYPE (lhs), NULL); > > pattern_stmt > > - = gimple_build_assign (lhs, COND_EXPR, > > - build2 (NE_EXPR, boolean_type_node, > > - var, build_int_cst (TREE_TYPE (var), 0)), > > + = gimple_build_assign (lhs, COND_EXPR, lhs_var, > > gimple_assign_rhs2 (last_stmt), > > gimple_assign_rhs3 (last_stmt)); > > *type_out = vectype; > > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc > > index > 43ecd2689701451b706b41d73ba60773af4cf8a5..1ea836ca8fe1d4867504c6da > 42a40c22b6638385 100644 > > --- a/gcc/tree-vect-slp.cc > > +++ b/gcc/tree-vect-slp.cc > > @@ -10393,4 +10393,23 @@ vect_schedule_slp (vec_info *vinfo, const > vec<slp_instance> &slp_instances) > > SLP_TREE_REPRESENTATIVE (root) = NULL; > > } > > } > > + > > + /* Generate the invariant statements. */ > > + if (!gimple_seq_empty_p (vinfo->inv_pattern_def_seq)) > > + { > > + if (dump_enabled_p ()) > > + dump_printf_loc (MSG_NOTE, vect_location, > > + "------>generating invariant statements\n"); > > + gimple_stmt_iterator gsi; > > + basic_block bb_inv = vinfo->bbs[0]; > > + loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo); > > + if (loop_vinfo) > > + bb_inv = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo))->src; > > + > > + gsi = gsi_after_labels (bb_inv); > > + gsi_insert_seq_before (&gsi, vinfo->inv_pattern_def_seq, > > + GSI_CONTINUE_LINKING); > > + vinfo->inv_pattern_def_seq = NULL; > > + } > > ... and this another. This one is only required for BB vectorization? > In that context it should probably best be done from vect_slp_region > before the vect_schedule_slp call or at least guarded with > BB vect?
Yeah indeed, I was searching for a place to put it only for BB vectorization But couldn't find a proper one. It's clearing the list at the end to prevent a double emission. But indeed it's cleaner to just guard it with a BB vect check. I'll try moving it first. > > I suspect you ran into this only for BB vectorization from loop > vect as that runs on if-converted code? Yeah, pure BB vectorization I couldn't trigger, because of the obvious issue that for it to work it needs if-cvt. So I can just assert and reject it, but the reason I kept it was I will need it for the early break BB vectorization which doesn't need Ifcvt and it seems trivial that the only place we can insert them is at the top of the first BB in the SLP region. But would you prefer an assert for now? Cheers, Tamar > > Thanks for clarifying. > Richard. > > > + > > } > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > > index > 385e63163c2450b1cee9f3c2e5df11636116ec2d..c761360d2e80bbf25f55c00ee > 69bbadfcc9128e1 100644 > > --- a/gcc/tree-vect-stmts.cc > > +++ b/gcc/tree-vect-stmts.cc > > @@ -12652,11 +12652,7 @@ vectorizable_comparison_1 (vec_info *vinfo, tree > vectype, > > /* Invariant comparison. */ > > if (!vectype) > > { > > - if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (rhs1))) > > - vectype = mask_type; > > - else > > - vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (rhs1), > > - slp_node); > > + vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (rhs1), > > slp_node); > > if (!vectype || maybe_ne (TYPE_VECTOR_SUBPARTS (vectype), nunits)) > > return false; > > } > > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h > > index > df6c8ada2f7814ac1ea89913e881dd659bd2da62..d3389767325229e95321223 > 6c248c30697bbce0d 100644 > > --- a/gcc/tree-vectorizer.h > > +++ b/gcc/tree-vectorizer.h > > @@ -509,6 +509,12 @@ public: > > /* The count of the basic blocks in the vectorization region. */ > > unsigned int nbbs; > > > > + /* Used to keep a sequence of def stmts of a pattern stmt that are loop > > + invariant if they exists. > > + The sequence is emitted in the loop preheader should the loop be > > vectorized > > + and are reset when undoing patterns. */ > > + gimple_seq inv_pattern_def_seq; > > + > > private: > > stmt_vec_info new_stmt_vec_info (gimple *stmt); > > void set_vinfo_for_stmt (gimple *, stmt_vec_info, bool = true); > > @@ -1039,6 +1045,7 @@ public: > > #define LOOP_VINFO_ORIG_LOOP_INFO(L) (L)->orig_loop_info > > #define LOOP_VINFO_SIMD_IF_COND(L) (L)->simd_if_cond > > #define LOOP_VINFO_INNER_LOOP_COST_FACTOR(L) (L)- > >inner_loop_cost_factor > > +#define LOOP_VINFO_INV_PATTERN_DEF_SEQ(L) (L)->inv_pattern_def_seq > > > > #define LOOP_VINFO_FULLY_MASKED_P(L) \ > > (LOOP_VINFO_USING_PARTIAL_VECTORS_P (L) \ > > diff --git a/gcc/tree-vectorizer.cc b/gcc/tree-vectorizer.cc > > index > 1fb4fb36ed44d11baea2d3db8eedf9685be344e8..70b83720fe28b8c8c567f68df > 1ceb308d49043dc 100644 > > --- a/gcc/tree-vectorizer.cc > > +++ b/gcc/tree-vectorizer.cc > > @@ -465,7 +465,8 @@ vec_info::vec_info (vec_info::vec_kind kind_in, > vec_info_shared *shared_) > > shared (shared_), > > stmt_vec_info_ro (false), > > bbs (NULL), > > - nbbs (0) > > + nbbs (0), > > + inv_pattern_def_seq (NULL) > > { > > stmt_vec_infos.create (50); > > } > > > > > > > > > > > > -- > Richard Biener <rguent...@suse.de> > SUSE Software Solutions Germany GmbH, > Frankenstrasse 146, 90461 Nuernberg, Germany; > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)