The following supports vectorizing BB reductions involving a constant or an invariant.
Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed. * tree-vectorizer.h (_slp_instance::remain_stmts): Change to ... (_slp_instance::remain_defs): ... this. (SLP_INSTANCE_REMAIN_STMTS): Rename to ... (SLP_INSTANCE_REMAIN_DEFS): ... this. (slp_root::remain): New. (slp_root::slp_root): Adjust. * tree-vect-slp.cc (vect_free_slp_instance): Adjust. (vect_build_slp_instance): Get extra remain parameter, adjust former handling of a cut off stmt. (vect_analyze_slp_instance): Adjust. (vect_analyze_slp): Likewise. (_bb_vec_info::~_bb_vec_info): Likewise. (vectorizable_bb_reduc_epilogue): Dump something if we fail. (vect_slp_check_for_constructors): Handle non-internal defs as remain defs of a reduction. (vectorize_slp_instance_root_stmt): Adjust. * gcc.dg/vect/bb-slp-75.c: New testcase. --- gcc/testsuite/gcc.dg/vect/bb-slp-75.c | 25 +++++++++++ gcc/tree-vect-slp.cc | 60 ++++++++++++++++++--------- gcc/tree-vectorizer.h | 9 ++-- 3 files changed, 71 insertions(+), 23 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-75.c diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-75.c b/gcc/testsuite/gcc.dg/vect/bb-slp-75.c new file mode 100644 index 00000000000..1abac136f72 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-75.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_float } */ +/* { dg-additional-options "-ffast-math" } */ +/* { dg-additional-options "-msse2 -mfpmath=sse" { target { x86_64-*-* i?86-*-* } } } */ + +float x[4]; + +float test1 (float a) +{ + return x[0] + x[2] + x[1] + x[3] + a; +} + +float test2 (void) +{ + return x[3] + x[2] + x[1] + 1.f + x[0]; +} + +float test3 (float a) +{ + return x[0] + a + x[2] + x[1] + x[3] + 1.f; +} + +/* We currently require a .REDUC_PLUS direct internal function but do not + have a dejagnu target for this. */ +/* { dg-final { scan-tree-dump-times "Basic block will be vectorized using SLP" 3 "slp2" { target { x86_64-*-* i?86-*-* } } } } */ diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 41997d5a546..cf91b21cf7d 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -209,7 +209,7 @@ vect_free_slp_instance (slp_instance instance) vect_free_slp_tree (SLP_INSTANCE_TREE (instance)); SLP_INSTANCE_LOADS (instance).release (); SLP_INSTANCE_ROOT_STMTS (instance).release (); - SLP_INSTANCE_REMAIN_STMTS (instance).release (); + SLP_INSTANCE_REMAIN_DEFS (instance).release (); instance->subgraph_entries.release (); instance->cost_vec.release (); free (instance); @@ -3115,6 +3115,7 @@ vect_build_slp_instance (vec_info *vinfo, slp_instance_kind kind, vec<stmt_vec_info> &scalar_stmts, vec<stmt_vec_info> &root_stmt_infos, + vec<tree> &remain, unsigned max_tree_size, unsigned *limit, scalar_stmts_to_slp_tree_map_t *bst_map, /* ??? We need stmt_info for group splitting. */ @@ -3134,10 +3135,9 @@ vect_build_slp_instance (vec_info *vinfo, ??? Selecting the optimal set of lanes to vectorize would be nice but SLP build for all lanes will fail quickly because we think we're going to need unrolling. */ - auto_vec<stmt_vec_info> remain; if (kind == slp_inst_kind_bb_reduc && (scalar_stmts.length () & 1)) - remain.safe_push (scalar_stmts.pop ()); + remain.safe_insert (0, gimple_get_lhs (scalar_stmts.pop ()->stmt)); /* Build the tree for the SLP instance. */ unsigned int group_size = scalar_stmts.length (); @@ -3186,10 +3186,7 @@ vect_build_slp_instance (vec_info *vinfo, SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor; SLP_INSTANCE_LOADS (new_instance) = vNULL; SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos; - if (!remain.is_empty ()) - SLP_INSTANCE_REMAIN_STMTS (new_instance) = remain.copy (); - else - SLP_INSTANCE_REMAIN_STMTS (new_instance) = vNULL; + SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain; SLP_INSTANCE_KIND (new_instance) = kind; new_instance->reduc_phis = NULL; new_instance->cost_vec = vNULL; @@ -3469,6 +3466,7 @@ vect_analyze_slp_instance (vec_info *vinfo, gcc_unreachable (); vec<stmt_vec_info> roots = vNULL; + vec<tree> remain = vNULL; if (kind == slp_inst_kind_ctor) { roots.create (1); @@ -3476,7 +3474,7 @@ vect_analyze_slp_instance (vec_info *vinfo, } /* Build the tree for the SLP instance. */ bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts, - roots, + roots, remain, max_tree_size, limit, bst_map, kind == slp_inst_kind_store ? stmt_info : NULL); @@ -3521,10 +3519,12 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind, bb_vinfo->roots[i].stmts, bb_vinfo->roots[i].roots, + bb_vinfo->roots[i].remain, max_tree_size, &limit, bst_map, NULL)) { bb_vinfo->roots[i].stmts = vNULL; bb_vinfo->roots[i].roots = vNULL; + bb_vinfo->roots[i].remain = vNULL; } } } @@ -5955,6 +5955,7 @@ _bb_vec_info::~_bb_vec_info () { roots[i].stmts.release (); roots[i].roots.release (); + roots[i].remain.release (); } roots.release (); } @@ -6405,7 +6406,13 @@ vectorizable_bb_reduc_epilogue (slp_instance instance, || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH) || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)), TREE_TYPE (vectype))) - return false; + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "not vectorized: basic block reduction epilogue " + "operation unsupported.\n"); + return false; + } /* There's no way to cost a horizontal vector reduction via REDUC_FN so cost log2 vector operations plus shuffles and one extraction. */ @@ -7262,22 +7269,37 @@ vect_slp_check_for_constructors (bb_vec_info bb_vinfo) but record those to be handled in the epilogue. */ /* ??? For now do not allow mixing ops or externs/constants. */ bool invalid = false; + unsigned remain_cnt = 0; for (unsigned i = 0; i < chain.length (); ++i) - if (chain[i].dt != vect_internal_def - || chain[i].code != code) - invalid = true; - if (!invalid) + { + if (chain[i].code != code) + { + invalid = true; + break; + } + if (chain[i].dt != vect_internal_def) + remain_cnt++; + } + if (!invalid && chain.length () - remain_cnt > 1) { vec<stmt_vec_info> stmts; + vec<tree> remain = vNULL; stmts.create (chain.length ()); + if (remain_cnt > 0) + remain.create (remain_cnt); for (unsigned i = 0; i < chain.length (); ++i) - stmts.quick_push (bb_vinfo->lookup_def (chain[i].op)); + { + if (chain[i].dt == vect_internal_def) + stmts.quick_push (bb_vinfo->lookup_def (chain[i].op)); + else + remain.quick_push (chain[i].op); + } vec<stmt_vec_info> roots; roots.create (chain_stmts.length ()); for (unsigned i = 0; i < chain_stmts.length (); ++i) roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i])); bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc, - stmts, roots)); + stmts, roots, remain)); } } } @@ -9160,16 +9182,16 @@ vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance) gcc_unreachable (); tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn), TREE_TYPE (TREE_TYPE (vec_def)), vec_def); - if (!SLP_INSTANCE_REMAIN_STMTS (instance).is_empty ()) + if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ()) { tree rem_def = NULL_TREE; - for (auto rem : SLP_INSTANCE_REMAIN_STMTS (instance)) + for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance)) if (!rem_def) - rem_def = gimple_get_lhs (rem->stmt); + rem_def = def; else rem_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (scalar_def), - rem_def, gimple_get_lhs (rem->stmt)); + rem_def, def); scalar_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (scalar_def), scalar_def, rem_def); diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 5987a327332..1de144988c8 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -259,7 +259,7 @@ public: /* For slp_inst_kind_bb_reduc the defs that were not vectorized, NULL otherwise. */ - vec<stmt_vec_info> remain_stmts; + vec<tree> remain_defs; /* The unrolling factor required to vectorized this SLP instance. */ poly_uint64 unrolling_factor; @@ -289,7 +289,7 @@ public: #define SLP_INSTANCE_UNROLLING_FACTOR(S) (S)->unrolling_factor #define SLP_INSTANCE_LOADS(S) (S)->loads #define SLP_INSTANCE_ROOT_STMTS(S) (S)->root_stmts -#define SLP_INSTANCE_REMAIN_STMTS(S) (S)->remain_stmts +#define SLP_INSTANCE_REMAIN_DEFS(S) (S)->remain_defs #define SLP_INSTANCE_KIND(S) (S)->kind #define SLP_TREE_CHILDREN(S) (S)->children @@ -1027,11 +1027,12 @@ loop_vec_info_for_loop (class loop *loop) struct slp_root { slp_root (slp_instance_kind kind_, vec<stmt_vec_info> stmts_, - vec<stmt_vec_info> roots_) - : kind(kind_), stmts(stmts_), roots(roots_) {} + vec<stmt_vec_info> roots_, vec<tree> remain_ = vNULL) + : kind(kind_), stmts(stmts_), roots(roots_), remain(remain_) {} slp_instance_kind kind; vec<stmt_vec_info> stmts; vec<stmt_vec_info> roots; + vec<tree> remain; }; typedef class _bb_vec_info : public vec_info -- 2.35.3