When we scrap the last def of an odd lane numbered BB reduction we can end up recording a pattern def which will later wreck code generation. The following puts this logic where it better belongs, avoiding this issue.
Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed. PR tree-optimization/114249 * tree-vect-slp.cc (vect_build_slp_instance): Move making a BB reduction lane number even ... (vect_slp_check_for_roots): ... here to avoid leaking pattern defs. * gcc.dg/vect/bb-slp-pr114249.c: New testcase. --- gcc/testsuite/gcc.dg/vect/bb-slp-pr114249.c | 20 ++++++++++++++++++++ gcc/tree-vect-slp.cc | 20 ++++++++++---------- 2 files changed, 30 insertions(+), 10 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-pr114249.c diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-pr114249.c b/gcc/testsuite/gcc.dg/vect/bb-slp-pr114249.c new file mode 100644 index 00000000000..64c93cd9a2d --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-pr114249.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ + +enum { SEG_THIN_POOL } read_only; +struct { + unsigned skip_block_zeroing; + unsigned ignore_discard; + unsigned no_discard_passdown; + unsigned error_if_no_space; +} _thin_pool_emit_segment_line_seg; +void dm_snprintf(); +void _emit_segment() +{ + int features = + (_thin_pool_emit_segment_line_seg.error_if_no_space ? 1 : 0) + + (read_only ? 1 : 0) + + (_thin_pool_emit_segment_line_seg.ignore_discard ? 1 : 0) + + (_thin_pool_emit_segment_line_seg.no_discard_passdown ? 1 : 0) + + (_thin_pool_emit_segment_line_seg.skip_block_zeroing ? 1 : 0); + dm_snprintf(features); +} diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 324400db19e..527b06c9f9c 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -3288,15 +3288,6 @@ vect_build_slp_instance (vec_info *vinfo, " %G", scalar_stmts[i]->stmt); } - /* When a BB reduction doesn't have an even number of lanes - strip it down, treating the remaining lane as scalar. - ??? Selecting the optimal set of lanes to vectorize would be nice - but SLP build for all lanes will fail quickly because we think - we're going to need unrolling. */ - if (kind == slp_inst_kind_bb_reduc - && (scalar_stmts.length () & 1)) - remain.safe_insert (0, gimple_get_lhs (scalar_stmts.pop ()->stmt)); - /* Build the tree for the SLP instance. */ unsigned int group_size = scalar_stmts.length (); bool *matches = XALLOCAVEC (bool, group_size); @@ -7549,6 +7540,7 @@ vect_slp_check_for_roots (bb_vec_info bb_vinfo) /* ??? For now do not allow mixing ops or externs/constants. */ bool invalid = false; unsigned remain_cnt = 0; + unsigned last_idx = 0; for (unsigned i = 0; i < chain.length (); ++i) { if (chain[i].code != code) @@ -7563,7 +7555,13 @@ vect_slp_check_for_roots (bb_vec_info bb_vinfo) (chain[i].op)->stmt) != chain[i].op)) remain_cnt++; + else + last_idx = i; } + /* Make sure to have an even number of lanes as we later do + all-or-nothing discovery, not trying to split further. */ + if ((chain.length () - remain_cnt) & 1) + remain_cnt++; if (!invalid && chain.length () - remain_cnt > 1) { vec<stmt_vec_info> stmts; @@ -7576,7 +7574,9 @@ vect_slp_check_for_roots (bb_vec_info bb_vinfo) stmt_vec_info stmt_info; if (chain[i].dt == vect_internal_def && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)), - gimple_get_lhs (stmt_info->stmt) == chain[i].op)) + gimple_get_lhs (stmt_info->stmt) == chain[i].op) + && (i != last_idx + || (stmts.length () & 1))) stmts.quick_push (stmt_info); else remain.quick_push (chain[i].op); -- 2.35.3