https://gcc.gnu.org/g:e8a51144c02e1cf210db5763e435802ac6fa6ad9
commit r16-2601-ge8a51144c02e1cf210db5763e435802ac6fa6ad9 Author: Richard Biener <rguent...@suse.de> Date: Tue Jul 29 10:05:32 2025 +0200 tree-optimization/120687 - avoid disturbing reduction chains in reassoc Reassoc carefully ranks operands to form reduction chains for vectorization so we are careful to not apply any width related changes in the early pass. Unfortunately we are not careful enough. The following gates fma related re-ordering and also the >= 3 ops tail "optimization" which is the culprit here. This does not fix the reported inefficient vectorization when using signed integer reductions yet. PR tree-optimization/120687 * tree-ssa-reassoc.cc (reassociate_bb): Do not disturb the sorted operand order in the early pass. * tree-vect-slp.cc (vect_analyze_slp): Dump when a detected reduction chain fails SLP discovery. * gcc.dg/vect/pr120687-1.c: New testcase. * gcc.dg/vect/pr120687-2.c: Likewise. Diff: --- gcc/testsuite/gcc.dg/vect/pr120687-1.c | 16 ++++++++++++++++ gcc/testsuite/gcc.dg/vect/pr120687-2.c | 17 +++++++++++++++++ gcc/tree-ssa-reassoc.cc | 10 ++++++---- gcc/tree-vect-slp.cc | 3 +++ 4 files changed, 42 insertions(+), 4 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/pr120687-1.c b/gcc/testsuite/gcc.dg/vect/pr120687-1.c new file mode 100644 index 000000000000..ce9cf6301ceb --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr120687-1.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_int } */ + +unsigned +frd (unsigned *p, unsigned *lastone) +{ + unsigned sum = 0; + for (; p <= lastone; p += 16) + sum += p[0] + p[1] + p[2] + p[3] + p[4] + p[5] + p[6] + p[7] + + p[8] + p[9] + p[10] + p[11] + p[12] + p[13] + p[14] + p[15]; + return sum; +} + +/* { dg-final { scan-tree-dump "reduction: detected reduction chain" "vect" } } */ +/* { dg-final { scan-tree-dump-not "SLP discovery of reduction chain failed" "vect" } } */ +/* { dg-final { scan-tree-dump "optimized: loop vectorized" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/pr120687-2.c b/gcc/testsuite/gcc.dg/vect/pr120687-2.c new file mode 100644 index 000000000000..dfc6dc726e9f --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr120687-2.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_float } */ +/* { dg-additional-options "-ffast-math" } */ + +float +frd (float *p, float *lastone) +{ + float sum = 0; + for (; p <= lastone; p += 16) + sum += p[0] + p[1] + p[2] + p[3] + p[4] + p[5] + p[6] + p[7] + + p[8] + p[9] + p[10] + p[11] + p[12] + p[13] + p[14] + p[15]; + return sum; +} + +/* { dg-final { scan-tree-dump "reduction: detected reduction chain" "vect" } } */ +/* { dg-final { scan-tree-dump-not "SLP discovery of reduction chain failed" "vect" } } */ +/* { dg-final { scan-tree-dump "optimized: loop vectorized" "vect" } } */ diff --git a/gcc/tree-ssa-reassoc.cc b/gcc/tree-ssa-reassoc.cc index 3c38f3d7a19f..c140f76766eb 100644 --- a/gcc/tree-ssa-reassoc.cc +++ b/gcc/tree-ssa-reassoc.cc @@ -7167,9 +7167,10 @@ reassociate_bb (basic_block bb) /* If the target support FMA, rank_ops_for_fma will detect if the chain has fmas and rearrange the ops if so. */ - if (direct_internal_fn_supported_p (IFN_FMA, - TREE_TYPE (lhs), - opt_type) + if (!reassoc_insert_powi_p + && direct_internal_fn_supported_p (IFN_FMA, + TREE_TYPE (lhs), + opt_type) && (rhs_code == PLUS_EXPR || rhs_code == MINUS_EXPR)) { mult_num = rank_ops_for_fma (&ops); @@ -7200,7 +7201,8 @@ reassociate_bb (basic_block bb) to make sure the ones that get the double binary op are chosen wisely. */ int len = ops.length (); - if (len >= 3 + if (!reassoc_insert_powi_p + && len >= 3 && (!has_fma /* width > 1 means ranking ops results in better parallelism. Check current value to avoid diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index cb27d166c553..a9c7105f47e6 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -4950,6 +4950,9 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size, max_tree_size, &limit, force_single_lane)) { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "SLP discovery of reduction chain failed\n"); /* Dissolve reduction chain group. */ stmt_vec_info vinfo = first_element; stmt_vec_info last = NULL;