[gcc r16-2601] tree-optimization/120687 - avoid disturbing reduction chains in reassoc

Richard Biener via Gcc-cvs Tue, 29 Jul 2025 03:13:27 -0700

https://gcc.gnu.org/g:e8a51144c02e1cf210db5763e435802ac6fa6ad9


commit r16-2601-ge8a51144c02e1cf210db5763e435802ac6fa6ad9
Author: Richard Biener <rguent...@suse.de>
Date:   Tue Jul 29 10:05:32 2025 +0200

    tree-optimization/120687 - avoid disturbing reduction chains in reassoc
    
    Reassoc carefully ranks operands to form reduction chains for
    vectorization so we are careful to not apply any width related
    changes in the early pass.  Unfortunately we are not careful
    enough.  The following gates fma related re-ordering and also
    the >= 3 ops tail "optimization" which is the culprit here.
    
    This does not fix the reported inefficient vectorization when
    using signed integer reductions yet.
    
            PR tree-optimization/120687
            * tree-ssa-reassoc.cc (reassociate_bb): Do not disturb
            the sorted operand order in the early pass.
            * tree-vect-slp.cc (vect_analyze_slp): Dump when a detected
            reduction chain fails SLP discovery.
    
            * gcc.dg/vect/pr120687-1.c: New testcase.
            * gcc.dg/vect/pr120687-2.c: Likewise.

Diff:
---
 gcc/testsuite/gcc.dg/vect/pr120687-1.c | 16 ++++++++++++++++
 gcc/testsuite/gcc.dg/vect/pr120687-2.c | 17 +++++++++++++++++
 gcc/tree-ssa-reassoc.cc                | 10 ++++++----
 gcc/tree-vect-slp.cc                   |  3 +++
 4 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/pr120687-1.c 
b/gcc/testsuite/gcc.dg/vect/pr120687-1.c
new file mode 100644
index 000000000000..ce9cf6301ceb
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr120687-1.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+
+unsigned
+frd (unsigned *p, unsigned *lastone)
+{
+  unsigned sum = 0;
+  for (; p <= lastone; p += 16)
+    sum += p[0] + p[1] + p[2] + p[3] + p[4] + p[5] + p[6] + p[7]
+           + p[8] + p[9] + p[10] + p[11] + p[12] + p[13] + p[14] + p[15];
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump "reduction: detected reduction chain" "vect" } 
} */
+/* { dg-final { scan-tree-dump-not "SLP discovery of reduction chain failed" 
"vect" } } */
+/* { dg-final { scan-tree-dump "optimized: loop vectorized" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr120687-2.c 
b/gcc/testsuite/gcc.dg/vect/pr120687-2.c
new file mode 100644
index 000000000000..dfc6dc726e9f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr120687-2.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_float } */
+/* { dg-additional-options "-ffast-math" } */
+
+float
+frd (float *p, float *lastone)
+{
+  float sum = 0;
+  for (; p <= lastone; p += 16)
+    sum += p[0] + p[1] + p[2] + p[3] + p[4] + p[5] + p[6] + p[7]
+           + p[8] + p[9] + p[10] + p[11] + p[12] + p[13] + p[14] + p[15];
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump "reduction: detected reduction chain" "vect" } 
} */
+/* { dg-final { scan-tree-dump-not "SLP discovery of reduction chain failed" 
"vect" } } */
+/* { dg-final { scan-tree-dump "optimized: loop vectorized" "vect" } } */
diff --git a/gcc/tree-ssa-reassoc.cc b/gcc/tree-ssa-reassoc.cc
index 3c38f3d7a19f..c140f76766eb 100644
--- a/gcc/tree-ssa-reassoc.cc
+++ b/gcc/tree-ssa-reassoc.cc
@@ -7167,9 +7167,10 @@ reassociate_bb (basic_block bb)
 
                  /* If the target support FMA, rank_ops_for_fma will detect if
                     the chain has fmas and rearrange the ops if so.  */
-                 if (direct_internal_fn_supported_p (IFN_FMA,
-                                                     TREE_TYPE (lhs),
-                                                     opt_type)
+                 if (!reassoc_insert_powi_p
+                     && direct_internal_fn_supported_p (IFN_FMA,
+                                                        TREE_TYPE (lhs),
+                                                        opt_type)
                      && (rhs_code == PLUS_EXPR || rhs_code == MINUS_EXPR))
                    {
                      mult_num = rank_ops_for_fma (&ops);
@@ -7200,7 +7201,8 @@ reassociate_bb (basic_block bb)
                         to make sure the ones that get the double
                         binary op are chosen wisely.  */
                      int len = ops.length ();
-                     if (len >= 3
+                     if (!reassoc_insert_powi_p
+                         && len >= 3
                          && (!has_fma
                              /* width > 1 means ranking ops results in better
                                 parallelism.  Check current value to avoid
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index cb27d166c553..a9c7105f47e6 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -4950,6 +4950,9 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
                                                 max_tree_size, &limit,
                                                 force_single_lane))
          {
+           if (dump_enabled_p ())
+             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                              "SLP discovery of reduction chain failed\n");
            /* Dissolve reduction chain group.  */
            stmt_vec_info vinfo = first_element;
            stmt_vec_info last = NULL;

[gcc r16-2601] tree-optimization/120687 - avoid disturbing reduction chains in reassoc

Reply via email to