*** Context ***

The Haifa scheduler is able to recognise some cases where a dependence D between two instructions -- a producer P and a consumer C -- can be broken. In particular, if C contains a memory reference M and P is an increment, then M can be replaced with its incremented version M' within C.

For instance, the following sequence:

  P:  a0:DI=a0:DI+0x9         ;; increment
  C:  s1:DI=[a0:DI]           ;; memory ref M

can be replaced with:

  C:  s1:DI=[a0:DI+0x9]       ;; inc mem ref M'

This effectively breaks dependence D.
Fine. But what about P?


*** Issue ***

In most cases, once D is broken, P will be scheduled based on other, remaining dependences. But what if the only dependence left is a write-after-write aka output dependence?

After scheduling, we can end up with the following sequence:

  P:  a0:DI=a0:DI+0x9         ;; dead code
  C': a0:DI=sp:DI+0x7ff

Obviously, a0 being immediately overwritten without any intervening instruction, P has become functionally dead code.


*** Proposed solution ***

The attached patch adds the following logic to apply_replacement. Once D is broken, it examines P's remaining dependences: if there is only one left and it is an output dependence, P is marked for elision. P is actually removed only after apply_replacement has returned, to avoid iterator invalidation.


Is it a sound approach? Any comment regarding the suggested implementation?

Thanks,
--
PA
From 179be3811fd4d8e2ece0b59948dafaeb3c0212e6 Mon Sep 17 00:00:00 2001
From: Paul-Antoine Arras <par...@baylibre.com>
Date: Tue, 17 Jun 2025 13:36:09 +0000
Subject: [PATCH] haifa-sched: Elide leftover instruction after breaking dependency [PR120459]

	PR target/120459

gcc/ChangeLog:

	* haifa-sched.cc (apply_replacement): Check whether the producer of the
	broken dependency can be elided; return it if so.
	(recompute_todo_spec): Elide instruction based on apply_replacement's
	return value.

gcc/testsuite/ChangeLog:

	* g++.dg/pr120459.C: New test.
---
 gcc/haifa-sched.cc              |  40 +++++++-
 gcc/testsuite/g++.dg/pr120459.C | 158 ++++++++++++++++++++++++++++++++
 2 files changed, 194 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/pr120459.C

diff --git gcc/haifa-sched.cc gcc/haifa-sched.cc
index 63eb06b2d..8831b0471 100644
--- gcc/haifa-sched.cc
+++ gcc/haifa-sched.cc
@@ -1186,7 +1186,7 @@ update_insn_after_change (rtx_insn *insn)
 static vec<dep_t> next_cycle_replace_deps;
 static vec<int> next_cycle_apply;
 
-static void apply_replacement (dep_t, bool);
+static rtx_insn *apply_replacement (dep_t, bool);
 static void restore_pattern (dep_t, bool);
 
 /* Look at the remaining dependencies for insn NEXT, and compute and return
@@ -1268,6 +1268,7 @@ recompute_todo_spec (rtx_insn *next, bool for_backtrack)
     {
       if (!dbg_cnt (sched_breakdep))
 	return HARD_DEP;
+      rtx_insn *insn_to_elide = NULL;
       FOR_EACH_DEP (next, SD_LIST_BACK, sd_it, dep)
 	{
 	  struct dep_replacement *desc = DEP_REPLACE (dep);
@@ -1276,11 +1277,22 @@ recompute_todo_spec (rtx_insn *next, bool for_backtrack)
 	      if (desc->insn == next && !for_backtrack)
 		{
 		  gcc_assert (n_replace == 1);
-		  apply_replacement (dep, true);
+		  insn_to_elide = apply_replacement (dep, true);
 		}
 	      DEP_STATUS (dep) |= DEP_CANCELLED;
 	    }
 	}
+      if (insn_to_elide != NULL_RTX)
+	{
+	  for (sd_it
+	       = sd_iterator_start (insn_to_elide, SD_LIST_FORW | SD_LIST_BACK
+						     | SD_LIST_RES_BACK);
+	       sd_iterator_cond (&sd_it, &dep);)
+	    {
+	      sd_delete_dep (sd_it);
+	    }
+	  sched_remove_insn (insn_to_elide);
+	}
       return 0;
     }
 
@@ -4719,9 +4731,10 @@ free_backtrack_queue (void)
    at which point we will be called again with IMMEDIATELY true.  This is
    only done for machines which have instruction packets with explicit
    parallelism however.  */
-static void
+static rtx_insn *
 apply_replacement (dep_t dep, bool immediately)
 {
+  rtx_insn *insn_to_elide = NULL;
   struct dep_replacement *desc = DEP_REPLACE (dep);
   if (!immediately && targetm.sched.exposed_pipeline && reload_completed)
     {
@@ -4733,7 +4746,7 @@ apply_replacement (dep_t dep, bool immediately)
       bool success;
 
       if (QUEUE_INDEX (desc->insn) == QUEUE_SCHEDULED)
-	return;
+	return NULL;
 
       if (sched_verbose >= 5)
 	fprintf (sched_dump, "applying replacement for insn %d\n",
@@ -4744,6 +4757,23 @@ apply_replacement (dep_t dep, bool immediately)
 
       rtx_insn *insn = DEP_PRO (dep);
 
+      /* Elide INSN if there is only one forward dependency left and it is an
+       * output dependency.  */
+      dep_t dep2;
+      unsigned n_deps = 0;
+      bool has_output_dep = false;
+      sd_iterator_def sd_it;
+      FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep2)
+	{
+	  n_deps++;
+	  if (DEP_TYPE (dep2) == REG_DEP_OUTPUT)
+	    has_output_dep = true;
+	}
+      if (n_deps == 2 && has_output_dep)
+	{
+	  insn_to_elide = insn;
+	}
+
       /* Recompute priority since dependent priorities may have changed.  */
       priority (insn, true);
       update_insn_after_change (desc->insn);
@@ -4757,6 +4787,8 @@ apply_replacement (dep_t dep, bool immediately)
 	  backtrack_queue->replace_apply.safe_push (1);
 	}
     }
+
+  return insn_to_elide;
 }
 
 /* We have determined that a pattern involved in DEP must be restored.
diff --git gcc/testsuite/g++.dg/pr120459.C gcc/testsuite/g++.dg/pr120459.C
new file mode 100644
index 000000000..692d82b42
--- /dev/null
+++ gcc/testsuite/g++.dg/pr120459.C
@@ -0,0 +1,158 @@
+/* PR target/120459 */
+/* { dg-do compile { target { riscv*-*-* } } } */
+/* { dg-options "-O2 -std=c++03 -w -fdump-rtl-sched2-slim" } */
+
+/* Reduced from 507.cactuBSSN */
+
+struct a *b(a *, int, int);
+double *d;
+double e;
+int c(a *, int, int, int);
+void f() {
+    int *ab, *aa;
+    a *g;
+    double *h((double *)b);
+    double *l = 0, *o, *t = 0, *ad, *al;
+    int m, p, r, ag, ai, am, ao, aq, at;
+    double *n((double *)b(g, 0, m));
+    double *q((double *)b(g, 0, p));
+    double *s((double *)r);
+    double *af((double *)b(g, 0, e));
+    double *ah((double *)b(g, 0, ag));
+    double *aj((double *)b(g, 0, ai));
+    double *an((double *)b(g, 0, am));
+    double *ap((double *)b(g, 0, ao));
+    double *ar((double *)b(g, 0, aq));
+    long au = c(g, 0, 1, 0), av = c(g, 0, 0, 0), aw = sizeof(double), ax = au,
+         ay = av;
+    double bb, bc = e;
+    double be(01.0 / bb);
+    int bf(aa[1]);
+    int bg(ab[0]);
+    for (int k = 0; k < ab[2]; ++k)
+        for (int j = bf; j < ab[1]; ++j) {
+            int bh = 0;
+            for (int i = bh; i < bg; i++) {
+                long bi = i + au * j + av * k;
+                double bj = l[bi], bk = ((double *)b)[bi], bl = s[bi];
+                double bm = af[bi];
+                double bn = aj[bi];
+                double bo = al[bi];
+                double bp = an[bi];
+                double bq = ap[bi];
+                double br;
+                double bs;
+                double bt;
+                double bu;
+                double bv;
+                double bw;
+                double bx;
+                double by;
+                double bz;
+                double cb;
+                double cc;
+                double cd;
+                double cf;
+                double cg;
+                double ch;
+                double ci;
+                double ck;
+                double cm;
+                double co;
+                double cp;
+                double cq;
+                double cr;
+                switch (at)
+                case 4: {
+                    bs = 21 * ((char *)&h[bi])[ax] + (&h[bi])[ax] +
+                         ((char *)&h[bi])[ax * 2] - ((char *)&h[bi])[3];
+                    bt = *(double *)&(&h)[0] * (&h[bi])[ax] *
+                             ((char *)&h[bi])[ax * -2] +
+                         (&bi)[ax] + ((char *)&h[bi])[aw * ax * 3] +
+                         ((char *)&h[bi])[ax * 3];
+                    bu = (&h[bi])[ay * -1] +
+                         (&h[bi])[ay] * ((&bi)[ay] & ((char *)&h[bi])[ay]) -
+                         (&h[bi])[ax * ay] + (&h[bi])[ay * 3];
+                    bw = *(double *)&((char *)&n[bi])[ax * -1] +
+                         *(double *)&((char *)&n[bi])[ax] +
+                         *(double *)&((char *)&n[bi])[ax * -2] -
+                         *(double *)&((char *)&n[bi])[ax * 2] -
+                         *(double *)&((char *)&n[bi])[ax * -3] +
+                         *(double *)&((char *)&n[bi])[ax * 3];
+                    bx = (&n[bi])[0] + *(double *)&((char *)&n[bi])[ax * -1] +
+                         *(double *)&((char *)&n[bi])[ax] -
+                         *(double *)&((char *)&n[bi])[ax * -2] +
+                         *(double *)&((char *)&n[bi])[ax * 2] +
+                         *(double *)&((char *)&n[bi])[ax * -3] +
+                         *(double *)&((char *)&n[bi])[ax * 3];
+                    by = *(double *)&((char *)&n[bi])[ay * -1] +
+                         *(double *)&((char *)&n[bi])[ay] +
+                         *(double *)&((char *)&n[bi])[ay * -2] *
+                             *(double *)&((char *)&n[bi])[ay * 2] -
+                         *(double *)&((char *)&n[bi])[ay * -3] +
+                         *(double *)&((char *)&n[bi])[ay * 3] *
+                             (0 * (&n[bi])[0] +
+                              5 * *(double *)&((char *)&n[bi])[ay * -1] +
+                              *(double *)&((char *)&n[bi])[ay] -
+                              (*(double *)&((char *)&n[bi])[ay * -2] +
+                               *(double *)&((char *)&n[bi])[ay * 2]) +
+                              *(double *)&((char *)&n[bi])[ay * -3] +
+                              *(double *)&((char *)&n[bi])[ay * 3]) *
+                             be;
+                    bz = *(double *)&((char *)0)[aw] *
+                         ((&q[bi])[2] - ((char *)0)[3]) * bc;
+                    cb = (&q[bi])[aw * ax] + (&q[bi])[ax] * (&q[bi])[ax * 2] +
+                         (&q[bi])[aw * ax * 2] + ((char *)&q[bi])[ax];
+                    cc = ((char *)&q[bi])[ay * -1] +
+                         (double)((char *)&q[bi])[ay] + (&bi)[2] -
+                         ((char *)&q[bi])[ay * 2] - (&q[bi])[ay];
+                    cd = 20 * (&bi)[0] + (&bi)[3];
+                    cf = (&bi)[ax - 1] * ((char *)&t[bi])[0] -
+                         (&t[bi])[ax * 2] - (&t[bi])[ax * -3] +
+                         (&t[bi])[ax * 3];
+                    cg = (&t[bi])[ax] +
+                         *(double *)((char *)&t[bi])[ax] * (&t[bi])[ax * -20] +
+                         ((char *)0)[-3] + *(double *)((char *)0)[ax];
+                    ch = 21 * ((char *)&t[bi])[ay * -1] + ((char *)&t[bi])[ay] +
+                         (&t[bi])[ay * 2] * ((char *)&t[bi])[ay * 2] -
+                         (&t[bi])[ay * 3] + (&bi)[ay * 3];
+                    ci = ((&t[bi])[aw * ax] + (&t[bi])[ay] +
+                          ((char *)0)[ay * 3]) *
+                         be;
+                    ck = ((char *)d)[ax] +
+                         *(double *)((char *)&d[bi])[aw * ax] * ((char *)0)[2] +
+                         (&d[bi])[aw * ax] + *(double *)((char *)&bi)[ax] +
+                         *(double *)((char *)&d[bi])[ax] * (&d[bi])[0] +
+                         (&d[bi])[ay * -1] * (&d[bi])[ay] -
+                         ((&d[bi])[ay * -2] + (&d[bi])[ay * 2]) + *(&d)[ay] +
+                         ((char *)&d[bi])[ay];
+                    cm = (&ah[bi])[0] + *(double *)&((char *)0)[aw];
+                    co = *(double *)((char *)&ah[bi])[ax * -1] +
+                         ((char *)&ah[bi])[ax] * *(double *)((char *)ah)[ax] +
+                         (&ah[bi])[ax * 2] + (&ah[bi])[ax] + (&ah[bi])[ax * 3];
+                    cp =
+                        21 * (&ah[bi])[ay] +
+                        21 * ((char *)&ah[bi])[ay] * (&ah[bi])[ax * -2] *
+                            ((char *)&ah[bi])[ay * 2] -
+                        ((char *)&bi)[ay] +
+                        (&ah[bi])[ax * ay] *
+                            ((&ar[bi])[0] * (&ar[bi])[ax] * (&ar[bi])[aw] +
+                             (&bi)[2] + (&ar[bi])[aw * ax] + (&ar[bi])[ax * 3]);
+                }
+                    bk = bk * bw + by * bx * bl + bz * cc + cb * cd * cf * ch +
+                         cg * ci + ck + cm * bq * cp + co + cq * 0;
+                bj = bj * bo + cr * 0 * bp * bs + bu + br * 0 + bt + bv * 0;
+                l[bi] = bj;
+                o[bi] = bk;
+                ad[bi] = bm;
+                aj[bi] = bn;
+            }
+        }
+}
+
+/* Before this PR was fixed, we got:
+ 2302: a0:DI=a0:DI+0x9
+ 2303: a0:DI=sp:DI+0x7ff
+where 2302 was obviously dead code. So we now check that this pattern is gone.
+*/
+/* { dg-final { scan-rtl-dump-not "\[0-9]+: (\[a-z]\[0-9]+):DI=\\1:DI\\+0x\[0-9]+\n \[0-9]+: \\1:DI=sp:DI\\+0x\[0-9]+" "sched2" } } */
-- 
2.39.5

Reply via email to