This patch was inspired from PR 110137. It reduces the amount of stack spilling 
by ensuring that more values are constant across a pure function call.

It does not add any new flag; rather, it makes the optimizer generate more 
optimal code.

For the added test file, the change is the following. As can be seen, the 
number of memory operations is cut in half (almost, because rbx = rdi also need 
to be saved in the "after" version).

Before:

        _Z2ggO7MyClass:
        .LFB653:
                .cfi_startproc
                sub     rsp, 72
                .cfi_def_cfa_offset 80
                movdqu  xmm1, XMMWORD PTR [rdi]
                movdqu  xmm0, XMMWORD PTR [rdi+16]
                movaps  XMMWORD PTR [rsp+16], xmm1
                movaps  XMMWORD PTR [rsp], xmm0
                call    _Z1fv
                movdqa  xmm1, XMMWORD PTR [rsp+16]
                movdqa  xmm0, XMMWORD PTR [rsp]
                lea     rdx, [rsp+32]
                movaps  XMMWORD PTR [rsp+32], xmm1
                movaps  XMMWORD PTR [rsp+48], xmm0
                add     rsp, 72
                .cfi_def_cfa_offset 8
                ret
                .cfi_endproc

After:

        _Z2ggO7MyClass:
        .LFB653:
                .cfi_startproc
                push    rbx
                .cfi_def_cfa_offset 16
                .cfi_offset 3, -16
                mov     rbx, rdi
                sub     rsp, 32
                .cfi_def_cfa_offset 48
                call    _Z1fv
                movdqu  xmm0, XMMWORD PTR [rbx]
                movaps  XMMWORD PTR [rsp], xmm0
                movdqu  xmm0, XMMWORD PTR [rbx+16]
                movaps  XMMWORD PTR [rsp+16], xmm0
                add     rsp, 32
                .cfi_def_cfa_offset 16
                pop     rbx
                .cfi_def_cfa_offset 8
                ret
                .cfi_endproc

As explained in PR 110137, the reason I modify the RTL pass instead of the 
GIMPLE pass is that currently the code that handle the optimization is in the 
IRA.

The optimization involved is: rewrite

definition: a = something;
...
use a;

to move the definition statement right before the use statement, provided none 
of the statements inbetween modifies "something".

The existing code only handle the case where "something" is a memory reference 
with a fixed address. The patch modifies the logic to also allow memory 
reference whose address is not changed by the statements inbetween.

In order to do that the only way I can think of is to modify 
"validate_equiv_mem" to also validate the equivalence of the address, which may 
consist of a pseudo-register.

Nevertheless, reviews and suggestions to improve the code/explain how to 
implement it in GIMPLE phase would be appreciated.

Bootstrapped and regression tested on x86_64-pc-linux-gnu.

I think the test passes but there are some spurious failures with some 
scan-assembler-* tests, looking through it it doesn't appear to pessimize the 
code.

I think this also fix PR 103541 as a side effect, but I'm not sure if the 
generated code is optimal (it loads from the global variable twice, but then 
there's no readily usable caller-saved register so you need an additional 
memory operation anyway)
From 3d8d04373716e031242678821c3e9904b6ac51cb Mon Sep 17 00:00:00 2001
From: user202729 <user202...@protonmail.com>
Date: Mon, 20 May 2024 18:12:58 +0800
Subject: [PATCH] Allow moving init from dereference across pure function call

gcc/ChangeLog:

	* ira.cc (struct equiv_mem_data): Rename to equiv_data.
	(validate_equiv_mem_from_store): Rename to validate_equiv_from_store.
	(uses_any_clobbered_hard_reg_p): New function.
	(validate_equiv_mem): Rename to valid_equiv.
	(equiv_init_movable_p): Relax several conditions.
	(update_equiv_regs): Relax condition that the source must be mem.
	(add_store_equivs): Update changed function name.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avoid-stack-spill-pure-call.C: New test.
---
 gcc/ira.cc                                    | 157 ++++++++++++------
 .../i386/avoid-stack-spill-pure-call.C        |  32 ++++
 2 files changed, 140 insertions(+), 49 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avoid-stack-spill-pure-call.C

diff --git a/gcc/ira.cc b/gcc/ira.cc
index 5642aea3caa..1a1718a18c5 100644
--- a/gcc/ira.cc
+++ b/gcc/ira.cc
@@ -3054,28 +3054,80 @@ struct equivalence
 static struct equivalence *reg_equiv;
 
 /* Used for communication between the following two functions.  */
-struct equiv_mem_data
+struct equiv_data
 {
-  /* A MEM that we wish to ensure remains unchanged.  */
-  rtx equiv_mem;
+  /* A rtx that we wish to ensure remains unchanged.  */
+  rtx equiv;
 
-  /* Set true if EQUIV_MEM is modified.  */
-  bool equiv_mem_modified;
+  /* Set true if EQUIV is modified.  */
+  bool equiv_modified;
 };
 
-/* If EQUIV_MEM is modified by modifying DEST, indicate that it is modified.
+/* If EQUIV is modified by modifying DEST, indicate that it is modified.
    Called via note_stores.  */
 static void
-validate_equiv_mem_from_store (rtx dest, const_rtx set ATTRIBUTE_UNUSED,
-			       void *data)
+validate_equiv_from_store (rtx dest, const_rtx set ATTRIBUTE_UNUSED,
+			   void *data)
 {
-  struct equiv_mem_data *info = (struct equiv_mem_data *) data;
+  struct equiv_data *info = (struct equiv_data *) data;
 
-  if ((REG_P (dest)
-       && reg_overlap_mentioned_p (dest, info->equiv_mem))
-      || (MEM_P (dest)
-	  && anti_dependence (info->equiv_mem, dest)))
-    info->equiv_mem_modified = true;
+  if (REG_P (dest))
+    {
+      if (reg_overlap_mentioned_p (dest, info->equiv))
+	info->equiv_modified = true;
+    }
+  else if (MEM_P (dest))
+    {
+      if (REG_P (info->equiv))
+	{
+	  /* info->equiv won't be modified by writing to a memory address.  */
+	}
+      else if (MEM_P (info->equiv))
+	{
+	  if (anti_dependence (info->equiv, dest))
+	    info->equiv_modified = true;
+	}
+      else
+	info->equiv_modified = true; /* just in case.  */
+    }
+  else
+    info->equiv_modified = true; /* just in case.  */
+}
+
+/* Returns true if x uses any hard register that can be clobbered across
+   a function call.  */
+static bool
+uses_any_clobbered_hard_reg_p (rtx x)
+{
+  int i, j;
+  const char *fmt;
+  enum rtx_code code;
+
+  if (x == NULL_RTX)
+    return false;
+  code = GET_CODE (x);
+
+  if (SUBREG_P (x))
+    return uses_any_clobbered_hard_reg_p (SUBREG_REG (x));
+  if (REG_P (x))
+    return HARD_REGISTER_P (x)
+      && crtl->abi->clobbers_at_least_part_of_reg_p (REGNO (x));
+  fmt = GET_RTX_FORMAT (code);
+  for (i = GET_RTX_LENGTH (code) - 1; i >= 0; i--)
+    {
+      if (fmt[i] == 'e')
+	{
+	  if (uses_any_clobbered_hard_reg_p (XEXP (x, i)))
+	    return true;
+	}
+      else if (fmt[i] == 'E')
+	{
+	  for (j = XVECLEN (x, i) - 1; j >= 0; j--)
+	    if (uses_any_clobbered_hard_reg_p (XVECEXP (x, i, j)))
+	      return true;
+	}
+    }
+  return false;
 }
 
 static bool equiv_init_varies_p (rtx x);
@@ -3083,24 +3135,24 @@ static bool equiv_init_varies_p (rtx x);
 enum valid_equiv { valid_none, valid_combine, valid_reload };
 
 /* Verify that no store between START and the death of REG invalidates
-   MEMREF.  MEMREF is invalidated by modifying a register used in MEMREF,
+   X.  X is invalidated by modifying a register used in X,
    by storing into an overlapping memory location, or with a non-const
    CALL_INSN.
 
-   Return VALID_RELOAD if MEMREF remains valid for both reload and
-   combine_and_move insns, VALID_COMBINE if only valid for
+   Return VALID_RELOAD if X remains valid for both reload and
+   combine_and_move_insns, VALID_COMBINE if only valid for
    combine_and_move_insns, and VALID_NONE otherwise.  */
 static enum valid_equiv
-validate_equiv_mem (rtx_insn *start, rtx reg, rtx memref)
+validate_equiv (rtx_insn *start, rtx reg, rtx x)
 {
   rtx_insn *insn;
   rtx note;
-  struct equiv_mem_data info = { memref, false };
+  struct equiv_data info = { x, false };
   enum valid_equiv ret = valid_reload;
 
   /* If the memory reference has side effects or is volatile, it isn't a
      valid equivalence.  */
-  if (side_effects_p (memref))
+  if (side_effects_p (x))
     return valid_none;
 
   for (insn = start; insn; insn = NEXT_INSN (insn))
@@ -3113,35 +3165,43 @@ validate_equiv_mem (rtx_insn *start, rtx reg, rtx memref)
 
       if (CALL_P (insn))
 	{
-	  /* We can combine a reg def from one insn into a reg use in
-	     another over a call if the memory is readonly or the call
-	     const/pure.  However, we can't set reg_equiv notes up for
-	     reload over any call.  The problem is the equivalent form
-	     may reference a pseudo which gets assigned a call
-	     clobbered hard reg.  When we later replace REG with its
-	     equivalent form, the value in the call-clobbered reg has
-	     been changed and all hell breaks loose.  */
-	  ret = valid_combine;
-	  if (!MEM_READONLY_P (memref)
-	      && (!RTL_CONST_OR_PURE_CALL_P (insn)
-		  || equiv_init_varies_p (XEXP (memref, 0))))
-	    return valid_none;
+	  if (RTL_CONST_OR_PURE_CALL_P (insn)
+	      && !uses_any_clobbered_hard_reg_p (x))
+	    {
+	      /* All is good, the call cannot modify x
+		 in the situation explained below.  */
+	    }
+	  else
+	    {
+	      /* We can combine a reg def from one insn into a reg use in
+		 another over a call if the memory is readonly or the call
+		 const/pure.  However, we can't set reg_equiv notes up for
+		 reload over any call.  The problem is the equivalent form
+		 may reference a pseudo which gets assigned a call
+		 clobbered hard reg.  When we later replace REG with its
+		 equivalent form, the value in the call-clobbered reg has
+		 been changed and all hell breaks loose.  */
+	      ret = valid_combine;
+	      if (MEM_P (x) && !MEM_READONLY_P (x)
+		  && (!RTL_CONST_OR_PURE_CALL_P (insn)
+		      || equiv_init_varies_p (XEXP (x, 0))))
+		return valid_none;
+	    }
 	}
 
-      note_stores (insn, validate_equiv_mem_from_store, &info);
-      if (info.equiv_mem_modified)
+      note_stores (insn, validate_equiv_from_store, &info);
+      if (info.equiv_modified)
 	return valid_none;
 
-      /* If a register mentioned in MEMREF is modified via an
+      /* If a register mentioned in X is modified via an
 	 auto-increment, we lose the equivalence.  Do the same if one
 	 dies; although we could extend the life, it doesn't seem worth
 	 the trouble.  */
 
       for (note = REG_NOTES (insn); note; note = XEXP (note, 1))
-	if ((REG_NOTE_KIND (note) == REG_INC
-	     || REG_NOTE_KIND (note) == REG_DEAD)
+	if (REG_NOTE_KIND (note) == REG_INC
 	    && REG_P (XEXP (note, 0))
-	    && reg_overlap_mentioned_p (XEXP (note, 0), memref))
+	    && reg_overlap_mentioned_p (XEXP (note, 0), x))
 	  return valid_none;
     }
 
@@ -3227,7 +3287,7 @@ equiv_init_movable_p (rtx x, int regno)
 
     case REG:
       return ((reg_equiv[REGNO (x)].loop_depth >= reg_equiv[regno].loop_depth
-	       && reg_equiv[REGNO (x)].replace)
+	       && reg_equiv[REGNO (x)].replacement)
 	      || (REG_BASIC_BLOCK (REGNO (x)) < NUM_FIXED_BLOCKS
 		  && ! rtx_varies_p (x, 0)));
 
@@ -3751,8 +3811,8 @@ update_equiv_regs (void)
 	    }
 
 	  /* If this insn introduces a "constant" register, decrease the priority
-	     of that register.  Record this insn if the register is only used once
-	     more and the equivalence value is the same as our source.
+	     of that register.  Record this insn if the register is only used
+	     once more and the equivalence value is the same as our source.
 
 	     The latter condition is checked for two reasons:  First, it is an
 	     indication that it may be more efficient to actually emit the insn
@@ -3761,19 +3821,18 @@ update_equiv_regs (void)
 	     dying in this insn whose death notes would be missed.
 
 	     If we don't have a REG_EQUIV note, see if this insn is loading
-	     a register used only in one basic block from a MEM.  If so, and the
-	     MEM remains unchanged for the life of the register, add a REG_EQUIV
-	     note.  */
+	     a register used only in one basic block.  If so, and the
+	     SET_SRC (set) remains unchanged for the life of the register, add a
+	     REG_EQUIV note.  */
 	  note = find_reg_note (insn, REG_EQUIV, NULL_RTX);
 
 	  rtx replacement = NULL_RTX;
 	  if (note)
 	    replacement = XEXP (note, 0);
-	  else if (REG_BASIC_BLOCK (regno) >= NUM_FIXED_BLOCKS
-		   && MEM_P (SET_SRC (set)))
+	  else if (REG_BASIC_BLOCK (regno) >= NUM_FIXED_BLOCKS)
 	    {
 	      enum valid_equiv validity;
-	      validity = validate_equiv_mem (insn, dest, SET_SRC (set));
+	      validity = validate_equiv (insn, dest, SET_SRC (set));
 	      if (validity != valid_none)
 		{
 		  replacement = copy_rtx (SET_SRC (set));
@@ -3869,7 +3928,7 @@ add_store_equivs (void)
 	  && (init_insn = reg_equiv[regno].init_insns->insn ()) != 0
 	  && bitmap_bit_p (seen_insns, INSN_UID (init_insn))
 	  && ! find_reg_note (init_insn, REG_EQUIV, NULL_RTX)
-	  && validate_equiv_mem (init_insn, src, dest) == valid_reload
+	  && validate_equiv (init_insn, src, dest) == valid_reload
 	  && ! memref_used_between_p (dest, init_insn, insn)
 	  /* Attaching a REG_EQUIV note will fail if INIT_INSN has
 	     multiple sets.  */
diff --git a/gcc/testsuite/gcc.target/i386/avoid-stack-spill-pure-call.C b/gcc/testsuite/gcc.target/i386/avoid-stack-spill-pure-call.C
new file mode 100644
index 00000000000..f8d200cd4fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avoid-stack-spill-pure-call.C
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-options "-fdump-rtl-final -O3" } */
+
+#include <array>
+#include <cstdint>
+
+struct MyClass
+{
+  std::array<uint64_t, 4> arr;
+};
+
+// Prevent optimization
+void sink (void *m) {
+  asm volatile ("" : : "g"(m) : "memory");
+}
+
+int __attribute__((pure))
+f ();
+
+int
+gg (MyClass&& a)
+{
+  MyClass b;
+  MyClass c = a;
+  int result = f ();
+  b = c;
+  sink (&b);
+  return result;
+}
+
+/* Once for rbx, twice for each xmm register stored to the stack.  */
+/* { dg-final { scan-rtl-dump-times {\(set \(mem} 3 "final" } } */
-- 
2.34.1

Reply via email to