https://gcc.gnu.org/g:41a533a85aaabef4e3f02cd2a09db23cf2ce7ac0

commit r16-7870-g41a533a85aaabef4e3f02cd2a09db23cf2ce7ac0
Author: Jakub Jelinek <[email protected]>
Date:   Tue Mar 3 15:47:08 2026 +0100

    i386: Use orb instead of orl/orq for stack probes/clash [PR124336]
    
    This PR is about an inconsistency between AT&T and Intel syntax
    for output_adjust_stack_and_probe/output_probe_stack_range.
    On ia32 they use both orl or or BYTE PTR, i.e. 32-bit or,
    but on x86_64 in AT&T syntax they use orq (i.e. 64-bit or) and
    in Intel syntax they use or DWORD PTR (i.e. 32-bit or).
    These cases are used when probing stack in a loop, for each
    page one probe.  There is also the probe_stack named pattern
    which currently uses word_mode or (i.e. 64-bit or for x86_64)
    for both syntaxes, used when probing only once.
    
    Functionally, I think whether we do an 8-bit or 32-bit or 64-bit
    or with 0 constant doesn't matter, we don't modify any values on the
    stack, just pretend to modify it.  The 8-bit and 32-bit ors
    are 1-byte shorter though than 64-bit one.  How the 3 behave
    performance-wise is unknown, if the particular probed spot on the
    stack hasn't been stored/read for a while and won't be for a while,
    then I'd think it shouldn't matter, dunno if there can be store
    forwarding effects if it has been e.g. written or read very recently
    by some other function as say 32-bit access and now is 8-bit.  The
    access after the probe (if it happens soon enough) should be in valid
    programs a store (and again, dunno if there can be issues if the
    sizes are different).
    
    Now, for consistency reasons, we could just make the Intel
    syntax match the AT&T and use 64-bit or on x86_64, so
    use QWORD PTR instead of DWORD PTR if stack_pointer_rtx is 64-bit
    in those 2 functions and be done with it.
    
    Another possibility is use always 32-bit ors (in both those 2 functions
    and probe_stack*; similar to the posted patch except testsuite changes
    aren't needed and s/{b}/{l}/g;s/QI/SI/g;s/BYTE PTR/DWORD PTR/g) and
    last option is to always use 8-bit ors (which is what the following
    patch does).  Or some other mix, say use 32-bit ors for -Os/-Oz and
    64-bit ors otherwise.
    
    2026-03-03  Jakub Jelinek  <[email protected]>
    
            PR target/124336
            * config/i386/i386.cc (output_adjust_stack_and_probe): Use
            or{b} rather than or%z0 and BYTE PTR rather than DWORD PTR.
            (output_probe_stack_range): Likewise.
            * config/i386/i386.md (probe_stack): Pass just 2 arguments
            to gen_probe_stack_1, first adjust_address to QImode, second
            const0_rtx.
            (@probe_stack_1_<mode>): Remove.
            (probe_stack_1): New define_insn.
    
            * gcc.target/i386/stack-check-11.c: Allow orb next to orl/orq.
            * gcc.target/i386/stack-check-18.c: Likewise.
            * gcc.target/i386/stack-check-19.c: Likewise.

Diff:
---
 gcc/config/i386/i386.cc                        |  4 ++--
 gcc/config/i386/i386.md                        | 16 ++++++++--------
 gcc/testsuite/gcc.target/i386/stack-check-11.c |  3 +--
 gcc/testsuite/gcc.target/i386/stack-check-18.c |  3 +--
 gcc/testsuite/gcc.target/i386/stack-check-19.c |  4 +---
 5 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index dec8ae97bf3e..428b68aff471 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -8438,7 +8438,7 @@ output_adjust_stack_and_probe (rtx reg)
 
   /* Probe at SP.  */
   xops[1] = const0_rtx;
-  output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
+  output_asm_insn ("or{b}\t{%1, (%0)|BYTE PTR [%0], %1}", xops);
 
   /* Test if SP == LAST_ADDR.  */
   xops[0] = stack_pointer_rtx;
@@ -8574,7 +8574,7 @@ output_probe_stack_range (rtx reg, rtx end)
   xops[0] = stack_pointer_rtx;
   xops[1] = reg;
   xops[2] = const0_rtx;
-  output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
+  output_asm_insn ("or{b}\t{%2, (%0,%1)|BYTE PTR [%0+%1], %2}", xops);
 
   /* Test if TEST_ADDR == LAST_ADDR.  */
   xops[0] = reg;
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 5c44988112b6..660098429a2a 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -27798,23 +27798,23 @@
   [(match_operand 0 "memory_operand")]
   ""
 {
-  emit_insn (gen_probe_stack_1
-            (word_mode, operands[0], const0_rtx));
+  emit_insn (gen_probe_stack_1 (adjust_address (operands[0], QImode, 0),
+                               const0_rtx));
   DONE;
 })
 
 ;; Use OR for stack probes, this is shorter.
-(define_insn "@probe_stack_1_<mode>"
-  [(set (match_operand:W 0 "memory_operand" "=m")
-       (unspec:W [(match_operand:W 1 "const0_operand")]
+(define_insn "probe_stack_1"
+  [(set (match_operand:QI 0 "memory_operand" "=m")
+       (unspec:QI [(match_operand:QI 1 "const0_operand")]
                  UNSPEC_PROBE_STACK))
    (clobber (reg:CC FLAGS_REG))]
   ""
-  "or{<imodesuffix>}\t{%1, %0|%0, %1}"
+  "or{b}\t{%1, %0|%0, %1}"
   [(set_attr "type" "alu1")
-   (set_attr "mode" "<MODE>")
+   (set_attr "mode" "QI")
    (set_attr "length_immediate" "1")])
-  
+
 (define_insn "@adjust_stack_and_probe_<mode>"
   [(set (match_operand:P 0 "register_operand" "=r")
        (unspec_volatile:P [(match_operand:P 1 "register_operand" "0")]
diff --git a/gcc/testsuite/gcc.target/i386/stack-check-11.c 
b/gcc/testsuite/gcc.target/i386/stack-check-11.c
index 48341cedcd4b..aaf11cba6127 100644
--- a/gcc/testsuite/gcc.target/i386/stack-check-11.c
+++ b/gcc/testsuite/gcc.target/i386/stack-check-11.c
@@ -14,5 +14,4 @@ frob ()
 }
 
 /* { dg-final { scan-assembler-times "sub\[ql\]" 4 } } */
-/* { dg-final { scan-assembler-times "or\[ql\]" 3 } } */
-
+/* { dg-final { scan-assembler-times "or\[qlb\]" 3 } } */
diff --git a/gcc/testsuite/gcc.target/i386/stack-check-18.c 
b/gcc/testsuite/gcc.target/i386/stack-check-18.c
index a0aab4a98238..19564fe0f94f 100644
--- a/gcc/testsuite/gcc.target/i386/stack-check-18.c
+++ b/gcc/testsuite/gcc.target/i386/stack-check-18.c
@@ -20,5 +20,4 @@ f2 (void)
 /* { dg-final { scan-rtl-dump-times "allocation and probing in loop" 1 
"expand" } } */
 /* { dg-final { scan-rtl-dump-times "allocation and probing residuals" 1 
"expand" } } */
 
-/* { dg-final { scan-assembler-times "or\[ql\]" 1 } } */
-
+/* { dg-final { scan-assembler-times "or\[qlb\]" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/stack-check-19.c 
b/gcc/testsuite/gcc.target/i386/stack-check-19.c
index 94c75dcb47ce..864cd7b16e9f 100644
--- a/gcc/testsuite/gcc.target/i386/stack-check-19.c
+++ b/gcc/testsuite/gcc.target/i386/stack-check-19.c
@@ -17,7 +17,7 @@ f2 (const int size)
    there are two probes in the final assembly code.  */
 /* { dg-final { scan-rtl-dump-times "allocation and probing in loop" 1 
"expand" } } */
 /* { dg-final { scan-rtl-dump-times "allocation and probing residuals" 1 
"expand" } } */
-/* { dg-final { scan-assembler-times "or\[ql\]" 2 } } */
+/* { dg-final { scan-assembler-times "or\[qlb\]" 2 } } */
 
 /* We also want to verify (indirectly) that the residual probe is
    guarded.  We do that by checking the number of conditional
@@ -26,5 +26,3 @@ f2 (const int size)
 
    These will all be equality tests.  */
 /* { dg-final { scan-assembler-times "(\?:je|jne)" 3 } } */
-
-

Reply via email to