https://gcc.gnu.org/g:881df7a0b1e8e8c1454309fe23c0edd026296b8b
commit r16-3460-g881df7a0b1e8e8c1454309fe23c0edd026296b8b Author: H.J. Lu <hjl.to...@gmail.com> Date: Thu Aug 28 17:55:46 2025 -0700 x86: Allow by_pieces op when expanding memcpy/memset epilogue Since commit 401199377c50045ede560daf3f6e8b51749c2a87 Author: H.J. Lu <hjl.to...@gmail.com> Date: Tue Jun 17 10:17:17 2025 +0800 x86: Improve vector_loop/unrolled_loop for memset/memcpy uses move_by_pieces and store_by_pieces to expand memcpy/memset epilogue with vector_loop even when targetm.use_by_pieces_infrastructure_p returns false, which triggers gcc_assert (targetm.use_by_pieces_infrastructure_p (len, align, memsetp ? SET_BY_PIECES : STORE_BY_PIECES, optimize_insn_for_speed_p ())); in store_by_pieces. Fix it by: 1. Add by_pieces_in_use to machine_function to indicate that by_pieces op is currently in use. 2. Set and clear by_pieces_in_use when expanding memcpy/memset epilogue with move_by_pieces and store_by_pieces. 3. Define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P to return true if by_pieces_in_use is true. gcc/ PR target/121096 * config/i386/i386-expand.cc (expand_cpymem_epilogue): Set and clear by_pieces_in_use when using by_pieces op. (expand_setmem_epilogue): Likewise. * config/i386/i386.cc (ix86_use_by_pieces_infrastructure_p): New. (TARGET_USE_BY_PIECES_INFRASTRUCTURE_P): Likewise. * config/i386/i386.h (machine_function): Add by_pieces_in_use. gcc/testsuite/ PR target/121096 * gcc.target/i386/memcpy-strategy-14.c: New test. * gcc.target/i386/memcpy-strategy-15.c: Likewise. * gcc.target/i386/memset-strategy-10.c: Likewise. * gcc.target/i386/memset-strategy-11.c: Likewise. * gcc.target/i386/memset-strategy-12.c: Likewise. * gcc.target/i386/memset-strategy-13.c: Likewise. * gcc.target/i386/memset-strategy-14.c: Likewise. * gcc.target/i386/memset-strategy-15.c: Likewise. Signed-off-by: H.J. Lu <hjl.to...@gmail.com> Diff: --- gcc/config/i386/i386-expand.cc | 4 ++++ gcc/config/i386/i386.cc | 21 +++++++++++++++++ gcc/config/i386/i386.h | 3 +++ gcc/testsuite/gcc.target/i386/memcpy-strategy-14.c | 10 +++++++++ gcc/testsuite/gcc.target/i386/memcpy-strategy-15.c | 10 +++++++++ gcc/testsuite/gcc.target/i386/memset-strategy-10.c | 24 ++++++++++++++++++++ gcc/testsuite/gcc.target/i386/memset-strategy-11.c | 9 ++++++++ gcc/testsuite/gcc.target/i386/memset-strategy-12.c | 8 +++++++ gcc/testsuite/gcc.target/i386/memset-strategy-13.c | 26 ++++++++++++++++++++++ gcc/testsuite/gcc.target/i386/memset-strategy-14.c | 8 +++++++ gcc/testsuite/gcc.target/i386/memset-strategy-15.c | 9 ++++++++ 11 files changed, 132 insertions(+) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 6734d9f1464e..1c788ae098ad 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -8241,8 +8241,10 @@ expand_cpymem_epilogue (rtx destmem, rtx srcmem, unsigned HOST_WIDE_INT countval = UINTVAL (count); unsigned HOST_WIDE_INT epilogue_size = countval % max_size; unsigned int destalign = MEM_ALIGN (destmem); + cfun->machine->by_pieces_in_use = true; move_by_pieces (destmem, srcmem, epilogue_size, destalign, RETURN_BEGIN); + cfun->machine->by_pieces_in_use = false; return; } if (max_size > 8) @@ -8487,9 +8489,11 @@ expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value, unsigned HOST_WIDE_INT countval = UINTVAL (count); unsigned HOST_WIDE_INT epilogue_size = countval % max_size; unsigned int destalign = MEM_ALIGN (destmem); + cfun->machine->by_pieces_in_use = true; store_by_pieces (destmem, epilogue_size, setmem_epilogue_gen_val, vec_value ? vec_value : value, destalign, true, RETURN_BEGIN); + cfun->machine->by_pieces_in_use = false; return; } if (max_size > 32) diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 1ca6c6121371..471be3e86158 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -11382,6 +11382,23 @@ ix86_address_cost (rtx x, machine_mode, addr_space_t, bool) return cost; } + +/* Implement TARGET_USE_BY_PIECES_INFRASTRUCTURE_P. */ + +bool +ix86_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size, + unsigned int align, + enum by_pieces_operation op, + bool speed_p) +{ + /* Return true when we are currently expanding memcpy/memset epilogue + with move_by_pieces or store_by_pieces. */ + if (cfun->machine->by_pieces_in_use) + return true; + + return default_use_by_pieces_infrastructure_p (size, align, op, + speed_p); +} /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as this is used for to form addresses to local data when -fPIC is in @@ -27934,6 +27951,10 @@ static const scoped_attribute_specs *const ix86_attribute_table[] = #undef TARGET_ADDRESS_COST #define TARGET_ADDRESS_COST ix86_address_cost +#undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P +#define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \ + ix86_use_by_pieces_infrastructure_p + #undef TARGET_OVERLAP_OP_BY_PIECES_P #define TARGET_OVERLAP_OP_BY_PIECES_P hook_bool_void_true diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 2eb141bab1ad..ac0ce687f36e 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -2954,6 +2954,9 @@ struct GTY(()) machine_function { /* True if this is a recursive function. */ BOOL_BITFIELD recursive_function : 1; + /* True if by_pieces op is currently in use. */ + BOOL_BITFIELD by_pieces_in_use : 1; + /* The largest alignment, in bytes, of stack slot actually used. */ unsigned int max_used_stack_alignment; diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-14.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-14.c new file mode 100644 index 000000000000..44cd65230292 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-14.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-Os -mno-avx -msse2 -mtune=generic -minline-all-stringops -mstringop-strategy=vector_loop" } */ +/* { dg-final { scan-assembler-times "movaps" 8 } } */ + +char a[2048]; +char b[2048]; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-15.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-15.c new file mode 100644 index 000000000000..ea8e4be4ac4d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-15.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-Os -mno-avx -msse2 -mtune=generic -minline-all-stringops -mstringop-strategy=vector_loop" } */ +/* { dg-final { scan-assembler-times "movups" 8 } } */ + +char *a; +char *b; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-10.c b/gcc/testsuite/gcc.target/i386/memset-strategy-10.c new file mode 100644 index 000000000000..d6f2f4ed7ff2 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-10.c @@ -0,0 +1,24 @@ +/* { dg-do compile } */ +/* { dg-options "-Os -march=x86-64 -mstringop-strategy=vector_loop" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { ! ia32 } } {^\t?\.} } } */ + +/* +**foo: +**.LFB[0-9]+: +** .cfi_startproc +** xorps %xmm0, %xmm0 +** xorl %eax, %eax +** movq %rax, 48\(%(e|r)di\) +** movups %xmm0, \(%(e|r)di\) +** movups %xmm0, 16\(%(e|r)di\) +** movups %xmm0, 32\(%(e|r)di\) +** ret +**... +*/ + +void +foo (char *a) +{ + __builtin_memset (a, 0, 56); +} diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-11.c b/gcc/testsuite/gcc.target/i386/memset-strategy-11.c new file mode 100644 index 000000000000..851c6faaa09d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-11.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-Os -mno-avx -msse2 -mtune=generic -minline-all-stringops -mstringop-strategy=vector_loop" } */ +/* { dg-final { scan-assembler-times "movaps" 4 } } */ + +char a[2048]; +void t (void) +{ + __builtin_memset (a, 0, 2048); +} diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-12.c b/gcc/testsuite/gcc.target/i386/memset-strategy-12.c new file mode 100644 index 000000000000..06cac03426a1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-12.c @@ -0,0 +1,8 @@ +/* { dg-do compile } */ +/* { dg-options "-Os -mno-sse -mstringop-strategy=vector_loop" } */ + +void +foo (char *a) +{ + __builtin_memset (a, 0, 56); +} diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-13.c b/gcc/testsuite/gcc.target/i386/memset-strategy-13.c new file mode 100644 index 000000000000..cc2129f60eb4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-13.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-options "-Os -mno-sse -mstringop-strategy=unrolled_loop" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target { ! ia32 } } {^\t?\.} } } */ + +/* +**foo: +**.LFB[0-9]+: +** .cfi_startproc +** xorl %eax, %eax +** movq %rax, \(%(e|r)di\) +** movq %rax, 8\(%(e|r)di\) +** movq %rax, 16\(%(e|r)di\) +** movq %rax, 24\(%(e|r)di\) +** movq %rax, 32\(%(e|r)di\) +** movq %rax, 40\(%(e|r)di\) +** movq %rax, 48\(%(e|r)di\) +** ret +**... +*/ + +void +foo (char *a) +{ + __builtin_memset (a, 0, 56); +} diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-14.c b/gcc/testsuite/gcc.target/i386/memset-strategy-14.c new file mode 100644 index 000000000000..144235ee0820 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-14.c @@ -0,0 +1,8 @@ +/* { dg-do compile } */ +/* { dg-options "-Os -march=x86-64 -mstringop-strategy=vector_loop" } */ + +void +foo (char *a, int c) +{ + __builtin_memset (a, c, 56); +} diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-15.c b/gcc/testsuite/gcc.target/i386/memset-strategy-15.c new file mode 100644 index 000000000000..66f9fa600499 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-15.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-Os -mno-avx -msse2 -mtune=generic -mstringop-strategy=vector_loop" } */ +/* { dg-final { scan-assembler-times "movups" 4} } */ + +char *a; +void t (void) +{ + __builtin_memset (a, 0, 2048); +}