Use TImode for piecewise move in 64-bit mode. When vector register is used for piecewise move, we don't increase stack_alignment_needed since vector register spill isn't required for piecewise move. Since stack_realign_needed is set to true by checking stack_alignment_estimated set by pseudo vector register usage, we also need to check stack_realign_needed to eliminate frame pointer.
Tested on x86-64. OK for trunk? H.J. --- gcc/ * config/i386/i386.c (ix86_finalize_stack_realign_flags): Also check stack_realign_needed for stack realignment. * config/i386/i386.h (MOVE_MAX_PIECES): Set to 16 in 64-bit mode if unaligned SSE load and store are optimal. gcc/testsuite/ * gcc.target/i386/pieces-memcpy-1.c: New test. * gcc.target/i386/pieces-memcpy-2.c: Likewise. * gcc.target/i386/pieces-memcpy-3.c: Likewise. * gcc.target/i386/pieces-memcpy-4.c: Likewise. * gcc.target/i386/pieces-memcpy-5.c: Likewise. * gcc.target/i386/pieces-memcpy-6.c: Likewise. --- gcc/config/i386/i386.c | 11 +++++++++-- gcc/config/i386/i386.h | 6 +++++- gcc/testsuite/gcc.target/i386/pieces-memcpy-1.c | 17 +++++++++++++++++ gcc/testsuite/gcc.target/i386/pieces-memcpy-2.c | 17 +++++++++++++++++ gcc/testsuite/gcc.target/i386/pieces-memcpy-3.c | 17 +++++++++++++++++ gcc/testsuite/gcc.target/i386/pieces-memcpy-4.c | 17 +++++++++++++++++ gcc/testsuite/gcc.target/i386/pieces-memcpy-5.c | 17 +++++++++++++++++ gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c | 17 +++++++++++++++++ 8 files changed, 116 insertions(+), 3 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-3.c create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-4.c create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-5.c create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 93eaab1..60dc160 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -13286,8 +13286,15 @@ ix86_finalize_stack_realign_flags (void) /* If the only reason for frame_pointer_needed is that we conservatively assumed stack realignment might be needed, but in the end nothing that needed the stack alignment had been spilled, clear frame_pointer_needed - and say we don't need stack realignment. */ - if (stack_realign + and say we don't need stack realignment. + + When vector register is used for piecewise move and store, we don't + increase stack_alignment_needed as there is no register spill for + piecewise move and store. Since stack_realign_needed is set to true + by checking stack_alignment_estimated which is updated by pseudo + vector register usage, we also need to check stack_realign_needed to + eliminate frame pointer. */ + if ((stack_realign || crtl->stack_realign_needed) && frame_pointer_needed && crtl->is_leaf && flag_omit_frame_pointer diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 9b66264..24db855 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -1951,7 +1951,11 @@ typedef struct ix86_args { /* MOVE_MAX_PIECES is the number of bytes at a time which we can move efficiently, as opposed to MOVE_MAX which is the maximum number of bytes we can move with a single instruction. */ -#define MOVE_MAX_PIECES UNITS_PER_WORD +#define MOVE_MAX_PIECES \ + ((TARGET_64BIT \ + && TARGET_SSE2 \ + && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \ + && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) ? 16 : UNITS_PER_WORD) /* If a memory-to-memory move would take MOVE_RATIO or more simple move-instruction pairs, we will do a movmem or libcall instead. diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-1.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-1.c new file mode 100644 index 0000000..adc0aa8 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-1.c @@ -0,0 +1,17 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */ + +extern char *dst, *src; + +void +foo (void) +{ + __builtin_memcpy (dst, src, 64); +} + +/* { dg-final { scan-assembler-times "movdqu\[ \\t\]+\[^\n\]*%xmm" 4 } } */ +/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 4 } } */ +/* No need to dynamically realign the stack here. */ +/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */ +/* Nor use a frame pointer. */ +/* { dg-final { scan-assembler-not "%\[re\]bp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-2.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-2.c new file mode 100644 index 0000000..c52c1d9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-2.c @@ -0,0 +1,17 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */ + +extern char *dst, *src; + +void +foo (void) +{ + __builtin_memcpy (dst, src, 33); +} + +/* { dg-final { scan-assembler-times "movdqu\[ \\t\]+\[^\n\]*%xmm" 2 } } */ +/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 2 } } */ +/* No need to dynamically realign the stack here. */ +/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */ +/* Nor use a frame pointer. */ +/* { dg-final { scan-assembler-not "%\[re\]bp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-3.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-3.c new file mode 100644 index 0000000..c532bbd --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-3.c @@ -0,0 +1,17 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */ + +extern char *dst, *src; + +void +foo (void) +{ + __builtin_memcpy (dst, src, 17); +} + +/* { dg-final { scan-assembler-times "movdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } */ +/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 1 } } */ +/* No need to dynamically realign the stack here. */ +/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */ +/* Nor use a frame pointer. */ +/* { dg-final { scan-assembler-not "%\[re\]bp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-4.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-4.c new file mode 100644 index 0000000..4ef763d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-4.c @@ -0,0 +1,17 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2 -mno-avx2 -mavx -mtune=generic" } */ + +extern char *dst, *src; + +void +foo (void) +{ + __builtin_memcpy (dst, src, 18); +} + +/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } */ +/* { dg-final { scan-assembler-times "vmovups\[ \\t\]+\[^\n\]*%xmm" 1 } } */ +/* No need to dynamically realign the stack here. */ +/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */ +/* Nor use a frame pointer. */ +/* { dg-final { scan-assembler-not "%\[re\]bp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-5.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-5.c new file mode 100644 index 0000000..2687560 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-5.c @@ -0,0 +1,17 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2 -mavx512f -mtune=generic" } */ + +extern char *dst, *src; + +void +foo (void) +{ + __builtin_memcpy (dst, src, 19); +} + +/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } */ +/* { dg-final { scan-assembler-times "vmovups\[ \\t\]+\[^\n\]*%xmm" 1 } } */ +/* No need to dynamically realign the stack here. */ +/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */ +/* Nor use a frame pointer. */ +/* { dg-final { scan-assembler-not "%\[re\]bp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c new file mode 100644 index 0000000..a205f83 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c @@ -0,0 +1,17 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2 -mno-avx2 -mavx -mtune=sandybridge" } */ + +extern char *dst, *src; + +void +foo (void) +{ + __builtin_memcpy (dst, src, 33); +} + +/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 2 } } */ +/* { dg-final { scan-assembler-times "vmovups\[ \\t\]+\[^\n\]*%xmm" 2 } } */ +/* No need to dynamically realign the stack here. */ +/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */ +/* Nor use a frame pointer. */ +/* { dg-final { scan-assembler-not "%\[re\]bp" } } */ -- 2.7.4