We can use TImode/OImode/XImode integers for piecewise move and store.
When vector register is used for piecewise move and store, we don't
increase stack_alignment_needed since vector register spill isn't
required for piecewise move and store.  Since stack_realign_needed is
set to true by checking stack_alignment_estimated set by pseudo vector
register usage, we also need to check stack_realign_needed to eliminate
frame pointer.

gcc/

        * config/i386/i386.c (ix86_finalize_stack_frame_flags): Also
        check stack_realign_needed for stack realignment.
        (ix86_legitimate_constant_p): Always allow CONST_WIDE_INT smaller
        than the largest integer supported by vector register.
        * config/i386/i386.h (MOVE_MAX): Set to 64.
        (MOVE_MAX_PIECES): Set to bytes of the largest integer supported
        by vector register.
        (STORE_MAX_PIECES): New.

gcc/testsuite/

        * gcc.target/i386/pr90773-1.c: Adjust to expect movq for 32-bit.
        * gcc.target/i386/pr90773-4.c: Also run for 32-bit.
        * gcc.target/i386/pr90773-14.c: Likewise.
        * gcc.target/i386/pr90773-15.c: Likewise.
        * gcc.target/i386/pr90773-16.c: Likewise.
        * gcc.target/i386/pr90773-17.c: Likewise.
---
 gcc/config/i386/i386.c                     | 21 ++++++++++++---
 gcc/config/i386/i386.h                     | 31 +++++++++++++++++-----
 gcc/testsuite/gcc.target/i386/pr90773-1.c  | 10 +++----
 gcc/testsuite/gcc.target/i386/pr90773-14.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr90773-15.c |  6 ++---
 gcc/testsuite/gcc.target/i386/pr90773-16.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr90773-17.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr90773-4.c  |  2 +-
 8 files changed, 53 insertions(+), 23 deletions(-)

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index e6ee3ef630a..8ae0fa764f6 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -7925,8 +7925,17 @@ ix86_finalize_stack_frame_flags (void)
      assumed stack realignment might be needed or -fno-omit-frame-pointer
      is used, but in the end nothing that needed the stack alignment had
      been spilled nor stack access, clear frame_pointer_needed and say we
-     don't need stack realignment.  */
-  if ((stack_realign || (!flag_omit_frame_pointer && optimize))
+     don't need stack realignment.
+
+     When vector register is used for piecewise move and store, we don't
+     increase stack_alignment_needed as there is no register spill for
+     piecewise move and store.  Since stack_realign_needed is set to true
+     by checking stack_alignment_estimated which is updated by pseudo
+     vector register usage, we also need to check stack_realign_needed to
+     eliminate frame pointer.  */
+  if ((stack_realign
+       || (!flag_omit_frame_pointer && optimize)
+       || crtl->stack_realign_needed)
       && frame_pointer_needed
       && crtl->is_leaf
       && crtl->sp_is_unchanging
@@ -10385,7 +10394,13 @@ ix86_legitimate_constant_p (machine_mode mode, rtx x)
          /* FALLTHRU */
        case E_OImode:
        case E_XImode:
-         if (!standard_sse_constant_p (x, mode))
+         if (!standard_sse_constant_p (x, mode)
+             && GET_MODE_SIZE (TARGET_AVX512F
+                               ? XImode
+                               : (TARGET_AVX
+                                  ? OImode
+                                  : (TARGET_SSE2
+                                     ? TImode : DImode))) < GET_MODE_SIZE 
(mode))
            return false;
        default:
          break;
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 96b46bac238..b3213f85698 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1750,7 +1750,7 @@ typedef struct ix86_args {
 
 /* Max number of bytes we can move from memory to memory
    in one reasonably fast instruction.  */
-#define MOVE_MAX 16
+#define MOVE_MAX 64
 
 /* MOVE_MAX_PIECES is the number of bytes at a time which we can
    move efficiently, as opposed to  MOVE_MAX which is the maximum
@@ -1761,11 +1761,30 @@ typedef struct ix86_args {
    widest mode with MAX_FIXED_MODE_SIZE, we can only use TImode in
    64-bit mode.  */
 #define MOVE_MAX_PIECES \
-  ((TARGET_64BIT \
-    && TARGET_SSE2 \
-    && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \
-    && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
-   ? GET_MODE_SIZE (TImode) : UNITS_PER_WORD)
+  ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
+   ? 64 \
+   : ((TARGET_AVX \
+       && !TARGET_PREFER_AVX128 \
+       && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD \
+       && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
+      ? 32 \
+      : ((TARGET_SSE2 \
+         && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \
+         && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
+        ? 16 : UNITS_PER_WORD)))
+
+/* STORE_MAX_PIECES is the number of bytes at a time that we can
+   store efficiently.  */
+#define STORE_MAX_PIECES \
+  ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
+   ? 64 \
+   : ((TARGET_AVX \
+       && !TARGET_PREFER_AVX128 \
+       && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
+      ? 32 \
+      : ((TARGET_SSE2 \
+         && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
+        ? 16 : UNITS_PER_WORD)))
 
 /* If a memory-to-memory move would take MOVE_RATIO or more simple
    move-instruction pairs, we will do a cpymem or libcall instead.
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-1.c 
b/gcc/testsuite/gcc.target/i386/pr90773-1.c
index 1d9f282dc0d..4fd5a40d99d 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mtune=generic" } */
+/* { dg-options "-O2 -msse2 -mtune=generic" } */
 
 extern char *dst, *src;
 
@@ -9,9 +9,5 @@ foo (void)
   __builtin_memcpy (dst, src, 15);
 }
 
-/* { dg-final { scan-assembler-times "movq\[\\t \]+\\(%\[\^,\]+\\)," 1 { 
target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-times "movq\[\\t \]+7\\(%\[\^,\]+\\)," 1 { 
target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-times "movl\[\\t \]+\\(%\[\^,\]+\\)," 1 { 
target ia32 } } } */
-/* { dg-final { scan-assembler-times "movl\[\\t \]+4\\(%\[\^,\]+\\)," 1 { 
target ia32 } } } */
-/* { dg-final { scan-assembler-times "movl\[\\t \]+8\\(%\[\^,\]+\\)," 1 { 
target ia32 } } } */
-/* { dg-final { scan-assembler-times "movl\[\\t \]+11\\(%\[\^,\]+\\)," 1 { 
target ia32 } } } */
+/* { dg-final { scan-assembler-times "movq\[\\t \]+\\(%\[\^,\]+\\)," 1 } } */
+/* { dg-final { scan-assembler-times "movq\[\\t \]+7\\(%\[\^,\]+\\)," 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-14.c 
b/gcc/testsuite/gcc.target/i386/pr90773-14.c
index 6364916ecac..74ba5055960 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-14.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-14.c
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
 /* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
 
 extern char *dst;
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-15.c 
b/gcc/testsuite/gcc.target/i386/pr90773-15.c
index c0a96fed892..880f71d1567 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-15.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-15.c
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
 /* { dg-options "-O2 -march=skylake-avx512" } */
 
 extern char *dst;
@@ -9,6 +9,6 @@ foo (int c)
   __builtin_memset (dst, c, 17);
 }
 
-/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%edi, %xmm\[0-9\]+" 
1 } } */
+/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%.*, %xmm\[0-9\]+" 
1 } } */
 /* { dg-final { scan-assembler-times "vmovdqu\[\\t \]+%xmm\[0-9\]+, 
\\(%\[\^,\]+\\)" 1 } } */
-/* { dg-final { scan-assembler-times "movb\[\\t \]+%dil, 16\\(%\[\^,\]+\\)" 1 
} } */
+/* { dg-final { scan-assembler-times "movb\[\\t \]+%.*, 16\\(%\[\^,\]+\\)" 1 } 
} */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-16.c 
b/gcc/testsuite/gcc.target/i386/pr90773-16.c
index d2d1ec6141c..32a976b10df 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-16.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-16.c
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
 /* { dg-options "-O2 -march=skylake-avx512" } */
 
 extern char *dst;
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-17.c 
b/gcc/testsuite/gcc.target/i386/pr90773-17.c
index 6c8da7d24ef..2d6fbf22a8b 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-17.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-17.c
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
 /* { dg-options "-O2 -march=skylake-avx512" } */
 
 extern char *dst;
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-4.c 
b/gcc/testsuite/gcc.target/i386/pr90773-4.c
index ec0bc0100ae..ee4c04678d1 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-4.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-4.c
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
 /* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
 
 extern char *dst;
-- 
2.31.1

Reply via email to