With the avx512_two_epilogues tuning enabled for zen4 and zen5
the gcc.target/i386/vect-epilogues-5.c testcase below regresses
and ends up using AVX2 sized vectors for the masked epilogue
rather than AVX512 sized vectors.  The following patch rectifies
this and adds coverage for the intended behavior.

Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.

OK for trunk and 15 branch if that succeeds?

Thanks,
Richard.

        * config/i386/i386.cc (ix86_vector_costs::finish_cost):
        Do not suggest a first epilogue mode for AVX512 sized
        main loops with X86_TUNE_AVX512_TWO_EPILOGUES as that
        interferes with using a masked epilogue.

        * gcc.target/i386/vect-epilogues-1.c: New testcase.
        * gcc.target/i386/vect-epilogues-2.c: Likewise.
        * gcc.target/i386/vect-epilogues-3.c: Likewise.
        * gcc.target/i386/vect-epilogues-4.c: Likewise.
        * gcc.target/i386/vect-epilogues-5.c: Likewise.
---
 gcc/config/i386/i386.cc                          | 10 +++-------
 gcc/testsuite/gcc.target/i386/vect-epilogues-1.c | 14 ++++++++++++++
 gcc/testsuite/gcc.target/i386/vect-epilogues-2.c | 15 +++++++++++++++
 gcc/testsuite/gcc.target/i386/vect-epilogues-3.c | 15 +++++++++++++++
 gcc/testsuite/gcc.target/i386/vect-epilogues-4.c | 13 +++++++++++++
 gcc/testsuite/gcc.target/i386/vect-epilogues-5.c | 13 +++++++++++++
 6 files changed, 73 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-epilogues-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-epilogues-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-epilogues-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-epilogues-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-epilogues-5.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 38df84f7db2..a6f0a582c3d 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -25545,14 +25545,10 @@ ix86_vector_costs::finish_cost (const vector_costs 
*scalar_costs)
   /* When X86_TUNE_AVX512_TWO_EPILOGUES is enabled arrange for both
      a AVX2 and a SSE epilogue for AVX512 vectorized loops.  */
   if (loop_vinfo
+      && LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+      && GET_MODE_SIZE (loop_vinfo->vector_mode) == 32
       && ix86_tune_features[X86_TUNE_AVX512_TWO_EPILOGUES])
-    {
-      if (GET_MODE_SIZE (loop_vinfo->vector_mode) == 64)
-       m_suggested_epilogue_mode = V32QImode;
-      else if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
-              && GET_MODE_SIZE (loop_vinfo->vector_mode) == 32)
-       m_suggested_epilogue_mode = V16QImode;
-    }
+    m_suggested_epilogue_mode = V16QImode;
   /* When a 128bit SSE vectorized epilogue still has a VF of 16 or larger
      enable a 64bit SSE epilogue.  */
   if (loop_vinfo
diff --git a/gcc/testsuite/gcc.target/i386/vect-epilogues-1.c 
b/gcc/testsuite/gcc.target/i386/vect-epilogues-1.c
new file mode 100644
index 00000000000..a7f5f12c71b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-epilogues-1.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx2 -mno-avx512f -mtune=generic 
-fdump-tree-vect-optimized" } */
+
+int test (signed char *data, int n)
+{
+  int sum = 0;
+  for (int i = 0; i < n; ++i)
+    sum += data[i];
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump "loop vectorized using 32 byte vectors" "vect" 
} } */
+/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors" "vect" 
} } */
+/* { dg-final { scan-tree-dump "loop vectorized using 8 byte vectors" "vect" { 
target { ! ia32 } } } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-epilogues-2.c 
b/gcc/testsuite/gcc.target/i386/vect-epilogues-2.c
new file mode 100644
index 00000000000..d6c06edcacd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-epilogues-2.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512bw -mtune=generic -fdump-tree-vect-optimized" } */
+
+int test (signed char *data, int n)
+{
+  int sum = 0;
+  for (int i = 0; i < n; ++i)
+    sum += data[i];
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump "loop vectorized using 64 byte vectors" "vect" 
} } */
+/* { dg-final { scan-tree-dump "loop vectorized using 32 byte vectors" "vect" 
} } */
+/* { dg-final { scan-tree-dump-not "loop vectorized using 16 byte vectors" 
"vect" } } */
+/* { dg-final { scan-tree-dump-not "loop vectorized using 8 byte vectors" 
"vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-epilogues-3.c 
b/gcc/testsuite/gcc.target/i386/vect-epilogues-3.c
new file mode 100644
index 00000000000..0ee610f5e3e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-epilogues-3.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512bw -mtune=znver4 -fdump-tree-vect-optimized" } */
+
+int test (signed char *data, int n)
+{
+  int sum = 0;
+  for (int i = 0; i < n; ++i)
+    sum += data[i];
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump "loop vectorized using 64 byte vectors" "vect" 
} } */
+/* { dg-final { scan-tree-dump "loop vectorized using 32 byte vectors" "vect" 
} } */
+/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors" "vect" 
} } */
+/* { dg-final { scan-tree-dump "loop vectorized using 8 byte vectors" "vect" { 
target { ! ia32 } } } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-epilogues-4.c 
b/gcc/testsuite/gcc.target/i386/vect-epilogues-4.c
new file mode 100644
index 00000000000..498db6b5a13
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-epilogues-4.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512bw -mtune=generic --param 
vect-partial-vector-usage=1 -fdump-tree-vect-optimized" } */
+
+int test (signed char *data, int n)
+{
+  int sum = 0;
+  for (int i = 0; i < n; ++i)
+    sum += data[i];
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 2 
"vect" } } */
+/* { dg-final { scan-tree-dump-not "loop vectorized using 32 byte vectors" 
"vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-epilogues-5.c 
b/gcc/testsuite/gcc.target/i386/vect-epilogues-5.c
new file mode 100644
index 00000000000..6772cabeb4a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-epilogues-5.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512bw -mtune=znver4 --param 
vect-partial-vector-usage=1 -fdump-tree-vect-optimized" } */
+
+int test (signed char *data, int n)
+{
+  int sum = 0;
+  for (int i = 0; i < n; ++i)
+    sum += data[i];
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 2 
"vect" } } */
+/* { dg-final { scan-tree-dump-not "loop vectorized using 32 byte vectors" 
"vect" } } */
-- 
2.43.0

Reply via email to