There is no need to generate vzeroupper if caller uses upper bits of
AVX/AVX512 registers,  We track caller's avx_u128_state and avoid
vzeroupper when caller's avx_u128_state is AVX_U128_DIRTY.

Tested on i686 and x86-64 with and without --with-arch=native.

OK for trunk?

Thanks.

H.J.
---
gcc/

        PR target/88717
        * config/i386/i386.c (ix86_avx_u128_mode_entry): Set
        caller_avx_u128_dirty to true when caller is AVX_U128_DIRTY.
        (ix86_avx_u128_mode_exit): Set exit mode to AVX_U128_DIRTY if
        caller is AVX_U128_DIRTY.
        * config/i386/i386.h (machine_function): Add
        caller_avx_u128_dirty.

gcc/testsuite/

        PR target/88717
        * gcc.target/i386/pr88717.c: New test.
---
 gcc/config/i386/i386.c                  | 10 +++++++++-
 gcc/config/i386/i386.h                  |  3 +++
 gcc/testsuite/gcc.target/i386/pr88717.c | 24 ++++++++++++++++++++++++
 3 files changed, 36 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88717.c

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index d01278d866f..9b49a2c1d9c 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -19100,7 +19100,11 @@ ix86_avx_u128_mode_entry (void)
       rtx incoming = DECL_INCOMING_RTL (arg);
 
       if (incoming && ix86_check_avx_upper_register (incoming))
-       return AVX_U128_DIRTY;
+       {
+         /* Caller is AVX_U128_DIRTY.  */
+         cfun->machine->caller_avx_u128_dirty = true;
+         return AVX_U128_DIRTY;
+       }
     }
 
   return AVX_U128_CLEAN;
@@ -19130,6 +19134,10 @@ ix86_mode_entry (int entity)
 static int
 ix86_avx_u128_mode_exit (void)
 {
+  /* Exit mode is set to AVX_U128_DIRTY if caller is AVX_U128_DIRTY.  */
+  if (cfun->machine->caller_avx_u128_dirty)
+    return AVX_U128_DIRTY;
+
   rtx reg = crtl->return_rtx;
 
   /* Exit mode is set to AVX_U128_DIRTY if there are 256bit
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 83b025e0cf5..c053b657a55 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -2747,6 +2747,9 @@ struct GTY(()) machine_function {
   /* If true, ENDBR is queued at function entrance.  */
   BOOL_BITFIELD endbr_queued_at_entrance : 1;
 
+  /* If true, caller is AVX_U128_DIRTY.  */
+  BOOL_BITFIELD caller_avx_u128_dirty : 1;
+
   /* The largest alignment, in bytes, of stack slot actually used.  */
   unsigned int max_used_stack_alignment;
 
diff --git a/gcc/testsuite/gcc.target/i386/pr88717.c 
b/gcc/testsuite/gcc.target/i386/pr88717.c
new file mode 100644
index 00000000000..01680998f1b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88717.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mvzeroupper" } */
+
+#include <immintrin.h>
+
+__m128
+foo1 (__m256 x)
+{
+  return _mm256_castps256_ps128 (x);
+}
+
+void
+foo2 (float *p, __m256 x)
+{
+  *p = ((__v8sf)x)[0];
+}
+
+void
+foo3 (float *p, __m512 x)
+{
+  *p = ((__v16sf)x)[0];
+}
+
+/* { dg-final { scan-assembler-not "vzeroupper" } } */
-- 
2.20.1

Reply via email to