There is no need to generate vzeroupper if caller uses upper bits of AVX/AVX512 registers, We track caller's avx_u128_state and avoid vzeroupper when caller's avx_u128_state is AVX_U128_DIRTY.
Tested on i686 and x86-64 with and without --with-arch=native. OK for trunk? Thanks. H.J. --- gcc/ PR target/88717 * config/i386/i386.c (ix86_avx_u128_mode_entry): Set caller_avx_u128_dirty to true when caller is AVX_U128_DIRTY. (ix86_avx_u128_mode_exit): Set exit mode to AVX_U128_DIRTY if caller is AVX_U128_DIRTY. * config/i386/i386.h (machine_function): Add caller_avx_u128_dirty. gcc/testsuite/ PR target/88717 * gcc.target/i386/pr88717.c: New test. --- gcc/config/i386/i386.c | 10 +++++++++- gcc/config/i386/i386.h | 3 +++ gcc/testsuite/gcc.target/i386/pr88717.c | 24 ++++++++++++++++++++++++ 3 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr88717.c diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index d01278d866f..9b49a2c1d9c 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -19100,7 +19100,11 @@ ix86_avx_u128_mode_entry (void) rtx incoming = DECL_INCOMING_RTL (arg); if (incoming && ix86_check_avx_upper_register (incoming)) - return AVX_U128_DIRTY; + { + /* Caller is AVX_U128_DIRTY. */ + cfun->machine->caller_avx_u128_dirty = true; + return AVX_U128_DIRTY; + } } return AVX_U128_CLEAN; @@ -19130,6 +19134,10 @@ ix86_mode_entry (int entity) static int ix86_avx_u128_mode_exit (void) { + /* Exit mode is set to AVX_U128_DIRTY if caller is AVX_U128_DIRTY. */ + if (cfun->machine->caller_avx_u128_dirty) + return AVX_U128_DIRTY; + rtx reg = crtl->return_rtx; /* Exit mode is set to AVX_U128_DIRTY if there are 256bit diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 83b025e0cf5..c053b657a55 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -2747,6 +2747,9 @@ struct GTY(()) machine_function { /* If true, ENDBR is queued at function entrance. */ BOOL_BITFIELD endbr_queued_at_entrance : 1; + /* If true, caller is AVX_U128_DIRTY. */ + BOOL_BITFIELD caller_avx_u128_dirty : 1; + /* The largest alignment, in bytes, of stack slot actually used. */ unsigned int max_used_stack_alignment; diff --git a/gcc/testsuite/gcc.target/i386/pr88717.c b/gcc/testsuite/gcc.target/i386/pr88717.c new file mode 100644 index 00000000000..01680998f1b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88717.c @@ -0,0 +1,24 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx512f -mvzeroupper" } */ + +#include <immintrin.h> + +__m128 +foo1 (__m256 x) +{ + return _mm256_castps256_ps128 (x); +} + +void +foo2 (float *p, __m256 x) +{ + *p = ((__v8sf)x)[0]; +} + +void +foo3 (float *p, __m512 x) +{ + *p = ((__v16sf)x)[0]; +} + +/* { dg-final { scan-assembler-not "vzeroupper" } } */ -- 2.20.1