There is no SSE <-> AVX transition penalty if the upper bits of YMM/ZMM registers are unchanged and YMM/ZMM store doesn't change the upper bits of YMM/ZMM registers.
1. Since zeroing YMM/ZMM register is implemented with zeroing XMM register, don't set AVX_U128_DIRTY when zeroing YMM/ZMM register. 2. Since store doesn't change the INIT state on the upper bits of YMM/ZMM register, don't set AVX_U128_DIRTY on store if the source of store was never non-zero. Here are the vzeroupper count differences on SPEC CPU 2017 with -Ofast -march=skylake-avx512 Before After Diff 500.perlbench_r 226 225 -0.44% 502.gcc_r 1263 1103 -12.67% 503.bwaves_r 14 14 0.00% 505.mcf_r 29 28 -3.45% 507.cactuBSSN_r 4651 4628 -0.49% 508.namd_r 433 432 -0.23% 510.parest_r 20380 19347 -5.07% 511.povray_r 495 452 -8.69% 519.lbm_r 2 2 0.00% 520.omnetpp_r 5954 5677 -4.65% 521.wrf_r 12353 12339 -0.11% 523.xalancbmk_r 13137 13001 -1.04% 525.x264_r 192 191 -0.52% 526.blender_r 2515 2366 -5.92% 527.cam4_r 4601 4583 -0.39% 531.deepsjeng_r 20 19 -5.00% 538.imagick_r 898 805 -10.36% 541.leela_r 427 399 -6.56% 544.nab_r 74 74 0.00% 548.exchange2_r 72 72 0.00% 549.fotonik3d_r 318 318 0.00% 554.roms_r 558 554 -0.72% 557.xz_r 79 52 -34.18% and performance differences are within noise range. gcc/ PR target/101456 * config/i386/i386.c (ix86_avx_u128_mode_needed): Don't set AVX_U128_DIRTY when all bits are zero. gcc/testsuite/ PR target/101456 * gcc.target/i386/pr101456-1.c: New test. * gcc.target/i386/pr101456-2.c: Likewise. --- gcc/config/i386/i386.c | 88 ++++++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr101456-1.c | 33 ++++++++ gcc/testsuite/gcc.target/i386/pr101456-2.c | 33 ++++++++ 3 files changed, 154 insertions(+) create mode 100644 gcc/testsuite/gcc.target/i386/pr101456-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr101456-2.c diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 876a19f4c1f..a1eb7c18d65 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -14149,6 +14149,94 @@ ix86_avx_u128_mode_needed (rtx_insn *insn) return AVX_U128_CLEAN; } + rtx set = single_set (insn); + if (set) + { + rtx dest = SET_DEST (set); + rtx src = SET_SRC (set); + if (ix86_check_avx_upper_register (dest)) + { + /* This is an YMM/ZMM load. Return AVX_U128_DIRTY if the + source isn't zero. */ + if (standard_sse_constant_p (src, GET_MODE (dest)) != 1) + return AVX_U128_DIRTY; + else + return AVX_U128_ANY; + } + else if (ix86_check_avx_upper_register (src)) + { + /* This is an YMM/ZMM store. Check for the source operand + of SRC DEFs in the same basic block before INSN. */ + basic_block bb = BLOCK_FOR_INSN (insn); + rtx_insn *end = BB_END (bb); + + /* Return AVX_U128_DIRTY if there is no DEF in the same basic + block. */ + int status = AVX_U128_DIRTY; + + for (df_ref def = DF_REG_DEF_CHAIN (REGNO (src)); + def; def = DF_REF_NEXT_REG (def)) + if (DF_REF_BB (def) == bb) + { + /* Ignore DEF from different basic blocks. */ + rtx_insn *def_insn = DF_REF_INSN (def); + + /* Check if DEF_INSN is before INSN. */ + rtx_insn *next; + for (next = NEXT_INSN (def_insn); + next != nullptr && next != end && next != insn; + next = NEXT_INSN (next)) + ; + + /* Skip if DEF_INSN isn't before INSN. */ + if (next != insn) + continue; + + /* Return AVX_U128_DIRTY if the source operand of + DEF_INSN isn't constant zero. */ + + if (CALL_P (def_insn)) + { + bool avx_upper_reg_found = false; + note_stores (def_insn, ix86_check_avx_upper_stores, + &avx_upper_reg_found); + + /* Return AVX_U128_DIRTY if call returns AVX. */ + if (avx_upper_reg_found) + return AVX_U128_DIRTY; + + continue; + } + + set = single_set (def_insn); + if (!set) + return AVX_U128_DIRTY; + + dest = SET_DEST (set); + + /* Skip if DEF_INSN is not an AVX load. */ + if (ix86_check_avx_upper_register (dest)) + { + src = SET_SRC (set); + /* Return AVX_U128_DIRTY if the source operand isn't + constant zero. */ + if (standard_sse_constant_p (src, GET_MODE (dest)) + != 1) + return AVX_U128_DIRTY; + } + + /* We get here only if all AVX loads are from constant + zero. */ + status = AVX_U128_ANY; + } + + return status; + } + + /* This isn't YMM/ZMM load/store. */ + return AVX_U128_ANY; + } + /* Require DIRTY mode if a 256bit or 512bit AVX register is referenced. Hardware changes state only when a 256bit register is written to, but we need to prevent the compiler from moving optimal insertion diff --git a/gcc/testsuite/gcc.target/i386/pr101456-1.c b/gcc/testsuite/gcc.target/i386/pr101456-1.c new file mode 100644 index 00000000000..803fc6e0207 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr101456-1.c @@ -0,0 +1,33 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=skylake" } */ + +#include <x86intrin.h> + +extern __m256 x1; +extern __m256d x2; +extern __m256i x3; + +extern void bar (void); + +void +foo1 (void) +{ + x1 = _mm256_setzero_ps (); + bar (); +} + +void +foo2 (void) +{ + x2 = _mm256_setzero_pd (); + bar (); +} + +void +foo3 (void) +{ + x3 = _mm256_setzero_si256 (); + bar (); +} + +/* { dg-final { scan-assembler-not "vzeroupper" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr101456-2.c b/gcc/testsuite/gcc.target/i386/pr101456-2.c new file mode 100644 index 00000000000..554a0f1702c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr101456-2.c @@ -0,0 +1,33 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=skylake" } */ + +#include <x86intrin.h> + +extern __m256 x1; +extern __m256d x2; +extern __m256i x3; + +extern __m256 bar (void); + +void +foo1 (void) +{ + bar (); + x1 = _mm256_setzero_ps (); +} + +void +foo2 (void) +{ + bar (); + x2 = _mm256_setzero_pd (); +} + +void +foo3 (void) +{ + bar (); + x3 = _mm256_setzero_si256 (); +} + +/* { dg-final { scan-assembler-times "vzeroupper" 3 } } */ -- 2.31.1