In AMD znver4, znver5 targets vpshufd, vpsrldq have latencies 1,2 and throughput 4 (2 for znver4),2 respectively. It is better to generate shuffles instead of shifts wherever possible. In this patch we try to generate appropriate shuffle instruction to copy higher half to lower half instead of a simple right shift during horizontal vector reduction.
gcc/ChangeLog: * config/i386/i386-expand.cc (emit_reduc_half): Use shuffles to generate reduc half for V4SI, similar modes. * config/i386/i386.h (TARGET_SSE_REDUCTION_PREFER_PSHUF): New Macro. * config/i386/x86-tune.def (X86_TUNE_SSE_REDUCTION_PREFER_PSHUF): New tuning. gcc/testsuite/ChangeLog: * gcc.target/i386/reduc-pshuf.c: New test. --- gcc/config/i386/i386-expand.cc | 28 ++++++++++++++++++--- gcc/config/i386/i386.h | 2 ++ gcc/config/i386/x86-tune.def | 5 ++++ gcc/testsuite/gcc.target/i386/reduc-pshuf.c | 14 +++++++++++ 4 files changed, 46 insertions(+), 3 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/reduc-pshuf.c diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 7fd03c88630..c7aec716a55 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -18724,9 +18724,31 @@ emit_reduc_half (rtx dest, rtx src, int i) case E_V8HFmode: case E_V4SImode: case E_V2DImode: - d = gen_reg_rtx (V1TImode); - tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src), - GEN_INT (i / 2)); + if (TARGET_SSE_REDUCTION_PREFER_PSHUF) { + if (i == 128) { + d = gen_reg_rtx(V4SImode); + tem = gen_sse2_pshufd_1( + d, force_reg(V4SImode, gen_lowpart(V4SImode, src)), GEN_INT(2), + GEN_INT(3), GEN_INT(2), GEN_INT(3)); + } else if (i == 64) { + d = gen_reg_rtx(V4SImode); + tem = gen_sse2_pshufd_1( + d, force_reg(V4SImode, gen_lowpart(V4SImode, src)), GEN_INT(1), + GEN_INT(1), GEN_INT(1), GEN_INT(1)); + } else if (i == 32) { + d = gen_reg_rtx(V8HImode); + tem = gen_sse2_pshuflw_1( + d, force_reg(V8HImode, gen_lowpart(V8HImode, src)), GEN_INT(1), + GEN_INT(1), GEN_INT(1), GEN_INT(1)); + } else { + d = gen_reg_rtx(V1TImode); + tem = + gen_sse2_lshrv1ti3(d, gen_lowpart(V1TImode, src), GEN_INT(i / 2)); + } + } else { + d = gen_reg_rtx(V1TImode); + tem = gen_sse2_lshrv1ti3(d, gen_lowpart(V1TImode, src), GEN_INT(i / 2)); + } break; case E_V8SFmode: if (i == 256) diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 5aa056ff553..ef1700da0e7 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -491,6 +491,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_SSE_MOVCC_USE_BLENDV] #define TARGET_ALIGN_TIGHT_LOOPS \ ix86_tune_features[X86_TUNE_ALIGN_TIGHT_LOOPS] +#define TARGET_SSE_REDUCTION_PREFER_PSHUF \ + ix86_tune_features[X86_TUNE_SSE_REDUCTION_PREFER_PSHUF] /* Feature tests against the various architecture variations. */ diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index e6044c6032e..f7213de9c48 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -572,6 +572,11 @@ DEF_TUNE (X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD, DEF_TUNE (X86_TUNE_SSE_MOVCC_USE_BLENDV, "sse_movcc_use_blendv", ~m_CORE_ATOM) +/* X86_TUNE_V4SI_REDUCTION_PREFER_SHUFD: Prefer pshuf to reduce V16QI, + V8HI, V8HI, V4SI, V4FI, V2DI modes when lshr are costlier. */ +DEF_TUNE (X86_TUNE_SSE_REDUCTION_PREFER_PSHUF, + "sse_reduction_prefer_pshuf", m_ZNVER4 | m_ZNVER5) + /*****************************************************************************/ /* AVX instruction selection tuning (some of SSE flags affects AVX, too) */ /*****************************************************************************/ diff --git a/gcc/testsuite/gcc.target/i386/reduc-pshuf.c b/gcc/testsuite/gcc.target/i386/reduc-pshuf.c new file mode 100644 index 00000000000..26998afc14c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/reduc-pshuf.c @@ -0,0 +1,14 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=znver5 " } */ + +#define N 32 +#define T short +T foo(T *a) { +T sum = 0; +for ( int i = 0 ; i < N ; i++ ) + sum += a[i]; +return sum; +} + +/* { dg-final { scan-assembler-times "vpsrl" 0 } } */ +/* { dg-final { scan-assembler-times "vpshuf" 3 } } */ -- 2.34.1