For Crestmont, 4-operand vex blendv instructions come from MSROM and is slower than 3-instructions sequence (op1 & mask) | (op2 & ~mask). legacy blendv instruction can still be handled by the decoder.
The patch add a new tune which is enabled for all processors except for SRF/CWF. It will use vpand + vpandn + vpor instead of vpblendvb(similar for vblendvps/vblendvpd) for SRF/CWF. gcc/ChangeLog: * config/i386/i386-expand.cc (ix86_expand_sse_movcc): Guard instruction blendv generation under new tune. * config/i386/i386.h (TARGET_SSE_MOVCC_USE_BLENDV): New Macro. * config/i386/x86-tune.def (X86_TUNE_SSE_MOVCC_USE_BLENDV): New tune. --- gcc/config/i386/i386-expand.cc | 24 +++++++++---------- gcc/config/i386/i386.h | 2 ++ gcc/config/i386/x86-tune.def | 8 +++++++ .../gcc.target/i386/sse_movcc_use_blendv.c | 12 ++++++++++ 4 files changed, 34 insertions(+), 12 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/sse_movcc_use_blendv.c diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 124cb976ec8..e4087cccb7c 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -4254,23 +4254,23 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) switch (mode) { case E_V2SFmode: - if (TARGET_SSE4_1) + if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1) gen = gen_mmx_blendvps; break; case E_V4SFmode: - if (TARGET_SSE4_1) + if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1) gen = gen_sse4_1_blendvps; break; case E_V2DFmode: - if (TARGET_SSE4_1) + if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1) gen = gen_sse4_1_blendvpd; break; case E_SFmode: - if (TARGET_SSE4_1) + if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1) gen = gen_sse4_1_blendvss; break; case E_DFmode: - if (TARGET_SSE4_1) + if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1) gen = gen_sse4_1_blendvsd; break; case E_V8QImode: @@ -4278,7 +4278,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) case E_V4HFmode: case E_V4BFmode: case E_V2SImode: - if (TARGET_SSE4_1) + if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1) { gen = gen_mmx_pblendvb_v8qi; blend_mode = V8QImode; @@ -4288,14 +4288,14 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) case E_V2HImode: case E_V2HFmode: case E_V2BFmode: - if (TARGET_SSE4_1) + if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1) { gen = gen_mmx_pblendvb_v4qi; blend_mode = V4QImode; } break; case E_V2QImode: - if (TARGET_SSE4_1) + if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1) gen = gen_mmx_pblendvb_v2qi; break; case E_V16QImode: @@ -4305,18 +4305,18 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) case E_V4SImode: case E_V2DImode: case E_V1TImode: - if (TARGET_SSE4_1) + if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1) { gen = gen_sse4_1_pblendvb; blend_mode = V16QImode; } break; case E_V8SFmode: - if (TARGET_AVX) + if (TARGET_AVX && TARGET_SSE_MOVCC_USE_BLENDV) gen = gen_avx_blendvps256; break; case E_V4DFmode: - if (TARGET_AVX) + if (TARGET_AVX && TARGET_SSE_MOVCC_USE_BLENDV) gen = gen_avx_blendvpd256; break; case E_V32QImode: @@ -4325,7 +4325,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) case E_V16BFmode: case E_V8SImode: case E_V4DImode: - if (TARGET_AVX2) + if (TARGET_AVX2 && TARGET_SSE_MOVCC_USE_BLENDV) { gen = gen_avx2_pblendvb; blend_mode = V32QImode; diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index c1ec92ffb15..f01f31d208a 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -462,6 +462,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_DEST_FALSE_DEP_FOR_GLC] #define TARGET_SLOW_STC ix86_tune_features[X86_TUNE_SLOW_STC] #define TARGET_USE_RCR ix86_tune_features[X86_TUNE_USE_RCR] +#define TARGET_SSE_MOVCC_USE_BLENDV \ + ix86_tune_features[X86_TUNE_SSE_MOVCC_USE_BLENDV] /* Feature tests against the various architecture variations. */ enum ix86_arch_indices { diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 3d123da95f0..b815b6dc255 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -534,6 +534,14 @@ DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_ZNVER5) DEF_TUNE (X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD, "v2df_reduction_prefer_haddpd", m_NONE) +/* X86_TUNE_SSE_MOVCC_USE_BLENDV: Prefer blendv instructions to + 3-instruction sequence (op1 & mask) | (op2 & ~mask) + for vector condition move. + For Crestmont, 4-operand vex blendv instructions come from MSROM + which is slow. */ +DEF_TUNE (X86_TUNE_SSE_MOVCC_USE_BLENDV, + "sse_movcc_use_blendv", ~m_CORE_ATOM) + /*****************************************************************************/ /* AVX instruction selection tuning (some of SSE flags affects AVX, too) */ /*****************************************************************************/ diff --git a/gcc/testsuite/gcc.target/i386/sse_movcc_use_blendv.c b/gcc/testsuite/gcc.target/i386/sse_movcc_use_blendv.c new file mode 100644 index 00000000000..ac9f1524949 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse_movcc_use_blendv.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-march=sierraforest -O2" } */ +/* { dg-final { scan-assembler-not {(?n)vp?blendv(b|ps|pd)} } } */ + +void +foo (int* a, int* b, int* __restrict c) +{ + for (int i = 0; i != 200; i++) + { + c[i] += a[i] > b[i] ? 1 : -1; + } +} -- 2.31.1