Hi: This patch is about to Fold __builtin_ia32_pblendvb128 (a, b, c) as VEC_COND_EXPR (c < 0, b, a), similar for float version but with mask operand VIEW_CONVERT_EXPR to same sized integer vectype.
After folding, blendv related patterns can be redefined as vec_merge since all elements of mask operand is either const0_rtx or constm1_rtx now. It could potentially enable more rtl optimizations. Besides, although there's no pblendv{d,q} instructions, backend can still define their patterns and generate blendv{ps,pd} instead. Bootstrap and regtested on x86_64-linux-gnu{-m32,}. Ok for trunk? gcc/ChangeLog: * config/i386/i386-builtin.def (IX86_BUILTIN_BLENDVPD256, IX86_BUILTIN_BLENDVPS256, IX86_BUILTIN_PBLENDVB256, IX86_BUILTIN_BLENDVPD, IX86_BUILTIN_BLENDVPS, IX86_BUILTIN_PBLENDVB128): Replace icode with CODE_FOR_nothing. * config/i386/i386-expand.c (ix86_expand_sse_movcc): Use gen_avx_blendvd256/gen_avx_blendvq256/gen_sse4_1_blendvd/gen_sse4_1_blendvq for V8SI/V4DI/V4SI/V2DImode. * config/i386/i386.c (ix86_gimple_fold_builtin): Fold blendv builtins. * config/i386/mmx.md (mmx_blendvps): Change to define_expand. (*mmx_blendvps): New pattern implemented as vec_merge. * config/i386/sse.md (<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>): Change to define_expand. (<sse4_1_avx2>_pblendvb): Ditto. (*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>): New pattern implemented as vec_merge. (*<sse4_1_avx2>_pblendvb): Ditto. (*<sse4_1_avx2>_pblendvb_lt): Redefined as define_insn with pattern implemented as vec_merge instead of UNSPEC_BLENDV. (*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>_lt): Ditto, and extend mode to V48_AVX. (*<sse4_1_avx2>_pblendvb_not_lt): New. (*<sse4_1>_blendv<ssefltmodesuffix><avxsizesuffix>_ltint): Deleted. (*<sse4_1_avx2>_pblendvb_lt): Ditto. (*<sse4_1_avx2>_pblendvb_not_lt): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/funcspec-8.c: Replace __builtin_ia32_blendvpd with __builtin_ia32_roundps_az. * gcc.target/i386/blendv-1.c: New test. * gcc.target/i386/blendv-2.c: New test. -- BR, Hongtao
From f78d9f2595c315b6343adc4c3b79b6596c45c65b Mon Sep 17 00:00:00 2001 From: liuhongt <hongtao....@intel.com> Date: Fri, 21 May 2021 09:48:18 +0800 Subject: [PATCH 1/2] [i386] Fold blendv builtins into gimple. Fold __builtin_ia32_pblendvb128 (a, b, c) as VEC_COND_EXPR (c < 0, b, a), similar for float version but with mask operand VIEW_CONVERT_EXPR to same sized integer vectype. After folding, blendv related patterns can be redefined as vec_merge since all elements of mask operand is either const0_rtx or constm1_rtx now. It could potentially enable more rtl optimizations. Besides, although there's no pblendv{d,q} instructions, backend can still define their patterns and generate blendv{ps,pd} instead. gcc/ChangeLog: * config/i386/i386-builtin.def (IX86_BUILTIN_BLENDVPD256, IX86_BUILTIN_BLENDVPS256, IX86_BUILTIN_PBLENDVB256, IX86_BUILTIN_BLENDVPD, IX86_BUILTIN_BLENDVPS, IX86_BUILTIN_PBLENDVB128): Replace icode with CODE_FOR_nothing. * config/i386/i386-expand.c (ix86_expand_sse_movcc): Use gen_avx_blendvd256/gen_avx_blendvq256/gen_sse4_1_blendvd/gen_sse4_1_blendvq for V8SI/V4DI/V4SI/V2DImode. * config/i386/i386.c (ix86_gimple_fold_builtin): Fold blendv builtins. * config/i386/mmx.md (mmx_blendvps): Change to define_expand. (*mmx_blendvps): New pattern implemented as vec_merge. * config/i386/sse.md (<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>): Change to define_expand. (<sse4_1_avx2>_pblendvb): Ditto. (*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>): New pattern implemented as vec_merge. (*<sse4_1_avx2>_pblendvb): Ditto. (*<sse4_1_avx2>_pblendvb_lt): Redefined as define_insn with pattern implemented as vec_merge instead of UNSPEC_BLENDV. (*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>_lt): Ditto, and extend mode to V48_AVX. (*<sse4_1_avx2>_pblendvb_not_lt): New. (*<sse4_1>_blendv<ssefltmodesuffix><avxsizesuffix>_ltint): Deleted. (*<sse4_1_avx2>_pblendvb_lt): Ditto. (*<sse4_1_avx2>_pblendvb_not_lt): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/funcspec-8.c: Replace __builtin_ia32_blendvpd with __builtin_ia32_roundps_az. * gcc.target/i386/blendv-1.c: New test. * gcc.target/i386/blendv-2.c: New test. --- gcc/config/i386/i386-builtin.def | 12 +- gcc/config/i386/i386-expand.c | 22 +- gcc/config/i386/i386.c | 37 ++++ gcc/config/i386/mmx.md | 38 +++- gcc/config/i386/sse.md | 227 +++++++++++---------- gcc/testsuite/gcc.target/i386/blendv-1.c | 51 +++++ gcc/testsuite/gcc.target/i386/blendv-2.c | 41 ++++ gcc/testsuite/gcc.target/i386/funcspec-8.c | 16 +- 8 files changed, 303 insertions(+), 141 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/blendv-1.c create mode 100644 gcc/testsuite/gcc.target/i386/blendv-2.c diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index 80c2a2c0294..0c1507317ae 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -902,13 +902,13 @@ BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_ssse3_palignrdi, /* SSE4.1 */ BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT) BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT) -BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF) -BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF) +BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_nothing, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF) +BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_nothing, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF) BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT) BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT) BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT) BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT) -BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI) +BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_nothing, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI) BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT) BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI) @@ -1028,8 +1028,8 @@ BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpe BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT) BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT) -BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF) -BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF) +BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF) +BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF) BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT) BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT) BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT) @@ -1154,7 +1154,7 @@ BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI) BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI) -BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI) +BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI) BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT) BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI) diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 9f3d41955a2..dc155313c39 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -3700,6 +3700,16 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) if (TARGET_SSE4_1) gen = gen_sse4_1_blendvpd; break; + /* Although x86 does not have pblendv{d,q} instructions, + backend can define their patterns and then generate pblendv{ps,pd}. */ + case E_V4SImode: + if (TARGET_SSE4_1) + gen = gen_sse4_1_blendvd; + break; + case E_V2DImode: + if (TARGET_SSE4_1) + gen = gen_sse4_1_blendvq; + break; case E_SFmode: if (TARGET_SSE4_1) { @@ -3731,8 +3741,6 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) break; case E_V16QImode: case E_V8HImode: - case E_V4SImode: - case E_V2DImode: if (TARGET_SSE4_1) { gen = gen_sse4_1_pblendvb; @@ -3743,6 +3751,14 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) cmp = gen_lowpart (V16QImode, cmp); } break; + case E_V8SImode: + if (TARGET_AVX) + gen = gen_avx_blendvd256; + break; + case E_V4DImode: + if (TARGET_AVX) + gen = gen_avx_blendvq256; + break; case E_V8SFmode: if (TARGET_AVX) gen = gen_avx_blendvps256; @@ -3753,8 +3769,6 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) break; case E_V32QImode: case E_V16HImode: - case E_V8SImode: - case E_V4DImode: if (TARGET_AVX2) { gen = gen_avx2_pblendvb; diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 743d8a25fe3..4a7ff768a32 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -17966,6 +17966,43 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) } break; + case IX86_BUILTIN_PBLENDVB128: + case IX86_BUILTIN_PBLENDVB256: + case IX86_BUILTIN_BLENDVPS: + case IX86_BUILTIN_BLENDVPD: + case IX86_BUILTIN_BLENDVPS256: + case IX86_BUILTIN_BLENDVPD256: + gcc_assert (n_args == 3); + arg0 = gimple_call_arg (stmt, 0); + arg1 = gimple_call_arg (stmt, 1); + arg2 = gimple_call_arg (stmt, 2); + if (gimple_call_lhs (stmt)) + { + location_t loc = gimple_location (stmt); + tree type = TREE_TYPE (arg2); + gimple_seq stmts = NULL; + if (VECTOR_FLOAT_TYPE_P (type)) + { + tree itype = GET_MODE_INNER (TYPE_MODE (type)) == E_SFmode + ? intSI_type_node : intDI_type_node; + type = get_same_sized_vectype (itype, type); + arg2 = gimple_build (&stmts, VIEW_CONVERT_EXPR, type, arg2); + } + tree zero_vec = build_zero_cst (type); + tree cmp_type = truth_type_for (type); + tree cmp = gimple_build (&stmts, LT_EXPR, cmp_type, arg2, zero_vec); + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); + gimple *g = gimple_build_assign (gimple_call_lhs (stmt), + VEC_COND_EXPR, cmp, + arg1, arg0); + gimple_set_location (g, loc); + gsi_replace (gsi, g, false); + } + else + gsi_replace (gsi, gimple_build_nop (), false); + return true; + + case IX86_BUILTIN_PCMPEQB128: case IX86_BUILTIN_PCMPEQW128: case IX86_BUILTIN_PCMPEQD128: diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index d8479782e90..564f283a1a8 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -862,13 +862,30 @@ (define_expand "vcond<mode>v2sf" DONE; }) -(define_insn "mmx_blendvps" - [(set (match_operand:V2SF 0 "register_operand" "=Yr,*x,x") +;; NB: This expander should only be used if only all elements +;; of operands[3] are either const0_rtx or constm1_rtx. +(define_expand "mmx_blendvps" + [(set (match_operand:V2SF 0 "register_operand") (unspec:V2SF - [(match_operand:V2SF 1 "register_operand" "0,0,x") - (match_operand:V2SF 2 "register_operand" "Yr,*x,x") - (match_operand:V2SF 3 "register_operand" "Yz,Yz,x")] - UNSPEC_BLENDV))] + [(match_operand:V2SF 1 "register_operand") + (match_operand:V2SF 2 "register_operand") + (match_operand:V2SF 3 "register_operand")] + UNSPEC_BLENDV))] + "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE" +{ + operands[3] = gen_lowpart (V2SImode, operands[3]); + rtx tmp = gen_rtx_VEC_MERGE (V2SFmode, operands[2], + operands[1], operands[3]); + emit_move_insn (operands[0], tmp); + DONE; +}) + +(define_insn "*mmx_blendvps" + [(set (match_operand:V2SF 0 "register_operand" "=Yr,*x,x") + (vec_merge:V2SF + (match_operand:V2SF 2 "register_operand" "Yr,*x,x") + (match_operand:V2SF 1 "register_operand" "0,0,x") + (match_operand:V2SI 3 "register_operand" "Yz,Yz,x")))] "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE" "@ blendvps\t{%3, %2, %0|%0, %2, %3} @@ -1935,11 +1952,10 @@ (define_expand "vcond_mask_<mode><mmxintvecmodelower>" (define_insn "mmx_pblendvb" [(set (match_operand:V8QI 0 "register_operand" "=Yr,*x,x") - (unspec:V8QI - [(match_operand:V8QI 1 "register_operand" "0,0,x") - (match_operand:V8QI 2 "register_operand" "Yr,*x,x") - (match_operand:V8QI 3 "register_operand" "Yz,Yz,x")] - UNSPEC_BLENDV))] + (vec_merge:V8QI + (match_operand:V8QI 2 "register_operand" "Yr,*x,x") + (match_operand:V8QI 1 "register_operand" "0,0,x") + (match_operand:V8QI 3 "register_operand" "Yz,Yz,x")))] "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE" "@ pblendvb\t{%3, %2, %0|%0, %2, %3} diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index a4503ddcb73..61fbf437f9f 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -547,6 +547,11 @@ (define_mode_iterator V48_AVX2 (V4SI "TARGET_AVX2") (V2DI "TARGET_AVX2") (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")]) +(define_mode_iterator V48_AVX + [V4SF V2DF V4SI V2DI + (V8SF "TARGET_AVX") (V4DF "TARGET_AVX") + (V8SI "TARGET_AVX") (V4DI "TARGET_AVX")]) + (define_mode_iterator VI1_AVX512VLBW [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX512VL") (V16QI "TARGET_AVX512VL")]) @@ -796,6 +801,14 @@ (define_mode_attr sseintvecmode (V32HI "V32HI") (V64QI "V64QI") (V32QI "V32QI") (V16QI "V16QI")]) +(define_mode_attr ssefloatvecmode + [(V16SF "V16SF") (V8DF "V8DF") + (V8SF "V8SF") (V4DF "V4DF") + (V4SF "V4SF") (V2DF "V2DF") + (V16SI "V16SF") (V8DI "V8DF") + (V8SI "V8SF") (V4DI "V4DF") + (V4SI "V4SF") (V2DI "V2DF")]) + (define_mode_attr sseintvecmode2 [(V8DF "XI") (V4DF "OI") (V2DF "TI") (V8SF "OI") (V4SF "TI")]) @@ -17637,26 +17650,50 @@ (define_insn "<sse4_1>_blend<ssemodesuffix><avxsizesuffix>" (set_attr "prefix" "orig,orig,vex") (set_attr "mode" "<MODE>")]) -(define_insn "<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>" - [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x") - (unspec:VF_128_256 - [(match_operand:VF_128_256 1 "register_operand" "0,0,x") - (match_operand:VF_128_256 2 "vector_operand" "YrBm,*xBm,xm") - (match_operand:VF_128_256 3 "register_operand" "Yz,Yz,x")] +;; NB: This expander should only be used if only all elements +;; of operands[3] are either const0_rtx or constm1_rtx. +(define_expand "<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>" + [(set (match_operand:V48_AVX 0 "register_operand") + (unspec:V48_AVX + [(match_operand:V48_AVX 1 "register_operand") + (match_operand:V48_AVX 2 "vector_operand") + (match_operand:V48_AVX 3 "register_operand")] UNSPEC_BLENDV))] "TARGET_SSE4_1" +{ + if (FLOAT_MODE_P (<MODE>mode)) + operands[3] = gen_lowpart (<sseintvecmode>mode, operands[3]); + rtx tmp = gen_rtx_VEC_MERGE (<MODE>mode, operands[2], + operands[1], operands[3]); + emit_move_insn (operands[0], tmp); + DONE; +}) + +(define_mode_attr fblendvsuffix + [(V4SF "ps") (V2DF "pd") + (V8SF "ps") (V4DF "pd") + (V4SI "ps") (V2DI "pd") + (V8SI "ps") (V4DI "pd")]) + +(define_insn "*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>" + [(set (match_operand:V48_AVX 0 "register_operand" "=Yr,*x,x") + (vec_merge:V48_AVX + (match_operand:V48_AVX 2 "vector_operand" "YrBm,*xBm,xm") + (match_operand:V48_AVX 1 "register_operand" "0,0,x") + (match_operand:<sseintvecmode> 3 "register_operand" "Yz,Yz,x")))] + "TARGET_SSE4_1" "@ - blendv<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3} - blendv<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3} - vblendv<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + blendv<fblendvsuffix>\t{%3, %2, %0|%0, %2, %3} + blendv<fblendvsuffix>\t{%3, %2, %0|%0, %2, %3} + vblendv<fblendvsuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "ssemov") (set_attr "length_immediate" "1") (set_attr "prefix_data16" "1,1,*") (set_attr "prefix_extra" "1") (set_attr "prefix" "orig,orig,vex") - (set_attr "btver2_decode" "vector,vector,vector") - (set_attr "mode" "<MODE>")]) + (set_attr "btver2_decode" "vector,vector,vector") + (set_attr "mode" "<ssefloatvecmode>")]) ;; Also define scalar versions. These are used for conditional move. ;; Using subregs into vector modes causes register allocation lossage. @@ -17698,67 +17735,27 @@ (define_insn "sse4_1_blendv<ssemodesuffix>" ] (const_string "<ssevecmode>")))]) -(define_insn_and_split "*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>_lt" - [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x") - (unspec:VF_128_256 - [(match_operand:VF_128_256 1 "register_operand" "0,0,x") - (match_operand:VF_128_256 2 "vector_operand" "YrBm,*xBm,xm") - (lt:VF_128_256 - (match_operand:<sseintvecmode> 3 "register_operand" "Yz,Yz,x") - (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C"))] - UNSPEC_BLENDV))] +(define_insn "*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>_lt" + [(set (match_operand:V48_AVX 0 "register_operand" "=Yr,*x,x") + (vec_merge:V48_AVX + (match_operand:V48_AVX 2 "vector_operand" "YrBm,*xBm,xm") + (match_operand:V48_AVX 1 "register_operand" "0,0,x") + (lt:<sseintvecmode> + (match_operand:<sseintvecmode> 3 "register_operand" "Yz,Yz,x") + (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C"))))] "TARGET_SSE4_1" - "#" - "&& reload_completed" - [(set (match_dup 0) - (unspec:VF_128_256 - [(match_dup 1) (match_dup 2) (match_dup 3)] UNSPEC_BLENDV))] - "operands[3] = gen_lowpart (<MODE>mode, operands[3]);" - [(set_attr "isa" "noavx,noavx,avx") - (set_attr "type" "ssemov") - (set_attr "length_immediate" "1") - (set_attr "prefix_data16" "1,1,*") - (set_attr "prefix_extra" "1") - (set_attr "prefix" "orig,orig,vex") - (set_attr "btver2_decode" "vector,vector,vector") - (set_attr "mode" "<MODE>")]) - -(define_mode_attr ssefltmodesuffix - [(V2DI "pd") (V4DI "pd") (V4SI "ps") (V8SI "ps")]) - -(define_mode_attr ssefltvecmode - [(V2DI "V2DF") (V4DI "V4DF") (V4SI "V4SF") (V8SI "V8SF")]) - -(define_insn_and_split "*<sse4_1>_blendv<ssefltmodesuffix><avxsizesuffix>_ltint" - [(set (match_operand:<ssebytemode> 0 "register_operand" "=Yr,*x,x") - (unspec:<ssebytemode> - [(match_operand:<ssebytemode> 1 "register_operand" "0,0,x") - (match_operand:<ssebytemode> 2 "vector_operand" "YrBm,*xBm,xm") - (subreg:<ssebytemode> - (lt:VI48_AVX - (match_operand:VI48_AVX 3 "register_operand" "Yz,Yz,x") - (match_operand:VI48_AVX 4 "const0_operand" "C,C,C")) 0)] - UNSPEC_BLENDV))] - "TARGET_SSE4_1" - "#" - "&& reload_completed" - [(set (match_dup 0) - (unspec:<ssefltvecmode> - [(match_dup 1) (match_dup 2) (match_dup 3)] UNSPEC_BLENDV))] -{ - operands[0] = gen_lowpart (<ssefltvecmode>mode, operands[0]); - operands[1] = gen_lowpart (<ssefltvecmode>mode, operands[1]); - operands[2] = gen_lowpart (<ssefltvecmode>mode, operands[2]); - operands[3] = gen_lowpart (<ssefltvecmode>mode, operands[3]); -} + "@ + blendv<fblendvsuffix>\t{%3, %2, %0|%0, %2, %3} + blendv<fblendvsuffix>\t{%3, %2, %0|%0, %2, %3} + vblendv<fblendvsuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "ssemov") (set_attr "length_immediate" "1") (set_attr "prefix_data16" "1,1,*") (set_attr "prefix_extra" "1") (set_attr "prefix" "orig,orig,vex") - (set_attr "btver2_decode" "vector,vector,vector") - (set_attr "mode" "<ssefltvecmode>")]) + (set_attr "btver2_decode" "vector,vector,vector") + (set_attr "mode" "<ssefloatvecmode>")]) (define_insn "<sse4_1>_dp<ssemodesuffix><avxsizesuffix>" [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x") @@ -17837,14 +17834,30 @@ (define_insn "<sse4_1_avx2>_packusdw<mask_name>" (set_attr "prefix" "orig,orig,<mask_prefix>") (set_attr "mode" "<sseinsnmode>")]) -(define_insn "<sse4_1_avx2>_pblendvb" - [(set (match_operand:VI1_AVX2 0 "register_operand" "=Yr,*x,x") +;; NB: This expander should only be used if only all elements +;; of operands[3] are either const0_rtx or constm1_rtx. +(define_expand "<sse4_1_avx2>_pblendvb" + [(set (match_operand:VI1_AVX2 0 "register_operand") (unspec:VI1_AVX2 - [(match_operand:VI1_AVX2 1 "register_operand" "0,0,x") - (match_operand:VI1_AVX2 2 "vector_operand" "YrBm,*xBm,xm") - (match_operand:VI1_AVX2 3 "register_operand" "Yz,Yz,x")] + [(match_operand:VI1_AVX2 1 "register_operand") + (match_operand:VI1_AVX2 2 "vector_operand") + (match_operand:VI1_AVX2 3 "register_operand")] UNSPEC_BLENDV))] "TARGET_SSE4_1" +{ + rtx tmp = gen_rtx_VEC_MERGE (<MODE>mode, operands[2], + operands[1], operands[3]); + emit_move_insn (operands[0], tmp); + DONE; +}) + +(define_insn "*<sse4_1_avx2>_pblendvb" + [(set (match_operand:VI1_AVX2 0 "register_operand" "=Yr,*x,x") + (vec_merge:VI1_AVX2 + (match_operand:VI1_AVX2 2 "vector_operand" "YrBm,*xBm,xm") + (match_operand:VI1_AVX2 1 "register_operand" "0,0,x") + (match_operand:VI1_AVX2 3 "register_operand" "Yz,Yz,x")))] + "TARGET_SSE4_1" "@ pblendvb\t{%3, %2, %0|%0, %2, %3} pblendvb\t{%3, %2, %0|%0, %2, %3} @@ -17857,50 +17870,19 @@ (define_insn "<sse4_1_avx2>_pblendvb" (set_attr "btver2_decode" "vector,vector,vector") (set_attr "mode" "<sseinsnmode>")]) -(define_split - [(set (match_operand:VI1_AVX2 0 "register_operand") - (unspec:VI1_AVX2 - [(match_operand:VI1_AVX2 1 "vector_operand") - (match_operand:VI1_AVX2 2 "register_operand") - (not:VI1_AVX2 (match_operand:VI1_AVX2 3 "register_operand"))] - UNSPEC_BLENDV))] - "TARGET_SSE4_1" - [(set (match_dup 0) - (unspec:VI1_AVX2 - [(match_dup 2) (match_dup 1) (match_dup 3)] - UNSPEC_BLENDV))]) - -(define_split - [(set (match_operand:VI1_AVX2 0 "register_operand") - (unspec:VI1_AVX2 - [(match_operand:VI1_AVX2 1 "vector_operand") - (match_operand:VI1_AVX2 2 "register_operand") - (subreg:VI1_AVX2 (not (match_operand 3 "register_operand")) 0)] - UNSPEC_BLENDV))] - "TARGET_SSE4_1 - && GET_MODE_CLASS (GET_MODE (operands[3])) == MODE_VECTOR_INT - && GET_MODE_SIZE (GET_MODE (operands[3])) == <MODE_SIZE>" - [(set (match_dup 0) - (unspec:VI1_AVX2 - [(match_dup 2) (match_dup 1) (match_dup 4)] - UNSPEC_BLENDV))] - "operands[4] = gen_lowpart (<MODE>mode, operands[3]);") - -(define_insn_and_split "*<sse4_1_avx2>_pblendvb_lt" +(define_insn "*<sse4_1_avx2>_pblendvb_lt" [(set (match_operand:VI1_AVX2 0 "register_operand" "=Yr,*x,x") - (unspec:VI1_AVX2 - [(match_operand:VI1_AVX2 1 "register_operand" "0,0,x") - (match_operand:VI1_AVX2 2 "vector_operand" "YrBm,*xBm,xm") - (lt:VI1_AVX2 (match_operand:VI1_AVX2 3 "register_operand" "Yz,Yz,x") - (match_operand:VI1_AVX2 4 "const0_operand" "C,C,C"))] - UNSPEC_BLENDV))] + (vec_merge:VI1_AVX2 + (match_operand:VI1_AVX2 2 "vector_operand" "YrBm,*xBm,xm") + (match_operand:VI1_AVX2 1 "register_operand" "0,0,x") + (lt:VI1_AVX2 + (match_operand:VI1_AVX2 3 "register_operand" "Yz,Yz,x") + (match_operand:VI1_AVX2 4 "const0_operand" "C,C,C"))))] "TARGET_SSE4_1" - "#" - "" - [(set (match_dup 0) - (unspec:VI1_AVX2 - [(match_dup 1) (match_dup 2) (match_dup 3)] UNSPEC_BLENDV))] - "" + "@ + pblendvb\t{%3, %2, %0|%0, %2, %3} + pblendvb\t{%3, %2, %0|%0, %2, %3} + vpblendvb\t{%3, %2, %1, %0|%0, %1, %2, %3}" [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "ssemov") (set_attr "prefix_extra" "1") @@ -17909,6 +17891,27 @@ (define_insn_and_split "*<sse4_1_avx2>_pblendvb_lt" (set_attr "btver2_decode" "vector,vector,vector") (set_attr "mode" "<sseinsnmode>")]) +(define_insn_and_split "*<sse4_1_avx2>_pblendvb_not_lt" + [(set (match_operand:VI1_AVX2 0 "register_operand") + (vec_merge:VI1_AVX2 + (match_operand:VI1_AVX2 2 "register_operand") + (match_operand:VI1_AVX2 1 "vector_operand") + (lt:VI1_AVX2 + (subreg:VI1_AVX2 (not (match_operand 3 "register_operand")) 0) + (match_operand:VI1_AVX2 4 "const0_operand"))))] + "TARGET_SSE4_1 && ix86_pre_reload_split () + && GET_MODE_CLASS (GET_MODE (operands[3])) == MODE_VECTOR_INT + && GET_MODE_SIZE (GET_MODE (operands[3])) == GET_MODE_SIZE (<MODE>mode)" + "#" + "&& 1" + [(set (match_dup 0) + (vec_merge:VI1_AVX2 + (match_dup 1) + (match_dup 2) + (lt:VI1_AVX2 + (subreg:VI1_AVX2 (match_dup 3) 0) + (match_dup 4))))]) + (define_insn "sse4_1_pblendw" [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,x") (vec_merge:V8HI diff --git a/gcc/testsuite/gcc.target/i386/blendv-1.c b/gcc/testsuite/gcc.target/i386/blendv-1.c new file mode 100644 index 00000000000..fcbbfb9b446 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/blendv-1.c @@ -0,0 +1,51 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx2 -O2 -mno-avx512f" } */ +/* { dg-final { scan-assembler-times {pblendvb[\t ]*%xmm} 1 } } */ +/* { dg-final { scan-assembler-times {pblendvb[\t ]*%ymm} 1 } } */ +/* { dg-final { scan-assembler-times {blendvps[\t ]*%xmm} 1 } } */ +/* { dg-final { scan-assembler-times {blendvps[\t ]*%ymm} 1 } } */ +/* { dg-final { scan-assembler-times {blendvpd[\t ]*%xmm} 1 } } */ +/* { dg-final { scan-assembler-times {blendvpd[\t ]*%ymm} 1 } } */ + +typedef float v4sf __attribute__ ((vector_size (16))); +typedef float v8sf __attribute__ ((vector_size (32))); +typedef double v2df __attribute__ ((vector_size (16))); +typedef double v4df __attribute__ ((vector_size (32))); +typedef char v16qi __attribute__ ((vector_size (16))); +typedef char v32qi __attribute__ ((vector_size (32))); + +v4sf +foo (v4sf a, v4sf b, v4sf c) +{ + return __builtin_ia32_blendvps (a, b, c); +} + +v8sf +foo2 (v8sf a, v8sf b, v8sf c) +{ + return __builtin_ia32_blendvps256 (a, b, c); +} + +v2df +foo3 (v2df a, v2df b, v2df c) +{ + return __builtin_ia32_blendvpd (a, b, c); +} + +v4df +foo4 (v4df a, v4df b, v4df c) +{ + return __builtin_ia32_blendvpd256 (a, b, c); +} + +v16qi +foo5 (v16qi a, v16qi b, v16qi c) +{ + return __builtin_ia32_pblendvb128 (a, b, c); +} + +v32qi +foo6 (v32qi a, v32qi b, v32qi c) +{ + return __builtin_ia32_pblendvb256 (a, b, c); +} diff --git a/gcc/testsuite/gcc.target/i386/blendv-2.c b/gcc/testsuite/gcc.target/i386/blendv-2.c new file mode 100644 index 00000000000..e61e0233411 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/blendv-2.c @@ -0,0 +1,41 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx2 -O2 -mno-avx512f" } */ +/* { dg-final { scan-assembler-not {pblendv} } } */ +/* { dg-final { scan-assembler-not {blendvp} } } */ + +#include <x86intrin.h> +__m128 +foo (__m128 a, __m128 b) +{ + return _mm_blendv_ps (a, b, _mm_setzero_ps ()); +} + +__m256 +foo2 (__m256 a, __m256 b) +{ + return _mm256_blendv_ps (a, b, _mm256_set1_ps (-1.0)); +} + +__m128d +foo3 (__m128d a, __m128d b, __m128d c) +{ + return _mm_blendv_pd (a, b, _mm_set1_pd (1.0)); +} + +__m256d +foo4 (__m256d a, __m256d b, __m256d c) +{ + return _mm256_blendv_pd (a, b, _mm256_set1_pd (-134.3)); +} + +__m128i +foo5 (__m128i a, __m128i b, __m128i c) +{ + return _mm_blendv_epi8 (a, b, _mm_set1_epi8 (3)); +} + +__m256i +foo6 (__m256i a, __m256i b, __m256i c) +{ + return _mm256_blendv_epi8 (a, b, _mm256_set1_epi8 (-22)); +} diff --git a/gcc/testsuite/gcc.target/i386/funcspec-8.c b/gcc/testsuite/gcc.target/i386/funcspec-8.c index 0a6c709003a..f15541169e7 100644 --- a/gcc/testsuite/gcc.target/i386/funcspec-8.c +++ b/gcc/testsuite/gcc.target/i386/funcspec-8.c @@ -52,19 +52,19 @@ generic_psignd128 (__m128w a, __m128w b) #error "-msse4.1 should not be set for this test" #endif -__m128d sse4_1_blendvpd (__m128d a, __m128d b, __m128d c) __attribute__((__target__("sse4.1"))); -__m128d generic_blendvpd (__m128d a, __m128d b, __m128d c); +__m128 sse4_1_roundv4sf2 (__m128 a) __attribute__((__target__("sse4.1"))); +__m128 generic_roundv4sf2 (__m128 a); -__m128d -sse4_1_blendvpd (__m128d a, __m128d b, __m128d c) +__m128 +sse4_1_roundv4sf2 (__m128 a) { - return __builtin_ia32_blendvpd (a, b, c); + return __builtin_ia32_roundps_az (a); } -__m128d -generic_blendvpd (__m128d a, __m128d b, __m128d c) +__m128 +generic_blendvpd (__m128 a) { - return __builtin_ia32_blendvpd (a, b, c); /* { dg-error "needs isa option" } */ + return __builtin_ia32_roundps_az (a); /* { dg-error "needs isa option" } */ } #ifdef __SSE4_2__ -- 2.18.1