[PATCH 1/2] [i386] Fold blendv builtins into gimple.

Hongtao Liu via Gcc-patches Sun, 23 May 2021 21:52:37 -0700

Hi:
  This patch is about to Fold __builtin_ia32_pblendvb128 (a, b, c) as
 VEC_COND_EXPR (c < 0, b, a), similar for float version but with
 mask operand VIEW_CONVERT_EXPR to same sized integer vectype.


After folding, blendv related patterns can be redefined as
vec_merge since all elements of mask operand is either const0_rtx or
constm1_rtx now. It could potentially enable more rtl optimizations.

Besides, although there's no pblendv{d,q} instructions, backend can
still define their patterns and generate blendv{ps,pd} instead.

  Bootstrap and regtested on x86_64-linux-gnu{-m32,}.
  Ok for trunk?

gcc/ChangeLog:

        * config/i386/i386-builtin.def (IX86_BUILTIN_BLENDVPD256,
        IX86_BUILTIN_BLENDVPS256, IX86_BUILTIN_PBLENDVB256,
        IX86_BUILTIN_BLENDVPD, IX86_BUILTIN_BLENDVPS,
        IX86_BUILTIN_PBLENDVB128): Replace icode with
        CODE_FOR_nothing.
        * config/i386/i386-expand.c (ix86_expand_sse_movcc): Use
        
gen_avx_blendvd256/gen_avx_blendvq256/gen_sse4_1_blendvd/gen_sse4_1_blendvq
        for V8SI/V4DI/V4SI/V2DImode.
        * config/i386/i386.c (ix86_gimple_fold_builtin): Fold blendv
        builtins.
        * config/i386/mmx.md (mmx_blendvps): Change to define_expand.
        (*mmx_blendvps): New pattern implemented as vec_merge.
        * config/i386/sse.md
        (<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>): Change to
        define_expand.
        (<sse4_1_avx2>_pblendvb): Ditto.
        (*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>): New pattern
        implemented as vec_merge.
        (*<sse4_1_avx2>_pblendvb): Ditto.
        (*<sse4_1_avx2>_pblendvb_lt): Redefined as define_insn with
        pattern implemented as vec_merge instead of UNSPEC_BLENDV.
        (*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>_lt): Ditto,
        and extend mode to V48_AVX.
        (*<sse4_1_avx2>_pblendvb_not_lt): New.
        (*<sse4_1>_blendv<ssefltmodesuffix><avxsizesuffix>_ltint): Deleted.
        (*<sse4_1_avx2>_pblendvb_lt): Ditto.
        (*<sse4_1_avx2>_pblendvb_not_lt): Ditto.

gcc/testsuite/ChangeLog:

        * gcc.target/i386/funcspec-8.c: Replace
        __builtin_ia32_blendvpd with  __builtin_ia32_roundps_az.
        * gcc.target/i386/blendv-1.c: New test.
        * gcc.target/i386/blendv-2.c: New test.


-- 
BR,
Hongtao

From f78d9f2595c315b6343adc4c3b79b6596c45c65b Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao....@intel.com>
Date: Fri, 21 May 2021 09:48:18 +0800
Subject: [PATCH 1/2] [i386] Fold blendv builtins into gimple.

Fold __builtin_ia32_pblendvb128 (a, b, c) as VEC_COND_EXPR (c < 0, b,
a), similar for float version but with mask operand VIEW_CONVERT_EXPR
to same sized integer vectype.

After folding, blendv related patterns can be redefined as
vec_merge since all elements of mask operand is either const0_rtx or
constm1_rtx now. It could potentially enable more rtl optimizations.

Besides, although there's no pblendv{d,q} instructions, backend can
still define their patterns and generate blendv{ps,pd} instead.

gcc/ChangeLog:

	* config/i386/i386-builtin.def (IX86_BUILTIN_BLENDVPD256,
	IX86_BUILTIN_BLENDVPS256, IX86_BUILTIN_PBLENDVB256,
	IX86_BUILTIN_BLENDVPD, IX86_BUILTIN_BLENDVPS,
	IX86_BUILTIN_PBLENDVB128): Replace icode with
	CODE_FOR_nothing.
	* config/i386/i386-expand.c (ix86_expand_sse_movcc): Use
	gen_avx_blendvd256/gen_avx_blendvq256/gen_sse4_1_blendvd/gen_sse4_1_blendvq
	for V8SI/V4DI/V4SI/V2DImode.
	* config/i386/i386.c (ix86_gimple_fold_builtin): Fold blendv
	builtins.
	* config/i386/mmx.md (mmx_blendvps): Change to define_expand.
	(*mmx_blendvps): New pattern implemented as vec_merge.
	* config/i386/sse.md
	(<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>): Change to
	define_expand.
	(<sse4_1_avx2>_pblendvb): Ditto.
	(*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>): New pattern
	implemented as vec_merge.
	(*<sse4_1_avx2>_pblendvb): Ditto.
	(*<sse4_1_avx2>_pblendvb_lt): Redefined as define_insn with
	pattern implemented as vec_merge instead of UNSPEC_BLENDV.
	(*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>_lt): Ditto,
	and extend mode to V48_AVX.
	(*<sse4_1_avx2>_pblendvb_not_lt): New.
	(*<sse4_1>_blendv<ssefltmodesuffix><avxsizesuffix>_ltint): Deleted.
	(*<sse4_1_avx2>_pblendvb_lt): Ditto.
	(*<sse4_1_avx2>_pblendvb_not_lt): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/funcspec-8.c: Replace
	__builtin_ia32_blendvpd with  __builtin_ia32_roundps_az.
	* gcc.target/i386/blendv-1.c: New test.
	* gcc.target/i386/blendv-2.c: New test.
---
 gcc/config/i386/i386-builtin.def           |  12 +-
 gcc/config/i386/i386-expand.c              |  22 +-
 gcc/config/i386/i386.c                     |  37 ++++
 gcc/config/i386/mmx.md                     |  38 +++-
 gcc/config/i386/sse.md                     | 227 +++++++++++----------
 gcc/testsuite/gcc.target/i386/blendv-1.c   |  51 +++++
 gcc/testsuite/gcc.target/i386/blendv-2.c   |  41 ++++
 gcc/testsuite/gcc.target/i386/funcspec-8.c |  16 +-
 8 files changed, 303 insertions(+), 141 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/blendv-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/blendv-2.c

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 80c2a2c0294..0c1507317ae 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -902,13 +902,13 @@ BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_ssse3_palignrdi,
 /* SSE4.1 */
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT)
-BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF)
-BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF)
+BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_nothing, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF)
+BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_nothing, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT)
-BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI)
+BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_nothing, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT)
 
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI)
@@ -1028,8 +1028,8 @@ BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpe
 
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT)
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT)
-BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF)
-BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF)
+BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF)
+BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF)
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT)
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT)
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT)
@@ -1154,7 +1154,7 @@ BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI)
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256",  IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI)
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256",  IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI)
-BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI)
+BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI)
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT)
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI)
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI)
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 9f3d41955a2..dc155313c39 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -3700,6 +3700,16 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
       if (TARGET_SSE4_1)
 	gen = gen_sse4_1_blendvpd;
       break;
+      /* Although x86 does not have pblendv{d,q} instructions,
+	 backend can define their patterns and then generate pblendv{ps,pd}.  */
+    case E_V4SImode:
+      if (TARGET_SSE4_1)
+	gen = gen_sse4_1_blendvd;
+      break;
+    case E_V2DImode:
+      if (TARGET_SSE4_1)
+	gen = gen_sse4_1_blendvq;
+      break;
     case E_SFmode:
       if (TARGET_SSE4_1)
 	{
@@ -3731,8 +3741,6 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
       break;
     case E_V16QImode:
     case E_V8HImode:
-    case E_V4SImode:
-    case E_V2DImode:
       if (TARGET_SSE4_1)
 	{
 	  gen = gen_sse4_1_pblendvb;
@@ -3743,6 +3751,14 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
 	  cmp = gen_lowpart (V16QImode, cmp);
 	}
       break;
+    case E_V8SImode:
+      if (TARGET_AVX)
+	gen = gen_avx_blendvd256;
+      break;
+    case E_V4DImode:
+      if (TARGET_AVX)
+	gen = gen_avx_blendvq256;
+      break;
     case E_V8SFmode:
       if (TARGET_AVX)
 	gen = gen_avx_blendvps256;
@@ -3753,8 +3769,6 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
       break;
     case E_V32QImode:
     case E_V16HImode:
-    case E_V8SImode:
-    case E_V4DImode:
       if (TARGET_AVX2)
 	{
 	  gen = gen_avx2_pblendvb;
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 743d8a25fe3..4a7ff768a32 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -17966,6 +17966,43 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 	}
       break;
 
+    case IX86_BUILTIN_PBLENDVB128:
+    case IX86_BUILTIN_PBLENDVB256:
+    case IX86_BUILTIN_BLENDVPS:
+    case IX86_BUILTIN_BLENDVPD:
+    case IX86_BUILTIN_BLENDVPS256:
+    case IX86_BUILTIN_BLENDVPD256:
+      gcc_assert (n_args == 3);
+      arg0 = gimple_call_arg (stmt, 0);
+      arg1 = gimple_call_arg (stmt, 1);
+      arg2 = gimple_call_arg (stmt, 2);
+      if (gimple_call_lhs (stmt))
+	{
+	  location_t loc = gimple_location (stmt);
+	  tree type = TREE_TYPE (arg2);
+	  gimple_seq stmts = NULL;
+	  if (VECTOR_FLOAT_TYPE_P (type))
+	    {
+	      tree itype = GET_MODE_INNER (TYPE_MODE (type)) == E_SFmode
+		? intSI_type_node : intDI_type_node;
+	      type = get_same_sized_vectype (itype, type);
+	      arg2 = gimple_build (&stmts, VIEW_CONVERT_EXPR, type, arg2);
+	    }
+	  tree zero_vec = build_zero_cst (type);
+	  tree cmp_type = truth_type_for (type);
+	  tree cmp = gimple_build (&stmts, LT_EXPR, cmp_type, arg2, zero_vec);
+	  gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt),
+					   VEC_COND_EXPR, cmp,
+					   arg1, arg0);
+	  gimple_set_location (g, loc);
+	  gsi_replace (gsi, g, false);
+	}
+      else
+	gsi_replace (gsi, gimple_build_nop (), false);
+      return true;
+
+
     case IX86_BUILTIN_PCMPEQB128:
     case IX86_BUILTIN_PCMPEQW128:
     case IX86_BUILTIN_PCMPEQD128:
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index d8479782e90..564f283a1a8 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -862,13 +862,30 @@ (define_expand "vcond<mode>v2sf"
   DONE;
 })
 
-(define_insn "mmx_blendvps"
-  [(set (match_operand:V2SF 0 "register_operand" "=Yr,*x,x")
+;; NB: This expander should only be used if only all elements
+;; of operands[3] are either const0_rtx or constm1_rtx.
+(define_expand "mmx_blendvps"
+  [(set (match_operand:V2SF 0 "register_operand")
 	(unspec:V2SF
-	  [(match_operand:V2SF 1 "register_operand" "0,0,x")
-	   (match_operand:V2SF 2 "register_operand" "Yr,*x,x")
-	   (match_operand:V2SF 3 "register_operand" "Yz,Yz,x")]
-	  UNSPEC_BLENDV))]
+	  [(match_operand:V2SF 1 "register_operand")
+	   (match_operand:V2SF 2 "register_operand")
+	   (match_operand:V2SF 3 "register_operand")]
+	   UNSPEC_BLENDV))]
+  "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
+{
+  operands[3] = gen_lowpart (V2SImode, operands[3]);
+  rtx tmp = gen_rtx_VEC_MERGE (V2SFmode, operands[2],
+      	    		      operands[1], operands[3]);
+  emit_move_insn (operands[0], tmp);
+  DONE;
+})
+
+(define_insn "*mmx_blendvps"
+  [(set (match_operand:V2SF 0 "register_operand" "=Yr,*x,x")
+	(vec_merge:V2SF
+	  (match_operand:V2SF 2 "register_operand" "Yr,*x,x")
+	  (match_operand:V2SF 1 "register_operand" "0,0,x")
+	  (match_operand:V2SI 3 "register_operand" "Yz,Yz,x")))]
   "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
   "@
    blendvps\t{%3, %2, %0|%0, %2, %3}
@@ -1935,11 +1952,10 @@ (define_expand "vcond_mask_<mode><mmxintvecmodelower>"
 
 (define_insn "mmx_pblendvb"
   [(set (match_operand:V8QI 0 "register_operand" "=Yr,*x,x")
-	(unspec:V8QI
-	  [(match_operand:V8QI 1 "register_operand" "0,0,x")
-	   (match_operand:V8QI 2 "register_operand" "Yr,*x,x")
-	   (match_operand:V8QI 3 "register_operand" "Yz,Yz,x")]
-	  UNSPEC_BLENDV))]
+	(vec_merge:V8QI
+	  (match_operand:V8QI 2 "register_operand" "Yr,*x,x")
+	  (match_operand:V8QI 1 "register_operand" "0,0,x")
+	  (match_operand:V8QI 3 "register_operand" "Yz,Yz,x")))]
   "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
   "@
    pblendvb\t{%3, %2, %0|%0, %2, %3}
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index a4503ddcb73..61fbf437f9f 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -547,6 +547,11 @@ (define_mode_iterator V48_AVX2
    (V4SI "TARGET_AVX2") (V2DI "TARGET_AVX2")
    (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")])
 
+(define_mode_iterator V48_AVX
+  [V4SF V2DF V4SI V2DI
+   (V8SF "TARGET_AVX") (V4DF "TARGET_AVX")
+   (V8SI "TARGET_AVX") (V4DI "TARGET_AVX")])
+
 (define_mode_iterator VI1_AVX512VLBW
   [(V64QI "TARGET_AVX512BW") (V32QI  "TARGET_AVX512VL")
 	(V16QI  "TARGET_AVX512VL")])
@@ -796,6 +801,14 @@ (define_mode_attr sseintvecmode
    (V32HI "V32HI") (V64QI "V64QI")
    (V32QI "V32QI") (V16QI "V16QI")])
 
+(define_mode_attr ssefloatvecmode
+  [(V16SF "V16SF") (V8DF  "V8DF")
+   (V8SF  "V8SF")  (V4DF  "V4DF")
+   (V4SF  "V4SF")  (V2DF  "V2DF")
+   (V16SI "V16SF") (V8DI  "V8DF")
+   (V8SI  "V8SF")  (V4DI  "V4DF")
+   (V4SI  "V4SF")  (V2DI  "V2DF")])
+
 (define_mode_attr sseintvecmode2
   [(V8DF "XI") (V4DF "OI") (V2DF "TI")
    (V8SF "OI") (V4SF "TI")])
@@ -17637,26 +17650,50 @@ (define_insn "<sse4_1>_blend<ssemodesuffix><avxsizesuffix>"
    (set_attr "prefix" "orig,orig,vex")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>"
-  [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x")
-	(unspec:VF_128_256
-	  [(match_operand:VF_128_256 1 "register_operand" "0,0,x")
-	   (match_operand:VF_128_256 2 "vector_operand" "YrBm,*xBm,xm")
-	   (match_operand:VF_128_256 3 "register_operand" "Yz,Yz,x")]
+;; NB: This expander should only be used if only all elements
+;; of operands[3] are either const0_rtx or constm1_rtx.
+(define_expand "<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>"
+  [(set (match_operand:V48_AVX 0 "register_operand")
+	(unspec:V48_AVX
+	  [(match_operand:V48_AVX 1 "register_operand")
+	   (match_operand:V48_AVX 2 "vector_operand")
+	   (match_operand:V48_AVX 3 "register_operand")]
 	  UNSPEC_BLENDV))]
   "TARGET_SSE4_1"
+{
+  if (FLOAT_MODE_P (<MODE>mode))
+    operands[3] = gen_lowpart (<sseintvecmode>mode, operands[3]);
+  rtx tmp = gen_rtx_VEC_MERGE (<MODE>mode, operands[2],
+			      operands[1], operands[3]);
+  emit_move_insn (operands[0], tmp);
+  DONE;
+})
+
+(define_mode_attr fblendvsuffix
+  [(V4SF "ps") (V2DF "pd")
+   (V8SF "ps") (V4DF "pd")
+   (V4SI "ps") (V2DI "pd")
+   (V8SI "ps") (V4DI "pd")])
+
+(define_insn "*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>"
+  [(set (match_operand:V48_AVX 0 "register_operand" "=Yr,*x,x")
+	(vec_merge:V48_AVX
+	   (match_operand:V48_AVX 2 "vector_operand" "YrBm,*xBm,xm")
+	   (match_operand:V48_AVX 1 "register_operand" "0,0,x")
+	   (match_operand:<sseintvecmode> 3 "register_operand" "Yz,Yz,x")))]
+  "TARGET_SSE4_1"
   "@
-   blendv<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}
-   blendv<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}
-   vblendv<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+   blendv<fblendvsuffix>\t{%3, %2, %0|%0, %2, %3}
+   blendv<fblendvsuffix>\t{%3, %2, %0|%0, %2, %3}
+   vblendv<fblendvsuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
   [(set_attr "isa" "noavx,noavx,avx")
    (set_attr "type" "ssemov")
    (set_attr "length_immediate" "1")
    (set_attr "prefix_data16" "1,1,*")
    (set_attr "prefix_extra" "1")
    (set_attr "prefix" "orig,orig,vex")
-   (set_attr "btver2_decode" "vector,vector,vector") 
-   (set_attr "mode" "<MODE>")])
+   (set_attr "btver2_decode" "vector,vector,vector")
+   (set_attr "mode" "<ssefloatvecmode>")])
 
 ;; Also define scalar versions.  These are used for conditional move.
 ;; Using subregs into vector modes causes register allocation lossage.
@@ -17698,67 +17735,27 @@ (define_insn "sse4_1_blendv<ssemodesuffix>"
 	      ]
 	      (const_string "<ssevecmode>")))])
 
-(define_insn_and_split "*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>_lt"
-  [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x")
-	(unspec:VF_128_256
-	  [(match_operand:VF_128_256 1 "register_operand" "0,0,x")
-	   (match_operand:VF_128_256 2 "vector_operand" "YrBm,*xBm,xm")
-	   (lt:VF_128_256
-	     (match_operand:<sseintvecmode> 3 "register_operand" "Yz,Yz,x")
-	     (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C"))]
-	  UNSPEC_BLENDV))]
+(define_insn "*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>_lt"
+  [(set (match_operand:V48_AVX 0 "register_operand" "=Yr,*x,x")
+	(vec_merge:V48_AVX
+	  (match_operand:V48_AVX 2 "vector_operand" "YrBm,*xBm,xm")
+	  (match_operand:V48_AVX 1 "register_operand" "0,0,x")
+	  (lt:<sseintvecmode>
+	    (match_operand:<sseintvecmode> 3 "register_operand" "Yz,Yz,x")
+	    (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C"))))]
   "TARGET_SSE4_1"
-  "#"
-  "&& reload_completed"
-  [(set (match_dup 0)
-	(unspec:VF_128_256
-	 [(match_dup 1) (match_dup 2) (match_dup 3)] UNSPEC_BLENDV))]
-  "operands[3] = gen_lowpart (<MODE>mode, operands[3]);"
-  [(set_attr "isa" "noavx,noavx,avx")
-   (set_attr "type" "ssemov")
-   (set_attr "length_immediate" "1")
-   (set_attr "prefix_data16" "1,1,*")
-   (set_attr "prefix_extra" "1")
-   (set_attr "prefix" "orig,orig,vex")
-   (set_attr "btver2_decode" "vector,vector,vector") 
-   (set_attr "mode" "<MODE>")])
-
-(define_mode_attr ssefltmodesuffix
-  [(V2DI "pd") (V4DI "pd") (V4SI "ps") (V8SI "ps")])
-
-(define_mode_attr ssefltvecmode
-  [(V2DI "V2DF") (V4DI "V4DF") (V4SI "V4SF") (V8SI "V8SF")])
-
-(define_insn_and_split "*<sse4_1>_blendv<ssefltmodesuffix><avxsizesuffix>_ltint"
-  [(set (match_operand:<ssebytemode> 0 "register_operand" "=Yr,*x,x")
-	(unspec:<ssebytemode>
-	  [(match_operand:<ssebytemode> 1 "register_operand" "0,0,x")
-	   (match_operand:<ssebytemode> 2 "vector_operand" "YrBm,*xBm,xm")
-	   (subreg:<ssebytemode>
-	     (lt:VI48_AVX
-	       (match_operand:VI48_AVX 3 "register_operand" "Yz,Yz,x")
-	       (match_operand:VI48_AVX 4 "const0_operand" "C,C,C")) 0)]
-	  UNSPEC_BLENDV))]
-  "TARGET_SSE4_1"
-  "#"
-  "&& reload_completed"
-  [(set (match_dup 0)
-	(unspec:<ssefltvecmode>
-	 [(match_dup 1) (match_dup 2) (match_dup 3)] UNSPEC_BLENDV))]
-{
-  operands[0] = gen_lowpart (<ssefltvecmode>mode, operands[0]);
-  operands[1] = gen_lowpart (<ssefltvecmode>mode, operands[1]);
-  operands[2] = gen_lowpart (<ssefltvecmode>mode, operands[2]);
-  operands[3] = gen_lowpart (<ssefltvecmode>mode, operands[3]);
-}
+  "@
+   blendv<fblendvsuffix>\t{%3, %2, %0|%0, %2, %3}
+   blendv<fblendvsuffix>\t{%3, %2, %0|%0, %2, %3}
+   vblendv<fblendvsuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
   [(set_attr "isa" "noavx,noavx,avx")
    (set_attr "type" "ssemov")
    (set_attr "length_immediate" "1")
    (set_attr "prefix_data16" "1,1,*")
    (set_attr "prefix_extra" "1")
    (set_attr "prefix" "orig,orig,vex")
-   (set_attr "btver2_decode" "vector,vector,vector") 
-   (set_attr "mode" "<ssefltvecmode>")])
+   (set_attr "btver2_decode" "vector,vector,vector")
+   (set_attr "mode" "<ssefloatvecmode>")])
 
 (define_insn "<sse4_1>_dp<ssemodesuffix><avxsizesuffix>"
   [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x")
@@ -17837,14 +17834,30 @@ (define_insn "<sse4_1_avx2>_packusdw<mask_name>"
    (set_attr "prefix" "orig,orig,<mask_prefix>")
    (set_attr "mode" "<sseinsnmode>")])
 
-(define_insn "<sse4_1_avx2>_pblendvb"
-  [(set (match_operand:VI1_AVX2 0 "register_operand" "=Yr,*x,x")
+;; NB: This expander should only be used if only all elements
+;; of operands[3] are either const0_rtx or constm1_rtx.
+(define_expand "<sse4_1_avx2>_pblendvb"
+  [(set (match_operand:VI1_AVX2 0 "register_operand")
 	(unspec:VI1_AVX2
-	  [(match_operand:VI1_AVX2 1 "register_operand"  "0,0,x")
-	   (match_operand:VI1_AVX2 2 "vector_operand" "YrBm,*xBm,xm")
-	   (match_operand:VI1_AVX2 3 "register_operand" "Yz,Yz,x")]
+	  [(match_operand:VI1_AVX2 1 "register_operand")
+	   (match_operand:VI1_AVX2 2 "vector_operand")
+	   (match_operand:VI1_AVX2 3 "register_operand")]
 	  UNSPEC_BLENDV))]
   "TARGET_SSE4_1"
+{
+  rtx tmp = gen_rtx_VEC_MERGE (<MODE>mode, operands[2],
+			      operands[1], operands[3]);
+  emit_move_insn (operands[0], tmp);
+  DONE;
+})
+
+(define_insn "*<sse4_1_avx2>_pblendvb"
+  [(set (match_operand:VI1_AVX2 0 "register_operand" "=Yr,*x,x")
+	(vec_merge:VI1_AVX2
+	  (match_operand:VI1_AVX2 2 "vector_operand" "YrBm,*xBm,xm")
+	  (match_operand:VI1_AVX2 1 "register_operand"  "0,0,x")
+	  (match_operand:VI1_AVX2 3 "register_operand" "Yz,Yz,x")))]
+  "TARGET_SSE4_1"
   "@
    pblendvb\t{%3, %2, %0|%0, %2, %3}
    pblendvb\t{%3, %2, %0|%0, %2, %3}
@@ -17857,50 +17870,19 @@ (define_insn "<sse4_1_avx2>_pblendvb"
    (set_attr "btver2_decode" "vector,vector,vector")
    (set_attr "mode" "<sseinsnmode>")])
 
-(define_split
-  [(set (match_operand:VI1_AVX2 0 "register_operand")
-	(unspec:VI1_AVX2
-	  [(match_operand:VI1_AVX2 1 "vector_operand")
-	   (match_operand:VI1_AVX2 2 "register_operand")
-	   (not:VI1_AVX2 (match_operand:VI1_AVX2 3 "register_operand"))]
-	  UNSPEC_BLENDV))]
-  "TARGET_SSE4_1"
-  [(set (match_dup 0)
-	(unspec:VI1_AVX2
-	  [(match_dup 2) (match_dup 1) (match_dup 3)]
-	  UNSPEC_BLENDV))])
-
-(define_split
-  [(set (match_operand:VI1_AVX2 0 "register_operand")
-	(unspec:VI1_AVX2
-	  [(match_operand:VI1_AVX2 1 "vector_operand")
-	   (match_operand:VI1_AVX2 2 "register_operand")
-	   (subreg:VI1_AVX2 (not (match_operand 3 "register_operand")) 0)]
-	  UNSPEC_BLENDV))]
-  "TARGET_SSE4_1
-   && GET_MODE_CLASS (GET_MODE (operands[3])) == MODE_VECTOR_INT
-   && GET_MODE_SIZE (GET_MODE (operands[3])) == <MODE_SIZE>"
-  [(set (match_dup 0)
-	(unspec:VI1_AVX2
-	  [(match_dup 2) (match_dup 1) (match_dup 4)]
-	  UNSPEC_BLENDV))]
-  "operands[4] = gen_lowpart (<MODE>mode, operands[3]);")
-
-(define_insn_and_split "*<sse4_1_avx2>_pblendvb_lt"
+(define_insn "*<sse4_1_avx2>_pblendvb_lt"
   [(set (match_operand:VI1_AVX2 0 "register_operand" "=Yr,*x,x")
-	(unspec:VI1_AVX2
-	  [(match_operand:VI1_AVX2 1 "register_operand"  "0,0,x")
-	   (match_operand:VI1_AVX2 2 "vector_operand" "YrBm,*xBm,xm")
-	   (lt:VI1_AVX2 (match_operand:VI1_AVX2 3 "register_operand" "Yz,Yz,x")
-			(match_operand:VI1_AVX2 4 "const0_operand" "C,C,C"))]
-	  UNSPEC_BLENDV))]
+	(vec_merge:VI1_AVX2
+	  (match_operand:VI1_AVX2 2 "vector_operand" "YrBm,*xBm,xm")
+	  (match_operand:VI1_AVX2 1 "register_operand"  "0,0,x")
+	  (lt:VI1_AVX2
+	    (match_operand:VI1_AVX2 3 "register_operand" "Yz,Yz,x")
+	    (match_operand:VI1_AVX2 4 "const0_operand" "C,C,C"))))]
   "TARGET_SSE4_1"
-  "#"
-  ""
-  [(set (match_dup 0)
-	(unspec:VI1_AVX2
-	 [(match_dup 1) (match_dup 2) (match_dup 3)] UNSPEC_BLENDV))]
-  ""
+   "@
+   pblendvb\t{%3, %2, %0|%0, %2, %3}
+   pblendvb\t{%3, %2, %0|%0, %2, %3}
+   vpblendvb\t{%3, %2, %1, %0|%0, %1, %2, %3}"
   [(set_attr "isa" "noavx,noavx,avx")
    (set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
@@ -17909,6 +17891,27 @@ (define_insn_and_split "*<sse4_1_avx2>_pblendvb_lt"
    (set_attr "btver2_decode" "vector,vector,vector")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn_and_split "*<sse4_1_avx2>_pblendvb_not_lt"
+  [(set (match_operand:VI1_AVX2 0 "register_operand")
+	(vec_merge:VI1_AVX2
+	  (match_operand:VI1_AVX2 2 "register_operand")
+	  (match_operand:VI1_AVX2 1 "vector_operand")
+	  (lt:VI1_AVX2
+	    (subreg:VI1_AVX2 (not (match_operand 3 "register_operand")) 0)
+	    (match_operand:VI1_AVX2 4 "const0_operand"))))]
+  "TARGET_SSE4_1 && ix86_pre_reload_split ()
+  && GET_MODE_CLASS (GET_MODE (operands[3])) == MODE_VECTOR_INT
+  && GET_MODE_SIZE (GET_MODE (operands[3])) == GET_MODE_SIZE (<MODE>mode)"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(vec_merge:VI1_AVX2
+	  (match_dup 1)
+	  (match_dup 2)
+	  (lt:VI1_AVX2
+	    (subreg:VI1_AVX2 (match_dup 3) 0)
+	    (match_dup 4))))])
+
 (define_insn "sse4_1_pblendw"
   [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,x")
 	(vec_merge:V8HI
diff --git a/gcc/testsuite/gcc.target/i386/blendv-1.c b/gcc/testsuite/gcc.target/i386/blendv-1.c
new file mode 100644
index 00000000000..fcbbfb9b446
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/blendv-1.c
@@ -0,0 +1,51 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O2 -mno-avx512f" } */
+/* { dg-final { scan-assembler-times {pblendvb[\t ]*%xmm} 1 } } */
+/* { dg-final { scan-assembler-times {pblendvb[\t ]*%ymm} 1 } } */
+/* { dg-final { scan-assembler-times {blendvps[\t ]*%xmm} 1 } } */
+/* { dg-final { scan-assembler-times {blendvps[\t ]*%ymm} 1 } } */
+/* { dg-final { scan-assembler-times {blendvpd[\t ]*%xmm} 1 } } */
+/* { dg-final { scan-assembler-times {blendvpd[\t ]*%ymm} 1 } } */
+
+typedef float v4sf __attribute__ ((vector_size (16)));
+typedef float v8sf __attribute__ ((vector_size (32)));
+typedef double v2df __attribute__ ((vector_size (16)));
+typedef double v4df __attribute__ ((vector_size (32)));
+typedef char v16qi __attribute__ ((vector_size (16)));
+typedef char v32qi __attribute__ ((vector_size (32)));
+
+v4sf
+foo (v4sf a, v4sf b, v4sf c)
+{
+  return __builtin_ia32_blendvps (a, b, c);
+}
+
+v8sf
+foo2 (v8sf a, v8sf b, v8sf c)
+{
+  return __builtin_ia32_blendvps256 (a, b, c);
+}
+
+v2df
+foo3 (v2df a, v2df b, v2df c)
+{
+  return __builtin_ia32_blendvpd (a, b, c);
+}
+
+v4df
+foo4 (v4df a, v4df b, v4df c)
+{
+  return __builtin_ia32_blendvpd256 (a, b, c);
+}
+
+v16qi
+foo5 (v16qi a, v16qi b, v16qi c)
+{
+  return __builtin_ia32_pblendvb128 (a, b, c);
+}
+
+v32qi
+foo6 (v32qi a, v32qi b, v32qi c)
+{
+  return __builtin_ia32_pblendvb256 (a, b, c);
+}
diff --git a/gcc/testsuite/gcc.target/i386/blendv-2.c b/gcc/testsuite/gcc.target/i386/blendv-2.c
new file mode 100644
index 00000000000..e61e0233411
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/blendv-2.c
@@ -0,0 +1,41 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O2 -mno-avx512f" } */
+/* { dg-final { scan-assembler-not {pblendv} } } */
+/* { dg-final { scan-assembler-not {blendvp} } } */
+
+#include <x86intrin.h>
+__m128
+foo (__m128 a, __m128 b)
+{
+  return _mm_blendv_ps (a, b, _mm_setzero_ps ());
+}
+
+__m256
+foo2 (__m256 a, __m256 b)
+{
+  return _mm256_blendv_ps (a, b, _mm256_set1_ps (-1.0));
+}
+
+__m128d
+foo3 (__m128d a, __m128d b, __m128d c)
+{
+  return _mm_blendv_pd (a, b, _mm_set1_pd (1.0));
+}
+
+__m256d
+foo4 (__m256d a, __m256d b, __m256d c)
+{
+  return _mm256_blendv_pd (a, b, _mm256_set1_pd (-134.3));
+}
+
+__m128i
+foo5 (__m128i a, __m128i b, __m128i c)
+{
+  return _mm_blendv_epi8 (a, b, _mm_set1_epi8 (3));
+}
+
+__m256i
+foo6 (__m256i a, __m256i b, __m256i c)
+{
+  return _mm256_blendv_epi8 (a, b, _mm256_set1_epi8 (-22));
+}
diff --git a/gcc/testsuite/gcc.target/i386/funcspec-8.c b/gcc/testsuite/gcc.target/i386/funcspec-8.c
index 0a6c709003a..f15541169e7 100644
--- a/gcc/testsuite/gcc.target/i386/funcspec-8.c
+++ b/gcc/testsuite/gcc.target/i386/funcspec-8.c
@@ -52,19 +52,19 @@ generic_psignd128 (__m128w a, __m128w b)
 #error "-msse4.1 should not be set for this test"
 #endif
 
-__m128d sse4_1_blendvpd (__m128d a, __m128d b, __m128d c) __attribute__((__target__("sse4.1")));
-__m128d generic_blendvpd (__m128d a, __m128d b, __m128d c);
+__m128 sse4_1_roundv4sf2 (__m128 a) __attribute__((__target__("sse4.1")));
+__m128 generic_roundv4sf2 (__m128 a);
 
-__m128d
-sse4_1_blendvpd  (__m128d a, __m128d b, __m128d c)
+__m128
+sse4_1_roundv4sf2  (__m128 a)
 {
-  return __builtin_ia32_blendvpd (a, b, c);
+  return __builtin_ia32_roundps_az (a);
 }
 
-__m128d
-generic_blendvpd  (__m128d a, __m128d b, __m128d c)
+__m128
+generic_blendvpd  (__m128 a)
 {
-  return __builtin_ia32_blendvpd (a, b, c);		/* { dg-error "needs isa option" } */
+  return __builtin_ia32_roundps_az (a);		/* { dg-error "needs isa option" } */
 }
 
 #ifdef __SSE4_2__
-- 
2.18.1

[PATCH 1/2] [i386] Fold blendv builtins into gimple.

Reply via email to