[PATCH 1/4]middle-end: support multi-step zero-extends using VEC_PERM_EXPR

Tamar Christina Mon, 14 Oct 2024 03:56:16 -0700

Hi All,

This patch series adds support for a target to do a direct convertion for zero
extends using permutes.


To do this it uses a target hook use_permute_for_promotio which must be
implemented by targets.  This hook is used to indicate:

 1. can a target do this for the given modes.
 2. is it profitable for the target to do it.
 3. can the target convert between various vector modes with a VIEW_CONVERT.

Using permutations have a big benefit of multi-step zero extensions because they
both reduce the number of needed instructions, but also increase throughput as
the dependency chain is removed.

Concretely on AArch64 this changes:

void test4(unsigned char *x, long long *y, int n) {
    for(int i = 0; i < n; i++) {
        y[i] = x[i];
    }
}

from generating:

.L4:
        ldr     q30, [x4], 16
        add     x3, x3, 128
        zip1    v1.16b, v30.16b, v31.16b
        zip2    v30.16b, v30.16b, v31.16b
        zip1    v2.8h, v1.8h, v31.8h
        zip1    v0.8h, v30.8h, v31.8h
        zip2    v1.8h, v1.8h, v31.8h
        zip2    v30.8h, v30.8h, v31.8h
        zip1    v26.4s, v2.4s, v31.4s
        zip1    v29.4s, v0.4s, v31.4s
        zip1    v28.4s, v1.4s, v31.4s
        zip1    v27.4s, v30.4s, v31.4s
        zip2    v2.4s, v2.4s, v31.4s
        zip2    v0.4s, v0.4s, v31.4s
        zip2    v1.4s, v1.4s, v31.4s
        zip2    v30.4s, v30.4s, v31.4s
        stp     q26, q2, [x3, -128]
        stp     q28, q1, [x3, -96]
        stp     q29, q0, [x3, -64]
        stp     q27, q30, [x3, -32]
        cmp     x4, x5
        bne     .L4

and instead we get:

.L4:
        add     x3, x3, 128
        ldr     q23, [x4], 16
        tbl     v5.16b, {v23.16b}, v31.16b
        tbl     v4.16b, {v23.16b}, v30.16b
        tbl     v3.16b, {v23.16b}, v29.16b
        tbl     v2.16b, {v23.16b}, v28.16b
        tbl     v1.16b, {v23.16b}, v27.16b
        tbl     v0.16b, {v23.16b}, v26.16b
        tbl     v22.16b, {v23.16b}, v25.16b
        tbl     v23.16b, {v23.16b}, v24.16b
        stp     q5, q4, [x3, -128]
        stp     q3, q2, [x3, -96]
        stp     q1, q0, [x3, -64]
        stp     q22, q23, [x3, -32]
        cmp     x4, x5
        bne     .L4

Tests are added in the AArch64 patch introducing the hook.  The testsuite also
already had about 800 runtime tests that get affected by this.

Bootstrapped Regtested on aarch64-none-linux-gnu, arm-none-linux-gnueabihf,
x86_64-pc-linux-gnu -m32, -m64 and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * target.def (use_permute_for_promotion): New.
        * doc/tm.texi.in: Document it.
        * doc/tm.texi: Regenerate.
        * targhooks.cc (default_use_permute_for_promotion): New.
        * targhooks.h (default_use_permute_for_promotion): New.
        (vectorizable_conversion): Support direct convertion with permute.
        * tree-vect-stmts.cc (vect_create_vectorized_promotion_stmts): Likewise.
        (supportable_widening_operation): Likewise.
        (vect_gen_perm_mask_any): Allow vector permutes where input registers
        are half the width of the result per the GCC 14 relaxation of
        VEC_PERM_EXPR.

---
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 
4deb3d2c283a2964972b94f434370a6f57ea816a..e8192590ac14005bf7cb5f731c16ee7eacb78143
 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6480,6 +6480,15 @@ type @code{internal_fn}) should be considered expensive 
when the mask is
 all zeros.  GCC can then try to branch around the instruction instead.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_VECTORIZE_USE_PERMUTE_FOR_PROMOTION 
(const_tree @var{in_type}, const_tree @var{out_type})
+This hook returns true if the operation promoting @var{in_type} to
+@var{out_type} should be done as a vector permute.  If @var{out_type} is
+a signed type the operation will be done as the related unsigned type and
+converted to @var{out_type}.  If the target supports the needed permute,
+is able to convert unsigned(@var{out_type}) to @var{out_type} and it is
+beneficial to the hook should return true, else false should be returned.
+@end deftypefn
+
 @deftypefn {Target Hook} {class vector_costs *} TARGET_VECTORIZE_CREATE_COSTS 
(vec_info *@var{vinfo}, bool @var{costing_for_scalar})
 This hook should initialize target-specific data structures in preparation
 for modeling the costs of vectorizing a loop or basic block.  The default
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 
9f147ccb95cc6d4e79cdf5b265666ad502492145..c007bc707372dd374e8effc52d29b76f5bc283a1
 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4303,6 +4303,8 @@ address;  but often a machine-dependent strategy can 
generate better code.
 
 @hook TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
 
+@hook TARGET_VECTORIZE_USE_PERMUTE_FOR_PROMOTION
+
 @hook TARGET_VECTORIZE_CREATE_COSTS
 
 @hook TARGET_VECTORIZE_BUILTIN_GATHER
diff --git a/gcc/target.def b/gcc/target.def
index 
b31550108883c5c3f5ffc7e46a1e8a7b839ebe83..58545d5ef4248da5850edec8f4db9f2636973598
 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -2056,6 +2056,20 @@ all zeros.  GCC can then try to branch around the 
instruction instead.",
  (unsigned ifn),
  default_empty_mask_is_expensive)
 
+/* Function to say whether a target supports and prefers to use permutes for
+   zero extensions or truncates.  */
+DEFHOOK
+(use_permute_for_promotion,
+ "This hook returns true if the operation promoting @var{in_type} to\n\
+@var{out_type} should be done as a vector permute.  If @var{out_type} is\n\
+a signed type the operation will be done as the related unsigned type and\n\
+converted to @var{out_type}.  If the target supports the needed permute,\n\
+is able to convert unsigned(@var{out_type}) to @var{out_type} and it is\n\
+beneficial to the hook should return true, else false should be returned.",
+ bool,
+ (const_tree in_type, const_tree out_type),
+ default_use_permute_for_promotion)
+
 /* Target builtin that implements vector gather operation.  */
 DEFHOOK
 (builtin_gather,
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 
2704d6008f14d2aa65671f002af886d3b802effa..723f8f4fda7808b6899f10f8b3fafad74d3c536f
 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -124,6 +124,7 @@ extern opt_machine_mode default_vectorize_related_mode 
(machine_mode,
 extern opt_machine_mode default_get_mask_mode (machine_mode);
 extern bool default_empty_mask_is_expensive (unsigned);
 extern bool default_conditional_operation_is_expensive (unsigned);
+extern bool default_use_permute_for_promotion (const_tree, const_tree);
 extern vector_costs *default_vectorize_create_costs (vec_info *, bool);
 
 /* OpenACC hooks.  */
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
index 
dc040df9fcd1182b62d83088ee7fb3a248c99f51..a487eab794fe9f1089ecb58fdfc881fdb19d28f3
 100644
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -1615,6 +1615,14 @@ default_conditional_operation_is_expensive (unsigned ifn)
   return ifn == IFN_MASK_STORE;
 }
 
+/* By default no targets prefer permutes over multi step extension.  */
+
+bool
+default_use_permute_for_promotion (const_tree, const_tree)
+{
+  return false;
+}
+
 /* By default consider masked stores to be expensive.  */
 
 bool
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 
4f6905f15417f90c6f36e1711a7a25071f0f507c..f2939655e4ec34111baa8894eaf769d29b1c5b82
 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -5129,6 +5129,111 @@ vect_create_vectorized_promotion_stmts (vec_info *vinfo,
   gimple *new_stmt1, *new_stmt2;
   vec<tree> vec_tmp = vNULL;
 
+  /* If we're using a VEC_PERM_EXPR then we're widening to the final type in
+     one go.  */
+  if (ch1 == VEC_PERM_EXPR
+      && op_type == unary_op)
+    {
+      vec_tmp.create (vec_oprnds0->length () * 2);
+      bool failed_p = false;
+
+      /* Extending with a vec-perm requires 2 instructions per step.  */
+      FOR_EACH_VEC_ELT (*vec_oprnds0, i, vop0)
+       {
+         tree vectype_in = TREE_TYPE (vop0);
+         tree vectype_out = TREE_TYPE (vec_dest);
+         machine_mode mode_in = TYPE_MODE (vectype_in);
+         machine_mode mode_out = TYPE_MODE (vectype_out);
+         unsigned bitsize_in = element_precision (vectype_in);
+         unsigned tot_in, tot_out;
+         unsigned HOST_WIDE_INT count;
+
+         /* We can't really support VLA here as the indexes depend on the VL.
+            VLA should really use widening instructions like widening
+            loads.  */
+         if (!GET_MODE_BITSIZE (mode_in).is_constant (&tot_in)
+             || !GET_MODE_BITSIZE (mode_out).is_constant (&tot_out)
+             || !TYPE_VECTOR_SUBPARTS (vectype_in).is_constant (&count)
+             || !TYPE_UNSIGNED (vectype_in)
+             || !targetm.vectorize.use_permute_for_promotion (vectype_in,
+                                                              vectype_out))
+           {
+             failed_p = true;
+             break;
+           }
+
+         unsigned steps = tot_out / bitsize_in;
+         tree zero = build_zero_cst (vectype_in);
+
+         unsigned chunk_size
+           = exact_div (TYPE_VECTOR_SUBPARTS (vectype_in),
+                        TYPE_VECTOR_SUBPARTS (vectype_out)).to_constant ();
+         unsigned step_size = chunk_size * (tot_out / tot_in);
+         unsigned nunits = tot_out / bitsize_in;
+
+         vec_perm_builder sel (steps, 1, 1);
+         sel.quick_grow (steps);
+
+         /* Flood fill with the out of range value first.  */
+         for (unsigned long i = 0; i < steps; ++i)
+           sel[i] = count;
+
+         tree var;
+         tree elem_in = TREE_TYPE (vectype_in);
+         machine_mode elem_mode_in = TYPE_MODE (elem_in);
+         unsigned long idx = 0;
+         tree vc_in = get_related_vectype_for_scalar_type (elem_mode_in,
+                                                           elem_in, nunits);
+
+         for (unsigned long j = 0; j < chunk_size; j++)
+           {
+             if (WORDS_BIG_ENDIAN)
+               for (int i = steps - 1; i >= 0; i -= step_size, idx++)
+                 sel[i] = idx;
+             else
+               for (int i = 0; i < (int)steps; i += step_size, idx++)
+                 sel[i] = idx;
+
+             vec_perm_indices indices (sel, 2, steps);
+
+             tree perm_mask = vect_gen_perm_mask_checked (vc_in, indices);
+             auto vec_oprnd = make_ssa_name (vc_in);
+             auto new_stmt = gimple_build_assign (vec_oprnd, VEC_PERM_EXPR,
+                                                  vop0, zero, perm_mask);
+             vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+
+             tree intvect_out = unsigned_type_for (vectype_out);
+             var = make_ssa_name (intvect_out);
+             new_stmt = gimple_build_assign (var, build1 (VIEW_CONVERT_EXPR,
+                                                          intvect_out,
+                                                          vec_oprnd));
+             vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+
+             gcc_assert (ch2.is_tree_code ());
+
+             var = make_ssa_name (vectype_out);
+             if (ch2 == VIEW_CONVERT_EXPR)
+                 new_stmt = gimple_build_assign (var,
+                                                 build1 (VIEW_CONVERT_EXPR,
+                                                         vectype_out,
+                                                         vec_oprnd));
+             else
+                 new_stmt = gimple_build_assign (var, (tree_code)ch2,
+                                                 vec_oprnd);
+
+             vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+             vec_tmp.safe_push (var);
+           }
+       }
+
+      if (!failed_p)
+       {
+         vec_oprnds0->release ();
+         *vec_oprnds0 = vec_tmp;
+         return;
+       }
+    }
+
   vec_tmp.create (vec_oprnds0->length () * 2);
   FOR_EACH_VEC_ELT (*vec_oprnds0, i, vop0)
     {
@@ -5495,6 +5600,20 @@ vectorizable_conversion (vec_info *vinfo,
          || GET_MODE_SIZE (lhs_mode) <= GET_MODE_SIZE (rhs_mode))
        goto unsupported;
 
+      /* Check to see if the target can use a permute to perform the zero
+        extension.  */
+      intermediate_type = unsigned_type_for (vectype_out);
+      if (TYPE_UNSIGNED (vectype_in)
+         && VECTOR_TYPE_P (intermediate_type)
+         && TYPE_VECTOR_SUBPARTS (intermediate_type).is_constant ()
+         && targetm.vectorize.use_permute_for_promotion (vectype_in,
+                                                         intermediate_type))
+       {
+         code1 = VEC_PERM_EXPR;
+         code2 = FLOAT_EXPR;
+         break;
+       }
+
       fltsz = GET_MODE_SIZE (lhs_mode);
       FOR_EACH_2XWIDER_MODE (rhs_mode_iter, rhs_mode)
        {
@@ -9804,7 +9923,8 @@ vect_gen_perm_mask_any (tree vectype, const 
vec_perm_indices &sel)
   tree mask_type;
 
   poly_uint64 nunits = sel.length ();
-  gcc_assert (known_eq (nunits, TYPE_VECTOR_SUBPARTS (vectype)));
+  gcc_assert (known_eq (nunits, TYPE_VECTOR_SUBPARTS (vectype))
+             || known_eq (nunits, TYPE_VECTOR_SUBPARTS (vectype) * 2));
 
   mask_type = build_vector_type (ssizetype, nunits);
   return vec_perm_indices_to_tree (mask_type, sel);
@@ -14397,8 +14517,20 @@ supportable_widening_operation (vec_info *vinfo,
       break;
 
     CASE_CONVERT:
-      c1 = VEC_UNPACK_LO_EXPR;
-      c2 = VEC_UNPACK_HI_EXPR;
+      {
+       tree cvt_type = unsigned_type_for (vectype_out);
+       if (TYPE_UNSIGNED (vectype_in)
+         && VECTOR_TYPE_P (cvt_type)
+         && TYPE_VECTOR_SUBPARTS (cvt_type).is_constant ()
+         && targetm.vectorize.use_permute_for_promotion (vectype_in, cvt_type))
+         {
+           *code1 = VEC_PERM_EXPR;
+           *code2 = VIEW_CONVERT_EXPR;
+           return true;
+         }
+       c1 = VEC_UNPACK_LO_EXPR;
+       c2 = VEC_UNPACK_HI_EXPR;
+      }
       break;
 
     case FLOAT_EXPR:




--

diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 4deb3d2c283a2964972b94f434370a6f57ea816a..e8192590ac14005bf7cb5f731c16ee7eacb78143 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6480,6 +6480,15 @@ type @code{internal_fn}) should be considered expensive when the mask is
 all zeros.  GCC can then try to branch around the instruction instead.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_VECTORIZE_USE_PERMUTE_FOR_PROMOTION (const_tree @var{in_type}, const_tree @var{out_type})
+This hook returns true if the operation promoting @var{in_type} to
+@var{out_type} should be done as a vector permute.  If @var{out_type} is
+a signed type the operation will be done as the related unsigned type and
+converted to @var{out_type}.  If the target supports the needed permute,
+is able to convert unsigned(@var{out_type}) to @var{out_type} and it is
+beneficial to the hook should return true, else false should be returned.
+@end deftypefn
+
 @deftypefn {Target Hook} {class vector_costs *} TARGET_VECTORIZE_CREATE_COSTS (vec_info *@var{vinfo}, bool @var{costing_for_scalar})
 This hook should initialize target-specific data structures in preparation
 for modeling the costs of vectorizing a loop or basic block.  The default
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 9f147ccb95cc6d4e79cdf5b265666ad502492145..c007bc707372dd374e8effc52d29b76f5bc283a1 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4303,6 +4303,8 @@ address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
 
+@hook TARGET_VECTORIZE_USE_PERMUTE_FOR_PROMOTION
+
 @hook TARGET_VECTORIZE_CREATE_COSTS
 
 @hook TARGET_VECTORIZE_BUILTIN_GATHER
diff --git a/gcc/target.def b/gcc/target.def
index b31550108883c5c3f5ffc7e46a1e8a7b839ebe83..58545d5ef4248da5850edec8f4db9f2636973598 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -2056,6 +2056,20 @@ all zeros.  GCC can then try to branch around the instruction instead.",
  (unsigned ifn),
  default_empty_mask_is_expensive)
 
+/* Function to say whether a target supports and prefers to use permutes for
+   zero extensions or truncates.  */
+DEFHOOK
+(use_permute_for_promotion,
+ "This hook returns true if the operation promoting @var{in_type} to\n\
+@var{out_type} should be done as a vector permute.  If @var{out_type} is\n\
+a signed type the operation will be done as the related unsigned type and\n\
+converted to @var{out_type}.  If the target supports the needed permute,\n\
+is able to convert unsigned(@var{out_type}) to @var{out_type} and it is\n\
+beneficial to the hook should return true, else false should be returned.",
+ bool,
+ (const_tree in_type, const_tree out_type),
+ default_use_permute_for_promotion)
+
 /* Target builtin that implements vector gather operation.  */
 DEFHOOK
 (builtin_gather,
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 2704d6008f14d2aa65671f002af886d3b802effa..723f8f4fda7808b6899f10f8b3fafad74d3c536f 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -124,6 +124,7 @@ extern opt_machine_mode default_vectorize_related_mode (machine_mode,
 extern opt_machine_mode default_get_mask_mode (machine_mode);
 extern bool default_empty_mask_is_expensive (unsigned);
 extern bool default_conditional_operation_is_expensive (unsigned);
+extern bool default_use_permute_for_promotion (const_tree, const_tree);
 extern vector_costs *default_vectorize_create_costs (vec_info *, bool);
 
 /* OpenACC hooks.  */
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
index dc040df9fcd1182b62d83088ee7fb3a248c99f51..a487eab794fe9f1089ecb58fdfc881fdb19d28f3 100644
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -1615,6 +1615,14 @@ default_conditional_operation_is_expensive (unsigned ifn)
   return ifn == IFN_MASK_STORE;
 }
 
+/* By default no targets prefer permutes over multi step extension.  */
+
+bool
+default_use_permute_for_promotion (const_tree, const_tree)
+{
+  return false;
+}
+
 /* By default consider masked stores to be expensive.  */
 
 bool
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 4f6905f15417f90c6f36e1711a7a25071f0f507c..f2939655e4ec34111baa8894eaf769d29b1c5b82 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -5129,6 +5129,111 @@ vect_create_vectorized_promotion_stmts (vec_info *vinfo,
   gimple *new_stmt1, *new_stmt2;
   vec<tree> vec_tmp = vNULL;
 
+  /* If we're using a VEC_PERM_EXPR then we're widening to the final type in
+     one go.  */
+  if (ch1 == VEC_PERM_EXPR
+      && op_type == unary_op)
+    {
+      vec_tmp.create (vec_oprnds0->length () * 2);
+      bool failed_p = false;
+
+      /* Extending with a vec-perm requires 2 instructions per step.  */
+      FOR_EACH_VEC_ELT (*vec_oprnds0, i, vop0)
+	{
+	  tree vectype_in = TREE_TYPE (vop0);
+	  tree vectype_out = TREE_TYPE (vec_dest);
+	  machine_mode mode_in = TYPE_MODE (vectype_in);
+	  machine_mode mode_out = TYPE_MODE (vectype_out);
+	  unsigned bitsize_in = element_precision (vectype_in);
+	  unsigned tot_in, tot_out;
+	  unsigned HOST_WIDE_INT count;
+
+	  /* We can't really support VLA here as the indexes depend on the VL.
+	     VLA should really use widening instructions like widening
+	     loads.  */
+	  if (!GET_MODE_BITSIZE (mode_in).is_constant (&tot_in)
+	      || !GET_MODE_BITSIZE (mode_out).is_constant (&tot_out)
+	      || !TYPE_VECTOR_SUBPARTS (vectype_in).is_constant (&count)
+	      || !TYPE_UNSIGNED (vectype_in)
+	      || !targetm.vectorize.use_permute_for_promotion (vectype_in,
+							       vectype_out))
+	    {
+	      failed_p = true;
+	      break;
+	    }
+
+	  unsigned steps = tot_out / bitsize_in;
+	  tree zero = build_zero_cst (vectype_in);
+
+	  unsigned chunk_size
+	    = exact_div (TYPE_VECTOR_SUBPARTS (vectype_in),
+			 TYPE_VECTOR_SUBPARTS (vectype_out)).to_constant ();
+	  unsigned step_size = chunk_size * (tot_out / tot_in);
+	  unsigned nunits = tot_out / bitsize_in;
+
+	  vec_perm_builder sel (steps, 1, 1);
+	  sel.quick_grow (steps);
+
+	  /* Flood fill with the out of range value first.  */
+	  for (unsigned long i = 0; i < steps; ++i)
+	    sel[i] = count;
+
+	  tree var;
+	  tree elem_in = TREE_TYPE (vectype_in);
+	  machine_mode elem_mode_in = TYPE_MODE (elem_in);
+	  unsigned long idx = 0;
+	  tree vc_in = get_related_vectype_for_scalar_type (elem_mode_in,
+							    elem_in, nunits);
+
+	  for (unsigned long j = 0; j < chunk_size; j++)
+	    {
+	      if (WORDS_BIG_ENDIAN)
+		for (int i = steps - 1; i >= 0; i -= step_size, idx++)
+		  sel[i] = idx;
+	      else
+		for (int i = 0; i < (int)steps; i += step_size, idx++)
+		  sel[i] = idx;
+
+	      vec_perm_indices indices (sel, 2, steps);
+
+	      tree perm_mask = vect_gen_perm_mask_checked (vc_in, indices);
+	      auto vec_oprnd = make_ssa_name (vc_in);
+	      auto new_stmt = gimple_build_assign (vec_oprnd, VEC_PERM_EXPR,
+						   vop0, zero, perm_mask);
+	      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+
+	      tree intvect_out = unsigned_type_for (vectype_out);
+	      var = make_ssa_name (intvect_out);
+	      new_stmt = gimple_build_assign (var, build1 (VIEW_CONVERT_EXPR,
+							   intvect_out,
+							   vec_oprnd));
+	      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+
+	      gcc_assert (ch2.is_tree_code ());
+
+	      var = make_ssa_name (vectype_out);
+	      if (ch2 == VIEW_CONVERT_EXPR)
+		  new_stmt = gimple_build_assign (var,
+						  build1 (VIEW_CONVERT_EXPR,
+							  vectype_out,
+							  vec_oprnd));
+	      else
+		  new_stmt = gimple_build_assign (var, (tree_code)ch2,
+						  vec_oprnd);
+
+	      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+	      vec_tmp.safe_push (var);
+	    }
+	}
+
+      if (!failed_p)
+	{
+	  vec_oprnds0->release ();
+	  *vec_oprnds0 = vec_tmp;
+	  return;
+	}
+    }
+
   vec_tmp.create (vec_oprnds0->length () * 2);
   FOR_EACH_VEC_ELT (*vec_oprnds0, i, vop0)
     {
@@ -5495,6 +5600,20 @@ vectorizable_conversion (vec_info *vinfo,
 	  || GET_MODE_SIZE (lhs_mode) <= GET_MODE_SIZE (rhs_mode))
 	goto unsupported;
 
+      /* Check to see if the target can use a permute to perform the zero
+	 extension.  */
+      intermediate_type = unsigned_type_for (vectype_out);
+      if (TYPE_UNSIGNED (vectype_in)
+	  && VECTOR_TYPE_P (intermediate_type)
+	  && TYPE_VECTOR_SUBPARTS (intermediate_type).is_constant ()
+	  && targetm.vectorize.use_permute_for_promotion (vectype_in,
+							  intermediate_type))
+	{
+	  code1 = VEC_PERM_EXPR;
+	  code2 = FLOAT_EXPR;
+	  break;
+	}
+
       fltsz = GET_MODE_SIZE (lhs_mode);
       FOR_EACH_2XWIDER_MODE (rhs_mode_iter, rhs_mode)
 	{
@@ -9804,7 +9923,8 @@ vect_gen_perm_mask_any (tree vectype, const vec_perm_indices &sel)
   tree mask_type;
 
   poly_uint64 nunits = sel.length ();
-  gcc_assert (known_eq (nunits, TYPE_VECTOR_SUBPARTS (vectype)));
+  gcc_assert (known_eq (nunits, TYPE_VECTOR_SUBPARTS (vectype))
+	      || known_eq (nunits, TYPE_VECTOR_SUBPARTS (vectype) * 2));
 
   mask_type = build_vector_type (ssizetype, nunits);
   return vec_perm_indices_to_tree (mask_type, sel);
@@ -14397,8 +14517,20 @@ supportable_widening_operation (vec_info *vinfo,
       break;
 
     CASE_CONVERT:
-      c1 = VEC_UNPACK_LO_EXPR;
-      c2 = VEC_UNPACK_HI_EXPR;
+      {
+	tree cvt_type = unsigned_type_for (vectype_out);
+	if (TYPE_UNSIGNED (vectype_in)
+	  && VECTOR_TYPE_P (cvt_type)
+	  && TYPE_VECTOR_SUBPARTS (cvt_type).is_constant ()
+	  && targetm.vectorize.use_permute_for_promotion (vectype_in, cvt_type))
+	  {
+	    *code1 = VEC_PERM_EXPR;
+	    *code2 = VIEW_CONVERT_EXPR;
+	    return true;
+	  }
+	c1 = VEC_UNPACK_LO_EXPR;
+	c2 = VEC_UNPACK_HI_EXPR;
+      }
       break;
 
     case FLOAT_EXPR:

[PATCH 1/4]middle-end: support multi-step zero-extends using VEC_PERM_EXPR

Reply via email to