This isn't really a 'PATCH' yet, it's something I was working on but had to put on hold. Feel free to re-use any bits or trash all of it if you'd like.
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index 
82f9eba5c397af04924bdebdc684a1d77682d3fd..08625aad7b1a8dc9c9f8c491cb13d8af0b46a946
 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -842,13 +842,45 @@ public:
     for (unsigned int i = 0; i < nargs; ++i)
       {
        tree elt = gimple_call_arg (f.call, i);
-       if (!CONSTANT_CLASS_P (elt))
-         return NULL;
        builder.quick_push (elt);
        for (unsigned int j = 1; j < factor; ++j)
          builder.quick_push (build_zero_cst (TREE_TYPE (vec_type)));
       }
-    return gimple_build_assign (f.lhs, builder.build ());
+    builder.finalize ();
+    unsigned int n_elts
+      = builder.nelts_per_pattern () == 1 ? builder.npatterns ()
+                                         : builder.full_nelts ().coeffs[0];
+
+    if (n_elts == 1)
+      return gimple_build_assign (f.lhs, build1 (VEC_DUPLICATE_EXPR, vec_type,
+                                                builder.elt (0)));
+    tree list = NULL_TREE;
+    tree *pp = &list;
+    for (unsigned int i = 0; i < n_elts; ++i)
+      {
+       *pp = build_tree_list (NULL, builder.elt (i) PASS_MEM_STAT);
+       pp = &TREE_CHAIN (*pp);
+      }
+
+    poly_uint64 vec_len = TYPE_VECTOR_SUBPARTS (vec_type);
+    vec_perm_builder sel (vec_len, n_elts, 1);
+    for (unsigned int i = 0; i < n_elts; i++)
+      sel.quick_push (i);
+    vec_perm_indices indices (sel, 1, n_elts);
+
+    tree elt_type = TREE_TYPE (vec_type);
+
+    tree ctor_type = build_vector_type (elt_type, n_elts);
+    tree ctor = make_ssa_name_fn (cfun, ctor_type, 0);
+    gimple *ctor_stmt
+      = gimple_build_assign (ctor,
+                            build_constructor_from_list (ctor_type, list));
+    gsi_insert_before (f.gsi, ctor_stmt, GSI_SAME_STMT);
+
+    tree mask_type = build_vector_type (ssizetype, vec_len);
+    tree mask = vec_perm_indices_to_tree (mask_type, indices);
+    return gimple_build_assign (f.lhs, fold_build3 (VEC_PERM_EXPR, vec_type,
+                                                   ctor, ctor, mask));
   }
 
   rtx
diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 
bd60e65b0c3f05f1c931f03807170f3b9d699de5..dec935211e5a064239c858880a696e6ca3fe1ae2
 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -2544,6 +2544,17 @@
   }
 )
 
+;; Duplicate an Advanced SIMD vector to fill an SVE vector (LE version).
+(define_insn "*aarch64_vec_duplicate_reg<mode>_le"
+  [(set (match_operand:SVE_FULL 0 "register_operand" "=w,w")
+       (vec_duplicate:SVE_FULL
+         (match_operand:<VEL> 1 "register_operand" "w,r")))]
+  "TARGET_SVE && !BYTES_BIG_ENDIAN"
+  "@
+   mov\t%0.<Vetype>, %<vwcore>1
+   mov\t%0.<Vetype>, %<Vetype>1"
+)
+
 ;; Duplicate an Advanced SIMD vector to fill an SVE vector (BE version).
 ;; The SVE register layout puts memory lane N into (architectural)
 ;; register lane N, whereas the Advanced SIMD layout puts the memory
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
a08043e18d609e258ebfe033875201163d129aba..9b118e4101d0a5995a833769433be49321ab2151
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -6033,7 +6033,6 @@ rtx
 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
 {
   machine_mode src_mode = GET_MODE (src);
-  gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
   insn_code icode = (BYTES_BIG_ENDIAN
                     ? code_for_aarch64_vec_duplicate_vq_be (mode)
                     : code_for_aarch64_vec_duplicate_vq_le (mode));
@@ -21806,20 +21805,29 @@ aarch64_simd_make_constant (rtx vals)
 }
 
 static void
-aarch64_vec_duplicate (rtx target, machine_mode mode, machine_mode 
element_mode,
+aarch64_vec_duplicate (rtx target, rtx op, machine_mode mode, machine_mode 
element_mode,
                       int narrow_n_elts)
 {
   poly_uint64 size = narrow_n_elts * GET_MODE_BITSIZE (element_mode);
-  scalar_mode i_mode = int_mode_for_size (size, 0).require ();
   machine_mode o_mode;
-  if (aarch64_sve_mode_p (mode))
-    o_mode = aarch64_full_sve_mode (i_mode).require ();
+  rtx input, output;
+  bool sve = aarch64_sve_mode_p (mode);
+  if (sve && known_eq (size, 128U))
+    {
+      o_mode = mode;
+      output = target;
+      input = op;
+    }
   else
-    o_mode
-      = aarch64_simd_container_mode (i_mode,
-                                    GET_MODE_BITSIZE (mode));
-  rtx input = simplify_gen_subreg (i_mode, target, mode, 0);
-  rtx output = simplify_gen_subreg (o_mode, target, mode, 0);
+    {
+      scalar_mode i_mode = int_mode_for_size (size, 0).require ();
+      o_mode
+       = sve ? aarch64_full_sve_mode (i_mode).require ()
+             : aarch64_simd_container_mode (i_mode,
+                                            GET_MODE_BITSIZE (mode));
+      input = simplify_gen_subreg (i_mode, op, GET_MODE (op), 0);
+      output = simplify_gen_subreg (o_mode, target, mode, 0);
+    }
   aarch64_emit_move (output, gen_vec_duplicate (o_mode, input));
 }
 
@@ -21910,6 +21918,16 @@ aarch64_expand_vector_init (rtx target, 
rtx_vector_builder &v)
       return;
     }
 
+  /* We are constructing a VLS vector that we may later duplicate into a VLA
+     one.  Actually maybe split this into one for ASIMD and one for SVE? */
+  machine_mode real_mode = mode;
+  rtx real_target = target;
+  if (aarch64_sve_mode_p (real_mode))
+    {
+      mode = aarch64_vq_mode (GET_MODE_INNER (real_mode)).require ();
+      target = simplify_gen_subreg (mode, target, real_mode, 0);
+    }
+
   enum insn_code icode = optab_handler (vec_set_optab, mode);
   gcc_assert (icode != CODE_FOR_nothing);
 
@@ -22000,8 +22018,8 @@ aarch64_expand_vector_init (rtx target, 
rtx_vector_builder &v)
          x = copy_to_mode_reg (inner_mode, x);
          emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
        }
-       if (!known_eq (v.full_nelts (), n_elts))
-         aarch64_vec_duplicate (target, mode, GET_MODE (v0), n_elts);
+      if (!known_eq (v.full_nelts (), n_elts))
+       aarch64_vec_duplicate (real_target, target, real_mode, GET_MODE (v0), 
n_elts);
       return;
     }
 
@@ -22048,7 +22066,7 @@ aarch64_expand_vector_init (rtx target, 
rtx_vector_builder &v)
       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
     }
   if (!known_eq (v.full_nelts (), n_elts))
-    aarch64_vec_duplicate (target, mode, inner_mode, n_elts);
+    aarch64_vec_duplicate (real_target, target, real_mode, inner_mode, n_elts);
 }
 
 /* Emit RTL corresponding to:
@@ -23947,11 +23965,7 @@ aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
   if (BYTES_BIG_ENDIAN
       || !d->one_vector_p
       || d->vec_flags != VEC_SVE_DATA
-      || d->op_vec_flags != VEC_ADVSIMD
-      || d->perm.encoding ().nelts_per_pattern () != 1
-      || !known_eq (d->perm.encoding ().npatterns (),
-                   GET_MODE_NUNITS (d->op_mode))
-      || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128))
+      || d->perm.encoding ().nelts_per_pattern () != 1)
     return false;
 
   int npatterns = d->perm.encoding ().npatterns ();
@@ -23962,7 +23976,10 @@ aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
   if (d->testing_p)
     return true;
 
-  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
+  machine_mode mode = GET_MODE (d->target);
+  machine_mode element_mode = GET_MODE_INNER (mode);
+  aarch64_vec_duplicate (d->target, d->op0, mode, element_mode,
+                        d->perm.encoding ().npatterns ());
   return true;
 }
 
@@ -24194,6 +24211,15 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, 
machine_mode op_mode,
   return ret;
 }
 
+/* Implement TARGET_VECTORIZE_VLA_CONSTRUCTOR.  */
+
+static bool
+aarch64_vectorize_vla_constructor (rtx target, rtx_vector_builder &builder)
+{
+  aarch64_expand_vector_init (target, builder);
+  return true;
+}
+
 /* Generate a byte permute mask for a register of mode MODE,
    which has NUNITS units.  */
 
@@ -27667,6 +27693,10 @@ aarch64_libgcc_floating_mode_supported_p
 #define TARGET_VECTORIZE_VEC_PERM_CONST \
   aarch64_vectorize_vec_perm_const
 
+#undef TARGET_VECTORIZE_VLA_CONSTRUCTOR
+#define TARGET_VECTORIZE_VLA_CONSTRUCTOR \
+  aarch64_vectorize_vla_constructor
+
 #undef TARGET_VECTORIZE_RELATED_MODE
 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
 #undef TARGET_VECTORIZE_GET_MASK_MODE
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 
b0ea39884aa3ced5c0ccc1e792088aa66997ec3b..eda3f014984f62d96d7fe0b3c0c439905375f25a
 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6112,6 +6112,11 @@ instruction pattern.  There is no need for the hook to 
handle these two
 implementation approaches itself.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_VECTORIZE_VLA_CONSTRUCTOR (rtx 
@var{target}, rtx_vector_builder @var{&builder})
+This hook is used to expand a vla constructor into @var{target}
+using the rtx_vector_builder @var{builder}.
+@end deftypefn
+
 @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION 
(unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
 This hook should return the decl of a function that implements the
 vectorized variant of the function with the @code{combined_fn} code
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 
f869ddd5e5b8b7acbd8e9765fb103af24a1085b6..07f4f77877b18a23f6fd205a8dd8daf1a03c2923
 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4164,6 +4164,8 @@ address;  but often a machine-dependent strategy can 
generate better code.
 
 @hook TARGET_VECTORIZE_VEC_PERM_CONST
 
+@hook TARGET_VECTORIZE_VLA_CONSTRUCTOR
+
 @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
 
 @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 
f9753d48245d56039206647be8576246a3b25ed3..b9eb550cac4c68464c95cffa8da19b3984b80782
 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -10264,6 +10264,44 @@ expand_expr_real_2 (sepops ops, rtx target, 
machine_mode tmode,
 
     case VEC_PERM_EXPR:
       {
+       if (TREE_CODE (treeop2) == VECTOR_CST
+           && targetm.vectorize.vla_constructor)
+         {
+           tree ctor0, ctor1;
+           if (TREE_CODE (treeop0) == SSA_NAME
+               && is_gimple_assign (SSA_NAME_DEF_STMT (treeop0)))
+             ctor0 = gimple_assign_rhs1 (SSA_NAME_DEF_STMT (treeop0));
+           else
+             ctor0 = treeop0;
+           if (TREE_CODE (treeop1) == SSA_NAME
+               && is_gimple_assign (SSA_NAME_DEF_STMT (treeop1)))
+             ctor1 = gimple_assign_rhs1 (SSA_NAME_DEF_STMT (treeop1));
+           else
+             ctor1 = treeop1;
+
+           if (TREE_CODE (ctor0) == CONSTRUCTOR
+               && TREE_CODE (ctor1) == CONSTRUCTOR)
+             {
+
+               unsigned int nelts = vector_cst_encoded_nelts (treeop2);
+               unsigned int ctor_nelts = CONSTRUCTOR_NELTS (ctor0);
+               machine_mode mode = GET_MODE (target);
+               rtx_vector_builder builder (mode, nelts, 1);
+               for (unsigned int i = 0; i < nelts; ++i)
+                 {
+                   unsigned HOST_WIDE_INT index
+                     = tree_to_uhwi (VECTOR_CST_ENCODED_ELT (treeop2, i));
+                   tree op
+                     = index >= ctor_nelts
+                       ? CONSTRUCTOR_ELT (ctor1, index - ctor_nelts)->value
+                       : CONSTRUCTOR_ELT (ctor0, index)->value;
+                   builder.quick_push (expand_normal (op));
+                 }
+               builder.finalize ();
+               if (targetm.vectorize.vla_constructor (target, builder))
+                 return target;
+             }
+         }
        expand_operands (treeop0, treeop1, target, &op0, &op1, EXPAND_NORMAL);
        vec_perm_builder sel;
        if (TREE_CODE (treeop2) == VECTOR_CST
diff --git a/gcc/target.def b/gcc/target.def
index 
2a7fa68f83dd15dcdd2c332e8431e6142ec7d305..3c219b6a90d9cc1a6393a3ebc24e54fcf14c6377
 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1902,6 +1902,13 @@ implementation approaches itself.",
        const vec_perm_indices &sel),
  NULL)
 
+DEFHOOK
+(vla_constructor,
+ "This hook is used to expand a vla constructor into @var{target}\n\
+using the rtx_vector_builder @var{builder}.",
+ bool, (rtx target, rtx_vector_builder &builder),
+ NULL)
+
 /* Return true if the target supports misaligned store/load of a
    specific factor denoted in the third parameter.  The last parameter
    is true if the access is defined in a packed struct.  */
diff --git a/gcc/target.h b/gcc/target.h
index 
d6fa6931499d15edff3e5af3e429540d001c7058..b46b8f0d7a9c52f6efe6acf10f589703cec3bd08
 100644
--- a/gcc/target.h
+++ b/gcc/target.h
@@ -262,6 +262,8 @@ enum poly_value_estimate_kind
 extern bool verify_type_context (location_t, type_context_kind, const_tree,
                                 bool = false);
 
+class rtx_vector_builder;
+
 /* The target structure.  This holds all the backend hooks.  */
 #define DEFHOOKPOD(NAME, DOC, TYPE, INIT) TYPE NAME;
 #define DEFHOOK(NAME, DOC, TYPE, PARAMS, INIT) TYPE (* NAME) PARAMS;
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_opt_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_opt_1.c
new file mode 100644
index 
0000000000000000000000000000000000000000..01f652931555534f43e0487766c568c72a5df686
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_opt_1.c
@@ -0,0 +1,134 @@
+/* { dg-options { "-O2" } } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+#include <arm_sve.h>
+
+/*
+** test0:
+**     ins     v0.s\[1\], v1.s\[0\]
+**     mov     z0.d, d0
+**     ret
+*/
+svfloat32_t test0(float x, float y) {
+    return svdupq_n_f32(x, y, x, y);
+}
+/*
+** test1:
+**     mov     z0.s, s0
+**     ret
+*/
+
+svfloat32_t test1(float x) {
+    return svdupq_n_f32(x, x, x, x);
+}
+
+/*
+** test2:
+**     mov     z0.s, w0
+**     ret
+*/
+
+svint32_t test2(int x) {
+    return svdupq_n_s32(x, x, x, x);
+}
+
+/*
+** test3:
+**     sxth    w0, w0
+**     fmov    d0, x0
+**     ins     v0.h\[1\], w1
+**     ins     v0.h\[2\], w2
+**     ins     v0.h\[3\], w3
+**     mov     z0.d, d0
+**     ret
+*/
+
+svint16_t test3(short a, short b, short c, short d)
+{
+    return svdupq_n_s16(a, b, c, d, a, b, c, d);
+}
+
+/*
+** test4:
+**     dup     v0.4h, w0
+**     ins     v0.h\[1\], w1
+**     ins     v0.h\[3\], w1
+**     mov     z0.d, d0
+**     ret
+*/
+
+svint16_t test4(short a, short b)
+{
+    return svdupq_n_s16(a, b, a, b, a, b, a, b);
+}
+
+/*
+** test5:
+**     mov     z0.h, w0
+**     ret
+*/
+
+svint16_t test5(short a)
+{
+    return svdupq_n_s16(a, a, a, a, a, a, a, a);
+}
+/*
+** test6:
+**     sxtb    w0, w0
+**     fmov    d0, x0
+**     ins     v0.b\[1\], w1
+**     ins     v0.b\[2\], w2
+**     ins     v0.b\[3\], w3
+**     ins     v0.b\[4\], w4
+**     ins     v0.b\[5\], w5
+**     ins     v0.b\[6\], w6
+**     ins     v0.b\[7\], w7
+**     mov     z0.d, d0
+**     ret
+*/
+
+svint8_t test6(char a, char b, char c, char d, char e, char f, char g, char h)
+{
+    return svdupq_n_s8(a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h);
+}
+
+/*
+** test7:
+**     dup     v0.8b, w0
+**     ins     v0.b\[1\], w1
+**     ins     v0.b\[2\], w2
+**     ins     v0.b\[3\], w3
+**     mov     z0.s, s0
+**     ret
+*/
+
+svint8_t test7(char a, char b, char c, char d)
+{
+    return svdupq_n_s8(a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d);
+}
+
+
+// We can do better than this
+/*
+**     sxtb    w0, w0
+**     fmov    d0, x0
+**     ins     v0.d\[1\], x1
+**     ins     v0.b\[1\], w1
+**     mov     z0.h, h0
+**     ret
+*/
+
+svint8_t test8(char a, char b)
+{
+    return svdupq_n_s8(a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b);
+}
+
+/*
+** test9:
+**     mov     z0.b, w0
+**     ret
+*/
+
+svint8_t test9(char a)
+{
+    return svdupq_n_s8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a);
+}
diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
index 
350129555a0c71c0896c4f1003163f3b3557c11b..eaae1eefe02af3f51073310e7d17c33286b2bead
 100644
--- a/gcc/tree-vect-generic.cc
+++ b/gcc/tree-vect-generic.cc
@@ -1513,6 +1513,11 @@ lower_vec_perm (gimple_stmt_iterator *gsi)
   if (!TYPE_VECTOR_SUBPARTS (vect_type).is_constant (&elements))
     return;
 
+  /* It is possible to have a VEC_PERM_EXPR with a VLA mask and a VLS
+     CONSTRUCTOR, this should return a VLA type, so we can't lower it.  */
+  if (!TYPE_VECTOR_SUBPARTS (mask_type).is_constant ())
+    return;
+
   if (TREE_CODE (mask) == SSA_NAME)
     {
       gimple *def_stmt = SSA_NAME_DEF_STMT (mask);

Reply via email to