Hi,
For the following test-case:

int16x8_t foo(int16_t x, int16_t y)
{
  return (int16x8_t) { x, y, x, y, x, y, x, y };
}

Code gen at -O3:
foo:
        dup    v0.8h, w0
        ins     v0.h[1], w1
        ins     v0.h[3], w1
        ins     v0.h[5], w1
        ins     v0.h[7], w1
        ret

For 16 elements, it results in 8 ins instructions which might not be
optimal perhaps.
I guess, the above code-gen would be equivalent to the following ?
dup v0.8h, w0
dup v1.8h, w1
zip1 v0.8h, v0.8h, v1.8h

I have attached patch to do the same, if number of elements >= 8,
which should be possibly better compared to current code-gen ?
Patch passes bootstrap+test on aarch64-linux-gnu.
Does the patch look OK ?

Thanks,
Prathamesh
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index c91df6f5006..e5dea70e363 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -22028,6 +22028,39 @@ aarch64_expand_vector_init (rtx target, rtx vals)
       return;
     }
 
+  /* Check for interleaving case.
+     For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
+     Generate following code:
+     dup v0.h, x
+     dup v1.h, y
+     zip1 v0.h, v0.h, v1.h
+     for "large enough" initializer.  */
+
+  if (n_elts >= 8)
+    {
+      int i;
+      for (i = 2; i < n_elts; i++)
+       if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
+         break;
+
+      if (i == n_elts)
+       {
+         machine_mode mode = GET_MODE (target);
+         rtx dest[2];
+
+         for (int i = 0; i < 2; i++)
+           {
+             rtx x = copy_to_mode_reg (GET_MODE_INNER (mode), XVECEXP (vals, 
0, i));
+             dest[i] = gen_reg_rtx (mode);
+             aarch64_emit_move (dest[i], gen_vec_duplicate (mode, x));
+           }
+
+         rtvec v = gen_rtvec (2, dest[0], dest[1]);
+         emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
+         return;
+       }
+    }
+
   enum insn_code icode = optab_handler (vec_set_optab, mode);
   gcc_assert (icode != CODE_FOR_nothing);
 
diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c 
b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
new file mode 100644
index 00000000000..ee775048589
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/*
+** foo:
+**     ...
+**     dup     v[0-9]+\.8h, w[0-9]+
+**     dup     v[0-9]+\.8h, w[0-9]+
+**     zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
+**     ...
+**     ret
+*/
+
+int16x8_t foo(int16_t x, int y)
+{
+  int16x8_t v = (int16x8_t) {x, y, x, y, x, y, x, y}; 
+  return v;
+}
+
+/*
+** foo2:
+**     ...
+**     dup     v[0-9]+\.8h, w[0-9]+
+**     movi    v[0-9]+\.8h, 0x1
+**     zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
+**     ...
+**     ret
+*/
+
+int16x8_t foo2(int16_t x) 
+{
+  int16x8_t v = (int16x8_t) {x, 1, x, 1, x, 1, x, 1}; 
+  return v;
+}

Reply via email to