The following patch enables vector permutations optimization by trying to use 
ins instruction instead of slow and generic tbl.

example:
#define vector __attribute__((vector_size(4*sizeof(float))))

vector float f0(vector float a, vector float b)
{
  return __builtin_shuffle (a, a, (vector int){3, 1, 2, 3});
}


was compiled into:
...
        adrp    x0, .LC0
        ldr     q1, [x0, #:lo12:.LC0]
        tbl     v0.16b, {v0.16b}, v1.16b
...

and after patch:
...
        ins     v0.s[0], v0.s[3]
...

bootstrapped and tested on aarch64-linux-gnu with no regressions


This patch was initially introduced by me with Andrew Pinksi 
<apin...@marvell.com> being involved later.

Please note that test in this patch depends on another commit (PR82199), which 
I sent not long ago.

(I have no write access to repo)

Thanks,
Dmitrij
>From d4ccbcdf67648a095706213a0fe0ac856bb077bb Mon Sep 17 00:00:00 2001
From: Dmitrij Pochepko <dmitrij.poche...@bell-sw.com>
Date: Wed, 17 Jun 2020 12:04:10 +0300
Subject: [PATCH] vector creation from two parts of two vectors produces TBL
 rather than ins (PR93720)

The following patch enables vector permutations optimization by trying to use ins instruction instead of slow and generic tbl.

example:

vector float f0(vector float a, vector float b)
{
  return __builtin_shuffle (a, a, (vector int){3, 1, 2, 3});
}

was compiled into:
...
        adrp    x0, .LC0
        ldr     q1, [x0, #:lo12:.LC0]
        tbl     v0.16b, {v0.16b}, v1.16b
...

and after patch:
...
        ins     v0.s[0], v0.s[3]
...

bootstrapped and tested on aarch64-linux-gnu with no regressions

This patch was initially introduced by me with Andrew Pinksi <apin...@marvell.com> being involved later.
---
 gcc/config/aarch64/aarch64.c              | 85 +++++++++++++++++++++++++++++++
 gcc/testsuite/gcc.target/aarch64/vins-1.c | 23 +++++++++
 gcc/testsuite/gcc.target/aarch64/vins-2.c | 23 +++++++++
 gcc/testsuite/gcc.target/aarch64/vins-3.c | 23 +++++++++
 4 files changed, 154 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vins-1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vins-2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vins-3.c

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index ab7b39e..e0bde6d 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -20516,6 +20516,89 @@ aarch64_evpc_sel (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Recognize patterns suitable for the INS instructions.  */
+static bool
+aarch64_evpc_ins (struct expand_vec_perm_d *d)
+{
+  machine_mode mode = d->vmode;
+  unsigned HOST_WIDE_INT nelt;
+
+  if (d->vec_flags != VEC_ADVSIMD)
+    return false;
+
+  unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
+  for (unsigned int i = 0; i < encoded_nelts; ++i)
+    if (!d->perm[i].is_constant ())
+      return false;
+
+  /* to_constant is safe since this routine is specific to Advanced SIMD
+     vectors.  */
+  nelt = d->perm.length ().to_constant ();
+  rtx insv = d->op0;
+
+  HOST_WIDE_INT idx = -1;
+
+  for (unsigned HOST_WIDE_INT i = 0; i < nelt; i ++)
+    {
+      if (d->perm[i].to_constant () == (HOST_WIDE_INT) i)
+	continue;
+      if (idx != -1)
+	{
+	  idx = -1;
+	  break;
+	}
+      idx = i;
+    }
+
+  if (idx == -1)
+    {
+      insv = d->op1;
+      for (unsigned HOST_WIDE_INT i = 0; i < nelt; i ++)
+	{
+	  if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
+	    continue;
+	  if (idx != -1)
+	    return false;
+	  idx = i;
+	}
+
+      if (idx == -1)
+	return false;
+    }
+
+  if (d->testing_p)
+    return true;
+
+  gcc_assert (idx != -1);
+
+  unsigned extractindex = d->perm[idx].to_constant ();
+  rtx extractv = d->op0;
+  if (extractindex >= nelt)
+    {
+      extractv = d->op1;
+      extractindex -= nelt;
+    }
+  gcc_assert (extractindex < nelt);
+
+  machine_mode inner_mode = GET_MODE_INNER (mode);
+
+  enum insn_code inscode = optab_handler (vec_set_optab, mode);
+  gcc_assert (inscode != CODE_FOR_nothing);
+  enum insn_code iextcode = convert_optab_handler (vec_extract_optab, mode,
+						   inner_mode);
+  gcc_assert (iextcode != CODE_FOR_nothing);
+  rtx tempinner = gen_reg_rtx (inner_mode);
+  emit_insn (GEN_FCN (iextcode) (tempinner, extractv, GEN_INT (extractindex)));
+
+  rtx temp = gen_reg_rtx (mode);
+  emit_move_insn (temp, insv);
+  emit_insn (GEN_FCN (inscode) (temp, tempinner, GEN_INT (idx)));
+
+  emit_move_insn (d->target, temp);
+
+  return true;
+}
+
 static bool
 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 {
@@ -20550,6 +20633,8 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 	return true;
       else if (aarch64_evpc_sel (d))
 	return true;
+      else if (aarch64_evpc_ins (d))
+	return true;
       else if (aarch64_evpc_reencode (d))
 	return true;
       if (d->vec_flags == VEC_SVE_DATA)
diff --git a/gcc/testsuite/gcc.target/aarch64/vins-1.c b/gcc/testsuite/gcc.target/aarch64/vins-1.c
new file mode 100644
index 0000000..8e3725b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vins-1.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(4*sizeof(float))))
+
+vector float f0(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, a, (vector int){3, 1, 2, 3});
+}
+vector float f1(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, a, (vector int){0, 0, 2, 3});
+}
+vector float f2(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, a, (vector int){0, 1, 0, 3});
+}
+vector float f3(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, a, (vector int){0, 1, 2, 0});
+}
+
+/* { dg-final { scan-assembler-times "\[ \t\]*ins\[ \t\]+v\[0-9\]+\.s" 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vins-2.c b/gcc/testsuite/gcc.target/aarch64/vins-2.c
new file mode 100644
index 0000000..c50aa5a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vins-2.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(8*sizeof(short))))
+
+vector short f0(vector short a, vector short b)
+{
+  return __builtin_shuffle (a, a, (vector short){6, 1, 2, 3, 4, 5, 6, 7});
+}
+vector short f2(vector short a, vector short b)
+{
+  return __builtin_shuffle (a, a, (vector short){0, 1, 2, 1, 4, 5, 6, 7});
+}
+vector short f4(vector short a, vector short b)
+{
+  return __builtin_shuffle (a, a, (vector short){0, 1, 2, 3, 0, 5, 6, 7});
+}
+vector short f6(vector short a, vector short b)
+{
+  return __builtin_shuffle (a, a, (vector short){0, 1, 2, 3, 4, 5, 6, 1});
+}
+
+/* { dg-final { scan-assembler-times "\[ \t\]*ins\[ \t\]+v\[0-9\]+\.s" 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vins-3.c b/gcc/testsuite/gcc.target/aarch64/vins-3.c
new file mode 100644
index 0000000..e02d58c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vins-3.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(4*sizeof(float))))
+
+vector float f0(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, b, (vector int){0, 5, 6, 7});
+}
+vector float f1(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, b, (vector int){4, 0, 6, 7});
+}
+vector float f2(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, b, (vector int){4, 5, 0, 7});
+}
+vector float f3(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, b, (vector int){4, 5, 6, 0});
+}
+
+/* { dg-final { scan-assembler-times "\[ \t\]*ins\[ \t\]+v\[0-9\]+\.s" 4 } } */
-- 
2.7.4

Reply via email to