Re: [PATCH][RFC] __builtin_shuffle sometimes should produce zip1 rather than TBL (PR82199)

Dmitrij Pochepko Mon, 13 Jul 2020 07:53:08 -0700

Hi,

I please take a look at new version (attached).


Thanks,
Dmitrij

On Sat, Jul 11, 2020 at 09:39:13AM +0100, Richard Sandiford wrote:
...
> This should push “newelt” instead.
done

...
> This test is endian-agnostic and should work for all targets, but…
...
> 
> …the shuffles in these tests are specific to little-endian targets.
> For big-endian targets, architectural lane 0 is the last in memory
> rather than first, so e.g. the big-endian permute vector for vzip_4.c
> would be: { 0, 1, 5, 6 } instead.
> 
> I guess the options are:
> 
> (1) add __ARM_BIG_ENDIAN preprocessor tests to choose the right mask
> (2) restrict the tests to aarch64_little_endian.
> (3) have two scan-assembler lines with opposite zip1/zip2 choices, e.g:
> 
> /* { dg-final { scan-assembler-times {[ \t]*zip1[ \t]+v[0-9]+\.2d} 1 { target 
> aarch64_big_endian } } } */
> /* { dg-final { scan-assembler-times {[ \t]*zip2[ \t]+v[0-9]+\.2d} 1 { target 
> aarch64_little_endian } } } */
> 
> (untested).
> 
> I guess (3) is probably best, but (2) is fine too if you're not set
> up to test big-endian.
> 
> Sorry for not noticing last time.  I only realised when testing the
> patch locally.
> 
> Thanks,
> Richard

I restricted tests to aarch64_little_endian, because I don't have big-endian 
setup to check it.

>From 197a9bc05f96c3f100b3f4748c9dd12a60de86d1 Mon Sep 17 00:00:00 2001
From: Dmitrij Pochepko <dmitrij.poche...@bell-sw.com>
Date: Mon, 13 Jul 2020 17:44:08 +0300
Subject: [PATCH]  __builtin_shuffle sometimes should produce zip1 rather than
 TBL (PR82199)

The following patch enables vector permutations optimization by using another vector element size when applicable.
It allows usage of simpler instructions in applicable cases.

example:

vector float f(vector float a, vector float b)
{
  return __builtin_shuffle  (a, b, (vector int){0, 1, 4,5});
}

was compiled into:
...
	adrp    x0, .LC0
	ldr     q2, [x0, #:lo12:.LC0]
	tbl     v0.16b, {v0.16b - v1.16b}, v2.16b
...

and after patch:
...
	zip1    v0.2d, v0.2d, v1.2d
...

bootstrapped and tested on aarch64-linux-gnu with no regressions

gcc/ChangeLog:

2020-07-13     Andrew Pinski   <apin...@marvell.com>

	PR gcc/82199

	* gcc/config/aarch64/aarch64.c (aarch64_evpc_reencode): New function

gcc/testsuite/ChangeLog:

2020-07-13      Andrew Pinski   <apin...@marvell.com>

	PR gcc/82199

	* gcc.target/aarch64/vdup_n_3.c: New test
	* gcc.target/aarch64/vzip_1.c: New test
	* gcc.target/aarch64/vzip_2.c: New test
	* gcc.target/aarch64/vzip_3.c: New test
	* gcc.target/aarch64/vzip_4.c: New test

Co-Authored-By: Dmitrij Pochepko        <dmitrij.poche...@bell-sw.com>
---
 gcc/config/aarch64/aarch64.c                | 57 +++++++++++++++++++++++++++++
 gcc/testsuite/gcc.target/aarch64/vdup_n_3.c | 16 ++++++++
 gcc/testsuite/gcc.target/aarch64/vzip_1.c   | 12 ++++++
 gcc/testsuite/gcc.target/aarch64/vzip_2.c   | 13 +++++++
 gcc/testsuite/gcc.target/aarch64/vzip_3.c   | 13 +++++++
 gcc/testsuite/gcc.target/aarch64/vzip_4.c   | 13 +++++++
 6 files changed, 124 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vdup_n_3.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vzip_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vzip_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vzip_3.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vzip_4.c

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 17dbe67..e259d05 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -19991,6 +19991,8 @@ struct expand_vec_perm_d
   bool testing_p;
 };
 
+static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
+
 /* Generate a variable permutation.  */
 
 static void
@@ -20176,6 +20178,59 @@ aarch64_evpc_trn (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Try to re-encode the PERM constant so it combines odd and even elements.
+   This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
+   We retry with this new constant with the full suite of patterns.  */
+static bool
+aarch64_evpc_reencode (struct expand_vec_perm_d *d)
+{
+  expand_vec_perm_d newd;
+  unsigned HOST_WIDE_INT nelt;
+
+  if (d->vec_flags != VEC_ADVSIMD)
+    return false;
+
+  /* Get the new mode.  Always twice the size of the inner
+     and half the elements.  */
+  poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
+  unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
+  auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
+  machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
+
+  if (new_mode == word_mode)
+    return false;
+
+  /* to_constant is safe since this routine is specific to Advanced SIMD
+     vectors.  */
+  nelt = d->perm.length ().to_constant ();
+
+  vec_perm_builder newpermconst;
+  newpermconst.new_vector (nelt / 2, nelt / 2, 1);
+
+  /* Convert the perm constant if we can.  Require even, odd as the pairs.  */
+  for (unsigned int i = 0; i < nelt; i += 2)
+    {
+      poly_int64 elt0 = d->perm[i];
+      poly_int64 elt1 = d->perm[i + 1];
+      poly_int64 newelt;
+      if (!multiple_p (elt0, 2, &newelt) || maybe_ne (elt0 + 1, elt1))
+	return false;
+      newpermconst.quick_push (newelt.to_constant ());
+    }
+  newpermconst.finalize ();
+
+  newd.vmode = new_mode;
+  newd.vec_flags = VEC_ADVSIMD;
+  newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
+  newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
+  newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
+  newd.testing_p = d->testing_p;
+  newd.one_vector_p = d->one_vector_p;
+
+  newd.perm.new_vector (newpermconst, newd.one_vector_p ? 1 : 2, nelt / 2);
+  return aarch64_expand_vec_perm_const_1 (&newd);
+}
+
 /* Recognize patterns suitable for the UZP instructions.  */
 static bool
 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
@@ -20573,6 +20628,8 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 	return true;
       else if (aarch64_evpc_sel (d))
 	return true;
+      else if (aarch64_evpc_reencode (d))
+	return true;
       if (d->vec_flags == VEC_SVE_DATA)
 	return aarch64_evpc_sve_tbl (d);
       else if (d->vec_flags == VEC_ADVSIMD)
diff --git a/gcc/testsuite/gcc.target/aarch64/vdup_n_3.c b/gcc/testsuite/gcc.target/aarch64/vdup_n_3.c
new file mode 100644
index 0000000..5234f5e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vdup_n_3.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(4*sizeof(float))))
+
+/* These are both dups. */
+vector float f(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, a, (vector int){0, 1, 0, 1});
+}
+vector float f1(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, a, (vector int){2, 3, 2, 3});
+}
+
+/* { dg-final { scan-assembler-times {[ \t]*dup[ \t]+v[0-9]+\.2d} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vzip_1.c b/gcc/testsuite/gcc.target/aarch64/vzip_1.c
new file mode 100644
index 0000000..837adb0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vzip_1.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-require-effective-target aarch64_little_endian } */
+
+#define vector __attribute__((vector_size(2*sizeof(float))))
+
+vector float f(vector float a, vector float b)
+{
+  return __builtin_shuffle (a, b, (vector int){0, 2});
+}
+
+/* { dg-final { scan-assembler-times {[ \t]*zip1[ \t]+v[0-9]+\.2s} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vzip_2.c b/gcc/testsuite/gcc.target/aarch64/vzip_2.c
new file mode 100644
index 0000000..2287d81
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vzip_2.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-require-effective-target aarch64_little_endian } */
+
+#define vector __attribute__((vector_size(4*sizeof(float))))
+
+vector float f(vector float a, vector float b)
+{
+  /* This is the same as zip1 v.2d as {0, 1, 4, 5} can be converted to {0, 2}. */
+  return __builtin_shuffle (a, b, (vector int){0, 1, 4, 5});
+}
+
+/* { dg-final { scan-assembler-times {[ \t]*zip1[ \t]+v[0-9]+\.2d} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vzip_3.c b/gcc/testsuite/gcc.target/aarch64/vzip_3.c
new file mode 100644
index 0000000..91c444b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vzip_3.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-require-effective-target aarch64_little_endian } */
+
+#define vector __attribute__((vector_size(4*sizeof(float))))
+
+vector float f(vector float a, vector float b)
+{
+  /* This is the same as zip1 v.2d as {4, 5, 0, 1} can be converted to {2, 0}. */
+  return __builtin_shuffle (a, b, (vector int){4, 5, 0, 1});
+}
+
+/* { dg-final { scan-assembler-times {[ \t]*zip1[ \t]+v[0-9]+\.2d} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vzip_4.c b/gcc/testsuite/gcc.target/aarch64/vzip_4.c
new file mode 100644
index 0000000..b4047e2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vzip_4.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-require-effective-target aarch64_little_endian } */
+
+#define vector __attribute__((vector_size(4*sizeof(float))))
+
+vector float f(vector float a, vector float b)
+{
+  /* This is the same as zip2 v.2d as {2, 3, 6, 7} can be converted to {1, 3}. */
+  return __builtin_shuffle (a, b, (vector int){2, 3, 6, 7});
+}
+
+/* { dg-final { scan-assembler-times {[ \t]*zip2[ \t]+v[0-9]+\.2d} 1 } } */
-- 
2.7.4

Re: [PATCH][RFC] __builtin_shuffle sometimes should produce zip1 rather than TBL (PR82199)

Reply via email to