Re: [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64 intrinsics

Christophe Lyon via Gcc-patches Fri, 06 Nov 2020 07:23:45 -0800

On Thu, 5 Nov 2020 at 12:55, Christophe Lyon <christophe.l...@linaro.org> wrote:
>
> On Thu, 5 Nov 2020 at 10:36, Kyrylo Tkachov <kyrylo.tkac...@arm.com> wrote:
> >
> > H, Christophe,
> >
> > > -----Original Message-----
> > > From: Gcc-patches <gcc-patches-boun...@gcc.gnu.org> On Behalf Of
> > > Christophe Lyon via Gcc-patches
> > > Sent: 15 October 2020 18:23
> > > To: gcc-patches@gcc.gnu.org
> > > Subject: [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64
> > > intrinsics
> > >
> > > This patch adds implementations for vceqq_p64, vceqz_p64 and
> > > vceqzq_p64 intrinsics.
> > >
> > > vceqq_p64 uses the existing vceq_p64 after splitting the input vectors
> > > into their high and low halves.
> > >
> > > vceqz[q] simply call the vceq and vceqq with a second argument equal
> > > to zero.
> > >
> > > The added (executable) testcases make sure that the poly64x2_t
> > > variants have results with one element of all zeroes (false) and the
> > > other element with all bits set to one (true).
> > >
> > > 2020-10-15  Christophe Lyon  <christophe.l...@linaro.org>
> > >
> > >       gcc/
> > >       * config/arm/arm_neon.h (vceqz_p64, vceqq_p64, vceqzq_p64):
> > > New.
> > >
> > >       gcc/testsuite/
> > >       * gcc.target/aarch64/advsimd-intrinsics/p64_p128.c: Add tests for
> > >       vceqz_p64, vceqq_p64 and vceqzq_p64.
> > > ---
> > >  gcc/config/arm/arm_neon.h                          | 31 +++++++++++++++
> > >  .../aarch64/advsimd-intrinsics/p64_p128.c          | 46
> > > +++++++++++++++++++++-
> > >  2 files changed, 76 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
> > > index aa21730..f7eff37 100644
> > > --- a/gcc/config/arm/arm_neon.h
> > > +++ b/gcc/config/arm/arm_neon.h
> > > @@ -16912,6 +16912,37 @@ vceq_p64 (poly64x1_t __a, poly64x1_t __b)
> > >    return vreinterpret_u64_u32 (__m);
> > >  }
> > >
> > > +__extension__ extern __inline uint64x1_t
> > > +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> > > +vceqz_p64 (poly64x1_t __a)
> > > +{
> > > +  poly64x1_t __b = vreinterpret_p64_u32 (vdup_n_u32 (0));
> > > +  return vceq_p64 (__a, __b);
> > > +}
> >
> > This approach is okay, but can we have some kind of test to confirm it 
> > generates the VCEQ instruction with immediate zero rather than having a 
> > separate DUP...
>
> I had checked that manually, but I'll add a test.
> However, I have noticed that although vceqz_p64 uses vceq.i32 dX, dY, #0,
> the vceqzq_64 version below first sets
> vmov dZ, #0
> and then emits two
> vmoz dX, dY, dZ
>
> I'm looking at why this happens.
>


Hi,

Here is an updated version, which adds two tests (arm/simd/vceqz_p64.c
and arm/simd/vceqzq_p64.c).

The vceqzq_64 test does not currently expect instructions with
immediate zero, because we generate:
vmov.i32        q9, #0  @ v4si
[...]
vceq.i32        d16, d16, d19
vceq.i32        d17, d17, d19

Looking at the traces, I can see this in reload:
(insn 19 8 15 2 (set (reg:V2SI 48 d16 [orig:128 _18 ] [128])
        (neg:V2SI (eq:V2SI (reg:V2SI 48 d16 [orig:139 v1 ] [139])
                (reg:V2SI 54 d19 [ _5+8 ]))))
"/home/christophe.lyon/src/GCC/builds/gcc-fsf-git-neon-intrinsics/tools/lib/gcc/arm-none-linux-gnueabihf/11.0.0/include/arm_neon.h":2404:22
1650 {neon_vceqv2si_insn}
     (expr_list:REG_EQUAL (neg:V2SI (eq:V2SI (subreg:V2SI (reg:DI 48
d16 [orig:139 v1 ] [139]) 0)
                (const_vector:V2SI [
                        (const_int 0 [0]) repeated x2
                    ])))
        (nil)))
(insn 15 19 20 2 (set (reg:V2SI 50 d17 [orig:121 _11 ] [121])
        (neg:V2SI (eq:V2SI (reg:V2SI 50 d17 [orig:141 v2 ] [141])
                (reg:V2SI 54 d19 [ _5+8 ]))))
"/home/christophe.lyon/src/GCC/builds/gcc-fsf-git-neon-intrinsics/tools/lib/gcc/arm-none-linux-gnueabihf/11.0.0/include/arm_neon.h":2404:22
1650 {neon_vceqv2si_insn}
     (expr_list:REG_EQUAL (neg:V2SI (eq:V2SI (subreg:V2SI (reg:DI 50
d17 [orig:141 v2 ] [141]) 0)
                (const_vector:V2SI [
                        (const_int 0 [0]) repeated x2
                    ])))
        (nil)))

but it says:
         Choosing alt 0 in insn 19:  (0) =w  (1) w  (2) w {neon_vceqv2si_insn}
          alt=0,overall=0,losers=0,rld_nregs=0
         Choosing alt 0 in insn 15:  (0) =w  (1) w  (2) w {neon_vceqv2si_insn}
          alt=0,overall=0,losers=0,rld_nregs=0

Why isn't it picking alternative 1 with the Dz constraint?

Christophe


> Thanks,
>
> Christophe
>
>
> > Thanks,
> > Kyrill
> >
> > > +
> > > +/* For vceqq_p64, we rely on vceq_p64 for each of the two elements.  */
> > > +__extension__ extern __inline uint64x2_t
> > > +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> > > +vceqq_p64 (poly64x2_t __a, poly64x2_t __b)
> > > +{
> > > +  poly64_t __high_a = vget_high_p64 (__a);
> > > +  poly64_t __high_b = vget_high_p64 (__b);
> > > +  uint64x1_t __high = vceq_p64(__high_a, __high_b);
> > > +
> > > +  poly64_t __low_a = vget_low_p64 (__a);
> > > +  poly64_t __low_b = vget_low_p64 (__b);
> > > +  uint64x1_t __low = vceq_p64(__low_a, __low_b);
> > > +  return vcombine_u64 (__low, __high);
> > > +}
> > > +
> > > +__extension__ extern __inline uint64x2_t
> > > +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> > > +vceqzq_p64 (poly64x2_t __a)
> > > +{
> > > +  poly64x2_t __b = vreinterpretq_p64_u32 (vdupq_n_u32 (0));
> > > +  return vceqq_p64 (__a, __b);
> > > +}
> > > +
> > >  /* The vtst_p64 intrinsic does not map to a single instruction.
> > >     We emulate it in way similar to vceq_p64 above but here we do
> > >     a reduction with max since if any two corresponding bits
> > > diff --git 
> > > a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> > > b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> > > index a3210a9..6aed096 100644
> > > --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> > > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> > > @@ -16,6 +16,11 @@ VECT_VAR_DECL(vbsl_expected,poly,64,2) [] =
> > > { 0xfffffff1,
> > >
> > >  /* Expected results: vceq.  */
> > >  VECT_VAR_DECL(vceq_expected,uint,64,1) [] = { 0x0 };
> > > +VECT_VAR_DECL(vceq_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff };
> > > +
> > > +/* Expected results: vceqz.  */
> > > +VECT_VAR_DECL(vceqz_expected,uint,64,1) [] = { 0x0 };
> > > +VECT_VAR_DECL(vceqz_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff };
> > >
> > >  /* Expected results: vcombine.  */
> > >  VECT_VAR_DECL(vcombine_expected,poly,64,2) [] = { 0xfffffffffffffff0,
> > > 0x88 };
> > > @@ -213,7 +218,7 @@ int main (void)
> > >
> > >    /* vceq_p64 tests. */
> > >  #undef TEST_MSG
> > > -#define TEST_MSG "VCEQ"
> > > +#define TEST_MSG "VCEQ/VCEQQ"
> > >
> > >  #define TEST_VCOMP1(INSN, Q, T1, T2, T3, W, N)
> > >       \
> > >    VECT_VAR(vceq_vector_res, T3, W, N) =
> > >       \
> > > @@ -227,16 +232,55 @@ int main (void)
> > >    DECL_VARIABLE(vceq_vector, poly, 64, 1);
> > >    DECL_VARIABLE(vceq_vector2, poly, 64, 1);
> > >    DECL_VARIABLE(vceq_vector_res, uint, 64, 1);
> > > +  DECL_VARIABLE(vceq_vector, poly, 64, 2);
> > > +  DECL_VARIABLE(vceq_vector2, poly, 64, 2);
> > > +  DECL_VARIABLE(vceq_vector_res, uint, 64, 2);
> > >
> > >    CLEAN(result, uint, 64, 1);
> > > +  CLEAN(result, uint, 64, 2);
> > >
> > >    VLOAD(vceq_vector, buffer, , poly, p, 64, 1);
> > > +  VLOAD(vceq_vector, buffer, q, poly, p, 64, 2);
> > >
> > >    VDUP(vceq_vector2, , poly, p, 64, 1, 0x88);
> > > +  VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 0, 0x88);
> > > +  VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 1, 0xFFFFFFFFFFFFFFF1);
> > >
> > >    TEST_VCOMP(vceq, , poly, p, uint, 64, 1);
> > > +  TEST_VCOMP(vceq, q, poly, p, uint, 64, 2);
> > >
> > >    CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceq_expected, "");
> > > +  CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceq_expected, "");
> > > +
> > > +  /* vceqz_p64 tests. */
> > > +#undef TEST_MSG
> > > +#define TEST_MSG "VCEQZ/VCEQZQ"
> > > +
> > > +#define TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N)
> > >       \
> > > +  VECT_VAR(vceqz_vector_res, T3, W, N) =                             \
> > > +    INSN##Q##_##T2##W(VECT_VAR(vceqz_vector, T1, W, N));             \
> > > +  vst1##Q##_u##W(VECT_VAR(result, T3, W, N),
> > > VECT_VAR(vceqz_vector_res, T3, W, N))
> > > +
> > > +#define TEST_VCOMPZ(INSN, Q, T1, T2, T3, W, N)
> > >       \
> > > +  TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N)
> > > +
> > > +  DECL_VARIABLE(vceqz_vector, poly, 64, 1);
> > > +  DECL_VARIABLE(vceqz_vector_res, uint, 64, 1);
> > > +  DECL_VARIABLE(vceqz_vector, poly, 64, 2);
> > > +  DECL_VARIABLE(vceqz_vector_res, uint, 64, 2);
> > > +
> > > +  CLEAN(result, uint, 64, 1);
> > > +  CLEAN(result, uint, 64, 2);
> > > +
> > > +  VLOAD(vceqz_vector, buffer, , poly, p, 64, 1);
> > > +  VLOAD(vceqz_vector, buffer, q, poly, p, 64, 2);
> > > +  VSET_LANE(vceqz_vector, q, poly, p, 64, 2, 1, 0);
> > > +
> > > +  TEST_VCOMPZ(vceqz, , poly, p, uint, 64, 1);
> > > +  TEST_VCOMPZ(vceqz, q, poly, p, uint, 64, 2);
> > > +
> > > +  CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceqz_expected, "");
> > > +  CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceqz_expected, "");
> > >
> > >    /* vcombine_p64 tests.  */
> > >  #undef TEST_MSG
> > > --
> > > 2.7.4
> >

From e0ca6975a559c445572ae6db30add4081c8207f6 Mon Sep 17 00:00:00 2001
From: Christophe Lyon <christophe.l...@linaro.org>
Date: Thu, 15 Oct 2020 17:13:59 +0000
Subject: [PATCH v2] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64
 intrinsics

This patch adds implementations for vceqq_p64, vceqz_p64 and
vceqzq_p64 intrinsics.

vceqq_p64 uses the existing vceq_p64 after splitting the input vectors
into their high and low halves.

vceqz[q] simply call the vceq and vceqq with a second argument equal
to zero.

The added (executable) testcases make sure that the poly64x2_t
variants have results with one element of all zeroes (false) and the
other element with all bits set to one (true).

2020-10-15  Christophe Lyon  <christophe.l...@linaro.org>

	gcc/
	* config/arm/arm_neon.h (vceqz_p64, vceqq_p64, vceqzq_p64): New.

	gcc/testsuite/
	* gcc.target/aarch64/advsimd-intrinsics/p64_p128.c: Add tests for
	vceqz_p64, vceqq_p64 and vceqzq_p64.
	* gcc.target/arm/simd/vceqz_p64.c: New test.
	* gcc.target/arm/simd/vceqzq_p64.c: New test.
---
 gcc/config/arm/arm_neon.h                          | 31 +++++++++++++++
 .../aarch64/advsimd-intrinsics/p64_p128.c          | 46 +++++++++++++++++++++-
 gcc/testsuite/gcc.target/arm/simd/vceqz_p64.c      | 17 ++++++++
 gcc/testsuite/gcc.target/arm/simd/vceqzq_p64.c     | 17 ++++++++
 4 files changed, 110 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/vceqz_p64.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/vceqzq_p64.c

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index aa21730..fd57ed5 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -16912,6 +16912,37 @@ vceq_p64 (poly64x1_t __a, poly64x1_t __b)
   return vreinterpret_u64_u32 (__m);
 }
 
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vceqz_p64 (poly64x1_t __a)
+{
+  poly64x1_t __b = vreinterpret_p64_u32 (vdup_n_u32 (0));
+  return vceq_p64 (__a, __b);
+}
+
+/* For vceqq_p64, we rely on vceq_p64 for each of the two elements.  */
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vceqq_p64 (poly64x2_t __a, poly64x2_t __b)
+{
+  poly64_t __high_a = vget_high_p64 (__a);
+  poly64_t __high_b = vget_high_p64 (__b);
+  uint64x1_t __high = vceq_p64 (__high_a, __high_b);
+
+  poly64_t __low_a = vget_low_p64 (__a);
+  poly64_t __low_b = vget_low_p64 (__b);
+  uint64x1_t __low = vceq_p64 (__low_a, __low_b);
+  return vcombine_u64 (__low, __high);
+}
+
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vceqzq_p64 (poly64x2_t __a)
+{
+  poly64x2_t __b = vreinterpretq_p64_u32 (vdupq_n_u32 (0));
+  return vceqq_p64 (__a, __b);
+}
+
 /* The vtst_p64 intrinsic does not map to a single instruction.
    We emulate it in way similar to vceq_p64 above but here we do
    a reduction with max since if any two corresponding bits
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
index a3210a9..6aed096 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
@@ -16,6 +16,11 @@ VECT_VAR_DECL(vbsl_expected,poly,64,2) [] = { 0xfffffff1,
 
 /* Expected results: vceq.  */
 VECT_VAR_DECL(vceq_expected,uint,64,1) [] = { 0x0 };
+VECT_VAR_DECL(vceq_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff };
+
+/* Expected results: vceqz.  */
+VECT_VAR_DECL(vceqz_expected,uint,64,1) [] = { 0x0 };
+VECT_VAR_DECL(vceqz_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff };
 
 /* Expected results: vcombine.  */
 VECT_VAR_DECL(vcombine_expected,poly,64,2) [] = { 0xfffffffffffffff0, 0x88 };
@@ -213,7 +218,7 @@ int main (void)
 
   /* vceq_p64 tests. */
 #undef TEST_MSG
-#define TEST_MSG "VCEQ"
+#define TEST_MSG "VCEQ/VCEQQ"
 
 #define TEST_VCOMP1(INSN, Q, T1, T2, T3, W, N)				\
   VECT_VAR(vceq_vector_res, T3, W, N) =					\
@@ -227,16 +232,55 @@ int main (void)
   DECL_VARIABLE(vceq_vector, poly, 64, 1);
   DECL_VARIABLE(vceq_vector2, poly, 64, 1);
   DECL_VARIABLE(vceq_vector_res, uint, 64, 1);
+  DECL_VARIABLE(vceq_vector, poly, 64, 2);
+  DECL_VARIABLE(vceq_vector2, poly, 64, 2);
+  DECL_VARIABLE(vceq_vector_res, uint, 64, 2);
 
   CLEAN(result, uint, 64, 1);
+  CLEAN(result, uint, 64, 2);
 
   VLOAD(vceq_vector, buffer, , poly, p, 64, 1);
+  VLOAD(vceq_vector, buffer, q, poly, p, 64, 2);
 
   VDUP(vceq_vector2, , poly, p, 64, 1, 0x88);
+  VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 0, 0x88);
+  VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 1, 0xFFFFFFFFFFFFFFF1);
 
   TEST_VCOMP(vceq, , poly, p, uint, 64, 1);
+  TEST_VCOMP(vceq, q, poly, p, uint, 64, 2);
 
   CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceq_expected, "");
+  CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceq_expected, "");
+
+  /* vceqz_p64 tests. */
+#undef TEST_MSG
+#define TEST_MSG "VCEQZ/VCEQZQ"
+
+#define TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N)				\
+  VECT_VAR(vceqz_vector_res, T3, W, N) =				\
+    INSN##Q##_##T2##W(VECT_VAR(vceqz_vector, T1, W, N));		\
+  vst1##Q##_u##W(VECT_VAR(result, T3, W, N), VECT_VAR(vceqz_vector_res, T3, W, N))
+
+#define TEST_VCOMPZ(INSN, Q, T1, T2, T3, W, N)				\
+  TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N)
+
+  DECL_VARIABLE(vceqz_vector, poly, 64, 1);
+  DECL_VARIABLE(vceqz_vector_res, uint, 64, 1);
+  DECL_VARIABLE(vceqz_vector, poly, 64, 2);
+  DECL_VARIABLE(vceqz_vector_res, uint, 64, 2);
+
+  CLEAN(result, uint, 64, 1);
+  CLEAN(result, uint, 64, 2);
+
+  VLOAD(vceqz_vector, buffer, , poly, p, 64, 1);
+  VLOAD(vceqz_vector, buffer, q, poly, p, 64, 2);
+  VSET_LANE(vceqz_vector, q, poly, p, 64, 2, 1, 0);
+
+  TEST_VCOMPZ(vceqz, , poly, p, uint, 64, 1);
+  TEST_VCOMPZ(vceqz, q, poly, p, uint, 64, 2);
+
+  CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceqz_expected, "");
+  CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceqz_expected, "");
 
   /* vcombine_p64 tests.  */
 #undef TEST_MSG
diff --git a/gcc/testsuite/gcc.target/arm/simd/vceqz_p64.c b/gcc/testsuite/gcc.target/arm/simd/vceqz_p64.c
new file mode 100644
index 0000000..f26cbff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/vceqz_p64.c
@@ -0,0 +1,17 @@
+/* Test the `vceqz_p64' ARM Neon intrinsic.  */
+
+/* { dg-do compile } */
+/* { dg-options "-save-temps -O2 -fno-inline" } */
+/* { dg-add-options arm_neon } */
+
+#include "arm_neon.h"
+
+poly64x1_t v1;
+uint64x1_t result1;
+
+void func()
+{
+  result1 = vceqz_p64 (v1);
+}
+
+/* { dg-final { scan-assembler-times "vceq\.i32\[ \t\]+\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+, #0\n" 1 } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/vceqzq_p64.c b/gcc/testsuite/gcc.target/arm/simd/vceqzq_p64.c
new file mode 100644
index 0000000..355efd8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/vceqzq_p64.c
@@ -0,0 +1,17 @@
+/* Test the `vceqzq_p64' ARM Neon intrinsic.  */
+
+/* { dg-do compile } */
+/* { dg-options "-save-temps -O2 -fno-inline" } */
+/* { dg-add-options arm_neon } */
+
+#include "arm_neon.h"
+
+poly64x2_t v2;
+uint64x2_t result2;
+
+void func()
+{
+  result2 = vceqzq_p64 (v2);
+}
+
+/* { dg-final { scan-assembler-times "vceq\.i32\[ \t\]+\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+\n" 2 } } */
-- 
2.7.4

Re: [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64 intrinsics

Reply via email to