On Thu, 5 Nov 2020 at 12:55, Christophe Lyon <christophe.l...@linaro.org> wrote: > > On Thu, 5 Nov 2020 at 10:36, Kyrylo Tkachov <kyrylo.tkac...@arm.com> wrote: > > > > H, Christophe, > > > > > -----Original Message----- > > > From: Gcc-patches <gcc-patches-boun...@gcc.gnu.org> On Behalf Of > > > Christophe Lyon via Gcc-patches > > > Sent: 15 October 2020 18:23 > > > To: gcc-patches@gcc.gnu.org > > > Subject: [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64 > > > intrinsics > > > > > > This patch adds implementations for vceqq_p64, vceqz_p64 and > > > vceqzq_p64 intrinsics. > > > > > > vceqq_p64 uses the existing vceq_p64 after splitting the input vectors > > > into their high and low halves. > > > > > > vceqz[q] simply call the vceq and vceqq with a second argument equal > > > to zero. > > > > > > The added (executable) testcases make sure that the poly64x2_t > > > variants have results with one element of all zeroes (false) and the > > > other element with all bits set to one (true). > > > > > > 2020-10-15 Christophe Lyon <christophe.l...@linaro.org> > > > > > > gcc/ > > > * config/arm/arm_neon.h (vceqz_p64, vceqq_p64, vceqzq_p64): > > > New. > > > > > > gcc/testsuite/ > > > * gcc.target/aarch64/advsimd-intrinsics/p64_p128.c: Add tests for > > > vceqz_p64, vceqq_p64 and vceqzq_p64. > > > --- > > > gcc/config/arm/arm_neon.h | 31 +++++++++++++++ > > > .../aarch64/advsimd-intrinsics/p64_p128.c | 46 > > > +++++++++++++++++++++- > > > 2 files changed, 76 insertions(+), 1 deletion(-) > > > > > > diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h > > > index aa21730..f7eff37 100644 > > > --- a/gcc/config/arm/arm_neon.h > > > +++ b/gcc/config/arm/arm_neon.h > > > @@ -16912,6 +16912,37 @@ vceq_p64 (poly64x1_t __a, poly64x1_t __b) > > > return vreinterpret_u64_u32 (__m); > > > } > > > > > > +__extension__ extern __inline uint64x1_t > > > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > > > +vceqz_p64 (poly64x1_t __a) > > > +{ > > > + poly64x1_t __b = vreinterpret_p64_u32 (vdup_n_u32 (0)); > > > + return vceq_p64 (__a, __b); > > > +} > > > > This approach is okay, but can we have some kind of test to confirm it > > generates the VCEQ instruction with immediate zero rather than having a > > separate DUP... > > I had checked that manually, but I'll add a test. > However, I have noticed that although vceqz_p64 uses vceq.i32 dX, dY, #0, > the vceqzq_64 version below first sets > vmov dZ, #0 > and then emits two > vmoz dX, dY, dZ > > I'm looking at why this happens. >
Hi, Here is an updated version, which adds two tests (arm/simd/vceqz_p64.c and arm/simd/vceqzq_p64.c). The vceqzq_64 test does not currently expect instructions with immediate zero, because we generate: vmov.i32 q9, #0 @ v4si [...] vceq.i32 d16, d16, d19 vceq.i32 d17, d17, d19 Looking at the traces, I can see this in reload: (insn 19 8 15 2 (set (reg:V2SI 48 d16 [orig:128 _18 ] [128]) (neg:V2SI (eq:V2SI (reg:V2SI 48 d16 [orig:139 v1 ] [139]) (reg:V2SI 54 d19 [ _5+8 ])))) "/home/christophe.lyon/src/GCC/builds/gcc-fsf-git-neon-intrinsics/tools/lib/gcc/arm-none-linux-gnueabihf/11.0.0/include/arm_neon.h":2404:22 1650 {neon_vceqv2si_insn} (expr_list:REG_EQUAL (neg:V2SI (eq:V2SI (subreg:V2SI (reg:DI 48 d16 [orig:139 v1 ] [139]) 0) (const_vector:V2SI [ (const_int 0 [0]) repeated x2 ]))) (nil))) (insn 15 19 20 2 (set (reg:V2SI 50 d17 [orig:121 _11 ] [121]) (neg:V2SI (eq:V2SI (reg:V2SI 50 d17 [orig:141 v2 ] [141]) (reg:V2SI 54 d19 [ _5+8 ])))) "/home/christophe.lyon/src/GCC/builds/gcc-fsf-git-neon-intrinsics/tools/lib/gcc/arm-none-linux-gnueabihf/11.0.0/include/arm_neon.h":2404:22 1650 {neon_vceqv2si_insn} (expr_list:REG_EQUAL (neg:V2SI (eq:V2SI (subreg:V2SI (reg:DI 50 d17 [orig:141 v2 ] [141]) 0) (const_vector:V2SI [ (const_int 0 [0]) repeated x2 ]))) (nil))) but it says: Choosing alt 0 in insn 19: (0) =w (1) w (2) w {neon_vceqv2si_insn} alt=0,overall=0,losers=0,rld_nregs=0 Choosing alt 0 in insn 15: (0) =w (1) w (2) w {neon_vceqv2si_insn} alt=0,overall=0,losers=0,rld_nregs=0 Why isn't it picking alternative 1 with the Dz constraint? Christophe > Thanks, > > Christophe > > > > Thanks, > > Kyrill > > > > > + > > > +/* For vceqq_p64, we rely on vceq_p64 for each of the two elements. */ > > > +__extension__ extern __inline uint64x2_t > > > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > > > +vceqq_p64 (poly64x2_t __a, poly64x2_t __b) > > > +{ > > > + poly64_t __high_a = vget_high_p64 (__a); > > > + poly64_t __high_b = vget_high_p64 (__b); > > > + uint64x1_t __high = vceq_p64(__high_a, __high_b); > > > + > > > + poly64_t __low_a = vget_low_p64 (__a); > > > + poly64_t __low_b = vget_low_p64 (__b); > > > + uint64x1_t __low = vceq_p64(__low_a, __low_b); > > > + return vcombine_u64 (__low, __high); > > > +} > > > + > > > +__extension__ extern __inline uint64x2_t > > > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > > > +vceqzq_p64 (poly64x2_t __a) > > > +{ > > > + poly64x2_t __b = vreinterpretq_p64_u32 (vdupq_n_u32 (0)); > > > + return vceqq_p64 (__a, __b); > > > +} > > > + > > > /* The vtst_p64 intrinsic does not map to a single instruction. > > > We emulate it in way similar to vceq_p64 above but here we do > > > a reduction with max since if any two corresponding bits > > > diff --git > > > a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c > > > b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c > > > index a3210a9..6aed096 100644 > > > --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c > > > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c > > > @@ -16,6 +16,11 @@ VECT_VAR_DECL(vbsl_expected,poly,64,2) [] = > > > { 0xfffffff1, > > > > > > /* Expected results: vceq. */ > > > VECT_VAR_DECL(vceq_expected,uint,64,1) [] = { 0x0 }; > > > +VECT_VAR_DECL(vceq_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff }; > > > + > > > +/* Expected results: vceqz. */ > > > +VECT_VAR_DECL(vceqz_expected,uint,64,1) [] = { 0x0 }; > > > +VECT_VAR_DECL(vceqz_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff }; > > > > > > /* Expected results: vcombine. */ > > > VECT_VAR_DECL(vcombine_expected,poly,64,2) [] = { 0xfffffffffffffff0, > > > 0x88 }; > > > @@ -213,7 +218,7 @@ int main (void) > > > > > > /* vceq_p64 tests. */ > > > #undef TEST_MSG > > > -#define TEST_MSG "VCEQ" > > > +#define TEST_MSG "VCEQ/VCEQQ" > > > > > > #define TEST_VCOMP1(INSN, Q, T1, T2, T3, W, N) > > > \ > > > VECT_VAR(vceq_vector_res, T3, W, N) = > > > \ > > > @@ -227,16 +232,55 @@ int main (void) > > > DECL_VARIABLE(vceq_vector, poly, 64, 1); > > > DECL_VARIABLE(vceq_vector2, poly, 64, 1); > > > DECL_VARIABLE(vceq_vector_res, uint, 64, 1); > > > + DECL_VARIABLE(vceq_vector, poly, 64, 2); > > > + DECL_VARIABLE(vceq_vector2, poly, 64, 2); > > > + DECL_VARIABLE(vceq_vector_res, uint, 64, 2); > > > > > > CLEAN(result, uint, 64, 1); > > > + CLEAN(result, uint, 64, 2); > > > > > > VLOAD(vceq_vector, buffer, , poly, p, 64, 1); > > > + VLOAD(vceq_vector, buffer, q, poly, p, 64, 2); > > > > > > VDUP(vceq_vector2, , poly, p, 64, 1, 0x88); > > > + VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 0, 0x88); > > > + VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 1, 0xFFFFFFFFFFFFFFF1); > > > > > > TEST_VCOMP(vceq, , poly, p, uint, 64, 1); > > > + TEST_VCOMP(vceq, q, poly, p, uint, 64, 2); > > > > > > CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceq_expected, ""); > > > + CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceq_expected, ""); > > > + > > > + /* vceqz_p64 tests. */ > > > +#undef TEST_MSG > > > +#define TEST_MSG "VCEQZ/VCEQZQ" > > > + > > > +#define TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N) > > > \ > > > + VECT_VAR(vceqz_vector_res, T3, W, N) = \ > > > + INSN##Q##_##T2##W(VECT_VAR(vceqz_vector, T1, W, N)); \ > > > + vst1##Q##_u##W(VECT_VAR(result, T3, W, N), > > > VECT_VAR(vceqz_vector_res, T3, W, N)) > > > + > > > +#define TEST_VCOMPZ(INSN, Q, T1, T2, T3, W, N) > > > \ > > > + TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N) > > > + > > > + DECL_VARIABLE(vceqz_vector, poly, 64, 1); > > > + DECL_VARIABLE(vceqz_vector_res, uint, 64, 1); > > > + DECL_VARIABLE(vceqz_vector, poly, 64, 2); > > > + DECL_VARIABLE(vceqz_vector_res, uint, 64, 2); > > > + > > > + CLEAN(result, uint, 64, 1); > > > + CLEAN(result, uint, 64, 2); > > > + > > > + VLOAD(vceqz_vector, buffer, , poly, p, 64, 1); > > > + VLOAD(vceqz_vector, buffer, q, poly, p, 64, 2); > > > + VSET_LANE(vceqz_vector, q, poly, p, 64, 2, 1, 0); > > > + > > > + TEST_VCOMPZ(vceqz, , poly, p, uint, 64, 1); > > > + TEST_VCOMPZ(vceqz, q, poly, p, uint, 64, 2); > > > + > > > + CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceqz_expected, ""); > > > + CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceqz_expected, ""); > > > > > > /* vcombine_p64 tests. */ > > > #undef TEST_MSG > > > -- > > > 2.7.4 > >
From e0ca6975a559c445572ae6db30add4081c8207f6 Mon Sep 17 00:00:00 2001 From: Christophe Lyon <christophe.l...@linaro.org> Date: Thu, 15 Oct 2020 17:13:59 +0000 Subject: [PATCH v2] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64 intrinsics This patch adds implementations for vceqq_p64, vceqz_p64 and vceqzq_p64 intrinsics. vceqq_p64 uses the existing vceq_p64 after splitting the input vectors into their high and low halves. vceqz[q] simply call the vceq and vceqq with a second argument equal to zero. The added (executable) testcases make sure that the poly64x2_t variants have results with one element of all zeroes (false) and the other element with all bits set to one (true). 2020-10-15 Christophe Lyon <christophe.l...@linaro.org> gcc/ * config/arm/arm_neon.h (vceqz_p64, vceqq_p64, vceqzq_p64): New. gcc/testsuite/ * gcc.target/aarch64/advsimd-intrinsics/p64_p128.c: Add tests for vceqz_p64, vceqq_p64 and vceqzq_p64. * gcc.target/arm/simd/vceqz_p64.c: New test. * gcc.target/arm/simd/vceqzq_p64.c: New test. --- gcc/config/arm/arm_neon.h | 31 +++++++++++++++ .../aarch64/advsimd-intrinsics/p64_p128.c | 46 +++++++++++++++++++++- gcc/testsuite/gcc.target/arm/simd/vceqz_p64.c | 17 ++++++++ gcc/testsuite/gcc.target/arm/simd/vceqzq_p64.c | 17 ++++++++ 4 files changed, 110 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/arm/simd/vceqz_p64.c create mode 100644 gcc/testsuite/gcc.target/arm/simd/vceqzq_p64.c diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index aa21730..fd57ed5 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -16912,6 +16912,37 @@ vceq_p64 (poly64x1_t __a, poly64x1_t __b) return vreinterpret_u64_u32 (__m); } +__extension__ extern __inline uint64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vceqz_p64 (poly64x1_t __a) +{ + poly64x1_t __b = vreinterpret_p64_u32 (vdup_n_u32 (0)); + return vceq_p64 (__a, __b); +} + +/* For vceqq_p64, we rely on vceq_p64 for each of the two elements. */ +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vceqq_p64 (poly64x2_t __a, poly64x2_t __b) +{ + poly64_t __high_a = vget_high_p64 (__a); + poly64_t __high_b = vget_high_p64 (__b); + uint64x1_t __high = vceq_p64 (__high_a, __high_b); + + poly64_t __low_a = vget_low_p64 (__a); + poly64_t __low_b = vget_low_p64 (__b); + uint64x1_t __low = vceq_p64 (__low_a, __low_b); + return vcombine_u64 (__low, __high); +} + +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vceqzq_p64 (poly64x2_t __a) +{ + poly64x2_t __b = vreinterpretq_p64_u32 (vdupq_n_u32 (0)); + return vceqq_p64 (__a, __b); +} + /* The vtst_p64 intrinsic does not map to a single instruction. We emulate it in way similar to vceq_p64 above but here we do a reduction with max since if any two corresponding bits diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c index a3210a9..6aed096 100644 --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c @@ -16,6 +16,11 @@ VECT_VAR_DECL(vbsl_expected,poly,64,2) [] = { 0xfffffff1, /* Expected results: vceq. */ VECT_VAR_DECL(vceq_expected,uint,64,1) [] = { 0x0 }; +VECT_VAR_DECL(vceq_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff }; + +/* Expected results: vceqz. */ +VECT_VAR_DECL(vceqz_expected,uint,64,1) [] = { 0x0 }; +VECT_VAR_DECL(vceqz_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff }; /* Expected results: vcombine. */ VECT_VAR_DECL(vcombine_expected,poly,64,2) [] = { 0xfffffffffffffff0, 0x88 }; @@ -213,7 +218,7 @@ int main (void) /* vceq_p64 tests. */ #undef TEST_MSG -#define TEST_MSG "VCEQ" +#define TEST_MSG "VCEQ/VCEQQ" #define TEST_VCOMP1(INSN, Q, T1, T2, T3, W, N) \ VECT_VAR(vceq_vector_res, T3, W, N) = \ @@ -227,16 +232,55 @@ int main (void) DECL_VARIABLE(vceq_vector, poly, 64, 1); DECL_VARIABLE(vceq_vector2, poly, 64, 1); DECL_VARIABLE(vceq_vector_res, uint, 64, 1); + DECL_VARIABLE(vceq_vector, poly, 64, 2); + DECL_VARIABLE(vceq_vector2, poly, 64, 2); + DECL_VARIABLE(vceq_vector_res, uint, 64, 2); CLEAN(result, uint, 64, 1); + CLEAN(result, uint, 64, 2); VLOAD(vceq_vector, buffer, , poly, p, 64, 1); + VLOAD(vceq_vector, buffer, q, poly, p, 64, 2); VDUP(vceq_vector2, , poly, p, 64, 1, 0x88); + VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 0, 0x88); + VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 1, 0xFFFFFFFFFFFFFFF1); TEST_VCOMP(vceq, , poly, p, uint, 64, 1); + TEST_VCOMP(vceq, q, poly, p, uint, 64, 2); CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceq_expected, ""); + CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceq_expected, ""); + + /* vceqz_p64 tests. */ +#undef TEST_MSG +#define TEST_MSG "VCEQZ/VCEQZQ" + +#define TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N) \ + VECT_VAR(vceqz_vector_res, T3, W, N) = \ + INSN##Q##_##T2##W(VECT_VAR(vceqz_vector, T1, W, N)); \ + vst1##Q##_u##W(VECT_VAR(result, T3, W, N), VECT_VAR(vceqz_vector_res, T3, W, N)) + +#define TEST_VCOMPZ(INSN, Q, T1, T2, T3, W, N) \ + TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N) + + DECL_VARIABLE(vceqz_vector, poly, 64, 1); + DECL_VARIABLE(vceqz_vector_res, uint, 64, 1); + DECL_VARIABLE(vceqz_vector, poly, 64, 2); + DECL_VARIABLE(vceqz_vector_res, uint, 64, 2); + + CLEAN(result, uint, 64, 1); + CLEAN(result, uint, 64, 2); + + VLOAD(vceqz_vector, buffer, , poly, p, 64, 1); + VLOAD(vceqz_vector, buffer, q, poly, p, 64, 2); + VSET_LANE(vceqz_vector, q, poly, p, 64, 2, 1, 0); + + TEST_VCOMPZ(vceqz, , poly, p, uint, 64, 1); + TEST_VCOMPZ(vceqz, q, poly, p, uint, 64, 2); + + CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceqz_expected, ""); + CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceqz_expected, ""); /* vcombine_p64 tests. */ #undef TEST_MSG diff --git a/gcc/testsuite/gcc.target/arm/simd/vceqz_p64.c b/gcc/testsuite/gcc.target/arm/simd/vceqz_p64.c new file mode 100644 index 0000000..f26cbff --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/vceqz_p64.c @@ -0,0 +1,17 @@ +/* Test the `vceqz_p64' ARM Neon intrinsic. */ + +/* { dg-do compile } */ +/* { dg-options "-save-temps -O2 -fno-inline" } */ +/* { dg-add-options arm_neon } */ + +#include "arm_neon.h" + +poly64x1_t v1; +uint64x1_t result1; + +void func() +{ + result1 = vceqz_p64 (v1); +} + +/* { dg-final { scan-assembler-times "vceq\.i32\[ \t\]+\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+, #0\n" 1 } } */ diff --git a/gcc/testsuite/gcc.target/arm/simd/vceqzq_p64.c b/gcc/testsuite/gcc.target/arm/simd/vceqzq_p64.c new file mode 100644 index 0000000..355efd8 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/vceqzq_p64.c @@ -0,0 +1,17 @@ +/* Test the `vceqzq_p64' ARM Neon intrinsic. */ + +/* { dg-do compile } */ +/* { dg-options "-save-temps -O2 -fno-inline" } */ +/* { dg-add-options arm_neon } */ + +#include "arm_neon.h" + +poly64x2_t v2; +uint64x2_t result2; + +void func() +{ + result2 = vceqzq_p64 (v2); +} + +/* { dg-final { scan-assembler-times "vceq\.i32\[ \t\]+\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+\n" 2 } } */ -- 2.7.4