Jonathan Wright via Gcc-patches <gcc-patches@gcc.gnu.org> writes: > Hi, > > As subject, this patch rewrites the vsri[q]_n_p* Neon intrinsics to use RTL > builtins rather than inline assembly code, allowing for better scheduling > and optimization. > > Regression tested and bootstrapped on aarch64-none-linux-gnu - no > issues. > > Ok for master?
OK, thanks. Richard > Thanks, > Jonathan > > --- > > gcc/ChangeLog: > > 2021-02-10 Jonathan Wright <jonathan.wri...@arm.com> > > * config/aarch64/aarch64-simd-builtins.def: Add polynomial > ssri_n buitin generator macro. > * config/aarch64/arm_neon.h (vsri_n_p8): Use RTL builtin > instead of inline asm. > (vsri_n_p16): Likewise. > (vsri_n_p64): Likewise. > (vsriq_n_p8): Likewise. > (vsriq_n_p16): Likewise. > (vsriq_n_p64): Likewise. > > diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def > b/gcc/config/aarch64/aarch64-simd-builtins.def > index > afd1e9706fb6922b175d1027458362acfc8eb442..4c55a424233d437184ceaf66b6983b79a907fce4 > 100644 > --- a/gcc/config/aarch64/aarch64-simd-builtins.def > +++ b/gcc/config/aarch64/aarch64-simd-builtins.def > @@ -436,6 +436,7 @@ > BUILTIN_VQN (USHIFT2IMM, uqrshrn2_n, 0, NONE) > /* Implemented by aarch64_<sur>s<lr>i_n<mode>. */ > BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssri_n, 0, NONE) > + BUILTIN_VALLP (SHIFTINSERTP, ssri_n, 0, NONE) > BUILTIN_VSDQ_I_DI (USHIFTACC, usri_n, 0, NONE) > BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssli_n, 0, NONE) > BUILTIN_VALLP (SHIFTINSERTP, ssli_n, 0, NONE) > diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h > index > d0a65eddfed53025adeea4146a551bdd506c7da1..41cd6ccb354b0231409372c0f1b5e1b87e4a9169 > 100644 > --- a/gcc/config/aarch64/arm_neon.h > +++ b/gcc/config/aarch64/arm_neon.h > @@ -9070,83 +9070,47 @@ vsliq_n_p16 (poly16x8_t __a, poly16x8_t __b, const > int __c) > return __builtin_aarch64_ssli_nv8hi_ppps (__a, __b, __c); > } > > -#define vsri_n_p8(a, b, c) \ > - __extension__ \ > - ({ \ > - poly8x8_t b_ = (b); \ > - poly8x8_t a_ = (a); \ > - poly8x8_t result; \ > - __asm__ ("sri %0.8b,%2.8b,%3" \ > - : "=w"(result) \ > - : "0"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vsri_n_p16(a, b, c) \ > - __extension__ \ > - ({ \ > - poly16x4_t b_ = (b); \ > - poly16x4_t a_ = (a); \ > - poly16x4_t result; \ > - __asm__ ("sri %0.4h,%2.4h,%3" \ > - : "=w"(result) \ > - : "0"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vsri_n_p64(a, b, c) \ > - __extension__ > \ > - ({ > \ > - poly64x1_t b_ = (b); \ > - poly64x1_t a_ = (a); \ > - poly64x1_t result; \ > - __asm__ ("sri %d0,%d2,%3" \ > - : "=w"(result) \ > - : "0"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers. */); \ > - result; > \ > - }) > - > -#define vsriq_n_p8(a, b, c) \ > - __extension__ \ > - ({ \ > - poly8x16_t b_ = (b); \ > - poly8x16_t a_ = (a); \ > - poly8x16_t result; \ > - __asm__ ("sri %0.16b,%2.16b,%3" \ > - : "=w"(result) \ > - : "0"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vsriq_n_p16(a, b, c) \ > - __extension__ \ > - ({ \ > - poly16x8_t b_ = (b); \ > - poly16x8_t a_ = (a); \ > - poly16x8_t result; \ > - __asm__ ("sri %0.8h,%2.8h,%3" \ > - : "=w"(result) \ > - : "0"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vsriq_n_p64(a, b, c) \ > - __extension__ > \ > - ({ > \ > - poly64x2_t b_ = (b); \ > - poly64x2_t a_ = (a); \ > - poly64x2_t result; \ > - __asm__ ("sri %0.2d,%2.2d,%3" \ > - : "=w"(result) \ > - : "0"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers. */); \ > - result; > \ > - }) > +__extension__ extern __inline poly8x8_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vsri_n_p8 (poly8x8_t __a, poly8x8_t __b, const int __c) > +{ > + return __builtin_aarch64_ssri_nv8qi_ppps (__a, __b, __c); > +} > + > +__extension__ extern __inline poly16x4_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vsri_n_p16 (poly16x4_t __a, poly16x4_t __b, const int __c) > +{ > + return __builtin_aarch64_ssri_nv4hi_ppps (__a, __b, __c); > +} > + > +__extension__ extern __inline poly64x1_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vsri_n_p64 (poly64x1_t __a, poly64x1_t __b, const int __c) > +{ > + return (poly64x1_t) __builtin_aarch64_ssri_ndi_ppps (__a[0], __b[0], __c); > +} > + > +__extension__ extern __inline poly8x16_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vsriq_n_p8 (poly8x16_t __a, poly8x16_t __b, const int __c) > +{ > + return __builtin_aarch64_ssri_nv16qi_ppps (__a, __b, __c); > +} > + > +__extension__ extern __inline poly16x8_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vsriq_n_p16 (poly16x8_t __a, poly16x8_t __b, const int __c) > +{ > + return __builtin_aarch64_ssri_nv8hi_ppps (__a, __b, __c); > +} > + > +__extension__ extern __inline poly64x2_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vsriq_n_p64 (poly64x2_t __a, poly64x2_t __b, const int __c) > +{ > + return __builtin_aarch64_ssri_nv2di_ppps (__a, __b, __c); > +} > > __extension__ extern __inline uint8x8_t > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))