Tejas Belagod via Gcc-patches <gcc-patches@gcc.gnu.org> writes: > Hi, > > Loops containing long long shifts fail to vectorize due to the vectorizer > not being able to recognize long long right shifts. This is due to a bug > in the iterator used for the vashr and vlshr patterns in aarch64-simd.md. > > Tested and bootstrapped on aarch64-linux. OK? > > 2021-08-05 Tejas Belagod <tejas.bela...@arm.com> > > gcc/ChangeLog: > > PR target/101609 > * config/aarch64/aarch64-simd.md (vlshr<mode>3, vashr<mode>3): Use > the right iterator. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/vect-shr-reg.c: New testcase. > * gcc.target/aarch64/vect-shr-reg-run.c: Likewise.
OK, thanks. Nice how we're still finding these little easter eggs from the dawn of the port. :-) Richard > > > Thanks, > Tejas Belagod. > > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > index > c5638d096fa84a27b4ea397f62cd0d05a28e7c8c..48eddf64e05afe3788abfa05141f6544a9323ea1 > 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -1299,13 +1299,10 @@ (define_expand "vashl<mode>3" > DONE; > }) > > -;; Using mode VDQ_BHSI as there is no V2DImode neg! > -;; Negating individual lanes most certainly offsets the > -;; gain from vectorization. > (define_expand "vashr<mode>3" > - [(match_operand:VDQ_BHSI 0 "register_operand") > - (match_operand:VDQ_BHSI 1 "register_operand") > - (match_operand:VDQ_BHSI 2 "register_operand")] > + [(match_operand:VDQ_I 0 "register_operand") > + (match_operand:VDQ_I 1 "register_operand") > + (match_operand:VDQ_I 2 "register_operand")] > "TARGET_SIMD" > { > rtx neg = gen_reg_rtx (<MODE>mode); > @@ -1333,9 +1330,9 @@ (define_expand "aarch64_ashr_simddi" > ) > > (define_expand "vlshr<mode>3" > - [(match_operand:VDQ_BHSI 0 "register_operand") > - (match_operand:VDQ_BHSI 1 "register_operand") > - (match_operand:VDQ_BHSI 2 "register_operand")] > + [(match_operand:VDQ_I 0 "register_operand") > + (match_operand:VDQ_I 1 "register_operand") > + (match_operand:VDQ_I 2 "register_operand")] > "TARGET_SIMD" > { > rtx neg = gen_reg_rtx (<MODE>mode); > diff --git a/gcc/testsuite/gcc.target/aarch64/vect-shr-reg-run.c > b/gcc/testsuite/gcc.target/aarch64/vect-shr-reg-run.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..3190448e0936b9d5265f538304f9d20f13927339 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/vect-shr-reg-run.c > @@ -0,0 +1,53 @@ > +/* { dg-do run } */ > +/* { dg-options "-O3 -march=armv8.2-a" } */ > + > +#include "vect-shr-reg.c" > + > +int > +main(void) > +{ > + int64_t a[16]; > + int64_t b[16]; > + int64_t c[17]; > + > + uint64_t ua[16]; > + uint64_t ub[16]; > + uint64_t uc[17]; > + > + int64_t res_a[16]; > + uint64_t res_ua[16]; > + > + int i; > + > + /* Set up inputs. */ > + for (i = 0; i < 16; i++) > + { > + b[i] = -2; > + c[i] = 34; > + ub[i] = 0xffffffffffffffff; > + uc[i] = 52; > + } > + > + /* Set up reference values. */ > + for (i = 0; i < 16; i++) > + { > + res_a[i] = -1LL; > + res_ua[i] = 0x0fffLL; > + } > + > + /* Do the shifts. */ > + f (ua, ub, uc); > + g (a, b, c); > + > + /* Compare outputs against reference values. */ > + for (i = 0; i < 16; i++) > + { > + if (a[i] != res_a[i]) > + __builtin_abort (); > + > + if (ua[i] != res_ua[i]) > + __builtin_abort (); > + } > + > + return 0; > +} > diff --git a/gcc/testsuite/gcc.target/aarch64/vect-shr-reg.c > b/gcc/testsuite/gcc.target/aarch64/vect-shr-reg.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..5736dafb5a19957032e7b4bc1e90b218f52788fb > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/vect-shr-reg.c > @@ -0,0 +1,30 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -march=armv8.2-a" } */ > + > +#include <stdint.h> > +#include <stdio.h> > + > +#pragma GCC target "+nosve" > + > +int __attribute__((noinline)) > +f(uint64_t *__restrict a, uint64_t *__restrict b, uint64_t *__restrict c) > +{ > + int i; > + > + for (i = 0; i < 16; i++) > + a[i] = b[i] >> c[i]; > +} > + > + > +int __attribute__((noinline)) > +g(int64_t *__restrict a, int64_t *__restrict b, int64_t *__restrict c) > +{ > + int i; > + > + for (i = 0; i < 16; i++) > + a[i] = b[i] >> c[i]; > +} > + > +/* { dg-final { scan-assembler "neg\\tv" } } */ > +/* { dg-final { scan-assembler "ushl\\tv" } } */ > +/* { dg-final { scan-assembler "sshl\\tv" } } */