On Wed, 5 May 2021 at 19:39, Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > Hi All, > > This adds optabs implementing usdot_prod. > > The following testcase: > > #define N 480 > #define SIGNEDNESS_1 unsigned > #define SIGNEDNESS_2 signed > #define SIGNEDNESS_3 signed > #define SIGNEDNESS_4 unsigned > > SIGNEDNESS_1 int __attribute__ ((noipa)) > f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a, > SIGNEDNESS_4 char *restrict b) > { > for (__INTPTR_TYPE__ i = 0; i < N; ++i) > { > int av = a[i]; > int bv = b[i]; > SIGNEDNESS_2 short mult = av * bv; > res += mult; > } > return res; > } > > Generates > > f: > vmov.i32 q8, #0 @ v4si > add r3, r2, #480 > .L2: > vld1.8 {q10}, [r2]! > vld1.8 {q9}, [r1]! > vusdot.s8 q8, q9, q10 > cmp r3, r2 > bne .L2 > vadd.i32 d16, d16, d17 > vpadd.i32 d16, d16, d16 > vmov.32 r3, d16[0] > add r0, r0, r3 > bx lr > > instead of > > f: > vmov.i32 q8, #0 @ v4si > add r3, r2, #480 > .L2: > vld1.8 {q9}, [r2]! > vld1.8 {q11}, [r1]! > cmp r3, r2 > vmull.s8 q10, d18, d22 > vmull.s8 q9, d19, d23 > vaddw.s16 q8, q8, d20 > vaddw.s16 q8, q8, d21 > vaddw.s16 q8, q8, d18 > vaddw.s16 q8, q8, d19 > bne .L2 > vadd.i32 d16, d16, d17 > vpadd.i32 d16, d16, d16 > vmov.32 r3, d16[0] > add r0, r0, r3 > bx lr > > For NEON. I couldn't figure out if the MVE instruction vmlaldav.s16 could be > used to emulate this. Because it would require additional widening to work I > left MVE out of this patch set but perhaps someone should take a look. > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
I guess you mean arm-linux-gnueabihf ? > > Ok for master? > > Thanks, > Tamar > > gcc/ChangeLog: > > * config/arm/neon.md (usdot_prod<vsi2qi>): New. > > gcc/testsuite/ChangeLog: > > * gcc.target/arm/simd/vusdot-autovec.c: New test. > > --- inline copy of patch -- > diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md > index > fec2cc91d24b6eff7b6fc8fdd54f39b3d646c468..23ad411178db77c5d19bee7452bc1070331c1aa0 > 100644 > --- a/gcc/config/arm/neon.md > +++ b/gcc/config/arm/neon.md > @@ -3075,6 +3075,24 @@ (define_expand "<sup>dot_prod<vsi2qi>" > DONE; > }) > > +;; Auto-vectorizer pattern for usdot > +(define_expand "usdot_prod<vsi2qi>" > + [(set (match_operand:VCVTI 0 "register_operand") > + (plus:VCVTI (unspec:VCVTI [(match_operand:<VSI2QI> 1 > + "register_operand") > + (match_operand:<VSI2QI> 2 > + "register_operand")] > + UNSPEC_DOT_US) > + (match_operand:VCVTI 3 "register_operand")))] > + "TARGET_I8MM" > +{ > + emit_insn ( > + gen_neon_usdot<vsi2qi> (operands[3], operands[3], operands[1], > + operands[2])); > + emit_insn (gen_rtx_SET (operands[0], operands[3])); > + DONE; > +}) > + > (define_expand "neon_copysignf<mode>" > [(match_operand:VCVTF 0 "register_operand") > (match_operand:VCVTF 1 "register_operand") > diff --git a/gcc/testsuite/gcc.target/arm/simd/vusdot-autovec.c > b/gcc/testsuite/gcc.target/arm/simd/vusdot-autovec.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..7cc56f68817d77d6950df0ab372d6fbaad6b3813 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/arm/simd/vusdot-autovec.c > @@ -0,0 +1,38 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -march=armv8.2-a+i8mm" } */ > + > +#define N 480 > +#define SIGNEDNESS_1 unsigned > +#define SIGNEDNESS_2 signed > +#define SIGNEDNESS_3 signed > +#define SIGNEDNESS_4 unsigned > + > +SIGNEDNESS_1 int __attribute__ ((noipa)) > +f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a, > + SIGNEDNESS_4 char *restrict b) > +{ > + for (__INTPTR_TYPE__ i = 0; i < N; ++i) > + { > + int av = a[i]; > + int bv = b[i]; > + SIGNEDNESS_2 short mult = av * bv; > + res += mult; > + } > + return res; > +} > + > +SIGNEDNESS_1 int __attribute__ ((noipa)) > +g (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict b, > + SIGNEDNESS_4 char *restrict a) > +{ > + for (__INTPTR_TYPE__ i = 0; i < N; ++i) > + { > + int av = a[i]; > + int bv = b[i]; > + SIGNEDNESS_2 short mult = av * bv; > + res += mult; > + } > + return res; > +} > + > +/* { dg-final { scan-assembler-times {vusdot.s8} 2 { target { > arm-*-*-gnueabihf } } } } */ > > > --