ping
> -----Original Message----- > From: Tamar Christina > Sent: Tuesday, December 21, 2021 12:31 PM > To: gcc-patches@gcc.gnu.org > Cc: nd <n...@arm.com>; Ramana Radhakrishnan > <ramana.radhakrish...@arm.com>; Richard Earnshaw > <richard.earns...@arm.com>; ni...@redhat.com; Kyrylo Tkachov > <kyrylo.tkac...@arm.com> > Subject: [AArch32]: correct dot-product RTL patterns. > > Hi All, > > The previous fix for this problem was wrong due to a subtle difference > between where NEON expects the RMW values and where intrinsics expects > them. > > The insn pattern is modeled after the intrinsics and so needs an expand for > the vectorizer optab to switch the RTL. > > However operand[3] is not expected to be written to so the current pattern > is bogus. > > Instead we use the expand to shuffle around the RTL. > > The vectorizer expects operands[3] and operands[0] to be the same but the > aarch64 intrinsics expanders expect operands[0] and operands[1] to be the > same. > > This also fixes some issues with big-endian, each dot product performs 4 8- > byte multiplications. However compared to AArch64 we don't enter lanes in > GCC lane indexed in AArch32 aside from loads/stores. This means no lane > remappings are done in arm-builtins.c and so none should be done at the > instruction side. > > There are some other instructions that need inspections as I think there are > more incorrect ones. > > Third there was a bug in the ACLE specication for dot product which has now > been fixed[1]. This means some intrinsics were missing and are added by > this patch. > > Bootstrapped and regtested on arm-none-linux-gnueabihf and no issues. > > Ok for master? and active branches after some stew? > > [1] https://github.com/ARM-software/acle/releases/tag/r2021Q3 > > Thanks, > Tamar > > gcc/ChangeLog: > > * config/arm/arm_neon.h (vdot_laneq_u32, vdotq_laneq_u32, > vdot_laneq_s32, vdotq_laneq_s32): New. > * config/arm/arm_neon_builtins.def (sdot_laneq, udot_laneq: New. > * config/arm/neon.md (neon_<sup>dot<vsi2qi>): New. > (<sup>dot_prod<vsi2qi>): Re-order rtl. > (neon_<sup>dot_lane<vsi2qi>): Fix rtl order and endiannes. > (neon_<sup>dot_laneq<vsi2qi>): New. > > gcc/testsuite/ChangeLog: > > * gcc.target/arm/simd/vdot-compile.c: Add new cases. > * gcc.target/arm/simd/vdot-exec.c: Likewise. > > --- inline copy of patch -- > diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index > 3364b37f69dfc33082388246c03149d9ad66a634..af6ac63dc3b47830d92f199d93 > 153ff510f658e9 100644 > --- a/gcc/config/arm/arm_neon.h > +++ b/gcc/config/arm/arm_neon.h > @@ -18243,6 +18243,35 @@ vdotq_lane_s32 (int32x4_t __r, int8x16_t __a, > int8x8_t __b, const int __index) > return __builtin_neon_sdot_lanev16qi (__r, __a, __b, __index); } > > +__extension__ extern __inline uint32x2_t __attribute__ > +((__always_inline__, __gnu_inline__, __artificial__)) > +vdot_laneq_u32 (uint32x2_t __r, uint8x8_t __a, uint8x16_t __b, const > +int __index) { > + return __builtin_neon_udot_laneqv8qi_uuuus (__r, __a, __b, __index); > +} > + > +__extension__ extern __inline uint32x4_t __attribute__ > +((__always_inline__, __gnu_inline__, __artificial__)) > +vdotq_laneq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b, > + const int __index) > +{ > + return __builtin_neon_udot_laneqv16qi_uuuus (__r, __a, __b, __index); > +} > + > +__extension__ extern __inline int32x2_t __attribute__ > +((__always_inline__, __gnu_inline__, __artificial__)) > +vdot_laneq_s32 (int32x2_t __r, int8x8_t __a, int8x16_t __b, const int > +__index) { > + return __builtin_neon_sdot_laneqv8qi (__r, __a, __b, __index); } > + > +__extension__ extern __inline int32x4_t __attribute__ > +((__always_inline__, __gnu_inline__, __artificial__)) > +vdotq_laneq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b, const int > +__index) { > + return __builtin_neon_sdot_laneqv16qi (__r, __a, __b, __index); } > + > #pragma GCC pop_options > #endif > > diff --git a/gcc/config/arm/arm_neon_builtins.def > b/gcc/config/arm/arm_neon_builtins.def > index > fafb5c6fc51c16679ead1afda7cccfea8264fd15..f83dd4327c16c0af68f72eb6d9ca > 8cf21e2e56b5 100644 > --- a/gcc/config/arm/arm_neon_builtins.def > +++ b/gcc/config/arm/arm_neon_builtins.def > @@ -342,6 +342,8 @@ VAR2 (TERNOP, sdot, v8qi, v16qi) > VAR2 (UTERNOP, udot, v8qi, v16qi) > VAR2 (MAC_LANE, sdot_lane, v8qi, v16qi) > VAR2 (UMAC_LANE, udot_lane, v8qi, v16qi) > +VAR2 (MAC_LANE, sdot_laneq, v8qi, v16qi) > +VAR2 (UMAC_LANE, udot_laneq, v8qi, v16qi) > > VAR1 (USTERNOP, usdot, v8qi) > VAR2 (USMAC_LANE_QUADTUP, usdot_lane, v8qi, v16qi) diff --git > a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index > 8b0a396947cc8e7345f178b926128d7224fb218a..848166311b5f82c5facb66e97c > 2260a5aba5d302 100644 > --- a/gcc/config/arm/neon.md > +++ b/gcc/config/arm/neon.md > @@ -2866,20 +2866,49 @@ (define_expand "cmul<conj_op><mode>3" > }) > > > -;; These instructions map to the __builtins for the Dot Product operations. > -(define_insn "neon_<sup>dot<vsi2qi>" > +;; These map to the auto-vectorizer Dot Product optab. > +;; The auto-vectorizer expects a dot product builtin that also does an > +;; accumulation into the provided register. > +;; Given the following pattern > +;; > +;; for (i=0; i<len; i++) { > +;; c = a[i] * b[i]; > +;; r += c; > +;; } > +;; return result; > +;; > +;; This can be auto-vectorized to > +;; r = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3]; ;; ;; given > +enough iterations. However the vectorizer can keep unrolling the loop > +;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7]; ;; r += > +a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11]; ;; ... > +;; > +;; and so the vectorizer provides r, in which the result has to be > accumulated. > +(define_insn "<sup>dot_prod<vsi2qi>" > [(set (match_operand:VCVTI 0 "register_operand" "=w") > - (plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0") > - (unspec:VCVTI [(match_operand:<VSI2QI> 2 > - "register_operand" > "w") > - (match_operand:<VSI2QI> 3 > - "register_operand" > "w")] > - DOTPROD)))] > + (plus:VCVTI > + (unspec:VCVTI [(match_operand:<VSI2QI> 1 "register_operand" > "w") > + (match_operand:<VSI2QI> 2 "register_operand" > "w")] > + DOTPROD) > + (match_operand:VCVTI 3 "register_operand" "0")))] > "TARGET_DOTPROD" > - "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %<V_reg>3" > + "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2" > [(set_attr "type" "neon_dot<q>")] > ) > > +;; These instructions map to the __builtins for the Dot Product > +operations (define_expand "neon_<sup>dot<vsi2qi>" > + [(set (match_operand:VCVTI 0 "register_operand" "=w") > + (plus:VCVTI > + (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand") > + (match_operand:<VSI2QI> 3 "register_operand")] > + DOTPROD) > + (match_operand:VCVTI 1 "register_operand")))] > + "TARGET_DOTPROD" > +) > + > ;; These instructions map to the __builtins for the Dot Product operations. > (define_insn "neon_usdot<vsi2qi>" > [(set (match_operand:VCVTI 0 "register_operand" "=w") @@ -2898,17 > +2927,40 @@ (define_insn "neon_usdot<vsi2qi>" > ;; indexed operations. > (define_insn "neon_<sup>dot_lane<vsi2qi>" > [(set (match_operand:VCVTI 0 "register_operand" "=w") > - (plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0") > - (unspec:VCVTI [(match_operand:<VSI2QI> 2 > - "register_operand" > "w") > - (match_operand:V8QI 3 "register_operand" > "t") > - (match_operand:SI 4 "immediate_operand" > "i")] > - DOTPROD)))] > + (plus:VCVTI > + (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand" > "w") > + (match_operand:V8QI 3 "register_operand" "t") > + (match_operand:SI 4 "immediate_operand" "i")] > + DOTPROD) > + (match_operand:VCVTI 1 "register_operand" "0")))] > + "TARGET_DOTPROD" > + "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]"; > + [(set_attr "type" "neon_dot<q>")] > +) > + > +;; These instructions map to the __builtins for the Dot Product ;; > +indexed operations. > +(define_insn "neon_<sup>dot_laneq<vsi2qi>" > + [(set (match_operand:VCVTI 0 "register_operand" "=w") > + (plus:VCVTI > + (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand" > "w") > + (match_operand:V16QI 3 "register_operand" "t") > + (match_operand:SI 4 "immediate_operand" "i")] > + DOTPROD) > + (match_operand:VCVTI 1 "register_operand" "0")))] > "TARGET_DOTPROD" > { > - operands[4] > - = GEN_INT (NEON_ENDIAN_LANE_N (V8QImode, INTVAL > (operands[4]))); > - return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]"; > + int lane = INTVAL (operands[4]); > + if (lane > GET_MODE_NUNITS (V2SImode) - 1) > + { > + operands[4] = GEN_INT (lane - GET_MODE_NUNITS (V2SImode)); > + return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %f3[%c4]"; > + } > + else > + { > + operands[4] = GEN_INT (lane); > + return > "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %e3[%c4]"; > + } > } > [(set_attr "type" "neon_dot<q>")] > ) > @@ -2932,43 +2984,6 @@ (define_insn "neon_<sup>dot_lane<vsi2qi>" > [(set_attr "type" "neon_dot<q>")] > ) > > -;; These expands map to the Dot Product optab the vectorizer checks for. > -;; The auto-vectorizer expects a dot product builtin that also does an -;; > accumulation into the provided register. > -;; Given the following pattern > -;; > -;; for (i=0; i<len; i++) { > -;; c = a[i] * b[i]; > -;; r += c; > -;; } > -;; return result; > -;; > -;; This can be auto-vectorized to > -;; r = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3]; -;; -;; given enough > iterations. However the vectorizer can keep unrolling the loop -;; r += > a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7]; -;; r += a[8]*b[8] + a[9]*b[9] > + > a[10]*b[10] + a[11]*b[11]; -;; ... > -;; > -;; and so the vectorizer provides r, in which the result has to be > accumulated. > -(define_expand "<sup>dot_prod<vsi2qi>" > - [(set (match_operand:VCVTI 0 "register_operand") > - (plus:VCVTI (unspec:VCVTI [(match_operand:<VSI2QI> 1 > - "register_operand") > - (match_operand:<VSI2QI> 2 > - "register_operand")] > - DOTPROD) > - (match_operand:VCVTI 3 "register_operand")))] > - "TARGET_DOTPROD" > -{ > - emit_insn ( > - gen_neon_<sup>dot<vsi2qi> (operands[3], operands[3], operands[1], > - operands[2])); > - emit_insn (gen_rtx_SET (operands[0], operands[3])); > - DONE; > -}) > - > ;; Auto-vectorizer pattern for usdot > (define_expand "usdot_prod<vsi2qi>" > [(set (match_operand:VCVTI 0 "register_operand") diff --git > a/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c > b/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c > index > b3bd3bf00e3822fdd60b5955165583d5a5cdc1d0..d3541e829a44fa07972096a02 > 226adea1d26f09d 100644 > --- a/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c > +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c > @@ -49,8 +49,28 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, int8x8_t > y) > return vdotq_lane_s32 (r, x, y, 0); > } > > -/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, d[0-9]+, d[0- > 9]+} 4 } } */ > +int32x2_t sfoo_laneq1 (int32x2_t r, int8x8_t x, int8x16_t y) { > + return vdot_laneq_s32 (r, x, y, 0); > +} > + > +int32x4_t sfooq_lane1 (int32x4_t r, int8x16_t x, int8x16_t y) { > + return vdotq_laneq_s32 (r, x, y, 0); > +} > + > +int32x2_t sfoo_laneq2 (int32x2_t r, int8x8_t x, int8x16_t y) { > + return vdot_laneq_s32 (r, x, y, 2); > +} > + > +int32x4_t sfooq_lane2 (int32x4_t r, int8x16_t x, int8x16_t y) { > + return vdotq_laneq_s32 (r, x, y, 2); > +} > + > +/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, > +d[0-9]+, d[0-9]+} 6 } } */ > /* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+, q[0-9]+, q[0- > 9]+} 2 } } */ > -/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, d[0-9]+, d[0- > 9]+\[#?[0-9]\]} 2 } } */ > -/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+, q[0-9]+, d[0- > 9]+\[#?[0-9]\]} 2 } } */ > +/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, > +d[0-9]+, d[0-9]+\[#?[0-9]\]} 4 } } */ > +/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+, > +q[0-9]+, d[0-9]+\[#?[0-9]\]} 4 } } */ > > diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c > b/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c > index > 054f4703394b4184284dac371415bef8e9bac45d..97b7898bd6a0fc9a898eba0ea > 15fbf38eb1405a3 100644 > --- a/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c > +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c > @@ -2,6 +2,7 @@ > /* { dg-additional-options "-O3" } */ > /* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw } */ > /* { dg-add-options arm_v8_2a_dotprod_neon } */ > +/* { dg-additional-options "-w" } */ > > #include <arm_neon.h> > > @@ -33,7 +34,20 @@ extern void abort(); > t3 f##_##rx1 = {0}; \ > f##_##rx1 = f (f##_##rx1, f##_##x, f##_##y, ORDER (1, 1)); \ > if (f##_##rx1[0] != n3 || f##_##rx1[1] != n4) \ > - abort (); \ > + abort (); > + > +#define P2(n1,n2) n1,n1,n1,n1,n2,n2,n2,n2,n1,n1,n1,n1,n2,n2,n2,n2 > +#define TEST_LANEQ(t1, t2, t3, f, r1, r2, n1, n2, n3, n4) \ > + ARR(f, x, t1, r1); \ > + ARR(f, y, t2, r2); \ > + t3 f##_##rx = {0}; \ > + f##_##rx = f (f##_##rx, f##_##x, f##_##y, ORDER (3, 2)); \ > + if (f##_##rx[0] != n1 || f##_##rx[1] != n2) \ > + abort (); \ > + t3 f##_##rx1 = {0}; \ > + f##_##rx1 = f (f##_##rx1, f##_##x, f##_##y, ORDER (3, 3)); \ > + if (f##_##rx1[0] != n3 || f##_##rx1[1] != n4) \ > + abort (); > > int > main() > @@ -45,11 +59,16 @@ main() > TEST (int8x16_t, int8x16_t, int32x4_t, vdotq_s32, P(1,2), P(-2,-3), -8, > -24); > > TEST_LANE (uint8x8_t, uint8x8_t, uint32x2_t, vdot_lane_u32, P(1,2), P(2,3), > 8, 16, 12, 24); > - > TEST_LANE (int8x8_t, int8x8_t, int32x2_t, vdot_lane_s32, P(1,2), P(-2,-3), > -8, > -16, -12, -24); > > TEST_LANE (uint8x16_t, uint8x8_t, uint32x4_t, vdotq_lane_u32, P(1,2), > P(2,3), 8, 16, 12, 24); > TEST_LANE (int8x16_t, int8x8_t, int32x4_t, vdotq_lane_s32, P(1,2), > P(-2,-3), > -8, -16, -12, -24); > > + TEST_LANEQ (uint8x8_t, uint8x16_t, uint32x2_t, vdot_laneq_u32, > + P2(1,2), P2(2,3), 8, 16, 12, 24); TEST_LANEQ (int8x8_t, int8x16_t, > + int32x2_t, vdot_laneq_s32, P2(1,2), P2(-2,-3), -8, -16, -12, -24); > + > + TEST_LANEQ (uint8x16_t, uint8x16_t, uint32x4_t, vdotq_laneq_u32, > + P2(1,2), P2(2,3), 8, 16, 12, 24); TEST_LANEQ (int8x16_t, int8x16_t, > + int32x4_t, vdotq_laneq_s32, P2(1,2), P2(-2,-3), -8, -16, -12, -24); > + > return 0; > } > > > --