RE: [AArch32]: correct dot-product RTL patterns.

Tamar Christina via Gcc-patches Mon, 10 Jan 2022 23:10:43 -0800

ping


> -----Original Message-----
> From: Tamar Christina
> Sent: Tuesday, December 21, 2021 12:31 PM
> To: gcc-patches@gcc.gnu.org
> Cc: nd <n...@arm.com>; Ramana Radhakrishnan
> <ramana.radhakrish...@arm.com>; Richard Earnshaw
> <richard.earns...@arm.com>; ni...@redhat.com; Kyrylo Tkachov
> <kyrylo.tkac...@arm.com>
> Subject: [AArch32]: correct dot-product RTL patterns.
> 
> Hi All,
> 
> The previous fix for this problem was wrong due to a subtle difference
> between where NEON expects the RMW values and where intrinsics expects
> them.
> 
> The insn pattern is modeled after the intrinsics and so needs an expand for
> the vectorizer optab to switch the RTL.
> 
> However operand[3] is not expected to be written to so the current pattern
> is bogus.
> 
> Instead we use the expand to shuffle around the RTL.
> 
> The vectorizer expects operands[3] and operands[0] to be the same but the
> aarch64 intrinsics expanders expect operands[0] and operands[1] to be the
> same.
> 
> This also fixes some issues with big-endian, each dot product performs 4 8-
> byte multiplications.  However compared to AArch64 we don't enter lanes in
> GCC lane indexed in AArch32 aside from loads/stores.  This means no lane
> remappings are done in arm-builtins.c and so none should be done at the
> instruction side.
> 
> There are some other instructions that need inspections as I think there are
> more incorrect ones.
> 
> Third there was a bug in the ACLE specication for dot product which has now
> been fixed[1].  This means some intrinsics were missing and are added by
> this patch.
> 
> Bootstrapped and regtested on arm-none-linux-gnueabihf and no issues.
> 
> Ok for master? and active branches after some stew?
> 
> [1] https://github.com/ARM-software/acle/releases/tag/r2021Q3
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
>       * config/arm/arm_neon.h (vdot_laneq_u32, vdotq_laneq_u32,
>       vdot_laneq_s32, vdotq_laneq_s32): New.
>       * config/arm/arm_neon_builtins.def (sdot_laneq, udot_laneq: New.
>       * config/arm/neon.md (neon_<sup>dot<vsi2qi>): New.
>       (<sup>dot_prod<vsi2qi>): Re-order rtl.
>       (neon_<sup>dot_lane<vsi2qi>): Fix rtl order and endiannes.
>       (neon_<sup>dot_laneq<vsi2qi>): New.
> 
> gcc/testsuite/ChangeLog:
> 
>       * gcc.target/arm/simd/vdot-compile.c: Add new cases.
>       * gcc.target/arm/simd/vdot-exec.c: Likewise.
> 
> --- inline copy of patch --
> diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index
> 3364b37f69dfc33082388246c03149d9ad66a634..af6ac63dc3b47830d92f199d93
> 153ff510f658e9 100644
> --- a/gcc/config/arm/arm_neon.h
> +++ b/gcc/config/arm/arm_neon.h
> @@ -18243,6 +18243,35 @@ vdotq_lane_s32 (int32x4_t __r, int8x16_t __a,
> int8x8_t __b, const int __index)
>    return __builtin_neon_sdot_lanev16qi (__r, __a, __b, __index);  }
> 
> +__extension__ extern __inline uint32x2_t __attribute__
> +((__always_inline__, __gnu_inline__, __artificial__))
> +vdot_laneq_u32 (uint32x2_t __r, uint8x8_t __a, uint8x16_t __b, const
> +int __index) {
> +  return __builtin_neon_udot_laneqv8qi_uuuus (__r, __a, __b, __index);
> +}
> +
> +__extension__ extern __inline uint32x4_t __attribute__
> +((__always_inline__, __gnu_inline__, __artificial__))
> +vdotq_laneq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b,
> +             const int __index)
> +{
> +  return __builtin_neon_udot_laneqv16qi_uuuus (__r, __a, __b, __index);
> +}
> +
> +__extension__ extern __inline int32x2_t __attribute__
> +((__always_inline__, __gnu_inline__, __artificial__))
> +vdot_laneq_s32 (int32x2_t __r, int8x8_t __a, int8x16_t __b, const int
> +__index) {
> +  return __builtin_neon_sdot_laneqv8qi (__r, __a, __b, __index); }
> +
> +__extension__ extern __inline int32x4_t __attribute__
> +((__always_inline__, __gnu_inline__, __artificial__))
> +vdotq_laneq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b, const int
> +__index) {
> +  return __builtin_neon_sdot_laneqv16qi (__r, __a, __b, __index); }
> +
>  #pragma GCC pop_options
>  #endif
> 
> diff --git a/gcc/config/arm/arm_neon_builtins.def
> b/gcc/config/arm/arm_neon_builtins.def
> index
> fafb5c6fc51c16679ead1afda7cccfea8264fd15..f83dd4327c16c0af68f72eb6d9ca
> 8cf21e2e56b5 100644
> --- a/gcc/config/arm/arm_neon_builtins.def
> +++ b/gcc/config/arm/arm_neon_builtins.def
> @@ -342,6 +342,8 @@ VAR2 (TERNOP, sdot, v8qi, v16qi)
>  VAR2 (UTERNOP, udot, v8qi, v16qi)
>  VAR2 (MAC_LANE, sdot_lane, v8qi, v16qi)
>  VAR2 (UMAC_LANE, udot_lane, v8qi, v16qi)
> +VAR2 (MAC_LANE, sdot_laneq, v8qi, v16qi)
> +VAR2 (UMAC_LANE, udot_laneq, v8qi, v16qi)
> 
>  VAR1 (USTERNOP, usdot, v8qi)
>  VAR2 (USMAC_LANE_QUADTUP, usdot_lane, v8qi, v16qi) diff --git
> a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index
> 8b0a396947cc8e7345f178b926128d7224fb218a..848166311b5f82c5facb66e97c
> 2260a5aba5d302 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -2866,20 +2866,49 @@ (define_expand "cmul<conj_op><mode>3"
>  })
> 
> 
> -;; These instructions map to the __builtins for the Dot Product operations.
> -(define_insn "neon_<sup>dot<vsi2qi>"
> +;; These map to the auto-vectorizer Dot Product optab.
> +;; The auto-vectorizer expects a dot product builtin that also does an
> +;; accumulation into the provided register.
> +;; Given the following pattern
> +;;
> +;; for (i=0; i<len; i++) {
> +;;     c = a[i] * b[i];
> +;;     r += c;
> +;; }
> +;; return result;
> +;;
> +;; This can be auto-vectorized to
> +;; r  = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3]; ;; ;; given
> +enough iterations.  However the vectorizer can keep unrolling the loop
> +;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7]; ;; r +=
> +a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11]; ;; ...
> +;;
> +;; and so the vectorizer provides r, in which the result has to be 
> accumulated.
> +(define_insn "<sup>dot_prod<vsi2qi>"
>    [(set (match_operand:VCVTI 0 "register_operand" "=w")
> -     (plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0")
> -                 (unspec:VCVTI [(match_operand:<VSI2QI> 2
> -                                                     "register_operand"
> "w")
> -                                (match_operand:<VSI2QI> 3
> -                                                     "register_operand"
> "w")]
> -             DOTPROD)))]
> +     (plus:VCVTI
> +       (unspec:VCVTI [(match_operand:<VSI2QI> 1 "register_operand"
> "w")
> +                      (match_operand:<VSI2QI> 2 "register_operand"
> "w")]
> +                      DOTPROD)
> +       (match_operand:VCVTI 3 "register_operand" "0")))]
>    "TARGET_DOTPROD"
> -  "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
> +  "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
>    [(set_attr "type" "neon_dot<q>")]
>  )
> 
> +;; These instructions map to the __builtins for the Dot Product
> +operations (define_expand "neon_<sup>dot<vsi2qi>"
> +  [(set (match_operand:VCVTI 0 "register_operand" "=w")
> +     (plus:VCVTI
> +       (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand")
> +                      (match_operand:<VSI2QI> 3 "register_operand")]
> +                      DOTPROD)
> +       (match_operand:VCVTI 1 "register_operand")))]
> +  "TARGET_DOTPROD"
> +)
> +
>  ;; These instructions map to the __builtins for the Dot Product operations.
>  (define_insn "neon_usdot<vsi2qi>"
>    [(set (match_operand:VCVTI 0 "register_operand" "=w") @@ -2898,17
> +2927,40 @@ (define_insn "neon_usdot<vsi2qi>"
>  ;; indexed operations.
>  (define_insn "neon_<sup>dot_lane<vsi2qi>"
>    [(set (match_operand:VCVTI 0 "register_operand" "=w")
> -     (plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0")
> -                 (unspec:VCVTI [(match_operand:<VSI2QI> 2
> -                                                     "register_operand"
> "w")
> -                                (match_operand:V8QI 3 "register_operand"
> "t")
> -                                (match_operand:SI 4 "immediate_operand"
> "i")]
> -             DOTPROD)))]
> +     (plus:VCVTI
> +       (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand"
> "w")
> +                      (match_operand:V8QI 3 "register_operand" "t")
> +                      (match_operand:SI 4 "immediate_operand" "i")]
> +                      DOTPROD)
> +       (match_operand:VCVTI 1 "register_operand" "0")))]
> +  "TARGET_DOTPROD"
> +  "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]";
> +  [(set_attr "type" "neon_dot<q>")]
> +)
> +
> +;; These instructions map to the __builtins for the Dot Product ;;
> +indexed operations.
> +(define_insn "neon_<sup>dot_laneq<vsi2qi>"
> +  [(set (match_operand:VCVTI 0 "register_operand" "=w")
> +     (plus:VCVTI
> +       (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand"
> "w")
> +                      (match_operand:V16QI 3 "register_operand" "t")
> +                      (match_operand:SI 4 "immediate_operand" "i")]
> +                      DOTPROD)
> +       (match_operand:VCVTI 1 "register_operand" "0")))]
>    "TARGET_DOTPROD"
>    {
> -    operands[4]
> -      = GEN_INT (NEON_ENDIAN_LANE_N (V8QImode, INTVAL
> (operands[4])));
> -    return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]";
> +    int lane = INTVAL (operands[4]);
> +    if (lane > GET_MODE_NUNITS (V2SImode) - 1)
> +      {
> +     operands[4] = GEN_INT (lane - GET_MODE_NUNITS (V2SImode));
> +     return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %f3[%c4]";
> +      }
> +    else
> +      {
> +     operands[4] = GEN_INT (lane);
> +     return
> "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %e3[%c4]";
> +      }
>    }
>    [(set_attr "type" "neon_dot<q>")]
>  )
> @@ -2932,43 +2984,6 @@ (define_insn "neon_<sup>dot_lane<vsi2qi>"
>    [(set_attr "type" "neon_dot<q>")]
>  )
> 
> -;; These expands map to the Dot Product optab the vectorizer checks for.
> -;; The auto-vectorizer expects a dot product builtin that also does an -;;
> accumulation into the provided register.
> -;; Given the following pattern
> -;;
> -;; for (i=0; i<len; i++) {
> -;;     c = a[i] * b[i];
> -;;     r += c;
> -;; }
> -;; return result;
> -;;
> -;; This can be auto-vectorized to
> -;; r  = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3]; -;; -;; given enough
> iterations.  However the vectorizer can keep unrolling the loop -;; r +=
> a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7]; -;; r += a[8]*b[8] + a[9]*b[9] 
> +
> a[10]*b[10] + a[11]*b[11]; -;; ...
> -;;
> -;; and so the vectorizer provides r, in which the result has to be 
> accumulated.
> -(define_expand "<sup>dot_prod<vsi2qi>"
> -  [(set (match_operand:VCVTI 0 "register_operand")
> -     (plus:VCVTI (unspec:VCVTI [(match_operand:<VSI2QI> 1
> -                                                     "register_operand")
> -                                (match_operand:<VSI2QI> 2
> -                                                     "register_operand")]
> -                  DOTPROD)
> -                 (match_operand:VCVTI 3 "register_operand")))]
> -  "TARGET_DOTPROD"
> -{
> -  emit_insn (
> -    gen_neon_<sup>dot<vsi2qi> (operands[3], operands[3], operands[1],
> -                              operands[2]));
> -  emit_insn (gen_rtx_SET (operands[0], operands[3]));
> -  DONE;
> -})
> -
>  ;; Auto-vectorizer pattern for usdot
>  (define_expand "usdot_prod<vsi2qi>"
>    [(set (match_operand:VCVTI 0 "register_operand") diff --git
> a/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c
> b/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c
> index
> b3bd3bf00e3822fdd60b5955165583d5a5cdc1d0..d3541e829a44fa07972096a02
> 226adea1d26f09d 100644
> --- a/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c
> +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c
> @@ -49,8 +49,28 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, int8x8_t
> y)
>    return vdotq_lane_s32 (r, x, y, 0);
>  }
> 
> -/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, d[0-9]+, d[0-
> 9]+} 4 } } */
> +int32x2_t sfoo_laneq1 (int32x2_t r, int8x8_t x, int8x16_t y) {
> +  return vdot_laneq_s32 (r, x, y, 0);
> +}
> +
> +int32x4_t sfooq_lane1 (int32x4_t r, int8x16_t x, int8x16_t y) {
> +  return vdotq_laneq_s32 (r, x, y, 0);
> +}
> +
> +int32x2_t sfoo_laneq2 (int32x2_t r, int8x8_t x, int8x16_t y) {
> +  return vdot_laneq_s32 (r, x, y, 2);
> +}
> +
> +int32x4_t sfooq_lane2 (int32x4_t r, int8x16_t x, int8x16_t y) {
> +  return vdotq_laneq_s32 (r, x, y, 2);
> +}
> +
> +/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+,
> +d[0-9]+, d[0-9]+} 6 } } */
>  /* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+, q[0-9]+, q[0-
> 9]+} 2 } } */
> -/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, d[0-9]+, d[0-
> 9]+\[#?[0-9]\]} 2 } } */
> -/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+, q[0-9]+, d[0-
> 9]+\[#?[0-9]\]} 2 } } */
> +/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+,
> +d[0-9]+, d[0-9]+\[#?[0-9]\]} 4 } } */
> +/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+,
> +q[0-9]+, d[0-9]+\[#?[0-9]\]} 4 } } */
> 
> diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c
> b/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c
> index
> 054f4703394b4184284dac371415bef8e9bac45d..97b7898bd6a0fc9a898eba0ea
> 15fbf38eb1405a3 100644
> --- a/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c
> +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c
> @@ -2,6 +2,7 @@
>  /* { dg-additional-options "-O3" } */
>  /* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw } */
>  /* { dg-add-options arm_v8_2a_dotprod_neon }  */
> +/* { dg-additional-options "-w" } */
> 
>  #include <arm_neon.h>
> 
> @@ -33,7 +34,20 @@ extern void abort();
>       t3 f##_##rx1 = {0};                         \
>       f##_##rx1 =  f (f##_##rx1, f##_##x, f##_##y, ORDER (1, 1));  \
>       if (f##_##rx1[0] != n3 || f##_##rx1[1] != n4)   \
> -       abort (); \
> +       abort ();
> +
> +#define P2(n1,n2) n1,n1,n1,n1,n2,n2,n2,n2,n1,n1,n1,n1,n2,n2,n2,n2
> +#define TEST_LANEQ(t1, t2, t3, f, r1, r2, n1, n2, n3, n4) \
> +     ARR(f, x, t1, r1);                  \
> +     ARR(f, y, t2, r2);                  \
> +     t3 f##_##rx = {0};                  \
> +     f##_##rx = f (f##_##rx, f##_##x, f##_##y, ORDER (3, 2));  \
> +     if (f##_##rx[0] != n1 || f##_##rx[1] != n2)   \
> +       abort ();                                 \
> +     t3 f##_##rx1 = {0};                         \
> +     f##_##rx1 =  f (f##_##rx1, f##_##x, f##_##y, ORDER (3, 3));  \
> +     if (f##_##rx1[0] != n3 || f##_##rx1[1] != n4)   \
> +       abort ();
> 
>  int
>  main()
> @@ -45,11 +59,16 @@ main()
>    TEST (int8x16_t, int8x16_t, int32x4_t, vdotq_s32, P(1,2), P(-2,-3), -8, 
> -24);
> 
>    TEST_LANE (uint8x8_t, uint8x8_t, uint32x2_t, vdot_lane_u32, P(1,2), P(2,3),
> 8, 16, 12, 24);
> -
>    TEST_LANE (int8x8_t, int8x8_t, int32x2_t, vdot_lane_s32, P(1,2), P(-2,-3), 
> -8,
> -16, -12, -24);
> 
>    TEST_LANE (uint8x16_t, uint8x8_t, uint32x4_t, vdotq_lane_u32, P(1,2),
> P(2,3), 8, 16, 12, 24);
>    TEST_LANE (int8x16_t, int8x8_t, int32x4_t, vdotq_lane_s32, P(1,2), 
> P(-2,-3),
> -8, -16, -12, -24);
> 
> +  TEST_LANEQ (uint8x8_t, uint8x16_t, uint32x2_t, vdot_laneq_u32,
> + P2(1,2), P2(2,3), 8, 16, 12, 24);  TEST_LANEQ (int8x8_t, int8x16_t,
> + int32x2_t, vdot_laneq_s32, P2(1,2), P2(-2,-3), -8, -16, -12, -24);
> +
> +  TEST_LANEQ (uint8x16_t, uint8x16_t, uint32x4_t, vdotq_laneq_u32,
> + P2(1,2), P2(2,3), 8, 16, 12, 24);  TEST_LANEQ (int8x16_t, int8x16_t,
> + int32x4_t, vdotq_laneq_s32, P2(1,2), P2(-2,-3), -8, -16, -12, -24);
> +
>    return 0;
>  }
> 
> 
> --

RE: [AArch32]: correct dot-product RTL patterns.

Reply via email to