[PATCH 1/2]AArch64 Add fallback case using sdot for usdot

Tamar Christina via Gcc-patches Thu, 16 Jun 2022 03:49:35 -0700

Hi All,

The usdot operation is common in video encoder and decoders including some of
the most widely used ones.


This patch adds a +dotprod version of the optab as a fallback for when you do
have sdot but not usdot available.

The fallback works by adding a bias to the unsigned argument to convert it to
a signed value and then correcting for the bias later on.

Essentially it relies on (x - 128)y + 128y == xy where x is unsigned and y is
signed (assuming both are 8-bit values).  Because the range of a signed byte is
only to 127 we split the bias correction into:

   (x - 128)y + 127y + y

Concretely for:

#define N 480
#define SIGNEDNESS_1 unsigned
#define SIGNEDNESS_2 signed
#define SIGNEDNESS_3 signed
#define SIGNEDNESS_4 unsigned

SIGNEDNESS_1 int __attribute__ ((noipa))
f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
   SIGNEDNESS_4 char *restrict b)
{
  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
    {
      int av = a[i];
      int bv = b[i];
      SIGNEDNESS_2 short mult = av * bv;
      res += mult;
    }
  return res;
}

we generate:

        movi    v5.16b, 0x7f
        mov     x3, 0
        movi    v4.16b, 0x1
        movi    v3.16b, 0xffffffffffffff80
        movi    v0.4s, 0
.L2:
        ldr     q2, [x2, x3]
        ldr     q1, [x1, x3]
        add     x3, x3, 16
        sub     v2.16b, v2.16b, v3.16b
        sdot    v0.4s, v2.16b, v1.16b
        sdot    v0.4s, v5.16b, v1.16b
        sdot    v0.4s, v4.16b, v1.16b
        cmp     x3, 480
        bne     .L2

instead of:

        movi    v0.4s, 0
        mov     x3, 0
.L2:
        ldr     q2, [x1, x3]
        ldr     q1, [x2, x3]
        add     x3, x3, 16
        sxtl    v4.8h, v2.8b
        sxtl2   v3.8h, v2.16b
        uxtl    v2.8h, v1.8b
        uxtl2   v1.8h, v1.16b
        mul     v2.8h, v2.8h, v4.8h
        mul     v1.8h, v1.8h, v3.8h
        saddw   v0.4s, v0.4s, v2.4h
        saddw2  v0.4s, v0.4s, v2.8h
        saddw   v0.4s, v0.4s, v1.4h
        saddw2  v0.4s, v0.4s, v1.8h
        cmp     x3, 480
        bne     .L2

The new sequence is significantly faster as the operations it uses are well
optimized.  Note that execution tests are already in the mid-end testsuite.

Thanks to James Greenhalgh for the tip-off.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * config/aarch64/aarch64-simd.md (usdot_prod<vsi2qi>): Generate fallback
        or call original isns ...
        (usdot_prod<vsi2qi>_insn): ...here.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/simd/vusdot-autovec-2.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 
cf2f4badacc594df9ecf06de3f8ea570ef9e0ff2..235a6fa371e471816284e3383e8564e9cf643a74
 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -623,7 +623,7 @@ (define_insn "<sur>dot_prod<vsi2qi>"
 
 ;; These instructions map to the __builtins for the Armv8.6-a I8MM usdot
 ;; (vector) Dot Product operation and the vectorized optab.
-(define_insn "usdot_prod<vsi2qi>"
+(define_insn "usdot_prod<vsi2qi>_insn"
   [(set (match_operand:VS 0 "register_operand" "=w")
        (plus:VS
          (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand" "w")
@@ -635,6 +635,43 @@ (define_insn "usdot_prod<vsi2qi>"
   [(set_attr "type" "neon_dot<q>")]
 )
 
+;; usdot auto-vec fallback code
+(define_expand "usdot_prod<vsi2qi>"
+  [(set (match_operand:VS 0 "register_operand")
+       (plus:VS
+         (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand")
+                     (match_operand:<VSI2QI> 2 "register_operand")]
+         UNSPEC_USDOT)
+         (match_operand:VS 3 "register_operand")))]
+  "TARGET_DOTPROD || TARGET_I8MM"
+{
+  if (TARGET_I8MM)
+    {
+      emit_insn (gen_usdot_prod<vsi2qi>_insn (operands[0], operands[1],
+                                             operands[2], operands[3]));
+      DONE;
+    }
+
+  machine_mode elemmode = GET_MODE_INNER (<VSI2QI>mode);
+  HOST_WIDE_INT val = 1 << (GET_MODE_BITSIZE (elemmode).to_constant () - 1);
+  rtx signbit = gen_int_mode (val, elemmode);
+  rtx t1 = gen_reg_rtx (<MODE>mode);
+  rtx t2 = gen_reg_rtx (<MODE>mode);
+  rtx tmp = gen_reg_rtx (<VSI2QI>mode);
+  rtx c1 = gen_const_vec_duplicate (<VSI2QI>mode,
+                                   gen_int_mode (val - 1, elemmode));
+  rtx c2 = gen_const_vec_duplicate (<VSI2QI>mode, gen_int_mode (1, elemmode));
+  rtx dup = gen_const_vec_duplicate (<VSI2QI>mode, signbit);
+  c1 = force_reg (<VSI2QI>mode, c1);
+  c2 = force_reg (<VSI2QI>mode, c2);
+  dup = force_reg (<VSI2QI>mode, dup);
+  emit_insn (gen_sub<vsi2qi>3 (tmp, operands[1], dup));
+  emit_insn (gen_sdot_prod<vsi2qi> (t1, tmp, operands[2], operands[3]));
+  emit_insn (gen_sdot_prod<vsi2qi> (t2, c1, operands[2], t1));
+  emit_insn (gen_sdot_prod<vsi2qi> (operands[0], c2, operands[2], t2));
+  DONE;
+})
+
 ;; These instructions map to the __builtins for the Dot Product
 ;; indexed operations.
 (define_insn "aarch64_<sur>dot_lane<vsi2qi>"
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c 
b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
new file mode 100644
index 
0000000000000000000000000000000000000000..acd8e36209690386d021df72f1467a696750ac3e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8.2-a+noi8mm+dotprod" } */
+
+#define N 480
+#define SIGNEDNESS_1 unsigned
+#define SIGNEDNESS_2 signed
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 unsigned
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
+   SIGNEDNESS_4 char *restrict b)
+{
+  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
+    {
+      int av = a[i];
+      int bv = b[i];
+      SIGNEDNESS_2 short mult = av * bv;
+      res += mult;
+    }
+  return res;
+}
+
+/* { dg-final { scan-assembler-not {\tusdot\t} } } */
+/* { dg-final { scan-assembler-times {\tsdot\t} 3 } } */




--

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 
cf2f4badacc594df9ecf06de3f8ea570ef9e0ff2..235a6fa371e471816284e3383e8564e9cf643a74
 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -623,7 +623,7 @@ (define_insn "<sur>dot_prod<vsi2qi>"
 
 ;; These instructions map to the __builtins for the Armv8.6-a I8MM usdot
 ;; (vector) Dot Product operation and the vectorized optab.
-(define_insn "usdot_prod<vsi2qi>"
+(define_insn "usdot_prod<vsi2qi>_insn"
   [(set (match_operand:VS 0 "register_operand" "=w")
        (plus:VS
          (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand" "w")
@@ -635,6 +635,43 @@ (define_insn "usdot_prod<vsi2qi>"
   [(set_attr "type" "neon_dot<q>")]
 )
 
+;; usdot auto-vec fallback code
+(define_expand "usdot_prod<vsi2qi>"
+  [(set (match_operand:VS 0 "register_operand")
+       (plus:VS
+         (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand")
+                     (match_operand:<VSI2QI> 2 "register_operand")]
+         UNSPEC_USDOT)
+         (match_operand:VS 3 "register_operand")))]
+  "TARGET_DOTPROD || TARGET_I8MM"
+{
+  if (TARGET_I8MM)
+    {
+      emit_insn (gen_usdot_prod<vsi2qi>_insn (operands[0], operands[1],
+                                             operands[2], operands[3]));
+      DONE;
+    }
+
+  machine_mode elemmode = GET_MODE_INNER (<VSI2QI>mode);
+  HOST_WIDE_INT val = 1 << (GET_MODE_BITSIZE (elemmode).to_constant () - 1);
+  rtx signbit = gen_int_mode (val, elemmode);
+  rtx t1 = gen_reg_rtx (<MODE>mode);
+  rtx t2 = gen_reg_rtx (<MODE>mode);
+  rtx tmp = gen_reg_rtx (<VSI2QI>mode);
+  rtx c1 = gen_const_vec_duplicate (<VSI2QI>mode,
+                                   gen_int_mode (val - 1, elemmode));
+  rtx c2 = gen_const_vec_duplicate (<VSI2QI>mode, gen_int_mode (1, elemmode));
+  rtx dup = gen_const_vec_duplicate (<VSI2QI>mode, signbit);
+  c1 = force_reg (<VSI2QI>mode, c1);
+  c2 = force_reg (<VSI2QI>mode, c2);
+  dup = force_reg (<VSI2QI>mode, dup);
+  emit_insn (gen_sub<vsi2qi>3 (tmp, operands[1], dup));
+  emit_insn (gen_sdot_prod<vsi2qi> (t1, tmp, operands[2], operands[3]));
+  emit_insn (gen_sdot_prod<vsi2qi> (t2, c1, operands[2], t1));
+  emit_insn (gen_sdot_prod<vsi2qi> (operands[0], c2, operands[2], t2));
+  DONE;
+})
+
 ;; These instructions map to the __builtins for the Dot Product
 ;; indexed operations.
 (define_insn "aarch64_<sur>dot_lane<vsi2qi>"
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c 
b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
new file mode 100644
index 
0000000000000000000000000000000000000000..acd8e36209690386d021df72f1467a696750ac3e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8.2-a+noi8mm+dotprod" } */
+
+#define N 480
+#define SIGNEDNESS_1 unsigned
+#define SIGNEDNESS_2 signed
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 unsigned
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
+   SIGNEDNESS_4 char *restrict b)
+{
+  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
+    {
+      int av = a[i];
+      int bv = b[i];
+      SIGNEDNESS_2 short mult = av * bv;
+      res += mult;
+    }
+  return res;
+}
+
+/* { dg-final { scan-assembler-not {\tusdot\t} } } */
+/* { dg-final { scan-assembler-times {\tsdot\t} 3 } } */

[PATCH 1/2]AArch64 Add fallback case using sdot for usdot

Reply via email to