Hi Richard,
On 5/4/19 5:13 PM, Richard Sandiford wrote:
Kyrill Tkachov <kyrylo.tkac...@foss.arm.com> writes:
@@ -764,6 +780,13 @@ (define_insn "aarch64_<sur>adalp<mode>_3"
;; UABAL tmp.8h, op1.16b, op2.16b
;; UADALP op3.4s, tmp.8h
;; MOV op0, op3 // should be eliminated in later passes.
+;;
+;; For TARGET_DOTPROD we do:
+;; MOV tmp1.16b, #1 // Can be CSE'd and hoisted out of loops.
+;; UABD tmp2.16b, op1.16b, op2.16b
+;; UDOT op3.4s, tmp2.16b, tmp1.16b
+;; MOV op0, op3 // RA will tie the operands of UDOT appropriately.
+;;
;; The signed version just uses the signed variants of the above instructions.
It looks like the code does what the comment says, and uses SDOT for the
signed optab. Doesn't it need to be UDOT for both? The signedness of the
optab applies to the inputs (and so to SABD vs. UABD), but the absolute
difference is always unsigned.
I think you're right, updated.
(define_expand "<sur>sadv16qi"
@@ -773,6 +796,18 @@ (define_expand "<sur>sadv16qi"
(use (match_operand:V4SI 3 "register_operand"))]
"TARGET_SIMD"
{
+ if (TARGET_DOTPROD)
+ {
+ rtx ones = gen_reg_rtx (V16QImode);
+ emit_move_insn (ones,
+ aarch64_simd_gen_const_vector_dup (V16QImode,
+ HOST_WIDE_INT_1));
Easier as:
rtx ones = force_reg (V16QImode, CONST1_RTX (V16QImode));
Indeed.
+ rtx abd = gen_reg_rtx (V16QImode);
+ emit_insn (gen_<sur>abdv16qi_3 (abd, operands[1], operands[2]));
+ emit_insn (gen_aarch64_<sur>dotv16qi (operands[0], operands[3],
+ abd, ones));
Nit: indented too far.
Thanks, fixed (and a couple of other minor edits after seeing
Alejandro's SVE patch).
2019-08-05 Kyrylo Tkachov <kyrylo.tkac...@arm.com>
* config/aarch64/iterators.md (MAX_OPP): New code attr.
* config/aarch64/aarch64-simd.md (<su>abd<mode>_3): New define_expand.
(*aarch64_<su>abd<mode>_3): Rename to...
(aarch64_<su>abd<mode>_3): ... This.
(<sur>sadv16qi): Add TARGET_DOTPROD expansion.
2019-08-05 Kyrylo Tkachov <kyrylo.tkac...@arm.com>
* gcc.target/aarch64/ssadv16qi.c: Add +nodotprod to pragma.
* gcc.target/aarch64/usadv16qi.c: Likewise.
* gcc.target/aarch64/ssadv16qi-dotprod.c: New test.
* gcc.target/aarch64/usadv16qi-dotprod.c: Likewise.
Thanks,
Richard
+ DONE;
+ }
rtx reduc = gen_reg_rtx (V8HImode);
emit_insn (gen_aarch64_<sur>abdl2v16qi_3 (reduc, operands[1],
operands[2]));
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index
16e4dbda73ab928054590c47a4398408162c0332..5afb692493c6e9fa31355693e7843e4f0b1b281c
100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -1059,6 +1059,9 @@ (define_code_attr f16mac [(plus "a") (minus "s")])
;; Map smax to smin and umax to umin.
(define_code_attr max_opp [(smax "smin") (umax "umin")])
+;; Same as above, but louder.
+(define_code_attr MAX_OPP [(smax "SMIN") (umax "UMIN")])
+
;; The number of subvectors in an SVE_STRUCT.
(define_mode_attr vector_count [(VNx32QI "2") (VNx16HI "2")
(VNx8SI "2") (VNx4DI "2")
diff --git a/gcc/testsuite/gcc.target/aarch64/ssadv16qi-dotprod.c
b/gcc/testsuite/gcc.target/aarch64/ssadv16qi-dotprod.c
new file mode 100644
index
0000000000000000000000000000000000000000..e08c33785303e86815554e67a300189a67dfc1da
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/ssadv16qi-dotprod.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_ok } */
+/* { dg-add-options arm_v8_2a_dotprod_neon } */
+/* { dg-additional-options "-O3" } */
+
+#pragma GCC target "+nosve"
+
+#define N 1024
+
+signed char pix1[N], pix2[N];
+
+int foo (void)
+{
+ int i_sum = 0;
+ int i;
+
+ for (i = 0; i < N; i++)
+ i_sum += __builtin_abs (pix1[i] - pix2[i]);
+
+ return i_sum;
+}
+
+/* { dg-final { scan-assembler-not {\tsshll\t} } } */
+/* { dg-final { scan-assembler-not {\tsshll2\t} } } */
+/* { dg-final { scan-assembler-not {\tssubl\t} } } */
+/* { dg-final { scan-assembler-not {\tssubl2\t} } } */
+/* { dg-final { scan-assembler-not {\tabs\t} } } */
+
+/* { dg-final { scan-assembler {\tsabd\t} } } */
+/* { dg-final { scan-assembler {\tsdot\t} } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/ssadv16qi.c
b/gcc/testsuite/gcc.target/aarch64/ssadv16qi.c
index
40b28843616e84df137210b45ec16abed2a37c75..85a867a113013f560bfd0a3142805b9c95ad8c5a
100644
--- a/gcc/testsuite/gcc.target/aarch64/ssadv16qi.c
+++ b/gcc/testsuite/gcc.target/aarch64/ssadv16qi.c
@@ -1,7 +1,7 @@
/* { dg-do compile } */
/* { dg-options "-O3" } */
-#pragma GCC target "+nosve"
+#pragma GCC target "+nosve+nodotprod"
#define N 1024
diff --git a/gcc/testsuite/gcc.target/aarch64/usadv16qi-dotprod.c b/gcc/testsuite/gcc.target/aarch64/usadv16qi-dotprod.c
new file mode 100644
index
0000000000000000000000000000000000000000..ea8de4d69758bd6bc9af9e33e1498f838b706949
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/usadv16qi-dotprod.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_ok } */
+/* { dg-add-options arm_v8_2a_dotprod_neon } */
+/* { dg-additional-options "-O3" } */
+
+#pragma GCC target "+nosve"
+
+#define N 1024
+
+unsigned char pix1[N], pix2[N];
+
+int foo (void)
+{
+ int i_sum = 0;
+ int i;
+
+ for (i = 0; i < N; i++)
+ i_sum += __builtin_abs (pix1[i] - pix2[i]);
+
+ return i_sum;
+}
+
+/* { dg-final { scan-assembler-not {\tushll\t} } } */
+/* { dg-final { scan-assembler-not {\tushll2\t} } } */
+/* { dg-final { scan-assembler-not {\tusubl\t} } } */
+/* { dg-final { scan-assembler-not {\tusubl2\t} } } */
+/* { dg-final { scan-assembler-not {\tabs\t} } } */
+
+/* { dg-final { scan-assembler {\tuabd\t} } } */
+/* { dg-final { scan-assembler {\tudot\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/usadv16qi.c
b/gcc/testsuite/gcc.target/aarch64/usadv16qi.c
index
69ceaf4259ea43e95078ce900d2498c3a2291369..a66e1209662cefaa95c90d8d2694f9c7c0de4152
100644
--- a/gcc/testsuite/gcc.target/aarch64/usadv16qi.c
+++ b/gcc/testsuite/gcc.target/aarch64/usadv16qi.c
@@ -1,7 +1,7 @@
/* { dg-do compile } */
/* { dg-options "-O3" } */
-#pragma GCC target "+nosve"
+#pragma GCC target "+nosve+nodotprod"
#define N 1024
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index eb99d3ab881e29f3069991e4f778be95d51ec4da..ebb16d676e21caa6ec783727822f901b1fe8405b 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -705,12 +705,28 @@
[(set_attr "type" "neon_abs<q>")]
)
+;; Helper expander for aarch64_<su>abd<mode>_3 to save the callers
+;; the hassle of constructing the other arm of the MINUS.
+(define_expand "<su>abd<mode>_3"
+ [(use (match_operand:VDQ_BHSI 0 "register_operand"))
+ (USMAX:VDQ_BHSI (match_operand:VDQ_BHSI 1 "register_operand")
+ (match_operand:VDQ_BHSI 2 "register_operand"))]
+ "TARGET_SIMD"
+ {
+ rtx other_arm
+ = gen_rtx_<MAX_OPP> (<MODE>mode, operands[1], operands[2]);
+ emit_insn (gen_aarch64_<su>abd<mode>_3 (operands[0], operands[1],
+ operands[2], other_arm));
+ DONE;
+ }
+)
+
;; It's tempting to represent SABD as ABS (MINUS op1 op2).
;; This isn't accurate as ABS treats always its input as a signed value.
;; So (ABS:QI (minus:QI 64 -128)) == (ABS:QI (192 or -64 signed)) == 64.
;; Whereas SABD would return 192 (-64 signed) on the above example.
;; Use MINUS ([us]max (op1, op2), [us]min (op1, op2)) instead.
-(define_insn "*aarch64_<su>abd<mode>_3"
+(define_insn "aarch64_<su>abd<mode>_3"
[(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
(minus:VDQ_BHSI
(USMAX:VDQ_BHSI
@@ -764,7 +780,16 @@
;; UABAL tmp.8h, op1.16b, op2.16b
;; UADALP op3.4s, tmp.8h
;; MOV op0, op3 // should be eliminated in later passes.
-;; The signed version just uses the signed variants of the above instructions.
+;;
+;; For TARGET_DOTPROD we do:
+;; MOV tmp1.16b, #1 // Can be CSE'd and hoisted out of loops.
+;; UABD tmp2.16b, op1.16b, op2.16b
+;; UDOT op3.4s, tmp2.16b, tmp1.16b
+;; MOV op0, op3 // RA will tie the operands of UDOT appropriately.
+;;
+;; The signed version just uses the signed variants of the above instructions
+;; but for TARGET_DOTPROD still emits a UDOT as the absolute difference is
+;; unsigned.
(define_expand "<sur>sadv16qi"
[(use (match_operand:V4SI 0 "register_operand"))
@@ -773,6 +798,15 @@
(use (match_operand:V4SI 3 "register_operand"))]
"TARGET_SIMD"
{
+ if (TARGET_DOTPROD)
+ {
+ rtx ones = force_reg (V16QImode, CONST1_RTX (V16QImode));
+ rtx abd = gen_reg_rtx (V16QImode);
+ emit_insn (gen_<sur>abdv16qi_3 (abd, operands[1], operands[2]));
+ emit_insn (gen_aarch64_udotv16qi (operands[0], operands[3],
+ abd, ones));
+ DONE;
+ }
rtx reduc = gen_reg_rtx (V8HImode);
emit_insn (gen_aarch64_<sur>abdl2v16qi_3 (reduc, operands[1],
operands[2]));
diff --git a/gcc/testsuite/gcc.target/aarch64/ssadv16qi-dotprod.c b/gcc/testsuite/gcc.target/aarch64/ssadv16qi-dotprod.c
new file mode 100644
index 0000000000000000000000000000000000000000..08b6831cfbee2c44cf6a33f91986e2953c622148
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/ssadv16qi-dotprod.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_ok } */
+/* { dg-add-options arm_v8_2a_dotprod_neon } */
+/* { dg-additional-options "-O3" } */
+
+#pragma GCC target "+nosve"
+
+#define N 1024
+
+signed char pix1[N], pix2[N];
+
+int foo (void)
+{
+ int i_sum = 0;
+ int i;
+
+ for (i = 0; i < N; i++)
+ i_sum += __builtin_abs (pix1[i] - pix2[i]);
+
+ return i_sum;
+}
+
+/* { dg-final { scan-assembler-not {\tsshll\t} } } */
+/* { dg-final { scan-assembler-not {\tsshll2\t} } } */
+/* { dg-final { scan-assembler-not {\tssubl\t} } } */
+/* { dg-final { scan-assembler-not {\tssubl2\t} } } */
+/* { dg-final { scan-assembler-not {\tabs\t} } } */
+
+/* { dg-final { scan-assembler {\tsabd\t} } } */
+/* { dg-final { scan-assembler {\tudot\t} } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/ssadv16qi.c b/gcc/testsuite/gcc.target/aarch64/ssadv16qi.c
index 40b28843616e84df137210b45ec16abed2a37c75..85a867a113013f560bfd0a3142805b9c95ad8c5a 100644
--- a/gcc/testsuite/gcc.target/aarch64/ssadv16qi.c
+++ b/gcc/testsuite/gcc.target/aarch64/ssadv16qi.c
@@ -1,7 +1,7 @@
/* { dg-do compile } */
/* { dg-options "-O3" } */
-#pragma GCC target "+nosve"
+#pragma GCC target "+nosve+nodotprod"
#define N 1024
diff --git a/gcc/testsuite/gcc.target/aarch64/usadv16qi-dotprod.c b/gcc/testsuite/gcc.target/aarch64/usadv16qi-dotprod.c
new file mode 100644
index 0000000000000000000000000000000000000000..ea8de4d69758bd6bc9af9e33e1498f838b706949
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/usadv16qi-dotprod.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_ok } */
+/* { dg-add-options arm_v8_2a_dotprod_neon } */
+/* { dg-additional-options "-O3" } */
+
+#pragma GCC target "+nosve"
+
+#define N 1024
+
+unsigned char pix1[N], pix2[N];
+
+int foo (void)
+{
+ int i_sum = 0;
+ int i;
+
+ for (i = 0; i < N; i++)
+ i_sum += __builtin_abs (pix1[i] - pix2[i]);
+
+ return i_sum;
+}
+
+/* { dg-final { scan-assembler-not {\tushll\t} } } */
+/* { dg-final { scan-assembler-not {\tushll2\t} } } */
+/* { dg-final { scan-assembler-not {\tusubl\t} } } */
+/* { dg-final { scan-assembler-not {\tusubl2\t} } } */
+/* { dg-final { scan-assembler-not {\tabs\t} } } */
+
+/* { dg-final { scan-assembler {\tuabd\t} } } */
+/* { dg-final { scan-assembler {\tudot\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/usadv16qi.c b/gcc/testsuite/gcc.target/aarch64/usadv16qi.c
index 69ceaf4259ea43e95078ce900d2498c3a2291369..a66e1209662cefaa95c90d8d2694f9c7c0de4152 100644
--- a/gcc/testsuite/gcc.target/aarch64/usadv16qi.c
+++ b/gcc/testsuite/gcc.target/aarch64/usadv16qi.c
@@ -1,7 +1,7 @@
/* { dg-do compile } */
/* { dg-options "-O3" } */
-#pragma GCC target "+nosve"
+#pragma GCC target "+nosve+nodotprod"
#define N 1024