Hi James,

On 06/02/19 17:33, James Greenhalgh wrote:
On Mon, Feb 04, 2019 at 04:23:32AM -0600, Kyrill Tkachov wrote:
Hi all,

Richard raised a concern about the RTL we use to represent the AdvSIMD SABD
(vector signed absolute difference) instruction.
We currently represent it as ABS (MINUS op1 op2).

This isn't exactly what SABD does. ABS treats its input as a signed value
and returns the absolute of that.

For example:
(sabd:QI 64 -128) == 192 (unsigned) aka -64 (signed)
whereas
(minus:QI 64 -128) == 192 (unsigned) aka -64 (signed), (abs ...) of that is 64.

A better way to describe the instruction is with MINUS (SMAX (op1 op2) SMIN 
(op1 op2)).
This patch implements that, and also implements similar semantics for the UABD 
instruction
that uses UMAX and UMIN.

That way for the example above we'll have:
(minus:QI (smax:QI (64 -128)) (smin:QI (64 -128))) == (minus:QI 64 -128) == 192 
(or -64 signed) which matches
what SABD does.

Bootstrapped and tested on aarch64-none-linux-gnu.

Ok for trunk?
Not without a comment explaining the above subtlety and preferably a
testcase which would fail today on trunk.

Otherwise, OK.

I'll take that as approval with the changes requested.
I'll commit the updated patch (attached) later today.

Thanks,
Kyrill

2019-02-07  Kyrylo Tkachov  <kyrylo.tkac...@arm.com>

    * config/aarch64/iterators.md (max_opp): New code_attr.
    (USMAX): New code iterator.
    * config/aarch64/predicates.md (aarch64_smin): New predicate.
    (aarch64_smax): Likewise.
    * config/aarch64/aarch64-simd.md (abd<mode>_3): Rename to...
    (*aarch64_<su>abd<mode>_3): ... Change RTL representation to
    MINUS (MAX MIN).

2019-02-07  Kyrylo Tkachov  <kyrylo.tkac...@arm.com>

    * gcc.target/aarch64/abd_1.c: New test.
    * gcc.dg/sabd_1.c: Likewise.

James

Thanks,
Kyrill

2019-04-02  Kyrylo Tkachov  <kyrylo.tkac...@arm.com>

      * config/aarch64/iterators.md (max_opp): New code_attr.
      (USMAX): New code iterator.
      * config/aarch64/predicates.md (aarch64_smin): New predicate.
      (aarch64_smax): Likewise.
      * config/aarch64/aarch64-simd.md (abd<mode>_3): Rename to...
      (*aarch64_<su>abd<mode>_3): ... Change RTL representation to
      MINUS (MAX MIN).

2019-04-02  Kyrylo Tkachov  <kyrylo.tkac...@arm.com>

      * gcc.target/aarch64/abd_1.c: New test.

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 96360d877a404a6795aef5458345e87de9006cf4..eb99d3ab881e29f3069991e4f778be95d51ec4da 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -705,13 +705,22 @@ (define_insn "aarch64_abs<mode>"
   [(set_attr "type" "neon_abs<q>")]
 )
 
-(define_insn "abd<mode>_3"
+;; It's tempting to represent SABD as ABS (MINUS op1 op2).
+;; This isn't accurate as ABS treats always its input as a signed value.
+;; So (ABS:QI (minus:QI 64 -128)) == (ABS:QI (192 or -64 signed)) == 64.
+;; Whereas SABD would return 192 (-64 signed) on the above example.
+;; Use MINUS ([us]max (op1, op2), [us]min (op1, op2)) instead.
+(define_insn "*aarch64_<su>abd<mode>_3"
   [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
-	(abs:VDQ_BHSI (minus:VDQ_BHSI
-		       (match_operand:VDQ_BHSI 1 "register_operand" "w")
-		       (match_operand:VDQ_BHSI 2 "register_operand" "w"))))]
-  "TARGET_SIMD"
-  "sabd\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
+	(minus:VDQ_BHSI
+	  (USMAX:VDQ_BHSI
+	    (match_operand:VDQ_BHSI 1 "register_operand" "w")
+	    (match_operand:VDQ_BHSI 2 "register_operand" "w"))
+	  (match_operator 3 "aarch64_<max_opp>"
+	    [(match_dup 1)
+	     (match_dup 2)])))]
+  "TARGET_SIMD"
+  "<su>abd\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
   [(set_attr "type" "neon_abd<q>")]
 )
 
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index e9ede2934a43539279ae68049fb66b84114d90ea..16e4dbda73ab928054590c47a4398408162c0332 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -1056,6 +1056,9 @@ (define_mode_attr f16quad [(V2SF "") (V4SF "q")])
 
 (define_code_attr f16mac [(plus "a") (minus "s")])
 
+;; Map smax to smin and umax to umin.
+(define_code_attr max_opp [(smax "smin") (umax "umin")])
+
 ;; The number of subvectors in an SVE_STRUCT.
 (define_mode_attr vector_count [(VNx32QI "2") (VNx16HI "2")
 				(VNx8SI  "2") (VNx4DI  "2")
@@ -1204,6 +1207,9 @@ (define_code_iterator MAXMIN [smax smin umax umin])
 
 (define_code_iterator FMAXMIN [smax smin])
 
+;; Signed and unsigned max operations.
+(define_code_iterator USMAX [smax umax])
+
 ;; Code iterator for variants of vector max and min.
 (define_code_iterator ADDSUB [plus minus])
 
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 855cf7b52f840a81e0bbd2b31fd59fc860060626..b8e6d232ff6237a58addda1ec0e953a1054dc616 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -319,6 +319,12 @@ (define_predicate "aarch64_reg_or_imm"
   (ior (match_operand 0 "register_operand")
        (match_operand 0 "const_scalar_int_operand")))
 
+(define_predicate "aarch64_smin"
+  (match_code "smin"))
+
+(define_predicate "aarch64_umin"
+  (match_code "umin"))
+
 ;; True for integer comparisons and for FP comparisons other than LTGT or UNEQ.
 (define_special_predicate "aarch64_comparison_operator"
   (match_code "eq,ne,le,lt,ge,gt,geu,gtu,leu,ltu,unordered,
diff --git a/gcc/testsuite/gcc.dg/sabd_1.c b/gcc/testsuite/gcc.dg/sabd_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..587e3055fce5796ab58eb271fd204fdcbec91df4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/sabd_1.c
@@ -0,0 +1,47 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -fwrapv" } */
+/* Make sure vectorized absolute difference behaves same as scalar version.  */
+
+#define N 16
+signed char a[] = {-100, -100, -100, -100,-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100 };
+signed char b[] = { 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100 };
+
+signed char out[N];
+
+__attribute__ ((noinline,noipa))
+void
+foo (void)
+{
+  for (int i = 0; i < N; i++)
+    {
+      signed char diff = b[i] - a[i];
+      out[i] = diff > 0 ? diff : -diff;
+    }
+}
+
+signed char out2[N];
+
+__attribute__ ((noinline,noipa))
+void
+foo_scalar (void)
+{
+  for (int i = 0; i < N; i++)
+    {
+      asm volatile ("");
+      signed char diff = b[i] - a[i];
+      out2[i] = diff > 0 ? diff : -diff;
+    }
+}
+
+int
+main (void)
+{
+  foo ();
+  foo_scalar ();
+  for (int i = 0; i < N; i++)
+    if (out[i] != out2[i])
+      __builtin_abort ();
+
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/abd_1.c b/gcc/testsuite/gcc.target/aarch64/abd_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..a27cb5ef0c12cd0068ff1728e20cce7f68eddc2d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/abd_1.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#pragma GCC target "+nosve"
+
+#define MAX(x, y) ((x) > (y) ? (x) : (y))
+#define MIN(x, y) ((x) < (y) ? (x) : (y))
+#define N 1024
+
+#define FUNC(T)								\
+void									\
+sabd_##T (signed T * restrict a, signed T * restrict b,		\
+		signed T * restrict out)				\
+{									\
+  for (int i = 0; i < N; i++)						\
+    out[i] = MAX (a[i], b[i]) - MIN (a[i], b[i]);			\
+}									\
+									\
+void									\
+uabd_##T (unsigned T * restrict a, unsigned T * restrict b,	\
+		  unsigned T * restrict out)				\
+{									\
+  for (int i = 0; i < N; i++)						\
+    out[i] = MAX (a[i], b[i]) - MIN (a[i], b[i]);			\
+}
+
+FUNC(char)
+FUNC(short)
+FUNC(int)
+
+/* { dg-final { scan-assembler-times "sabd\\tv\[0-9\]+\.16b, v\[0-9\]+\.16b, v\[0-9\]+\.16b" 1 } } */
+/* { dg-final { scan-assembler-times "uabd\\tv\[0-9\]+\.16b, v\[0-9\]+\.16b, v\[0-9\]+\.16b" 1 } } */
+/* { dg-final { scan-assembler-times "sabd\\tv\[0-9\]+\.8h, v\[0-9\]+\.8h, v\[0-9\]+\.8h" 1 } } */
+/* { dg-final { scan-assembler-times "uabd\\tv\[0-9\]+\.8h, v\[0-9\]+\.8h, v\[0-9\]+\.8h" 1 } } */
+/* { dg-final { scan-assembler-times "sabd\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.4s" 1 } } */
+/* { dg-final { scan-assembler-times "uabd\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.4s" 1 } } */

Reply via email to