[Aarch64] Use vector wide add for mixed-mode adds

Michael Collison Sun, 06 Sep 2015 22:55:29 -0700

This patch is designed to address code that was not being vectorized dueto missing widening patterns in the aarch64 backend. Code such as:


int t6(int len, void * dummy, short * __restrict x)
{
  len = len & ~31;
  int result = 0;
  __asm volatile ("");
  for (int i = 0; i < len; i++)
    result += x[i];
  return result;
}

Validated on aarch64-none-elf, aarch64_be-none-elf, andaarch64-none-linus-gnu.

Note that there are three non-execution tree dump vectorizationregressions where previously code was being vectorized. They are:


Passed now fails          [PASS => FAIL]:
  gcc.dg/vect/slp-multitypes-4.c -flto -ffat-lto-objects  scan-tree-dump-times vect 
"vectorized 1 loops" 1
  gcc.dg/vect/slp-multitypes-4.c -flto -ffat-lto-objects  scan-tree-dump-times vect 
"vectorizing stmts using SLP" 1
  gcc.dg/vect/slp-multitypes-4.c scan-tree-dump-times vect "vectorized 1 loops" 
1
  gcc.dg/vect/slp-multitypes-4.c scan-tree-dump-times vect "vectorizing stmts using 
SLP" 1
  gcc.dg/vect/slp-multitypes-5.c -flto -ffat-lto-objects  scan-tree-dump-times vect 
"vectorized 1 loops" 1
  gcc.dg/vect/slp-multitypes-5.c -flto -ffat-lto-objects  scan-tree-dump-times vect 
"vectorizing stmts using SLP" 1
  gcc.dg/vect/slp-multitypes-5.c scan-tree-dump-times vect "vectorized 1 loops" 
1
  gcc.dg/vect/slp-multitypes-5.c scan-tree-dump-times vect "vectorizing stmts using 
SLP" 1
  gcc.dg/vect/slp-reduc-3.c -flto -ffat-lto-objects  scan-tree-dump-times vect 
"vectorizing stmts using SLP" 1
  gcc.dg/vect/slp-reduc-3.c scan-tree-dump-times vect "vectorizing stmts using 
SLP" 1
  gcc.dg/vect/vect-125.c -flto -ffat-lto-objects  scan-tree-dump vect "vectorized 1 
loops"
  gcc.dg/vect/vect-125.c scan-tree-dump vect "vectorized 1 loops"

I would like to treat these as saperate bugs and resolve them separately.


--------------------------------------------------------------------------------------------------------------------------------------------------------

2015-09-04  Michael Collison  <michael.colli...@linaro.org>

    * config/aarch64/aarch64-simd.md (widen_ssum, widen_usum,
aarch64_<ANY_EXTEND:su><ADDSUB:optab>w<mode>_internal): New patterns
    * config/aarch64/iterators.md (Vhalf, VDBLW): New mode attributes.
    * gcc.target/aarch64/saddw-1.c: New test.
    * gcc.target/aarch64/saddw-2.c: New test.
    * gcc.target/aarch64/uaddw-1.c: New test.
    * gcc.target/aarch64/uaddw-2.c: New test.
    * gcc.target/aarch64/uaddw-3.c: New test.

diff --git a/gcc/config/aarch64/aarch64-simd.mdb/gcc/config/aarch64/aarch64-simd.md

index 9777418..d6c5d61 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -2636,6 +2636,60 @@

 ;; <su><addsub>w<q>.

+(define_expand "widen_ssum<mode>3"
+  [(set (match_operand:<VDBLW> 0 "register_operand" "")

+ (plus:<VDBLW> (sign_extend:<VDBLW> (match_operand:VQW 1"register_operand" ""))

+              (match_operand:<VDBLW> 2 "register_operand" "")))]
+  "TARGET_SIMD"
+  {
+    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, false);
+    rtx temp = gen_reg_rtx (GET_MODE (operands[0]));
+
+    emit_insn (gen_aarch64_saddw<mode>_internal (temp, operands[2],
+                        operands[1], p));
+    emit_insn (gen_aarch64_saddw2<mode> (operands[0], temp, operands[1]));
+    DONE;
+  }
+)
+
+(define_expand "widen_ssum<mode>3"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "")
+    (plus:<VWIDE> (sign_extend:<VWIDE>
+               (match_operand:VD_BHSI 1 "register_operand" ""))
+              (match_operand:<VWIDE> 2 "register_operand" "")))]
+  "TARGET_SIMD"
+{

+ emit_insn (gen_aarch64_saddw<mode> (operands[0], operands[2],operands[1]));

+  DONE;
+})
+
+(define_expand "widen_usum<mode>3"
+  [(set (match_operand:<VDBLW> 0 "register_operand" "=&w")

+ (plus:<VDBLW> (zero_extend:<VDBLW> (match_operand:VQW 1"register_operand" ""))

+              (match_operand:<VDBLW> 2 "register_operand" "")))]
+  "TARGET_SIMD"
+  {
+    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, false);
+    rtx temp = gen_reg_rtx (GET_MODE (operands[0]));
+
+    emit_insn (gen_aarch64_uaddw<mode>_internal (temp, operands[2],
+                         operands[1], p));
+    emit_insn (gen_aarch64_uaddw2<mode> (operands[0], temp, operands[1]));
+    DONE;
+  }
+)
+
+(define_expand "widen_usum<mode>3"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "")
+    (plus:<VWIDE> (zero_extend:<VWIDE>
+               (match_operand:VD_BHSI 1 "register_operand" ""))
+              (match_operand:<VWIDE> 2 "register_operand" "")))]
+  "TARGET_SIMD"
+{

+ emit_insn (gen_aarch64_uaddw<mode> (operands[0], operands[2],operands[1]));

+  DONE;
+})
+
 (define_insn "aarch64_<ANY_EXTEND:su><ADDSUB:optab>w<mode>"
   [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
         (ADDSUB:<VWIDE> (match_operand:<VWIDE> 1 "register_operand" "w")
@@ -2646,6 +2700,18 @@
   [(set_attr "type" "neon_<ADDSUB:optab>_widen")]
 )

+(define_insn "aarch64_<ANY_EXTEND:su><ADDSUB:optab>w<mode>_internal"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+        (ADDSUB:<VWIDE> (match_operand:<VWIDE> 1 "register_operand" "w")
+            (ANY_EXTEND:<VWIDE>
+              (vec_select:<VHALF>
+               (match_operand:VQW 2 "register_operand" "w")
+               (match_operand:VQW 3 "vect_par_cnst_lo_half" "")))))]
+  "TARGET_SIMD"

+ "<ANY_EXTEND:su><ADDSUB:optab>w\\t%0.<Vwtype>, %1.<Vwtype>,%2.<Vhalftype>"

+  [(set_attr "type" "neon_<ADDSUB:optab>_widen")]
+)
+
 (define_insn "aarch64_<ANY_EXTEND:su><ADDSUB:optab>w2<mode>_internal"
   [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
         (ADDSUB:<VWIDE> (match_operand:<VWIDE> 1 "register_operand" "w")

diff --git a/gcc/config/aarch64/iterators.mdb/gcc/config/aarch64/iterators.md

index b8a45d1..cd2914e 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -427,6 +427,13 @@
              (V2DI "DI")    (V2SF  "SF")
              (V4SF "V2SF")  (V2DF  "DF")])

+;; Half modes of all vector modes, in lower-case.
+(define_mode_attr Vhalf [(V8QI "v4qi")  (V16QI "v8qi")
+             (V4HI "v2hi")  (V8HI  "v4hi")
+             (V2SI "si")    (V4SI  "v2si")
+             (V2DI "di")    (V2SF  "sf")
+             (V4SF "v2sf")  (V2DF  "df")])
+
 ;; Double modes of vector modes.
 (define_mode_attr VDBL [(V8QI "V16QI") (V4HI "V8HI")
             (V2SI "V4SI")  (V2SF "V4SF")
@@ -439,6 +446,11 @@
             (SI   "v2si")  (DI   "v2di")
             (DF   "v2df")])

+;; Modes with double-width elements.
+(define_mode_attr VDBLW [(V8QI "V4HI") (V16QI "V8HI")
+                  (V4HI "V2SI") (V8HI "V4SI")
+                  (V2SI "DI")   (V4SI "V2DI")])
+
 ;; Narrowed modes for VDN.
 (define_mode_attr VNARROWD [(V4HI "V8QI") (V2SI "V4HI")
                 (DI   "V2SI")])

diff --git a/gcc/testsuite/gcc.target/aarch64/saddw-1.cb/gcc/testsuite/gcc.target/aarch64/saddw-1.c

new file mode 100644
index 0000000..9db5d00
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/saddw-1.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+
+int
+t6(int len, void * dummy, short * __restrict x)
+{
+  len = len & ~31;
+  int result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "saddw" } } */
+/* { dg-final { scan-assembler "saddw2" } } */
+
+
+

diff --git a/gcc/testsuite/gcc.target/aarch64/saddw-2.cb/gcc/testsuite/gcc.target/aarch64/saddw-2.c

new file mode 100644
index 0000000..6f8c8fd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/saddw-2.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+int
+t6(int len, void * dummy, int * __restrict x)
+{
+  len = len & ~31;
+  long long result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "saddw" } } */
+/* { dg-final { scan-assembler "saddw2" } } */
+
+

diff --git a/gcc/testsuite/gcc.target/aarch64/uaddw-1.cb/gcc/testsuite/gcc.target/aarch64/uaddw-1.c

new file mode 100644
index 0000000..e34574f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/uaddw-1.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+
+int
+t6(int len, void * dummy, unsigned short * __restrict x)
+{
+  len = len & ~31;
+  unsigned int result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "uaddw" } } */
+/* { dg-final { scan-assembler "uaddw2" } } */

diff --git a/gcc/testsuite/gcc.target/aarch64/uaddw-2.cb/gcc/testsuite/gcc.target/aarch64/uaddw-2.c

new file mode 100644
index 0000000..fd3b578
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/uaddw-2.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+int
+t6(int len, void * dummy, unsigned short * __restrict x)
+{
+  len = len & ~31;
+  unsigned int result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "uaddw" } } */
+/* { dg-final { scan-assembler "uaddw2" } } */
+

diff --git a/gcc/testsuite/gcc.target/aarch64/uaddw-3.cb/gcc/testsuite/gcc.target/aarch64/uaddw-3.c

new file mode 100644
index 0000000..04bc7c9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/uaddw-3.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+
+int
+t6(int len, void * dummy, char * __restrict x)
+{
+  len = len & ~31;
+  unsigned short result = 0;
+  __asm volatile ("");
+  for (int i = 0; i < len; i++)
+    result += x[i];
+  return result;
+}
+
+/* { dg-final { scan-assembler "uaddw" } } */
+/* { dg-final { scan-assembler "uaddw2" } } */
+
+
+
--
1.9.1



--
Michael Collison
Linaro Toolchain Working Group
michael.colli...@linaro.org

[Aarch64] Use vector wide add for mixed-mode adds

Reply via email to