[PATCH][Arm] Auto-vectorization for MVE: vsub

Dennis Zhang Mon, 17 Aug 2020 11:43:21 -0700

Hi all,

This patch enables MVE vsub instructions for auto-vectorization.
It adds RTL templates for MVE vsub instructions using 'minus' instead of 
unspec expression to make the instructions recognizable for vectorization.
MVE target is added in sub<mode>3 optab. The sub<mode>3 optab is 
modified to use a mode iterator that selects available modes for various 
targets correspondingly.
MVE vector modes are enabled in arm_preferred_simd_mode in arm.c to 
support vectorization.


This patch also fixes 'vreinterpretq_*.c' MVE intrinsic tests. The tests 
generate wrong instruction numbers because of unexpected icf optimization.
This bug is exposed by the MVE vector modes enabled in this patch, 
therefore it is corrected in this patch to avoid test failures.

MVE instructions are documented here: 
https://developer.arm.com/architectures/instruction-sets/simd-isas/helium/helium-intrinsics

The patch is regtested for arm-none-eabi and bootstrapped for 
arm-none-linux-gnueabihf.

Is it OK for trunk please?

Thanks
Dennis

gcc/ChangeLog:

2020-08-10  Dennis Zhang  <dennis.zh...@arm.com>

        * config/arm/arm.c (arm_preferred_simd_mode): Enable MVE vector modes.
        * config/arm/arm.h (TARGET_NEON_IWMMXT): New macro.
        (TARGET_NEON_IWMMXT_MVE, TARGET_NEON_IWMMXT_MVE_FP): Likewise.
        (TARGET_NEON_MVE_HFP): Likewise.
        * config/arm/iterators.md (VSEL): New mode iterator to select modes
        for corresponding targets.
        * config/arm/mve.md (mve_vsubq<mode>): New entry for vsub instruction
        using expression 'minus'.
        (mve_vsubq_f<mode>): Use minus instead of VSUBQ_F unspec.
        * config/arm/neon.md (sub<mode>3): Removed here. Integrated in the
        sub<mode>3 in vec-common.md
        * config/arm/vec-common.md (sub<mode>3): Enable MVE target. Use VSEL
        to select available modes. Exclude TARGET_NEON_FP16INST from
        TARGET_NEON statement. Intergrate TARGET_NEON_FP16INST which is
        originally in neon.md.

gcc/testsuite/ChangeLog:

2020-08-10  Dennis Zhang  <dennis.zh...@arm.com>

        * gcc.target/arm/mve/intrinsics/vreinterpretq_f16.c: Use additional
        option -fno-ipa-icf and change the instruction count from 8 to 16.
        * gcc.target/arm/mve/intrinsics/vreinterpretq_f32.c: Likewise.
        * gcc.target/arm/mve/intrinsics/vreinterpretq_s16.c: Likewise.
        * gcc.target/arm/mve/intrinsics/vreinterpretq_s32.c: Likewise.
        * gcc.target/arm/mve/intrinsics/vreinterpretq_s64.c: Likewise.
        * gcc.target/arm/mve/intrinsics/vreinterpretq_s8.c: Likewise.
        * gcc.target/arm/mve/intrinsics/vreinterpretq_u16.c: Likewise.
        * gcc.target/arm/mve/intrinsics/vreinterpretq_u32.c: Likewise.
        * gcc.target/arm/mve/intrinsics/vreinterpretq_u64.c: Likewise.
        * gcc.target/arm/mve/intrinsics/vreinterpretq_u8.c: Likewise.
        * gcc.target/arm/mve/mve.exp: Include tests in subdir 'vect'.
        * gcc.target/arm/mve/vect/vect_sub_0.c: New test.
        * gcc.target/arm/mve/vect/vect_sub_1.c: New test.

diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
index 30e1d6dc994..eb8c9599357 100644
--- a/gcc/config/arm/arm.h
+++ b/gcc/config/arm/arm.h
@@ -334,6 +334,14 @@ emission of floating point pcs attributes.  */
 						isa_bit_mve_float) \
 			       && !TARGET_GENERAL_REGS_ONLY)
 
+#define TARGET_NEON_IWMMXT	(TARGET_NEON || TARGET_REALLY_IWMMXT)
+#define TARGET_NEON_IWMMXT_MVE	(TARGET_NEON || TARGET_REALLY_IWMMXT \
+				 || TARGET_HAVE_MVE)
+#define TARGET_NEON_IWMMXT_MVE_FP ((TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT) \
+				   || TARGET_NEON || TARGET_REALLY_IWMMXT)
+#define TARGET_NEON_MVE_HFP	((TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT) \
+				 || TARGET_NEON_FP16INST)
+
 /* MVE have few common instructions as VFP, like VLDM alias VPOP, VLDR, VSTM
    alia VPUSH, VSTR and VMOV, VMSR and VMRS.  In the same manner it updates few
    registers such as FPCAR, FPCCR, FPDSCR, FPSCR, MVFR0, MVFR1 and MVFR2.  All
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 6b7ca829f1c..dcbcbbeced0 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -28913,6 +28913,30 @@ arm_preferred_simd_mode (scalar_mode mode)
       default:;
       }
 
+  if (TARGET_HAVE_MVE)
+    switch (mode)
+      {
+      case QImode:
+	return V16QImode;
+      case HImode:
+	return V8HImode;
+      case SImode:
+	return V4SImode;
+
+      default:;
+      }
+
+  if (TARGET_HAVE_MVE_FLOAT)
+    switch (mode)
+      {
+      case HFmode:
+	return V8HFmode;
+      case SFmode:
+	return V4SFmode;
+
+      default:;
+      }
+
   return word_mode;
 }
 
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 0bc9eba0722..52c3a8a4355 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -80,6 +80,19 @@
 ;; Integer and float modes supported by Neon and IWMMXT but not MVE.
 (define_mode_iterator VNINOTM1 [V2SI V4HI V8QI V2SF])
 
+;; Select modes for NEON, IWMMXT and MVE.
+(define_mode_iterator VSEL [(V16QI "TARGET_NEON_IWMMXT_MVE")
+			    (V8HI  "TARGET_NEON_IWMMXT_MVE")
+			    (V4SI  "TARGET_NEON_IWMMXT_MVE")
+			    (V4SF  "TARGET_NEON_IWMMXT_MVE_FP")
+			    (V8HF  "TARGET_NEON_MVE_HFP")
+			    (V4HF  "TARGET_NEON_FP16INST")
+			    (V2SI  "TARGET_NEON_IWMMXT")
+			    (V4HI  "TARGET_NEON_IWMMXT")
+			    (V8QI  "TARGET_NEON_IWMMXT")
+			    (V2SF  "TARGET_NEON_IWMMXT")
+			    (V2DI  "TARGET_NEON_IWMMXT")])
+
 ;; Integer and float modes supported by Neon and IWMMXT, except V2DI.
 (define_mode_iterator VALLW [V2SI V4HI V8QI V2SF V4SI V8HI V16QI V4SF])
 
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 3a57901bd5b..7853b642262 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -2574,6 +2574,17 @@
   [(set_attr "type" "mve_move")
 ])
 
+(define_insn "mve_vsubq<mode>"
+  [
+   (set (match_operand:MVE_2 0 "s_register_operand" "=w")
+	(minus:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w")
+		     (match_operand:MVE_2 2 "s_register_operand" "w")))
+  ]
+  "TARGET_HAVE_MVE"
+  "vsub.i%#<V_sz_elem>\t%q0, %q1, %q2"
+  [(set_attr "type" "mve_move")
+])
+
 ;;
 ;; [vabdq_f])
 ;;
@@ -3480,9 +3491,8 @@
 (define_insn "mve_vsubq_f<mode>"
   [
    (set (match_operand:MVE_0 0 "s_register_operand" "=w")
-	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w")
-		       (match_operand:MVE_0 2 "s_register_operand" "w")]
-	 VSUBQ_F))
+	(minus:MVE_0 (match_operand:MVE_0 1 "s_register_operand" "w")
+		     (match_operand:MVE_0 2 "s_register_operand" "w")))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
   "vsub.f%#<V_sz_elem>\t%q0, %q1, %q2"
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 3e7b51d8ab6..ec933b5711e 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -552,6 +552,10 @@
     (const_string "neon_add<q>")))]
 )
 
+;; These insns implement the patterns defined by the expander sub<mode>3
+;; in vec-common.md file. For NEON fp16 extension, the pattern is only valid
+;; when flag-unsafe-math-optimizations is enabled.
+
 (define_insn "*sub<mode>3_neon"
   [(set (match_operand:VDQ 0 "s_register_operand" "=w")
         (minus:VDQ (match_operand:VDQ 1 "s_register_operand" "w")
@@ -564,17 +568,6 @@
                     (const_string "neon_sub<q>")))]
 )
 
-(define_insn "sub<mode>3"
- [(set
-   (match_operand:VH 0 "s_register_operand" "=w")
-   (minus:VH
-    (match_operand:VH 1 "s_register_operand" "w")
-    (match_operand:VH 2 "s_register_operand" "w")))]
- "TARGET_NEON_FP16INST && flag_unsafe_math_optimizations"
- "vsub.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
- [(set_attr "type" "neon_sub<q>")]
-)
-
 (define_insn "sub<mode>3_fp16"
  [(set
    (match_operand:VH 0 "s_register_operand" "=w")
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index b7e3619caf4..98664b17585 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -120,15 +120,20 @@
 })
 
 ;; Vector arithmetic. Expanders are blank, then unnamed insns implement
-;; patterns separately for IWMMXT and Neon.
+;; patterns separately for MVE, IWMMXT and Neon.
 
 (define_expand "sub<mode>3"
-  [(set (match_operand:VALL 0 "s_register_operand")
-        (minus:VALL (match_operand:VALL 1 "s_register_operand")
-                    (match_operand:VALL 2 "s_register_operand")))]
-  "(TARGET_NEON && ((<MODE>mode != V2SFmode && <MODE>mode != V4SFmode)
-		    || flag_unsafe_math_optimizations))
-   || (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (<MODE>mode))"
+  [(set (match_operand:VSEL 0 "s_register_operand")
+	(minus:VSEL (match_operand:VSEL 1 "s_register_operand")
+		    (match_operand:VSEL 2 "s_register_operand")))]
+  "((TARGET_NEON && !TARGET_NEON_FP16INST)
+    && ((<MODE>mode != V2SFmode && <MODE>mode != V4SFmode)
+	|| flag_unsafe_math_optimizations))
+   || (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (<MODE>mode))
+   || (TARGET_HAVE_MVE && VALID_MVE_SI_MODE(<MODE>mode))
+   || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT
+       && VALID_MVE_SF_MODE(<MODE>mode))
+   || (TARGET_NEON_FP16INST && flag_unsafe_math_optimizations)"
 {
 })
 
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_f16.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_f16.c
index f59f69734ed..4b8e252d998 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_f16.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_f16.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
 /* { dg-add-options arm_v8_1m_mve_fp } */
-/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-O2 -fno-ipa-icf" } */
 
 #include "arm_mve.h"
 int8x16_t value1;
@@ -41,4 +41,4 @@ foo1 ()
   return vaddq_f16 (r7, vreinterpretq_f16 (value9));
 }
 
-/* { dg-final { scan-assembler-times "vadd.f16" 8 } } */
+/* { dg-final { scan-assembler-times "vadd.f16" 16 } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_f32.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_f32.c
index dac47c7e924..9c4cec7f648 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_f32.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_f32.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
 /* { dg-add-options arm_v8_1m_mve_fp } */
-/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-O2 -fno-ipa-icf" } */
 
 #include "arm_mve.h"
 int16x8_t value1;
@@ -41,4 +41,4 @@ foo1 ()
   return vaddq_f32 (r7, vreinterpretq_f32 (value9));
 }
 
-/* { dg-final { scan-assembler-times "vadd.f32" 8 } } */
+/* { dg-final { scan-assembler-times "vadd.f32" 16 } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_s16.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_s16.c
index edc2f2f3bc6..67c64ff5ca7 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_s16.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_s16.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
 /* { dg-add-options arm_v8_1m_mve_fp } */
-/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-O2 -fno-ipa-icf" } */
 
 #include "arm_mve.h"
 int8x16_t value1;
@@ -41,4 +41,4 @@ foo1 ()
   return vaddq_s16 (r7, vreinterpretq_s16 (value9));
 }
 
-/* { dg-final { scan-assembler-times "vadd.i16" 8 } } */
+/* { dg-final { scan-assembler-times "vadd.i16" 16 } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_s32.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_s32.c
index 880de06a781..e213383e799 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_s32.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_s32.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
 /* { dg-add-options arm_v8_1m_mve_fp } */
-/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-O2 -fno-ipa-icf" } */
 
 #include "arm_mve.h"
 int16x8_t value1;
@@ -41,4 +41,4 @@ foo1 ()
   return vaddq_s32 (r7, vreinterpretq_s32 (value9));
 }
 
-/* { dg-final { scan-assembler-times "vadd.i32" 8 } } */
+/* { dg-final { scan-assembler-times "vadd.i32" 16 } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_s64.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_s64.c
index b0e81542956..7f5b3991602 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_s64.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_s64.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
 /* { dg-add-options arm_v8_1m_mve_fp } */
-/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-O2 -fno-ipa-icf" } */
 
 #include "arm_mve.h"
 int16x8_t value1;
@@ -42,4 +42,4 @@ foo1 (mve_pred16_t __p)
   return vpselq_s64 (r7, vreinterpretq_s64 (value9), __p);
 }
 
-/* { dg-final { scan-assembler-times "vpsel" 8 } } */
+/* { dg-final { scan-assembler-times "vpsel" 16 } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_s8.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_s8.c
index a5ceebb10b9..59b2dc4a598 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_s8.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_s8.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
 /* { dg-add-options arm_v8_1m_mve_fp } */
-/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-O2 -fno-ipa-icf" } */
 
 #include "arm_mve.h"
 int16x8_t value1;
@@ -41,4 +41,4 @@ foo1 ()
   return vaddq_s8 (r7, vreinterpretq_s8 (value9));
 }
 
-/* { dg-final { scan-assembler-times "vadd.i8" 8 } } */
+/* { dg-final { scan-assembler-times "vadd.i8" 16 } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_u16.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_u16.c
index cd31c23500a..5ae5b6941f9 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_u16.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_u16.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
 /* { dg-add-options arm_v8_1m_mve_fp } */
-/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-O2 -fno-ipa-icf" } */
 
 #include "arm_mve.h"
 int8x16_t value1;
@@ -41,4 +41,4 @@ foo1 ()
   return vaddq_u16 (r7, vreinterpretq_u16 (value9));
 }
 
-/* { dg-final { scan-assembler-times "vadd.i16" 8 } } */
+/* { dg-final { scan-assembler-times "vadd.i16" 16 } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_u32.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_u32.c
index faa66c9e1cc..85f10ad6a04 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_u32.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_u32.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
 /* { dg-add-options arm_v8_1m_mve_fp } */
-/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-O2 -fno-ipa-icf" } */
 
 #include "arm_mve.h"
 int16x8_t value1;
@@ -41,4 +41,4 @@ foo1 ()
   return vaddq_u32 (r7, vreinterpretq_u32 (value9));
 }
 
-/* { dg-final { scan-assembler-times "vadd.i32" 8 } } */
+/* { dg-final { scan-assembler-times "vadd.i32" 16 } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_u64.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_u64.c
index 853b28a2aac..2786771e85e 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_u64.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_u64.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
 /* { dg-add-options arm_v8_1m_mve_fp } */
-/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-O2 -fno-ipa-icf" } */
 
 #include "arm_mve.h"
 int16x8_t value1;
@@ -42,4 +42,4 @@ foo1 (mve_pred16_t __p)
   return vpselq_u64 (r7, vreinterpretq_u64 (value9), __p);
 }
 
-/* { dg-final { scan-assembler-times "vpsel" 8 } } */
+/* { dg-final { scan-assembler-times "vpsel" 16 } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_u8.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_u8.c
index bdf8cd588e1..2e87fe7da6a 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_u8.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vreinterpretq_u8.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
 /* { dg-add-options arm_v8_1m_mve_fp } */
-/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-O2 -fno-ipa-icf" } */
 
 #include "arm_mve.h"
 int16x8_t value1;
@@ -41,4 +41,4 @@ foo1 ()
   return vaddq_u8 (r7, vreinterpretq_u8 (value9));
 }
 
-/* { dg-final { scan-assembler-times "vadd.i8" 8 } } */
+/* { dg-final { scan-assembler-times "vadd.i8" 16 } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/mve.exp b/gcc/testsuite/gcc.target/arm/mve/mve.exp
index e84cb068940..4a651438eaa 100644
--- a/gcc/testsuite/gcc.target/arm/mve/mve.exp
+++ b/gcc/testsuite/gcc.target/arm/mve/mve.exp
@@ -43,6 +43,8 @@ dg-init
 # Main loop.
 dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/intrinsics/*.\[cCS\]]] \
 	"" $DEFAULT_CFLAGS
+dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/vect/*.\[cCS\]]] \
+	"" $DEFAULT_CFLAGS
 
 # All done.
 set dg_runtest_extra_prunes ""
diff --git a/gcc/testsuite/gcc.target/arm/mve/vect/vect_sub_0.c b/gcc/testsuite/gcc.target/arm/mve/vect/vect_sub_0.c
new file mode 100644
index 00000000000..68af9f0c316
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/vect/vect_sub_0.c
@@ -0,0 +1,55 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O3" } */
+
+#include <stdint.h>
+
+void test_vsub_i32 (int32_t * dest, int32_t * a, int32_t * b) {
+  int i;
+  for (i=0; i<4; i++) {
+    dest[i] = a[i] - b[i];
+  }
+}
+
+void test_vsub_i32_u (uint32_t * dest, uint32_t * a, uint32_t * b) {
+  int i;
+  for (i=0; i<4; i++) {
+    dest[i] = a[i] - b[i];
+  }
+}
+
+/* { dg-final { scan-assembler-times {vsub\.i32\tq[0-9]+, q[0-9]+, q[0-9]+} 2 } } */
+
+void test_vsub_i16 (int16_t * dest, int16_t * a, int16_t * b) {
+  int i;
+  for (i=0; i<8; i++) {
+    dest[i] = a[i] - b[i];
+  }
+}
+
+void test_vsub_i16_u (uint16_t * dest, uint16_t * a, uint16_t * b) {
+  int i;
+  for (i=0; i<8; i++) {
+    dest[i] = a[i] - b[i];
+  }
+}
+
+/* { dg-final { scan-assembler-times {vsub\.i16\tq[0-9]+, q[0-9]+, q[0-9]+} 2 } } */
+
+void test_vsub_i8 (int8_t * dest, int8_t * a, int8_t * b) {
+  int i;
+  for (i=0; i<16; i++) {
+    dest[i] = a[i] - b[i];
+  }
+}
+
+void test_vsub_i8_u (uint8_t * dest, uint8_t * a, uint8_t * b) {
+  int i;
+  for (i=0; i<16; i++) {
+    dest[i] = a[i] - b[i];
+  }
+}
+
+/* { dg-final { scan-assembler-times {vsub\.i8\tq[0-9]+, q[0-9]+, q[0-9]+} 2 } } */
+
diff --git a/gcc/testsuite/gcc.target/arm/mve/vect/vect_sub_1.c b/gcc/testsuite/gcc.target/arm/mve/vect/vect_sub_1.c
new file mode 100644
index 00000000000..3106f4aa6f1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/vect/vect_sub_1.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O3" } */
+
+void test_vsub_f32 (float * dest, float * a, float * b) {
+  int i;
+  for (i=0; i<4; i++) {
+    dest[i] = a[i] - b[i];
+  }
+}
+
+/* { dg-final { scan-assembler-times {vsub\.f32\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */
+

[PATCH][Arm] Auto-vectorization for MVE: vsub

Reply via email to