[PATCH]Arm: Add NEON and MVE RTL patterns for Complex Addition, Multiply and FMA. Fix mve types Fix mve patterns

Tamar Christina via Gcc-patches Thu, 10 Dec 2020 09:01:14 -0800

Hi All,

This adds implementation for the optabs for complex additions.  With this the
following C code:


  void f90 (float complex a[restrict N], float complex b[restrict N],
            float complex c[restrict N])
  {
    for (int i=0; i < N; i++)
      c[i] = a[i] + (b[i] * I);
  }

generates

  f90:
          add     r3, r2, #1600
  .L2:
          vld1.32 {q8}, [r0]!
          vld1.32 {q9}, [r1]!
          vcadd.f32       q8, q8, q9, #90
          vst1.32 {q8}, [r2]!
          cmp     r3, r2
          bne     .L2
          bx      lr


instead of

  f90:
          add     r3, r2, #1600
  .L2:
          vld2.32 {d24-d27}, [r0]!
          vld2.32 {d20-d23}, [r1]!
          vsub.f32      q8, q12, q11
          vadd.f32      q9, q13, q10
          vst2.32 {d16-d19}, [r2]!
          cmp     r3, r2
          bne     .L2
          bx      lr

Bootstrapped Regtested on arm-none-linux-gnueabihf and no issues.
Codegen tested for -march=armv8.1-m.main+mve.fp -mfloat-abi=hard -mfpu=auto
and no issues.

Matching tests for these are in the mid-end patches.
Note that The mid-end patches are still being respun and I may need to
change the order of some parameters but no other change is expected and
would like to decrease the size of future patches.  As such..

Ok for master?

Thanks,
Tamar

Tamar

gcc/ChangeLog:

        * config/arm/arm_mve.h (__arm_vcaddq_rot90_u8, __arm_vcaddq_rot270_u8,
        , __arm_vcaddq_rot90_s8, __arm_vcaddq_rot270_s8,
        __arm_vcaddq_rot90_u16, __arm_vcaddq_rot270_u16, __arm_vcaddq_rot90_s16,
        __arm_vcaddq_rot270_s16, __arm_vcaddq_rot90_u32,
        __arm_vcaddq_rot270_u32, __arm_vcaddq_rot90_s32,
        __arm_vcaddq_rot270_s32, __arm_vcmulq_rot90_f16,
        __arm_vcmulq_rot270_f16, __arm_vcmulq_rot180_f16,
        __arm_vcmulq_f16, __arm_vcaddq_rot90_f16, __arm_vcaddq_rot270_f16,
        __arm_vcmulq_rot90_f32, __arm_vcmulq_rot270_f32,
        __arm_vcmulq_rot180_f32, __arm_vcmulq_f32, __arm_vcaddq_rot90_f32,
        __arm_vcaddq_rot270_f32, __arm_vcmlaq_f16, __arm_vcmlaq_rot180_f16,
        __arm_vcmlaq_rot270_f16, __arm_vcmlaq_rot90_f16, __arm_vcmlaq_f32,
        __arm_vcmlaq_rot180_f32, __arm_vcmlaq_rot270_f32,
        __arm_vcmlaq_rot90_f32): Update builtin calls.
        * config/arm/arm_mve_builtins.def (vcaddq_rot90_u, vcaddq_rot270_u,
        vcaddq_rot90_s, vcaddq_rot270_s, vcaddq_rot90_f, vcaddq_rot270_f,
        vcmulq_f, vcmulq_rot90_f, vcmulq_rot180_f, vcmulq_rot270_f,
        vcmlaq_f, vcmlaq_rot90_f, vcmlaq_rot180_f, vcmlaq_rot270_f): Removed.
        (vcaddq_rot90, vcaddq_rot270, vcmulq, vcmulq_rot90, vcmulq_rot180,
        vcmulq_rot270, vcmlaq, vcmlaq_rot90, vcmlaq_rot180, vcmlaq_rot270):
        New.
        * config/arm/constraints.md (Dz): Include MVE.
        * config/arm/iterators.md (mve_rotsplit1, mve_rotsplit2): New.
        (rot): Add UNSPEC_VCMLS, UNSPEC_VCMUL and UNSPEC_VCMUL180.
        (rot_op, rotsplit1, rotsplit2, fcmac1, VCMLA_OP, VCMUL_OP): New.
        * config/arm/mve.md (VCADDQ_ROT270_S, VCADDQ_ROT90_S, VCADDQ_ROT270_U,
        VCADDQ_ROT90_U, VCADDQ_ROT270_F, VCADDQ_ROT90_F, VCMULQ_F,
        VCMULQ_ROT180_F, VCMULQ_ROT270_F, VCMULQ_ROT90_F, VCMLAQ_F,
        VCMLAQ_ROT180_F, VCMLAQ_ROT90_F, VCMLAQ_ROT270_F, VCADDQ_ROT270_S,
        VCADDQ_ROT270, VCADDQ_ROT90): Removed.
        (mve_rot, VCMUL): New.
        (mve_vcaddq_rot270_<supf><mode, mve_vcaddq_rot90_<supf><mode>,
        mve_vcaddq_rot270_f<mode>, mve_vcaddq_rot90_f<mode>, mve_vcmulq_f<mode,
        mve_vcmulq_rot180_f<mode>, mve_vcmulq_rot270_f<mode>,
        mve_vcmulq_rot90_f<mode>, mve_vcmlaq_f<mode>, mve_vcmlaq_rot180_f<mode>,
        mve_vcmlaq_rot270_f<mode>, mve_vcmlaq_rot90_f<mode>): Removed.
        (mve_vcmlaq<mve_rot><mode>, mve_vcmulq<mve_rot><mode>,
        mve_vcaddq<mve_rot><mode>, cadd<rot><mode>3, mve_vcaddq<mve_rot><mode>):
        New.
        (cmul<rot_op><mode>3): Exclude MVE types.
        * config/arm/unspecs.md (UNSPEC_VCMUL90, UNSPEC_VCMUL270): New.
        * config/arm/vec-common.md (cadd<rot><mode>3, cmul<rot_op><mode>3,
        arm_vcmla<rot><mode>, cml<fcmac1><rot_op><mode>4): New.
        * config/arm/unspecs.md (UNSPEC_VCMUL, UNSPEC_VCMUL180, UNSPEC_VCMLS,
        UNSPEC_VCMLS180): New.

--

diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index 6c0d1e2e634a32196eb31079166a7733dcd3a4b6..45014621f2533497e90ddf5257fb04e1fd9325b4 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -3981,14 +3981,16 @@ __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot90_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  return __builtin_mve_vcaddq_rot90_uv16qi (__a, __b);
+  return (uint8x16_t)
+    __builtin_mve_vcaddq_rot90v16qi ((int8x16_t)__a, (int8x16_t)__b);
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot270_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  return __builtin_mve_vcaddq_rot270_uv16qi (__a, __b);
+  return (uint8x16_t)
+    __builtin_mve_vcaddq_rot270v16qi ((int8x16_t)__a, (int8x16_t)__b);
 }
 
 __extension__ extern __inline uint8x16_t
@@ -4520,14 +4522,14 @@ __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot90_s8 (int8x16_t __a, int8x16_t __b)
 {
-  return __builtin_mve_vcaddq_rot90_sv16qi (__a, __b);
+  return __builtin_mve_vcaddq_rot90v16qi (__a, __b);
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot270_s8 (int8x16_t __a, int8x16_t __b)
 {
-  return __builtin_mve_vcaddq_rot270_sv16qi (__a, __b);
+  return __builtin_mve_vcaddq_rot270v16qi (__a, __b);
 }
 
 __extension__ extern __inline int8x16_t
@@ -4821,14 +4823,16 @@ __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot90_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  return __builtin_mve_vcaddq_rot90_uv8hi (__a, __b);
+  return (uint16x8_t)
+    __builtin_mve_vcaddq_rot90v8hi ((int16x8_t)__a, (int16x8_t)__b);
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot270_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  return __builtin_mve_vcaddq_rot270_uv8hi (__a, __b);
+  return (uint16x8_t)
+    __builtin_mve_vcaddq_rot270v8hi ((int16x8_t)__a, (int16x8_t)__b);
 }
 
 __extension__ extern __inline uint16x8_t
@@ -5360,14 +5364,14 @@ __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot90_s16 (int16x8_t __a, int16x8_t __b)
 {
-  return __builtin_mve_vcaddq_rot90_sv8hi (__a, __b);
+  return __builtin_mve_vcaddq_rot90v8hi (__a, __b);
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot270_s16 (int16x8_t __a, int16x8_t __b)
 {
-  return __builtin_mve_vcaddq_rot270_sv8hi (__a, __b);
+  return __builtin_mve_vcaddq_rot270v8hi (__a, __b);
 }
 
 __extension__ extern __inline int16x8_t
@@ -5661,14 +5665,16 @@ __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot90_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  return __builtin_mve_vcaddq_rot90_uv4si (__a, __b);
+  return (uint32x4_t)
+    __builtin_mve_vcaddq_rot90v4si ((int32x4_t)__a, (int32x4_t)__b);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot270_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  return __builtin_mve_vcaddq_rot270_uv4si (__a, __b);
+  return (uint32x4_t)
+    __builtin_mve_vcaddq_rot270v4si ((int32x4_t)__a, (int32x4_t)__b);
 }
 
 __extension__ extern __inline uint32x4_t
@@ -6200,14 +6206,14 @@ __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot90_s32 (int32x4_t __a, int32x4_t __b)
 {
-  return __builtin_mve_vcaddq_rot90_sv4si (__a, __b);
+  return __builtin_mve_vcaddq_rot90v4si (__a, __b);
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot270_s32 (int32x4_t __a, int32x4_t __b)
 {
-  return __builtin_mve_vcaddq_rot270_sv4si (__a, __b);
+  return __builtin_mve_vcaddq_rot270v4si (__a, __b);
 }
 
 __extension__ extern __inline int32x4_t
@@ -17342,42 +17348,42 @@ __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmulq_rot90_f16 (float16x8_t __a, float16x8_t __b)
 {
-  return __builtin_mve_vcmulq_rot90_fv8hf (__a, __b);
+  return __builtin_mve_vcmulq_rot90v8hf (__a, __b);
 }
 
 __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmulq_rot270_f16 (float16x8_t __a, float16x8_t __b)
 {
-  return __builtin_mve_vcmulq_rot270_fv8hf (__a, __b);
+  return __builtin_mve_vcmulq_rot270v8hf (__a, __b);
 }
 
 __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmulq_rot180_f16 (float16x8_t __a, float16x8_t __b)
 {
-  return __builtin_mve_vcmulq_rot180_fv8hf (__a, __b);
+  return __builtin_mve_vcmulq_rot180v8hf (__a, __b);
 }
 
 __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmulq_f16 (float16x8_t __a, float16x8_t __b)
 {
-  return __builtin_mve_vcmulq_fv8hf (__a, __b);
+  return __builtin_mve_vcmulqv8hf (__a, __b);
 }
 
 __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot90_f16 (float16x8_t __a, float16x8_t __b)
 {
-  return __builtin_mve_vcaddq_rot90_fv8hf (__a, __b);
+  return __builtin_mve_vcaddq_rot90v8hf (__a, __b);
 }
 
 __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot270_f16 (float16x8_t __a, float16x8_t __b)
 {
-  return __builtin_mve_vcaddq_rot270_fv8hf (__a, __b);
+  return __builtin_mve_vcaddq_rot270v8hf (__a, __b);
 }
 
 __extension__ extern __inline float16x8_t
@@ -17594,42 +17600,42 @@ __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmulq_rot90_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return __builtin_mve_vcmulq_rot90_fv4sf (__a, __b);
+  return __builtin_mve_vcmulq_rot90v4sf (__a, __b);
 }
 
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmulq_rot270_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return __builtin_mve_vcmulq_rot270_fv4sf (__a, __b);
+  return __builtin_mve_vcmulq_rot270v4sf (__a, __b);
 }
 
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmulq_rot180_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return __builtin_mve_vcmulq_rot180_fv4sf (__a, __b);
+  return __builtin_mve_vcmulq_rot180v4sf (__a, __b);
 }
 
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmulq_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return __builtin_mve_vcmulq_fv4sf (__a, __b);
+  return __builtin_mve_vcmulqv4sf (__a, __b);
 }
 
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot90_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return __builtin_mve_vcaddq_rot90_fv4sf (__a, __b);
+  return __builtin_mve_vcaddq_rot90v4sf (__a, __b);
 }
 
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcaddq_rot270_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return __builtin_mve_vcaddq_rot270_fv4sf (__a, __b);
+  return __builtin_mve_vcaddq_rot270v4sf (__a, __b);
 }
 
 __extension__ extern __inline float32x4_t
@@ -17784,28 +17790,28 @@ __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmlaq_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c)
 {
-  return __builtin_mve_vcmlaq_fv8hf (__a, __b, __c);
+  return __builtin_mve_vcmlaqv8hf (__a, __b, __c);
 }
 
 __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmlaq_rot180_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c)
 {
-  return __builtin_mve_vcmlaq_rot180_fv8hf (__a, __b, __c);
+  return __builtin_mve_vcmlaq_rot180v8hf (__a, __b, __c);
 }
 
 __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmlaq_rot270_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c)
 {
-  return __builtin_mve_vcmlaq_rot270_fv8hf (__a, __b, __c);
+  return __builtin_mve_vcmlaq_rot270v8hf (__a, __b, __c);
 }
 
 __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmlaq_rot90_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c)
 {
-  return __builtin_mve_vcmlaq_rot90_fv8hf (__a, __b, __c);
+  return __builtin_mve_vcmlaq_rot90v8hf (__a, __b, __c);
 }
 
 __extension__ extern __inline float16x8_t
@@ -18092,28 +18098,28 @@ __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmlaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
 {
-  return __builtin_mve_vcmlaq_fv4sf (__a, __b, __c);
+  return __builtin_mve_vcmlaqv4sf (__a, __b, __c);
 }
 
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmlaq_rot180_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
 {
-  return __builtin_mve_vcmlaq_rot180_fv4sf (__a, __b, __c);
+  return __builtin_mve_vcmlaq_rot180v4sf (__a, __b, __c);
 }
 
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmlaq_rot270_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
 {
-  return __builtin_mve_vcmlaq_rot270_fv4sf (__a, __b, __c);
+  return __builtin_mve_vcmlaq_rot270v4sf (__a, __b, __c);
 }
 
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vcmlaq_rot90_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
 {
-  return __builtin_mve_vcmlaq_rot90_fv4sf (__a, __b, __c);
+  return __builtin_mve_vcmlaq_rot90v4sf (__a, __b, __c);
 }
 
 __extension__ extern __inline float32x4_t
diff --git a/gcc/config/arm/arm_mve_builtins.def b/gcc/config/arm/arm_mve_builtins.def
index f38926ffd8e44f63d25a8fb9bf8f7d8680570ef0..56b652fff0a6729d04982cc13a479587180b0208 100644
--- a/gcc/config/arm/arm_mve_builtins.def
+++ b/gcc/config/arm/arm_mve_builtins.def
@@ -125,8 +125,6 @@ VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpeqq_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpeqq_n_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_n_u, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_UNONE_UNONE, vcaddq_rot90_u, v16qi, v8hi, v4si)
-VAR3 (BINOP_UNONE_UNONE_UNONE, vcaddq_rot270_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vbicq_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vandq_u, v16qi, v8hi, v4si)
 VAR3 (BINOP_UNONE_UNONE_UNONE, vaddvq_p_u, v16qi, v8hi, v4si)
@@ -202,8 +200,6 @@ VAR3 (BINOP_NONE_NONE_NONE, vhcaddq_rot270_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_NONE, vhaddq_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_NONE, vhaddq_n_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_NONE, veorq_s, v16qi, v8hi, v4si)
-VAR3 (BINOP_NONE_NONE_NONE, vcaddq_rot90_s, v16qi, v8hi, v4si)
-VAR3 (BINOP_NONE_NONE_NONE, vcaddq_rot270_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_NONE, vbrsrq_n_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_NONE, vbicq_s, v16qi, v8hi, v4si)
 VAR3 (BINOP_NONE_NONE_NONE, vandq_s, v16qi, v8hi, v4si)
@@ -264,12 +260,6 @@ VAR2 (BINOP_NONE_NONE_NONE, vmaxnmq_f, v8hf, v4sf)
 VAR2 (BINOP_NONE_NONE_NONE, vmaxnmavq_f, v8hf, v4sf)
 VAR2 (BINOP_NONE_NONE_NONE, vmaxnmaq_f, v8hf, v4sf)
 VAR2 (BINOP_NONE_NONE_NONE, veorq_f, v8hf, v4sf)
-VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot90_f, v8hf, v4sf)
-VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot270_f, v8hf, v4sf)
-VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot180_f, v8hf, v4sf)
-VAR2 (BINOP_NONE_NONE_NONE, vcmulq_f, v8hf, v4sf)
-VAR2 (BINOP_NONE_NONE_NONE, vcaddq_rot90_f, v8hf, v4sf)
-VAR2 (BINOP_NONE_NONE_NONE, vcaddq_rot270_f, v8hf, v4sf)
 VAR2 (BINOP_NONE_NONE_NONE, vbicq_f, v8hf, v4sf)
 VAR2 (BINOP_NONE_NONE_NONE, vandq_f, v8hf, v4sf)
 VAR2 (BINOP_NONE_NONE_NONE, vaddq_n_f, v8hf, v4sf)
@@ -470,10 +460,6 @@ VAR2 (TERNOP_NONE_NONE_NONE_NONE, vfmsq_f, v8hf, v4sf)
 VAR2 (TERNOP_NONE_NONE_NONE_NONE, vfmasq_n_f, v8hf, v4sf)
 VAR2 (TERNOP_NONE_NONE_NONE_NONE, vfmaq_n_f, v8hf, v4sf)
 VAR2 (TERNOP_NONE_NONE_NONE_NONE, vfmaq_f, v8hf, v4sf)
-VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot90_f, v8hf, v4sf)
-VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot270_f, v8hf, v4sf)
-VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot180_f, v8hf, v4sf)
-VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_f, v8hf, v4sf)
 VAR2 (TERNOP_NONE_NONE_NONE_IMM, vshrntq_n_s, v8hi, v4si)
 VAR2 (TERNOP_NONE_NONE_NONE_IMM, vshrnbq_n_s, v8hi, v4si)
 VAR2 (TERNOP_NONE_NONE_NONE_IMM, vrshrntq_n_s, v8hi, v4si)
@@ -892,3 +878,15 @@ VAR3 (QUADOP_NONE_NONE_UNONE_IMM_UNONE, vshlcq_m_vec_s, v16qi, v8hi, v4si)
 VAR3 (QUADOP_NONE_NONE_UNONE_IMM_UNONE, vshlcq_m_carry_s, v16qi, v8hi, v4si)
 VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vshlcq_m_vec_u, v16qi, v8hi, v4si)
 VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vshlcq_m_carry_u, v16qi, v8hi, v4si)
+
+/* optabs without any suffixes.  */
+VAR5 (BINOP_NONE_NONE_NONE, vcaddq_rot90, v16qi, v8hi, v4si, v8hf, v4sf)
+VAR5 (BINOP_NONE_NONE_NONE, vcaddq_rot270, v16qi, v8hi, v4si, v8hf, v4sf)
+VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot90, v8hf, v4sf)
+VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot270, v8hf, v4sf)
+VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot180, v8hf, v4sf)
+VAR2 (BINOP_NONE_NONE_NONE, vcmulq, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot90, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot270, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot180, v8hf, v4sf)
+VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq, v8hf, v4sf)
diff --git a/gcc/config/arm/constraints.md b/gcc/config/arm/constraints.md
index 789e3332abb7495b308509d03ed241d39498a8b6..6ebddb95b4f9c835f10f5265573f27a06ccbd11f 100644
--- a/gcc/config/arm/constraints.md
+++ b/gcc/config/arm/constraints.md
@@ -310,7 +310,7 @@ (define_constraint "Dz"
  "@internal
   In ARM/Thumb-2 state a vector of constant zeros."
  (and (match_code "const_vector")
-      (match_test "TARGET_NEON && op == CONST0_RTX (mode)")))
+      (match_test "(TARGET_NEON || TARGET_HAVE_MVE) && op == CONST0_RTX (mode)")))
 
 (define_constraint "Da"
  "@internal
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 43e1ebd87cf69e716474bb6ee9bcdd405523d8da..daeb13ce863034844a3afc501f31846e4b22ea8b 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -1177,6 +1177,7 @@ (define_int_attr crypto_mode [(UNSPEC_SHA1H "V4SI") (UNSPEC_AESMC "V16QI")
 
 (define_int_attr rot [(UNSPEC_VCADD90 "90")
 		      (UNSPEC_VCADD270 "270")
+		      (UNSPEC_VCMLS "0")
 		      (UNSPEC_VCMLA "0")
 		      (UNSPEC_VCMLA90 "90")
 		      (UNSPEC_VCMLA180 "180")
@@ -1224,6 +1225,34 @@ (define_int_attr rotsplit2 [(UNSPEC_VCMLA "90")
 (define_int_attr fcmac1 [(UNSPEC_VCMLA "a") (UNSPEC_VCMLA180 "a")
 			 (UNSPEC_VCMLS "s") (UNSPEC_VCMLS180 "s")])
 
+(define_int_attr mve_rotsplit1 [(UNSPEC_VCMLA "")
+				(UNSPEC_VCMLA180 "")
+				(UNSPEC_VCMUL "")
+				(UNSPEC_VCMUL180 "")
+				(UNSPEC_VCMLS "_rot270")
+				(UNSPEC_VCMLS180 "_rot90")])
+
+(define_int_attr mve_rotsplit2 [(UNSPEC_VCMLA "_rot90")
+				(UNSPEC_VCMLA180 "_rot270")
+				(UNSPEC_VCMUL "_rot90")
+				(UNSPEC_VCMUL180 "_rot270")
+				(UNSPEC_VCMLS "_rot180")
+				(UNSPEC_VCMLS180 "_rot180")])
+
+(define_int_attr mve_rot [(UNSPEC_VCADD90 "_rot90")
+			  (UNSPEC_VCADD270 "_rot270")
+			  (UNSPEC_VCMLA "")
+			  (UNSPEC_VCMLA90 "_rot90")
+			  (UNSPEC_VCMLA180 "_rot180")
+			  (UNSPEC_VCMLA270 "_rot270")
+			  (UNSPEC_VCMUL "")
+			  (UNSPEC_VCMUL90 "_rot90")
+			  (UNSPEC_VCMUL180 "_rot180")
+			  (UNSPEC_VCMUL270 "_rot270")])
+
+(define_int_iterator VCMUL [UNSPEC_VCMUL UNSPEC_VCMUL90
+			    UNSPEC_VCMUL180 UNSPEC_VCMUL270])
+
 (define_int_attr simd32_op [(UNSPEC_QADD8 "qadd8") (UNSPEC_QSUB8 "qsub8")
 			    (UNSPEC_SHADD8 "shadd8") (UNSPEC_SHSUB8 "shsub8")
 			    (UNSPEC_UHADD8 "uhadd8") (UNSPEC_UHSUB8 "uhsub8")
@@ -1276,9 +1305,8 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U "u") (VREV16Q_S "s")
 		       (VABDQ_U "u") (VADDQ_N_S "s") (VADDQ_N_U "u")
 		       (VADDVQ_P_S "s")	(VADDVQ_P_U "u") (VANDQ_S "s")
 		       (VANDQ_U "u") (VBICQ_S "s") (VBICQ_U "u")
-		       (VBRSRQ_N_S "s") (VBRSRQ_N_U "u") (VCADDQ_ROT270_S "s")
-		       (VCADDQ_ROT270_U "u") (VCADDQ_ROT90_S "s")
-		       (VCMPEQQ_S "s") (VCMPEQQ_U "u") (VCADDQ_ROT90_U "u")
+		       (VBRSRQ_N_S "s") (VBRSRQ_N_U "u")
+		       (VCMPEQQ_S "s") (VCMPEQQ_U "u")
 		       (VCMPEQQ_N_S "s") (VCMPEQQ_N_U "u") (VCMPNEQ_N_S "s")
 		       (VCMPNEQ_N_U "u") (VEORQ_S "s") (VEORQ_U "u")
 		       (VHADDQ_N_S "s") (VHADDQ_N_U "u") (VHADDQ_S "s")
@@ -1546,8 +1574,6 @@ (define_int_iterator VADDVQ_P [VADDVQ_P_U VADDVQ_P_S])
 (define_int_iterator VANDQ [VANDQ_U VANDQ_S])
 (define_int_iterator VBICQ [VBICQ_S VBICQ_U])
 (define_int_iterator VBRSRQ_N [VBRSRQ_N_U VBRSRQ_N_S])
-(define_int_iterator VCADDQ_ROT270 [VCADDQ_ROT270_S VCADDQ_ROT270_U])
-(define_int_iterator VCADDQ_ROT90 [VCADDQ_ROT90_U VCADDQ_ROT90_S])
 (define_int_iterator VCMPEQQ [VCMPEQQ_U VCMPEQQ_S])
 (define_int_iterator VCMPEQQ_N [VCMPEQQ_N_S VCMPEQQ_N_U])
 (define_int_iterator VCMPNEQ_N [VCMPNEQ_N_U VCMPNEQ_N_S])
@@ -1757,3 +1783,11 @@ (define_int_iterator VADCQ_M [VADCQ_M_U VADCQ_M_S])
 (define_int_iterator UQRSHLLQ [UQRSHLL_64 UQRSHLL_48])
 (define_int_iterator SQRSHRLQ [SQRSHRL_64 SQRSHRL_48])
 (define_int_iterator VSHLCQ_M [VSHLCQ_M_S VSHLCQ_M_U])
+;; Define iterators for VCMLA operations
+(define_int_iterator VCMLA_OP [UNSPEC_VCMLA
+			       UNSPEC_VCMLA180
+			       UNSPEC_VCMLS])
+
+;; Define iterators for VCMLA operations as MUL
+(define_int_iterator VCMUL_OP [UNSPEC_VCMUL
+			       UNSPEC_VCMUL180])
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index ecbaaa9150132fa6568b5d2217e7233e2ec6ebb6..336e15ebf7d3193db47414a3100a8ba44356087c 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -937,34 +937,28 @@ (define_insn "mve_vbrsrq_n_<supf><mode>"
 ])
 
 ;;
-;; [vcaddq_rot270_s, vcaddq_rot270_u])
+;; [vcaddq, vcaddq_rot90, vcadd_rot180, vcadd_rot270])
 ;;
-(define_insn "mve_vcaddq_rot270_<supf><mode>"
+(define_insn "mve_vcaddq<mve_rot><mode>"
   [
    (set (match_operand:MVE_2 0 "s_register_operand" "<earlyclobber_32>")
 	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")
 		       (match_operand:MVE_2 2 "s_register_operand" "w")]
-	 VCADDQ_ROT270))
+	 VCADD))
   ]
   "TARGET_HAVE_MVE"
-  "vcadd.i%#<V_sz_elem>	%q0, %q1, %q2, #270"
+  "vcadd.i%#<V_sz_elem>	%q0, %q1, %q2, #<rot>"
   [(set_attr "type" "mve_move")
 ])
 
-;;
-;; [vcaddq_rot90_u, vcaddq_rot90_s])
-;;
-(define_insn "mve_vcaddq_rot90_<supf><mode>"
-  [
-   (set (match_operand:MVE_2 0 "s_register_operand" "<earlyclobber_32>")
-	(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")
-		       (match_operand:MVE_2 2 "s_register_operand" "w")]
-	 VCADDQ_ROT90))
-  ]
-  "TARGET_HAVE_MVE"
-  "vcadd.i%#<V_sz_elem>	%q0, %q1, %q2, #90"
-  [(set_attr "type" "mve_move")
-])
+;; Auto vectorizer pattern for int vcadd
+(define_expand "cadd<rot><mode>3"
+  [(set (match_operand:MVE_2 0 "register_operand")
+	(unspec:MVE_2 [(match_operand:MVE_2 1 "register_operand")
+		       (match_operand:MVE_2 2 "register_operand")]
+	  VCADD))]
+  "TARGET_HAVE_MVE && !BYTES_BIG_ENDIAN"
+)
 
 ;;
 ;; [vcmpcsq_n_u])
@@ -2059,32 +2053,17 @@ (define_insn "mve_vbicq_n_<supf><mode>"
 ])
 
 ;;
-;; [vcaddq_rot270_f])
-;;
-(define_insn "mve_vcaddq_rot270_f<mode>"
-  [
-   (set (match_operand:MVE_0 0 "s_register_operand" "<earlyclobber_32>")
-	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w")
-		       (match_operand:MVE_0 2 "s_register_operand" "w")]
-	 VCADDQ_ROT270_F))
-  ]
-  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcadd.f%#<V_sz_elem>	%q0, %q1, %q2, #270"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcaddq_rot90_f])
+;; [vcaddq, vcaddq_rot90, vcadd_rot180, vcadd_rot270])
 ;;
-(define_insn "mve_vcaddq_rot90_f<mode>"
+(define_insn "mve_vcaddq<mve_rot><mode>"
   [
    (set (match_operand:MVE_0 0 "s_register_operand" "<earlyclobber_32>")
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")]
-	 VCADDQ_ROT90_F))
+	 VCADD))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcadd.f%#<V_sz_elem>	%q0, %q1, %q2, #90"
+  "vcadd.f%#<V_sz_elem>	%q0, %q1, %q2, #<rot>"
   [(set_attr "type" "mve_move")
 ])
 
@@ -2269,62 +2248,17 @@ (define_insn "mve_vcmpneq_n_f<mode>"
 ])
 
 ;;
-;; [vcmulq_f])
-;;
-(define_insn "mve_vcmulq_f<mode>"
-  [
-   (set (match_operand:MVE_0 0 "s_register_operand" "<earlyclobber_32>")
-	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w")
-		       (match_operand:MVE_0 2 "s_register_operand" "w")]
-	 VCMULQ_F))
-  ]
-  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcmul.f%#<V_sz_elem>	%q0, %q1, %q2, #0"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmulq_rot180_f])
-;;
-(define_insn "mve_vcmulq_rot180_f<mode>"
-  [
-   (set (match_operand:MVE_0 0 "s_register_operand" "<earlyclobber_32>")
-	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w")
-		       (match_operand:MVE_0 2 "s_register_operand" "w")]
-	 VCMULQ_ROT180_F))
-  ]
-  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcmul.f%#<V_sz_elem>	%q0, %q1, %q2, #180"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmulq_rot270_f])
+;; [vcmulq, vcmulq_rot90, vcmulq_rot180, vcmulq_rot270])
 ;;
-(define_insn "mve_vcmulq_rot270_f<mode>"
+(define_insn "mve_vcmulq<mve_rot><mode>"
   [
    (set (match_operand:MVE_0 0 "s_register_operand" "<earlyclobber_32>")
 	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w")
 		       (match_operand:MVE_0 2 "s_register_operand" "w")]
-	 VCMULQ_ROT270_F))
+	 VCMUL))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcmul.f%#<V_sz_elem>	%q0, %q1, %q2, #270"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmulq_rot90_f])
-;;
-(define_insn "mve_vcmulq_rot90_f<mode>"
-  [
-   (set (match_operand:MVE_0 0 "s_register_operand" "<earlyclobber_32>")
-	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w")
-		       (match_operand:MVE_0 2 "s_register_operand" "w")]
-	 VCMULQ_ROT90_F))
-  ]
-  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcmul.f%#<V_sz_elem>	%q0, %q1, %q2, #90"
+  "vcmul.f%#<V_sz_elem>	%q0, %q1, %q2, #<rot>"
   [(set_attr "type" "mve_move")
 ])
 
@@ -4098,66 +4032,20 @@ (define_insn "mve_vaddlvaq_p_<supf>v4si"
   [(set_attr "type" "mve_move")
    (set_attr "length""8")])
 ;;
-;; [vcmlaq_f])
-;;
-(define_insn "mve_vcmlaq_f<mode>"
-  [
-   (set (match_operand:MVE_0 0 "s_register_operand" "=w")
-	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
-		       (match_operand:MVE_0 2 "s_register_operand" "w")
-		       (match_operand:MVE_0 3 "s_register_operand" "w")]
-	 VCMLAQ_F))
-  ]
-  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcmla.f%#<V_sz_elem>	%q0, %q2, %q3, #0"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmlaq_rot180_f])
+;; [vcmlaq, vcmlaq_rot90, vcmlaq_rot180, vcmlaq_rot270])
 ;;
-(define_insn "mve_vcmlaq_rot180_f<mode>"
+(define_insn "mve_vcmlaq<mve_rot><mode>"
   [
-   (set (match_operand:MVE_0 0 "s_register_operand" "=w")
-	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
-		       (match_operand:MVE_0 2 "s_register_operand" "w")
-		       (match_operand:MVE_0 3 "s_register_operand" "w")]
-	 VCMLAQ_ROT180_F))
-  ]
-  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcmla.f%#<V_sz_elem>	%q0, %q2, %q3, #180"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmlaq_rot270_f])
-;;
-(define_insn "mve_vcmlaq_rot270_f<mode>"
-  [
-   (set (match_operand:MVE_0 0 "s_register_operand" "=w")
-	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
-		       (match_operand:MVE_0 2 "s_register_operand" "w")
-		       (match_operand:MVE_0 3 "s_register_operand" "w")]
-	 VCMLAQ_ROT270_F))
-  ]
-  "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcmla.f%#<V_sz_elem>	%q0, %q2, %q3, #270"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vcmlaq_rot90_f])
-;;
-(define_insn "mve_vcmlaq_rot90_f<mode>"
-  [
-   (set (match_operand:MVE_0 0 "s_register_operand" "=w")
-	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0")
-		       (match_operand:MVE_0 2 "s_register_operand" "w")
-		       (match_operand:MVE_0 3 "s_register_operand" "w")]
-	 VCMLAQ_ROT90_F))
+   (set (match_operand:MVE_0 0 "s_register_operand" "=w,w")
+	(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0,Dz")
+		       (match_operand:MVE_0 2 "s_register_operand" "w,w")
+		       (match_operand:MVE_0 3 "s_register_operand" "w,w")]
+	 VCMLA))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vcmla.f%#<V_sz_elem>	%q0, %q2, %q3, #90"
+  "@
+   vcmla.f%#<V_sz_elem>	%q0, %q2, %q3, #<rot>
+   vcmul.f%#<V_sz_elem>	%q0, %q2, %q3, #<rot>"
   [(set_attr "type" "mve_move")
 ])
 
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 2d76769837884be7bde7a4c4234607a559ea5499..d47606d1fcf20d818a6d4022fb491bba79c657f7 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -3030,6 +3030,26 @@ (define_insn "neon_vcmlaq_lane<rot><mode>"
   [(set_attr "type" "neon_fcmla")]
 )
 
+;; The complex mul operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder.  Because of this, expand early.
+(define_expand "cmul<rot_op><mode>3"
+  [(set (match_operand:VDF 0 "register_operand")
+	(unspec:VDF [(match_operand:VDF 1 "register_operand")
+		     (match_operand:VDF 2 "register_operand")]
+		    VCMUL_OP))]
+  "TARGET_COMPLEX && !BYTES_BIG_ENDIAN"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  rtx res1 = gen_reg_rtx (<MODE>mode);
+  emit_move_insn (tmp, CONST0_RTX (<MODE>mode));
+  emit_insn (gen_neon_vcmla<rotsplit1><mode> (res1, tmp,
+					      operands[1], operands[2]));
+  emit_insn (gen_neon_vcmla<rotsplit2><mode> (operands[0], res1,
+					      operands[1], operands[2]));
+  DONE;
+})
+
 
 ;; These instructions map to the __builtins for the Dot Product operations.
 (define_insn "neon_<sup>dot<vsi2qi>"
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index a3844e9a72a54489b17738d05d6c5280ae059ffa..663b20d23ba5554ebe7e8e7b459c607cea9b01f1 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -510,6 +510,12 @@ (define_c_enum "unspec" [
   UNSPEC_VCMLA90
   UNSPEC_VCMLA180
   UNSPEC_VCMLA270
+  UNSPEC_VCMUL
+  UNSPEC_VCMUL90
+  UNSPEC_VCMUL180
+  UNSPEC_VCMUL270
+  UNSPEC_VCMLS
+  UNSPEC_VCMLS180
   UNSPEC_MATMUL_S
   UNSPEC_MATMUL_U
   UNSPEC_MATMUL_US
@@ -604,8 +610,6 @@ (define_c_enum "unspec" [
   VANDQ_S
   VBICQ_S
   VBRSRQ_N_S
-  VCADDQ_ROT270_S
-  VCADDQ_ROT90_S
   VCMPEQQ_S
   VCMPEQQ_N_S
   VCMPNEQ_N_S
@@ -651,8 +655,6 @@ (define_c_enum "unspec" [
   VANDQ_U
   VBICQ_U
   VBRSRQ_N_U
-  VCADDQ_ROT270_U
-  VCADDQ_ROT90_U
   VCMPEQQ_U
   VCMPEQQ_N_U
   VCMPNEQ_N_U
@@ -723,8 +725,6 @@ (define_c_enum "unspec" [
   VADDQ_N_F
   VANDQ_F
   VBICQ_F
-  VCADDQ_ROT270_F
-  VCADDQ_ROT90_F
   VCMPEQQ_F
   VCMPEQQ_N_F
   VCMPGEQ_F
@@ -737,10 +737,6 @@ (define_c_enum "unspec" [
   VCMPLTQ_N_F
   VCMPNEQ_F
   VCMPNEQ_N_F
-  VCMULQ_F
-  VCMULQ_ROT180_F
-  VCMULQ_ROT270_F
-  VCMULQ_ROT90_F
   VEORQ_F
   VMAXNMAQ_F
   VMAXNMAVQ_F
@@ -914,7 +910,6 @@ (define_c_enum "unspec" [
   VMLSLDAVAQ_S
   VQSHRUNBQ_N_S
   VQRSHRUNTQ_N_S
-  VCMLAQ_F
   VMINNMAQ_M_F
   VFMASQ_N_F
   VDUPQ_M_N_F
@@ -936,14 +931,12 @@ (define_c_enum "unspec" [
   VADDLVAQ_P_S
   VQMOVUNBQ_M_S
   VCMPLEQ_M_F
-  VCMLAQ_ROT180_F
   VMLSLDAVAXQ_S
   VRNDXQ_M_F
   VFMSQ_F
   VMINNMVQ_P_F
   VMAXNMVQ_P_F
   VPSELQ_F
-  VCMLAQ_ROT90_F
   VQMOVUNTQ_M_S
   VREV64Q_M_F
   VNEGQ_M_F
@@ -956,7 +949,6 @@ (define_c_enum "unspec" [
   VRMLALDAVHQ_P_S
   VRMLALDAVHXQ_P_S
   VCMPEQQ_M_N_F
-  VCMLAQ_ROT270_F
   VMAXNMAQ_M_F
   VRNDQ_M_F
   VMLALDAVQ_P_U
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index 250e503876ac9491234b225d6fdac16acc48c41f..22a0dfa2e52e532165ba90a3e0c82cc4bbebd4c4 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -172,3 +172,73 @@ (define_expand "vec_set<mode>"
 					       GEN_INT (elem), operands[0]));
   DONE;
 })
+
+(define_expand "cadd<rot><mode>3"
+  [(set (match_operand:VF 0 "register_operand")
+	(unspec:VF [(match_operand:VF 1 "register_operand")
+		    (match_operand:VF 2 "register_operand")]
+		   VCADD))]
+  "(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT
+		      && ARM_HAVE_<MODE>_ARITH)) && !BYTES_BIG_ENDIAN"
+)
+
+;; The complex mul operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder.  Because of this, expand early.
+(define_expand "cmul<rot_op><mode>3"
+  [(set (match_operand:VQ_HSF 0 "register_operand")
+        (unspec:VQ_HSF [(match_operand:VQ_HSF 1 "register_operand")
+			(match_operand:VQ_HSF 2 "register_operand")]
+		       VCMUL_OP))]
+  "(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT))
+   && !BYTES_BIG_ENDIAN"
+{
+  rtx res1 = gen_reg_rtx (<MODE>mode);
+  if (TARGET_COMPLEX)
+    {
+      rtx tmp = gen_reg_rtx (<MODE>mode);
+      emit_move_insn (tmp, CONST0_RTX (<MODE>mode));
+      emit_insn (gen_neon_vcmla<rotsplit1><mode> (res1, tmp,
+                                                  operands[1], operands[2]));
+      emit_insn (gen_neon_vcmla<rotsplit2><mode> (operands[0], res1,
+                                                  operands[1], operands[2]));
+    }
+  else
+    {
+      emit_insn (gen_mve_vcmulq<mve_rotsplit1><mode> (operands[0], operands[1],
+                                                      operands[2]));
+      emit_insn (gen_mve_vcmulq<mve_rotsplit2><mode> (operands[0], operands[1],
+                                                      operands[2]));
+    }
+  DONE;
+})
+
+(define_expand "arm_vcmla<rot><mode>"
+  [(set (match_operand:VF 0 "register_operand")
+	(plus:VF (match_operand:VF 1 "register_operand")
+		 (unspec:VF [(match_operand:VF 2 "register_operand")
+			     (match_operand:VF 3 "register_operand")]
+			     VCMLA)))]
+  "(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT
+		      && ARM_HAVE_<MODE>_ARITH)) && !BYTES_BIG_ENDIAN"
+)
+
+;; The complex mla/mls operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder.  Because of this, expand early.
+(define_expand "cml<fcmac1><rot_op><mode>4"
+  [(set (match_operand:VF 0 "register_operand")
+	(plus:VF (match_operand:VF 1 "register_operand")
+		 (unspec:VF [(match_operand:VF 2 "register_operand")
+			     (match_operand:VF 3 "register_operand")]
+			    VCMLA_OP)))]
+  "(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT
+		      && ARM_HAVE_<MODE>_ARITH)) && !BYTES_BIG_ENDIAN"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_arm_vcmla<rotsplit1><mode> (tmp, operands[1],
+					     operands[2], operands[3]));
+  emit_insn (gen_arm_vcmla<rotsplit2><mode> (operands[0], tmp,
+					     operands[2], operands[3]));
+  DONE;
+})

[PATCH]Arm: Add NEON and MVE RTL patterns for Complex Addition, Multiply and FMA. Fix mve types Fix mve patterns

Reply via email to