Hi All, This adds implementation for the optabs for complex additions. With this the following C code:
void f90 (float complex a[restrict N], float complex b[restrict N], float complex c[restrict N]) { for (int i=0; i < N; i++) c[i] = a[i] + (b[i] * I); } generates f90: add r3, r2, #1600 .L2: vld1.32 {q8}, [r0]! vld1.32 {q9}, [r1]! vcadd.f32 q8, q8, q9, #90 vst1.32 {q8}, [r2]! cmp r3, r2 bne .L2 bx lr instead of f90: add r3, r2, #1600 .L2: vld2.32 {d24-d27}, [r0]! vld2.32 {d20-d23}, [r1]! vsub.f32 q8, q12, q11 vadd.f32 q9, q13, q10 vst2.32 {d16-d19}, [r2]! cmp r3, r2 bne .L2 bx lr Bootstrapped Regtested on arm-none-linux-gnueabihf and no issues. Codegen tested for -march=armv8.1-m.main+mve.fp -mfloat-abi=hard -mfpu=auto and no issues. Matching tests for these are in the mid-end patches. Note that The mid-end patches are still being respun and I may need to change the order of some parameters but no other change is expected and would like to decrease the size of future patches. As such.. Ok for master? Thanks, Tamar Tamar gcc/ChangeLog: * config/arm/arm_mve.h (__arm_vcaddq_rot90_u8, __arm_vcaddq_rot270_u8, , __arm_vcaddq_rot90_s8, __arm_vcaddq_rot270_s8, __arm_vcaddq_rot90_u16, __arm_vcaddq_rot270_u16, __arm_vcaddq_rot90_s16, __arm_vcaddq_rot270_s16, __arm_vcaddq_rot90_u32, __arm_vcaddq_rot270_u32, __arm_vcaddq_rot90_s32, __arm_vcaddq_rot270_s32, __arm_vcmulq_rot90_f16, __arm_vcmulq_rot270_f16, __arm_vcmulq_rot180_f16, __arm_vcmulq_f16, __arm_vcaddq_rot90_f16, __arm_vcaddq_rot270_f16, __arm_vcmulq_rot90_f32, __arm_vcmulq_rot270_f32, __arm_vcmulq_rot180_f32, __arm_vcmulq_f32, __arm_vcaddq_rot90_f32, __arm_vcaddq_rot270_f32, __arm_vcmlaq_f16, __arm_vcmlaq_rot180_f16, __arm_vcmlaq_rot270_f16, __arm_vcmlaq_rot90_f16, __arm_vcmlaq_f32, __arm_vcmlaq_rot180_f32, __arm_vcmlaq_rot270_f32, __arm_vcmlaq_rot90_f32): Update builtin calls. * config/arm/arm_mve_builtins.def (vcaddq_rot90_u, vcaddq_rot270_u, vcaddq_rot90_s, vcaddq_rot270_s, vcaddq_rot90_f, vcaddq_rot270_f, vcmulq_f, vcmulq_rot90_f, vcmulq_rot180_f, vcmulq_rot270_f, vcmlaq_f, vcmlaq_rot90_f, vcmlaq_rot180_f, vcmlaq_rot270_f): Removed. (vcaddq_rot90, vcaddq_rot270, vcmulq, vcmulq_rot90, vcmulq_rot180, vcmulq_rot270, vcmlaq, vcmlaq_rot90, vcmlaq_rot180, vcmlaq_rot270): New. * config/arm/constraints.md (Dz): Include MVE. * config/arm/iterators.md (mve_rotsplit1, mve_rotsplit2): New. (rot): Add UNSPEC_VCMLS, UNSPEC_VCMUL and UNSPEC_VCMUL180. (rot_op, rotsplit1, rotsplit2, fcmac1, VCMLA_OP, VCMUL_OP): New. * config/arm/mve.md (VCADDQ_ROT270_S, VCADDQ_ROT90_S, VCADDQ_ROT270_U, VCADDQ_ROT90_U, VCADDQ_ROT270_F, VCADDQ_ROT90_F, VCMULQ_F, VCMULQ_ROT180_F, VCMULQ_ROT270_F, VCMULQ_ROT90_F, VCMLAQ_F, VCMLAQ_ROT180_F, VCMLAQ_ROT90_F, VCMLAQ_ROT270_F, VCADDQ_ROT270_S, VCADDQ_ROT270, VCADDQ_ROT90): Removed. (mve_rot, VCMUL): New. (mve_vcaddq_rot270_<supf><mode, mve_vcaddq_rot90_<supf><mode>, mve_vcaddq_rot270_f<mode>, mve_vcaddq_rot90_f<mode>, mve_vcmulq_f<mode, mve_vcmulq_rot180_f<mode>, mve_vcmulq_rot270_f<mode>, mve_vcmulq_rot90_f<mode>, mve_vcmlaq_f<mode>, mve_vcmlaq_rot180_f<mode>, mve_vcmlaq_rot270_f<mode>, mve_vcmlaq_rot90_f<mode>): Removed. (mve_vcmlaq<mve_rot><mode>, mve_vcmulq<mve_rot><mode>, mve_vcaddq<mve_rot><mode>, cadd<rot><mode>3, mve_vcaddq<mve_rot><mode>): New. (cmul<rot_op><mode>3): Exclude MVE types. * config/arm/unspecs.md (UNSPEC_VCMUL90, UNSPEC_VCMUL270): New. * config/arm/vec-common.md (cadd<rot><mode>3, cmul<rot_op><mode>3, arm_vcmla<rot><mode>, cml<fcmac1><rot_op><mode>4): New. * config/arm/unspecs.md (UNSPEC_VCMUL, UNSPEC_VCMUL180, UNSPEC_VCMLS, UNSPEC_VCMLS180): New. --
diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h index 6c0d1e2e634a32196eb31079166a7733dcd3a4b6..45014621f2533497e90ddf5257fb04e1fd9325b4 100644 --- a/gcc/config/arm/arm_mve.h +++ b/gcc/config/arm/arm_mve.h @@ -3981,14 +3981,16 @@ __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot90_u8 (uint8x16_t __a, uint8x16_t __b) { - return __builtin_mve_vcaddq_rot90_uv16qi (__a, __b); + return (uint8x16_t) + __builtin_mve_vcaddq_rot90v16qi ((int8x16_t)__a, (int8x16_t)__b); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot270_u8 (uint8x16_t __a, uint8x16_t __b) { - return __builtin_mve_vcaddq_rot270_uv16qi (__a, __b); + return (uint8x16_t) + __builtin_mve_vcaddq_rot270v16qi ((int8x16_t)__a, (int8x16_t)__b); } __extension__ extern __inline uint8x16_t @@ -4520,14 +4522,14 @@ __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot90_s8 (int8x16_t __a, int8x16_t __b) { - return __builtin_mve_vcaddq_rot90_sv16qi (__a, __b); + return __builtin_mve_vcaddq_rot90v16qi (__a, __b); } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot270_s8 (int8x16_t __a, int8x16_t __b) { - return __builtin_mve_vcaddq_rot270_sv16qi (__a, __b); + return __builtin_mve_vcaddq_rot270v16qi (__a, __b); } __extension__ extern __inline int8x16_t @@ -4821,14 +4823,16 @@ __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot90_u16 (uint16x8_t __a, uint16x8_t __b) { - return __builtin_mve_vcaddq_rot90_uv8hi (__a, __b); + return (uint16x8_t) + __builtin_mve_vcaddq_rot90v8hi ((int16x8_t)__a, (int16x8_t)__b); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot270_u16 (uint16x8_t __a, uint16x8_t __b) { - return __builtin_mve_vcaddq_rot270_uv8hi (__a, __b); + return (uint16x8_t) + __builtin_mve_vcaddq_rot270v8hi ((int16x8_t)__a, (int16x8_t)__b); } __extension__ extern __inline uint16x8_t @@ -5360,14 +5364,14 @@ __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot90_s16 (int16x8_t __a, int16x8_t __b) { - return __builtin_mve_vcaddq_rot90_sv8hi (__a, __b); + return __builtin_mve_vcaddq_rot90v8hi (__a, __b); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot270_s16 (int16x8_t __a, int16x8_t __b) { - return __builtin_mve_vcaddq_rot270_sv8hi (__a, __b); + return __builtin_mve_vcaddq_rot270v8hi (__a, __b); } __extension__ extern __inline int16x8_t @@ -5661,14 +5665,16 @@ __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot90_u32 (uint32x4_t __a, uint32x4_t __b) { - return __builtin_mve_vcaddq_rot90_uv4si (__a, __b); + return (uint32x4_t) + __builtin_mve_vcaddq_rot90v4si ((int32x4_t)__a, (int32x4_t)__b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot270_u32 (uint32x4_t __a, uint32x4_t __b) { - return __builtin_mve_vcaddq_rot270_uv4si (__a, __b); + return (uint32x4_t) + __builtin_mve_vcaddq_rot270v4si ((int32x4_t)__a, (int32x4_t)__b); } __extension__ extern __inline uint32x4_t @@ -6200,14 +6206,14 @@ __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot90_s32 (int32x4_t __a, int32x4_t __b) { - return __builtin_mve_vcaddq_rot90_sv4si (__a, __b); + return __builtin_mve_vcaddq_rot90v4si (__a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot270_s32 (int32x4_t __a, int32x4_t __b) { - return __builtin_mve_vcaddq_rot270_sv4si (__a, __b); + return __builtin_mve_vcaddq_rot270v4si (__a, __b); } __extension__ extern __inline int32x4_t @@ -17342,42 +17348,42 @@ __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmulq_rot90_f16 (float16x8_t __a, float16x8_t __b) { - return __builtin_mve_vcmulq_rot90_fv8hf (__a, __b); + return __builtin_mve_vcmulq_rot90v8hf (__a, __b); } __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmulq_rot270_f16 (float16x8_t __a, float16x8_t __b) { - return __builtin_mve_vcmulq_rot270_fv8hf (__a, __b); + return __builtin_mve_vcmulq_rot270v8hf (__a, __b); } __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmulq_rot180_f16 (float16x8_t __a, float16x8_t __b) { - return __builtin_mve_vcmulq_rot180_fv8hf (__a, __b); + return __builtin_mve_vcmulq_rot180v8hf (__a, __b); } __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmulq_f16 (float16x8_t __a, float16x8_t __b) { - return __builtin_mve_vcmulq_fv8hf (__a, __b); + return __builtin_mve_vcmulqv8hf (__a, __b); } __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot90_f16 (float16x8_t __a, float16x8_t __b) { - return __builtin_mve_vcaddq_rot90_fv8hf (__a, __b); + return __builtin_mve_vcaddq_rot90v8hf (__a, __b); } __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot270_f16 (float16x8_t __a, float16x8_t __b) { - return __builtin_mve_vcaddq_rot270_fv8hf (__a, __b); + return __builtin_mve_vcaddq_rot270v8hf (__a, __b); } __extension__ extern __inline float16x8_t @@ -17594,42 +17600,42 @@ __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmulq_rot90_f32 (float32x4_t __a, float32x4_t __b) { - return __builtin_mve_vcmulq_rot90_fv4sf (__a, __b); + return __builtin_mve_vcmulq_rot90v4sf (__a, __b); } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmulq_rot270_f32 (float32x4_t __a, float32x4_t __b) { - return __builtin_mve_vcmulq_rot270_fv4sf (__a, __b); + return __builtin_mve_vcmulq_rot270v4sf (__a, __b); } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmulq_rot180_f32 (float32x4_t __a, float32x4_t __b) { - return __builtin_mve_vcmulq_rot180_fv4sf (__a, __b); + return __builtin_mve_vcmulq_rot180v4sf (__a, __b); } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmulq_f32 (float32x4_t __a, float32x4_t __b) { - return __builtin_mve_vcmulq_fv4sf (__a, __b); + return __builtin_mve_vcmulqv4sf (__a, __b); } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot90_f32 (float32x4_t __a, float32x4_t __b) { - return __builtin_mve_vcaddq_rot90_fv4sf (__a, __b); + return __builtin_mve_vcaddq_rot90v4sf (__a, __b); } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot270_f32 (float32x4_t __a, float32x4_t __b) { - return __builtin_mve_vcaddq_rot270_fv4sf (__a, __b); + return __builtin_mve_vcaddq_rot270v4sf (__a, __b); } __extension__ extern __inline float32x4_t @@ -17784,28 +17790,28 @@ __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmlaq_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c) { - return __builtin_mve_vcmlaq_fv8hf (__a, __b, __c); + return __builtin_mve_vcmlaqv8hf (__a, __b, __c); } __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmlaq_rot180_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c) { - return __builtin_mve_vcmlaq_rot180_fv8hf (__a, __b, __c); + return __builtin_mve_vcmlaq_rot180v8hf (__a, __b, __c); } __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmlaq_rot270_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c) { - return __builtin_mve_vcmlaq_rot270_fv8hf (__a, __b, __c); + return __builtin_mve_vcmlaq_rot270v8hf (__a, __b, __c); } __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmlaq_rot90_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c) { - return __builtin_mve_vcmlaq_rot90_fv8hf (__a, __b, __c); + return __builtin_mve_vcmlaq_rot90v8hf (__a, __b, __c); } __extension__ extern __inline float16x8_t @@ -18092,28 +18098,28 @@ __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmlaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c) { - return __builtin_mve_vcmlaq_fv4sf (__a, __b, __c); + return __builtin_mve_vcmlaqv4sf (__a, __b, __c); } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmlaq_rot180_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c) { - return __builtin_mve_vcmlaq_rot180_fv4sf (__a, __b, __c); + return __builtin_mve_vcmlaq_rot180v4sf (__a, __b, __c); } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmlaq_rot270_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c) { - return __builtin_mve_vcmlaq_rot270_fv4sf (__a, __b, __c); + return __builtin_mve_vcmlaq_rot270v4sf (__a, __b, __c); } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmlaq_rot90_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c) { - return __builtin_mve_vcmlaq_rot90_fv4sf (__a, __b, __c); + return __builtin_mve_vcmlaq_rot90v4sf (__a, __b, __c); } __extension__ extern __inline float32x4_t diff --git a/gcc/config/arm/arm_mve_builtins.def b/gcc/config/arm/arm_mve_builtins.def index f38926ffd8e44f63d25a8fb9bf8f7d8680570ef0..56b652fff0a6729d04982cc13a479587180b0208 100644 --- a/gcc/config/arm/arm_mve_builtins.def +++ b/gcc/config/arm/arm_mve_builtins.def @@ -125,8 +125,6 @@ VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpeqq_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpeqq_n_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_n_u, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_UNONE_UNONE, vcaddq_rot90_u, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_UNONE_UNONE, vcaddq_rot270_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vbicq_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vandq_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vaddvq_p_u, v16qi, v8hi, v4si) @@ -202,8 +200,6 @@ VAR3 (BINOP_NONE_NONE_NONE, vhcaddq_rot270_s, v16qi, v8hi, v4si) VAR3 (BINOP_NONE_NONE_NONE, vhaddq_s, v16qi, v8hi, v4si) VAR3 (BINOP_NONE_NONE_NONE, vhaddq_n_s, v16qi, v8hi, v4si) VAR3 (BINOP_NONE_NONE_NONE, veorq_s, v16qi, v8hi, v4si) -VAR3 (BINOP_NONE_NONE_NONE, vcaddq_rot90_s, v16qi, v8hi, v4si) -VAR3 (BINOP_NONE_NONE_NONE, vcaddq_rot270_s, v16qi, v8hi, v4si) VAR3 (BINOP_NONE_NONE_NONE, vbrsrq_n_s, v16qi, v8hi, v4si) VAR3 (BINOP_NONE_NONE_NONE, vbicq_s, v16qi, v8hi, v4si) VAR3 (BINOP_NONE_NONE_NONE, vandq_s, v16qi, v8hi, v4si) @@ -264,12 +260,6 @@ VAR2 (BINOP_NONE_NONE_NONE, vmaxnmq_f, v8hf, v4sf) VAR2 (BINOP_NONE_NONE_NONE, vmaxnmavq_f, v8hf, v4sf) VAR2 (BINOP_NONE_NONE_NONE, vmaxnmaq_f, v8hf, v4sf) VAR2 (BINOP_NONE_NONE_NONE, veorq_f, v8hf, v4sf) -VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot90_f, v8hf, v4sf) -VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot270_f, v8hf, v4sf) -VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot180_f, v8hf, v4sf) -VAR2 (BINOP_NONE_NONE_NONE, vcmulq_f, v8hf, v4sf) -VAR2 (BINOP_NONE_NONE_NONE, vcaddq_rot90_f, v8hf, v4sf) -VAR2 (BINOP_NONE_NONE_NONE, vcaddq_rot270_f, v8hf, v4sf) VAR2 (BINOP_NONE_NONE_NONE, vbicq_f, v8hf, v4sf) VAR2 (BINOP_NONE_NONE_NONE, vandq_f, v8hf, v4sf) VAR2 (BINOP_NONE_NONE_NONE, vaddq_n_f, v8hf, v4sf) @@ -470,10 +460,6 @@ VAR2 (TERNOP_NONE_NONE_NONE_NONE, vfmsq_f, v8hf, v4sf) VAR2 (TERNOP_NONE_NONE_NONE_NONE, vfmasq_n_f, v8hf, v4sf) VAR2 (TERNOP_NONE_NONE_NONE_NONE, vfmaq_n_f, v8hf, v4sf) VAR2 (TERNOP_NONE_NONE_NONE_NONE, vfmaq_f, v8hf, v4sf) -VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot90_f, v8hf, v4sf) -VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot270_f, v8hf, v4sf) -VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot180_f, v8hf, v4sf) -VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_f, v8hf, v4sf) VAR2 (TERNOP_NONE_NONE_NONE_IMM, vshrntq_n_s, v8hi, v4si) VAR2 (TERNOP_NONE_NONE_NONE_IMM, vshrnbq_n_s, v8hi, v4si) VAR2 (TERNOP_NONE_NONE_NONE_IMM, vrshrntq_n_s, v8hi, v4si) @@ -892,3 +878,15 @@ VAR3 (QUADOP_NONE_NONE_UNONE_IMM_UNONE, vshlcq_m_vec_s, v16qi, v8hi, v4si) VAR3 (QUADOP_NONE_NONE_UNONE_IMM_UNONE, vshlcq_m_carry_s, v16qi, v8hi, v4si) VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vshlcq_m_vec_u, v16qi, v8hi, v4si) VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vshlcq_m_carry_u, v16qi, v8hi, v4si) + +/* optabs without any suffixes. */ +VAR5 (BINOP_NONE_NONE_NONE, vcaddq_rot90, v16qi, v8hi, v4si, v8hf, v4sf) +VAR5 (BINOP_NONE_NONE_NONE, vcaddq_rot270, v16qi, v8hi, v4si, v8hf, v4sf) +VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot90, v8hf, v4sf) +VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot270, v8hf, v4sf) +VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot180, v8hf, v4sf) +VAR2 (BINOP_NONE_NONE_NONE, vcmulq, v8hf, v4sf) +VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot90, v8hf, v4sf) +VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot270, v8hf, v4sf) +VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot180, v8hf, v4sf) +VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq, v8hf, v4sf) diff --git a/gcc/config/arm/constraints.md b/gcc/config/arm/constraints.md index 789e3332abb7495b308509d03ed241d39498a8b6..6ebddb95b4f9c835f10f5265573f27a06ccbd11f 100644 --- a/gcc/config/arm/constraints.md +++ b/gcc/config/arm/constraints.md @@ -310,7 +310,7 @@ (define_constraint "Dz" "@internal In ARM/Thumb-2 state a vector of constant zeros." (and (match_code "const_vector") - (match_test "TARGET_NEON && op == CONST0_RTX (mode)"))) + (match_test "(TARGET_NEON || TARGET_HAVE_MVE) && op == CONST0_RTX (mode)"))) (define_constraint "Da" "@internal diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 43e1ebd87cf69e716474bb6ee9bcdd405523d8da..daeb13ce863034844a3afc501f31846e4b22ea8b 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -1177,6 +1177,7 @@ (define_int_attr crypto_mode [(UNSPEC_SHA1H "V4SI") (UNSPEC_AESMC "V16QI") (define_int_attr rot [(UNSPEC_VCADD90 "90") (UNSPEC_VCADD270 "270") + (UNSPEC_VCMLS "0") (UNSPEC_VCMLA "0") (UNSPEC_VCMLA90 "90") (UNSPEC_VCMLA180 "180") @@ -1224,6 +1225,34 @@ (define_int_attr rotsplit2 [(UNSPEC_VCMLA "90") (define_int_attr fcmac1 [(UNSPEC_VCMLA "a") (UNSPEC_VCMLA180 "a") (UNSPEC_VCMLS "s") (UNSPEC_VCMLS180 "s")]) +(define_int_attr mve_rotsplit1 [(UNSPEC_VCMLA "") + (UNSPEC_VCMLA180 "") + (UNSPEC_VCMUL "") + (UNSPEC_VCMUL180 "") + (UNSPEC_VCMLS "_rot270") + (UNSPEC_VCMLS180 "_rot90")]) + +(define_int_attr mve_rotsplit2 [(UNSPEC_VCMLA "_rot90") + (UNSPEC_VCMLA180 "_rot270") + (UNSPEC_VCMUL "_rot90") + (UNSPEC_VCMUL180 "_rot270") + (UNSPEC_VCMLS "_rot180") + (UNSPEC_VCMLS180 "_rot180")]) + +(define_int_attr mve_rot [(UNSPEC_VCADD90 "_rot90") + (UNSPEC_VCADD270 "_rot270") + (UNSPEC_VCMLA "") + (UNSPEC_VCMLA90 "_rot90") + (UNSPEC_VCMLA180 "_rot180") + (UNSPEC_VCMLA270 "_rot270") + (UNSPEC_VCMUL "") + (UNSPEC_VCMUL90 "_rot90") + (UNSPEC_VCMUL180 "_rot180") + (UNSPEC_VCMUL270 "_rot270")]) + +(define_int_iterator VCMUL [UNSPEC_VCMUL UNSPEC_VCMUL90 + UNSPEC_VCMUL180 UNSPEC_VCMUL270]) + (define_int_attr simd32_op [(UNSPEC_QADD8 "qadd8") (UNSPEC_QSUB8 "qsub8") (UNSPEC_SHADD8 "shadd8") (UNSPEC_SHSUB8 "shsub8") (UNSPEC_UHADD8 "uhadd8") (UNSPEC_UHSUB8 "uhsub8") @@ -1276,9 +1305,8 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U "u") (VREV16Q_S "s") (VABDQ_U "u") (VADDQ_N_S "s") (VADDQ_N_U "u") (VADDVQ_P_S "s") (VADDVQ_P_U "u") (VANDQ_S "s") (VANDQ_U "u") (VBICQ_S "s") (VBICQ_U "u") - (VBRSRQ_N_S "s") (VBRSRQ_N_U "u") (VCADDQ_ROT270_S "s") - (VCADDQ_ROT270_U "u") (VCADDQ_ROT90_S "s") - (VCMPEQQ_S "s") (VCMPEQQ_U "u") (VCADDQ_ROT90_U "u") + (VBRSRQ_N_S "s") (VBRSRQ_N_U "u") + (VCMPEQQ_S "s") (VCMPEQQ_U "u") (VCMPEQQ_N_S "s") (VCMPEQQ_N_U "u") (VCMPNEQ_N_S "s") (VCMPNEQ_N_U "u") (VEORQ_S "s") (VEORQ_U "u") (VHADDQ_N_S "s") (VHADDQ_N_U "u") (VHADDQ_S "s") @@ -1546,8 +1574,6 @@ (define_int_iterator VADDVQ_P [VADDVQ_P_U VADDVQ_P_S]) (define_int_iterator VANDQ [VANDQ_U VANDQ_S]) (define_int_iterator VBICQ [VBICQ_S VBICQ_U]) (define_int_iterator VBRSRQ_N [VBRSRQ_N_U VBRSRQ_N_S]) -(define_int_iterator VCADDQ_ROT270 [VCADDQ_ROT270_S VCADDQ_ROT270_U]) -(define_int_iterator VCADDQ_ROT90 [VCADDQ_ROT90_U VCADDQ_ROT90_S]) (define_int_iterator VCMPEQQ [VCMPEQQ_U VCMPEQQ_S]) (define_int_iterator VCMPEQQ_N [VCMPEQQ_N_S VCMPEQQ_N_U]) (define_int_iterator VCMPNEQ_N [VCMPNEQ_N_U VCMPNEQ_N_S]) @@ -1757,3 +1783,11 @@ (define_int_iterator VADCQ_M [VADCQ_M_U VADCQ_M_S]) (define_int_iterator UQRSHLLQ [UQRSHLL_64 UQRSHLL_48]) (define_int_iterator SQRSHRLQ [SQRSHRL_64 SQRSHRL_48]) (define_int_iterator VSHLCQ_M [VSHLCQ_M_S VSHLCQ_M_U]) +;; Define iterators for VCMLA operations +(define_int_iterator VCMLA_OP [UNSPEC_VCMLA + UNSPEC_VCMLA180 + UNSPEC_VCMLS]) + +;; Define iterators for VCMLA operations as MUL +(define_int_iterator VCMUL_OP [UNSPEC_VCMUL + UNSPEC_VCMUL180]) diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index ecbaaa9150132fa6568b5d2217e7233e2ec6ebb6..336e15ebf7d3193db47414a3100a8ba44356087c 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -937,34 +937,28 @@ (define_insn "mve_vbrsrq_n_<supf><mode>" ]) ;; -;; [vcaddq_rot270_s, vcaddq_rot270_u]) +;; [vcaddq, vcaddq_rot90, vcadd_rot180, vcadd_rot270]) ;; -(define_insn "mve_vcaddq_rot270_<supf><mode>" +(define_insn "mve_vcaddq<mve_rot><mode>" [ (set (match_operand:MVE_2 0 "s_register_operand" "<earlyclobber_32>") (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w") (match_operand:MVE_2 2 "s_register_operand" "w")] - VCADDQ_ROT270)) + VCADD)) ] "TARGET_HAVE_MVE" - "vcadd.i%#<V_sz_elem> %q0, %q1, %q2, #270" + "vcadd.i%#<V_sz_elem> %q0, %q1, %q2, #<rot>" [(set_attr "type" "mve_move") ]) -;; -;; [vcaddq_rot90_u, vcaddq_rot90_s]) -;; -(define_insn "mve_vcaddq_rot90_<supf><mode>" - [ - (set (match_operand:MVE_2 0 "s_register_operand" "<earlyclobber_32>") - (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w") - (match_operand:MVE_2 2 "s_register_operand" "w")] - VCADDQ_ROT90)) - ] - "TARGET_HAVE_MVE" - "vcadd.i%#<V_sz_elem> %q0, %q1, %q2, #90" - [(set_attr "type" "mve_move") -]) +;; Auto vectorizer pattern for int vcadd +(define_expand "cadd<rot><mode>3" + [(set (match_operand:MVE_2 0 "register_operand") + (unspec:MVE_2 [(match_operand:MVE_2 1 "register_operand") + (match_operand:MVE_2 2 "register_operand")] + VCADD))] + "TARGET_HAVE_MVE && !BYTES_BIG_ENDIAN" +) ;; ;; [vcmpcsq_n_u]) @@ -2059,32 +2053,17 @@ (define_insn "mve_vbicq_n_<supf><mode>" ]) ;; -;; [vcaddq_rot270_f]) -;; -(define_insn "mve_vcaddq_rot270_f<mode>" - [ - (set (match_operand:MVE_0 0 "s_register_operand" "<earlyclobber_32>") - (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w") - (match_operand:MVE_0 2 "s_register_operand" "w")] - VCADDQ_ROT270_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcadd.f%#<V_sz_elem> %q0, %q1, %q2, #270" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcaddq_rot90_f]) +;; [vcaddq, vcaddq_rot90, vcadd_rot180, vcadd_rot270]) ;; -(define_insn "mve_vcaddq_rot90_f<mode>" +(define_insn "mve_vcaddq<mve_rot><mode>" [ (set (match_operand:MVE_0 0 "s_register_operand" "<earlyclobber_32>") (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w") (match_operand:MVE_0 2 "s_register_operand" "w")] - VCADDQ_ROT90_F)) + VCADD)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcadd.f%#<V_sz_elem> %q0, %q1, %q2, #90" + "vcadd.f%#<V_sz_elem> %q0, %q1, %q2, #<rot>" [(set_attr "type" "mve_move") ]) @@ -2269,62 +2248,17 @@ (define_insn "mve_vcmpneq_n_f<mode>" ]) ;; -;; [vcmulq_f]) -;; -(define_insn "mve_vcmulq_f<mode>" - [ - (set (match_operand:MVE_0 0 "s_register_operand" "<earlyclobber_32>") - (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w") - (match_operand:MVE_0 2 "s_register_operand" "w")] - VCMULQ_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcmul.f%#<V_sz_elem> %q0, %q1, %q2, #0" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmulq_rot180_f]) -;; -(define_insn "mve_vcmulq_rot180_f<mode>" - [ - (set (match_operand:MVE_0 0 "s_register_operand" "<earlyclobber_32>") - (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w") - (match_operand:MVE_0 2 "s_register_operand" "w")] - VCMULQ_ROT180_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcmul.f%#<V_sz_elem> %q0, %q1, %q2, #180" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmulq_rot270_f]) +;; [vcmulq, vcmulq_rot90, vcmulq_rot180, vcmulq_rot270]) ;; -(define_insn "mve_vcmulq_rot270_f<mode>" +(define_insn "mve_vcmulq<mve_rot><mode>" [ (set (match_operand:MVE_0 0 "s_register_operand" "<earlyclobber_32>") (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w") (match_operand:MVE_0 2 "s_register_operand" "w")] - VCMULQ_ROT270_F)) + VCMUL)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcmul.f%#<V_sz_elem> %q0, %q1, %q2, #270" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmulq_rot90_f]) -;; -(define_insn "mve_vcmulq_rot90_f<mode>" - [ - (set (match_operand:MVE_0 0 "s_register_operand" "<earlyclobber_32>") - (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w") - (match_operand:MVE_0 2 "s_register_operand" "w")] - VCMULQ_ROT90_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcmul.f%#<V_sz_elem> %q0, %q1, %q2, #90" + "vcmul.f%#<V_sz_elem> %q0, %q1, %q2, #<rot>" [(set_attr "type" "mve_move") ]) @@ -4098,66 +4032,20 @@ (define_insn "mve_vaddlvaq_p_<supf>v4si" [(set_attr "type" "mve_move") (set_attr "length""8")]) ;; -;; [vcmlaq_f]) -;; -(define_insn "mve_vcmlaq_f<mode>" - [ - (set (match_operand:MVE_0 0 "s_register_operand" "=w") - (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") - (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:MVE_0 3 "s_register_operand" "w")] - VCMLAQ_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcmla.f%#<V_sz_elem> %q0, %q2, %q3, #0" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmlaq_rot180_f]) +;; [vcmlaq, vcmlaq_rot90, vcmlaq_rot180, vcmlaq_rot270]) ;; -(define_insn "mve_vcmlaq_rot180_f<mode>" +(define_insn "mve_vcmlaq<mve_rot><mode>" [ - (set (match_operand:MVE_0 0 "s_register_operand" "=w") - (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") - (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:MVE_0 3 "s_register_operand" "w")] - VCMLAQ_ROT180_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcmla.f%#<V_sz_elem> %q0, %q2, %q3, #180" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmlaq_rot270_f]) -;; -(define_insn "mve_vcmlaq_rot270_f<mode>" - [ - (set (match_operand:MVE_0 0 "s_register_operand" "=w") - (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") - (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:MVE_0 3 "s_register_operand" "w")] - VCMLAQ_ROT270_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcmla.f%#<V_sz_elem> %q0, %q2, %q3, #270" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmlaq_rot90_f]) -;; -(define_insn "mve_vcmlaq_rot90_f<mode>" - [ - (set (match_operand:MVE_0 0 "s_register_operand" "=w") - (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") - (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:MVE_0 3 "s_register_operand" "w")] - VCMLAQ_ROT90_F)) + (set (match_operand:MVE_0 0 "s_register_operand" "=w,w") + (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0,Dz") + (match_operand:MVE_0 2 "s_register_operand" "w,w") + (match_operand:MVE_0 3 "s_register_operand" "w,w")] + VCMLA)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcmla.f%#<V_sz_elem> %q0, %q2, %q3, #90" + "@ + vcmla.f%#<V_sz_elem> %q0, %q2, %q3, #<rot> + vcmul.f%#<V_sz_elem> %q0, %q2, %q3, #<rot>" [(set_attr "type" "mve_move") ]) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 2d76769837884be7bde7a4c4234607a559ea5499..d47606d1fcf20d818a6d4022fb491bba79c657f7 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -3030,6 +3030,26 @@ (define_insn "neon_vcmlaq_lane<rot><mode>" [(set_attr "type" "neon_fcmla")] ) +;; The complex mul operations always need to expand to two instructions. +;; The first operation does half the computation and the second does the +;; remainder. Because of this, expand early. +(define_expand "cmul<rot_op><mode>3" + [(set (match_operand:VDF 0 "register_operand") + (unspec:VDF [(match_operand:VDF 1 "register_operand") + (match_operand:VDF 2 "register_operand")] + VCMUL_OP))] + "TARGET_COMPLEX && !BYTES_BIG_ENDIAN" +{ + rtx tmp = gen_reg_rtx (<MODE>mode); + rtx res1 = gen_reg_rtx (<MODE>mode); + emit_move_insn (tmp, CONST0_RTX (<MODE>mode)); + emit_insn (gen_neon_vcmla<rotsplit1><mode> (res1, tmp, + operands[1], operands[2])); + emit_insn (gen_neon_vcmla<rotsplit2><mode> (operands[0], res1, + operands[1], operands[2])); + DONE; +}) + ;; These instructions map to the __builtins for the Dot Product operations. (define_insn "neon_<sup>dot<vsi2qi>" diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md index a3844e9a72a54489b17738d05d6c5280ae059ffa..663b20d23ba5554ebe7e8e7b459c607cea9b01f1 100644 --- a/gcc/config/arm/unspecs.md +++ b/gcc/config/arm/unspecs.md @@ -510,6 +510,12 @@ (define_c_enum "unspec" [ UNSPEC_VCMLA90 UNSPEC_VCMLA180 UNSPEC_VCMLA270 + UNSPEC_VCMUL + UNSPEC_VCMUL90 + UNSPEC_VCMUL180 + UNSPEC_VCMUL270 + UNSPEC_VCMLS + UNSPEC_VCMLS180 UNSPEC_MATMUL_S UNSPEC_MATMUL_U UNSPEC_MATMUL_US @@ -604,8 +610,6 @@ (define_c_enum "unspec" [ VANDQ_S VBICQ_S VBRSRQ_N_S - VCADDQ_ROT270_S - VCADDQ_ROT90_S VCMPEQQ_S VCMPEQQ_N_S VCMPNEQ_N_S @@ -651,8 +655,6 @@ (define_c_enum "unspec" [ VANDQ_U VBICQ_U VBRSRQ_N_U - VCADDQ_ROT270_U - VCADDQ_ROT90_U VCMPEQQ_U VCMPEQQ_N_U VCMPNEQ_N_U @@ -723,8 +725,6 @@ (define_c_enum "unspec" [ VADDQ_N_F VANDQ_F VBICQ_F - VCADDQ_ROT270_F - VCADDQ_ROT90_F VCMPEQQ_F VCMPEQQ_N_F VCMPGEQ_F @@ -737,10 +737,6 @@ (define_c_enum "unspec" [ VCMPLTQ_N_F VCMPNEQ_F VCMPNEQ_N_F - VCMULQ_F - VCMULQ_ROT180_F - VCMULQ_ROT270_F - VCMULQ_ROT90_F VEORQ_F VMAXNMAQ_F VMAXNMAVQ_F @@ -914,7 +910,6 @@ (define_c_enum "unspec" [ VMLSLDAVAQ_S VQSHRUNBQ_N_S VQRSHRUNTQ_N_S - VCMLAQ_F VMINNMAQ_M_F VFMASQ_N_F VDUPQ_M_N_F @@ -936,14 +931,12 @@ (define_c_enum "unspec" [ VADDLVAQ_P_S VQMOVUNBQ_M_S VCMPLEQ_M_F - VCMLAQ_ROT180_F VMLSLDAVAXQ_S VRNDXQ_M_F VFMSQ_F VMINNMVQ_P_F VMAXNMVQ_P_F VPSELQ_F - VCMLAQ_ROT90_F VQMOVUNTQ_M_S VREV64Q_M_F VNEGQ_M_F @@ -956,7 +949,6 @@ (define_c_enum "unspec" [ VRMLALDAVHQ_P_S VRMLALDAVHXQ_P_S VCMPEQQ_M_N_F - VCMLAQ_ROT270_F VMAXNMAQ_M_F VRNDQ_M_F VMLALDAVQ_P_U diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index 250e503876ac9491234b225d6fdac16acc48c41f..22a0dfa2e52e532165ba90a3e0c82cc4bbebd4c4 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -172,3 +172,73 @@ (define_expand "vec_set<mode>" GEN_INT (elem), operands[0])); DONE; }) + +(define_expand "cadd<rot><mode>3" + [(set (match_operand:VF 0 "register_operand") + (unspec:VF [(match_operand:VF 1 "register_operand") + (match_operand:VF 2 "register_operand")] + VCADD))] + "(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT + && ARM_HAVE_<MODE>_ARITH)) && !BYTES_BIG_ENDIAN" +) + +;; The complex mul operations always need to expand to two instructions. +;; The first operation does half the computation and the second does the +;; remainder. Because of this, expand early. +(define_expand "cmul<rot_op><mode>3" + [(set (match_operand:VQ_HSF 0 "register_operand") + (unspec:VQ_HSF [(match_operand:VQ_HSF 1 "register_operand") + (match_operand:VQ_HSF 2 "register_operand")] + VCMUL_OP))] + "(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT)) + && !BYTES_BIG_ENDIAN" +{ + rtx res1 = gen_reg_rtx (<MODE>mode); + if (TARGET_COMPLEX) + { + rtx tmp = gen_reg_rtx (<MODE>mode); + emit_move_insn (tmp, CONST0_RTX (<MODE>mode)); + emit_insn (gen_neon_vcmla<rotsplit1><mode> (res1, tmp, + operands[1], operands[2])); + emit_insn (gen_neon_vcmla<rotsplit2><mode> (operands[0], res1, + operands[1], operands[2])); + } + else + { + emit_insn (gen_mve_vcmulq<mve_rotsplit1><mode> (operands[0], operands[1], + operands[2])); + emit_insn (gen_mve_vcmulq<mve_rotsplit2><mode> (operands[0], operands[1], + operands[2])); + } + DONE; +}) + +(define_expand "arm_vcmla<rot><mode>" + [(set (match_operand:VF 0 "register_operand") + (plus:VF (match_operand:VF 1 "register_operand") + (unspec:VF [(match_operand:VF 2 "register_operand") + (match_operand:VF 3 "register_operand")] + VCMLA)))] + "(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT + && ARM_HAVE_<MODE>_ARITH)) && !BYTES_BIG_ENDIAN" +) + +;; The complex mla/mls operations always need to expand to two instructions. +;; The first operation does half the computation and the second does the +;; remainder. Because of this, expand early. +(define_expand "cml<fcmac1><rot_op><mode>4" + [(set (match_operand:VF 0 "register_operand") + (plus:VF (match_operand:VF 1 "register_operand") + (unspec:VF [(match_operand:VF 2 "register_operand") + (match_operand:VF 3 "register_operand")] + VCMLA_OP)))] + "(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT + && ARM_HAVE_<MODE>_ARITH)) && !BYTES_BIG_ENDIAN" +{ + rtx tmp = gen_reg_rtx (<MODE>mode); + emit_insn (gen_arm_vcmla<rotsplit1><mode> (tmp, operands[1], + operands[2], operands[3])); + emit_insn (gen_arm_vcmla<rotsplit2><mode> (operands[0], tmp, + operands[2], operands[3])); + DONE; +})