Hi James, Committed with the refactoring requested and the addition of a few trivial Defines and iterators that were missing due to the patch series being split.
Thanks, Tamar -----Original Message----- From: James Greenhalgh <james.greenha...@arm.com> Sent: Wednesday, January 9, 2019 3:37 PM To: Tamar Christina <tamar.christ...@arm.com> Cc: gcc-patches@gcc.gnu.org; nd <n...@arm.com>; Richard Earnshaw <richard.earns...@arm.com>; Marcus Shawcroft <marcus.shawcr...@arm.com> Subject: Re: [PATCH 6/9][GCC][AArch64] Add Armv8.3-a complex intrinsics On Fri, Dec 21, 2018 at 11:57:55AM -0600, Tamar Christina wrote: > Hi All, > > This updated patch adds NEON intrinsics and tests for the Armv8.3-a > complex multiplication and add instructions with a rotate along the Argand > plane. > > The instructions are documented in the ArmARM[1] and the intrinsics > specification will be published on the Arm website [2]. > > The Lane versions of these instructions are special in that they always > select a pair. > using index 0 means selecting lane 0 and 1. Because of this the range > check for the intrinsics require special handling. > > There're a few complexities with the intrinsics for the laneq variants for > AArch64: > > 1) The architecture does not have a version for V2SF. However since the > instructions always > selects a pair of values, the only valid index for V2SF would have been 0. > As such the lane > versions for V2SF are all mapped to the 3SAME variant of the instructions > and not the By element > variant. > > 2) Because of no# 1 above, the laneq versions of the instruction become > tricky. The valid indices are 0 and 1. > For index 0 we treat it the same as the lane version of this instruction > and just pass the lower half of the > register to the 3SAME instruction. When index is 1 we extract the upper > half of the register and pass that to > the 3SAME version of the instruction. > > 2) The architecture forbits the laneq version of the V4HF instruction from > having an index greater than 1. For index 0-1 > we do no extra work. For index 2-3 we extract the upper parts of the > register and pass that to the instruction it would > have normally used, and re-map the index into a range of 0-1. > > [1] > https://developer.arm.com/docs/ddi0487/latest/arm-architecture-referen > ce-manual-armv8-for-armv8-a-architecture-profile > [2] https://developer.arm.com/docs/101028/latest > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > Additional runtime checks done but not posted with the patch. > > Ok for trunk? OK with a refactor. This isn't a great fit for Stage 4, but it is also completely self-contained. I hope we can slow down new content in the AArch64 back-end and start stabilising the port for release. Thanks, James > @@ -1395,6 +1494,80 @@ aarch64_expand_builtin (tree exp, > } > > return target; > + > + case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ0_V2SF: > + case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ90_V2SF: > + case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ180_V2SF: > + case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ270_V2SF: > + case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ0_V4HF: > + case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ90_V4HF: > + case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ180_V4HF: > + case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ270_V4HF: Pull all of this out to a new function please. > + int bcode = fcode - AARCH64_SIMD_FCMLA_LANEQ_BUILTIN_BASE - 1; > + aarch64_fcmla_laneq_builtin_datum* d > + = &aarch64_fcmla_lane_builtin_data[bcode]; > + machine_mode quadmode = GET_MODE_2XWIDER_MODE (d->mode).require (); > + op0 = force_reg (d->mode, expand_normal (CALL_EXPR_ARG (exp, 0))); > + rtx op1 = force_reg (d->mode, expand_normal (CALL_EXPR_ARG (exp, 1))); > + rtx op2 = force_reg (quadmode, expand_normal (CALL_EXPR_ARG (exp, 2))); > + tree tmp = CALL_EXPR_ARG (exp, 3); > + rtx lane_idx = expand_expr (tmp, NULL_RTX, VOIDmode, > +EXPAND_INITIALIZER); > + > + /* Validate that the lane index is a constant. */ > + if (!CONST_INT_P (lane_idx)) > + { > + error ("%Kargument %d must be a constant immediate", exp, 4); > + return const0_rtx; > + } > + > + /* Validate that the index is within the expected range. */ > + int nunits = GET_MODE_NUNITS (quadmode).to_constant (); > + aarch64_simd_lane_bounds (lane_idx, 0, nunits / 2, exp); > + > + /* Keep to GCC-vector-extension lane indices in the RTL. */ > + lane_idx = aarch64_endian_lane_rtx (quadmode, INTVAL > + (lane_idx)); > + > + /* Generate the correct register and mode. */ > + int lane = INTVAL (lane_idx); > + > + if (lane < nunits / 4) > + op2 = simplify_gen_subreg (d->mode, op2, quadmode, 0); > + else > + { > + /* Select the upper 64 bits, either a V2SF or V4HF, this however > + is quite messy, as the operation required even though simple > + doesn't have a simple RTL pattern, and seems it's quite hard to > + define using a single RTL pattern. The target generic version > + gen_highpart_mode generates code that isn't optimal. */ > + rtx temp1 = gen_reg_rtx (d->mode); > + rtx temp2 = gen_reg_rtx (DImode); > + temp1 = simplify_gen_subreg (d->mode, op2, quadmode, 0); > + temp1 = simplify_gen_subreg (V2DImode, temp1, d->mode, 0); > + emit_insn (gen_aarch64_get_lanev2di (temp2, temp1 , const1_rtx)); > + op2 = simplify_gen_subreg (d->mode, temp2, GET_MODE (temp2), 0); > + > + /* And recalculate the index. */ > + lane -= nunits / 4; > + } > + > + if (!target) > + target = gen_reg_rtx (d->mode); > + else > + target = force_reg (d->mode, target); > + > + rtx pat = NULL_RTX; > + > + if (d->lane) > + pat = GEN_FCN (d->icode) (target, op0, op1, op2, > + gen_int_mode (lane, SImode)); > + else > + pat = GEN_FCN (d->icode) (target, op0, op1, op2); > + > + if (!pat) > + return NULL_RTX; > + > + emit_insn (pat); > + return target; > } > > if (fcode >= AARCH64_SIMD_BUILTIN_BASE && fcode <= > AARCH64_SIMD_BUILTIN_MAX)
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c index e2d24168465d590b679555eab154eab0cb9071c7..df0e035e39a94b7978f7c30317779dbdda7c182e 100644 --- a/gcc/config/aarch64/aarch64-builtins.c +++ b/gcc/config/aarch64/aarch64-builtins.c @@ -42,6 +42,7 @@ #include "langhooks.h" #include "gimple-iterator.h" #include "case-cfn-macros.h" +#include "emit-rtl.h" #define v8qi_UP E_V8QImode #define v4hi_UP E_V4HImode @@ -102,7 +103,10 @@ enum aarch64_type_qualifiers /* Lane indices - must be in range, and flipped for bigendian. */ qualifier_lane_index = 0x200, /* Lane indices for single lane structure loads and stores. */ - qualifier_struct_load_store_lane_index = 0x400 + qualifier_struct_load_store_lane_index = 0x400, + /* Lane indices selected in pairs. - must be in range, and flipped for + bigendian. */ + qualifier_lane_pair_index = 0x800, }; typedef struct @@ -171,6 +175,11 @@ aarch64_types_ternopu_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS] #define TYPES_TERNOPUI (aarch64_types_ternopu_imm_qualifiers) +static enum aarch64_type_qualifiers +aarch64_types_quadop_lane_pair_qualifiers[SIMD_MAX_BUILTIN_ARGS] + = { qualifier_none, qualifier_none, qualifier_none, + qualifier_none, qualifier_lane_pair_index }; +#define TYPES_QUADOP_LANE_PAIR (aarch64_types_quadop_lane_pair_qualifiers) static enum aarch64_type_qualifiers aarch64_types_quadop_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_none, qualifier_none, qualifier_none, @@ -356,6 +365,18 @@ static aarch64_simd_builtin_datum aarch64_simd_builtin_data[] = { CRC32_BUILTIN (crc32cw, SI) \ CRC32_BUILTIN (crc32cx, DI) +/* The next 8 FCMLA instrinsics require some special handling compared the + normal simd intrinsics. */ +#define AARCH64_SIMD_FCMLA_LANEQ_BUILTINS \ + FCMLA_LANEQ_BUILTIN (0, v2sf, fcmla, V2SF, false) \ + FCMLA_LANEQ_BUILTIN (90, v2sf, fcmla, V2SF, false) \ + FCMLA_LANEQ_BUILTIN (180, v2sf, fcmla, V2SF, false) \ + FCMLA_LANEQ_BUILTIN (270, v2sf, fcmla, V2SF, false) \ + FCMLA_LANEQ_BUILTIN (0, v4hf, fcmla_laneq, V4HF, true) \ + FCMLA_LANEQ_BUILTIN (90, v4hf, fcmla_laneq, V4HF, true) \ + FCMLA_LANEQ_BUILTIN (180, v4hf, fcmla_laneq, V4HF, true) \ + FCMLA_LANEQ_BUILTIN (270, v4hf, fcmla_laneq, V4HF, true) \ + typedef struct { const char *name; @@ -364,9 +385,22 @@ typedef struct unsigned int fcode; } aarch64_crc_builtin_datum; +/* Hold information about how to expand the FCMLA_LANEQ builtins. */ +typedef struct +{ + const char *name; + machine_mode mode; + const enum insn_code icode; + unsigned int fcode; + bool lane; +} aarch64_fcmla_laneq_builtin_datum; + #define CRC32_BUILTIN(N, M) \ AARCH64_BUILTIN_##N, +#define FCMLA_LANEQ_BUILTIN(I, N, X, M, T) \ + AARCH64_SIMD_BUILTIN_FCMLA_LANEQ##I##_##M, + #undef VAR1 #define VAR1(T, N, MAP, A) \ AARCH64_SIMD_BUILTIN_##T##_##N##A, @@ -399,6 +433,9 @@ enum aarch64_builtins AARCH64_PAUTH_BUILTIN_AUTIA1716, AARCH64_PAUTH_BUILTIN_PACIA1716, AARCH64_PAUTH_BUILTIN_XPACLRI, + /* Special cased Armv8.3-A Complex FMA by Lane quad Builtins. */ + AARCH64_SIMD_FCMLA_LANEQ_BUILTIN_BASE, + AARCH64_SIMD_FCMLA_LANEQ_BUILTINS AARCH64_BUILTIN_MAX }; @@ -410,6 +447,18 @@ static aarch64_crc_builtin_datum aarch64_crc_builtin_data[] = { AARCH64_CRC32_BUILTINS }; + +#undef FCMLA_LANEQ_BUILTIN +#define FCMLA_LANEQ_BUILTIN(I, N, X, M, T) \ + {"__builtin_aarch64_fcmla_laneq"#I#N, E_##M##mode, CODE_FOR_aarch64_##X##I##N, \ + AARCH64_SIMD_BUILTIN_FCMLA_LANEQ##I##_##M, T}, + +/* This structure contains how to manage the mapping form the builtin to the + instruction to generate in the backend and how to invoke the instruction. */ +static aarch64_fcmla_laneq_builtin_datum aarch64_fcmla_lane_builtin_data[] { + AARCH64_SIMD_FCMLA_LANEQ_BUILTINS +}; + #undef CRC32_BUILTIN static GTY(()) tree aarch64_builtin_decls[AARCH64_BUILTIN_MAX]; @@ -746,6 +795,34 @@ aarch64_init_simd_builtin_scalar_types (void) static bool aarch64_simd_builtins_initialized_p = false; +/* Due to the architecture not providing lane variant of the lane instructions + for fcmla we can't use the standard simd builtin expansion code, but we + still want the majority of the validation that would normally be done. */ + +void +aarch64_init_fcmla_laneq_builtins (void) +{ + unsigned int i = 0; + + for (i = 0; i < ARRAY_SIZE (aarch64_fcmla_lane_builtin_data); ++i) + { + aarch64_fcmla_laneq_builtin_datum* d + = &aarch64_fcmla_lane_builtin_data[i]; + tree argtype = aarch64_lookup_simd_builtin_type (d->mode, qualifier_none); + machine_mode quadmode = GET_MODE_2XWIDER_MODE (d->mode).require (); + tree quadtype + = aarch64_lookup_simd_builtin_type (quadmode, qualifier_none); + tree lanetype + = aarch64_simd_builtin_std_type (SImode, qualifier_lane_pair_index); + tree ftype = build_function_type_list (argtype, argtype, argtype, + quadtype, lanetype, NULL_TREE); + tree fndecl = add_builtin_function (d->name, ftype, d->fcode, + BUILT_IN_MD, NULL, NULL_TREE); + + aarch64_builtin_decls[d->fcode] = fndecl; + } +} + void aarch64_init_simd_builtins (void) { @@ -1001,7 +1078,10 @@ aarch64_init_builtins (void) aarch64_init_fp16_types (); if (TARGET_SIMD) - aarch64_init_simd_builtins (); + { + aarch64_init_simd_builtins (); + aarch64_init_fcmla_laneq_builtins (); + } aarch64_init_crc32_builtins (); aarch64_init_builtin_rsqrt (); @@ -1031,6 +1111,7 @@ typedef enum SIMD_ARG_CONSTANT, SIMD_ARG_LANE_INDEX, SIMD_ARG_STRUCT_LOAD_STORE_LANE_INDEX, + SIMD_ARG_LANE_PAIR_INDEX, SIMD_ARG_STOP } builtin_simd_arg; @@ -1102,6 +1183,22 @@ aarch64_simd_expand_args (rtx target, int icode, int have_retval, /* Keep to GCC-vector-extension lane indices in the RTL. */ op[opc] = aarch64_endian_lane_rtx (vmode, INTVAL (op[opc])); } + /* If the lane index isn't a constant then error out. */ + goto constant_arg; + + case SIMD_ARG_LANE_PAIR_INDEX: + /* Must be a previous operand into which this is an index and + index is restricted to nunits / 2. */ + gcc_assert (opc > 0); + if (CONST_INT_P (op[opc])) + { + machine_mode vmode = insn_data[icode].operand[opc - 1].mode; + unsigned int nunits + = GET_MODE_NUNITS (vmode).to_constant (); + aarch64_simd_lane_bounds (op[opc], 0, nunits / 2, exp); + /* Keep to GCC-vector-extension lane indices in the RTL. */ + op[opc] = aarch64_endian_lane_rtx (vmode, INTVAL (op[opc])); + } /* Fall through - if the lane index isn't a constant then the next case will error. */ /* FALLTHRU */ @@ -1215,6 +1312,8 @@ aarch64_simd_expand_builtin (int fcode, tree exp, rtx target) if (d->qualifiers[qualifiers_k] & qualifier_lane_index) args[k] = SIMD_ARG_LANE_INDEX; + else if (d->qualifiers[qualifiers_k] & qualifier_lane_pair_index) + args[k] = SIMD_ARG_LANE_PAIR_INDEX; else if (d->qualifiers[qualifiers_k] & qualifier_struct_load_store_lane_index) args[k] = SIMD_ARG_STRUCT_LOAD_STORE_LANE_INDEX; else if (d->qualifiers[qualifiers_k] & qualifier_immediate) @@ -1317,6 +1416,79 @@ aarch64_expand_builtin_rsqrt (int fcode, tree exp, rtx target) return target; } +/* Expand a FCMLA lane expression EXP with code FCODE and + result going to TARGET if that is convenient. */ + +rtx +aarch64_expand_fcmla_builtin (tree exp, rtx target, int fcode) +{ + int bcode = fcode - AARCH64_SIMD_FCMLA_LANEQ_BUILTIN_BASE - 1; + aarch64_fcmla_laneq_builtin_datum* d + = &aarch64_fcmla_lane_builtin_data[bcode]; + machine_mode quadmode = GET_MODE_2XWIDER_MODE (d->mode).require (); + rtx op0 = force_reg (d->mode, expand_normal (CALL_EXPR_ARG (exp, 0))); + rtx op1 = force_reg (d->mode, expand_normal (CALL_EXPR_ARG (exp, 1))); + rtx op2 = force_reg (quadmode, expand_normal (CALL_EXPR_ARG (exp, 2))); + tree tmp = CALL_EXPR_ARG (exp, 3); + rtx lane_idx = expand_expr (tmp, NULL_RTX, VOIDmode, EXPAND_INITIALIZER); + + /* Validate that the lane index is a constant. */ + if (!CONST_INT_P (lane_idx)) + { + error ("%Kargument %d must be a constant immediate", exp, 4); + return const0_rtx; + } + + /* Validate that the index is within the expected range. */ + int nunits = GET_MODE_NUNITS (quadmode).to_constant (); + aarch64_simd_lane_bounds (lane_idx, 0, nunits / 2, exp); + + /* Keep to GCC-vector-extension lane indices in the RTL. */ + lane_idx = aarch64_endian_lane_rtx (quadmode, INTVAL (lane_idx)); + + /* Generate the correct register and mode. */ + int lane = INTVAL (lane_idx); + + if (lane < nunits / 4) + op2 = simplify_gen_subreg (d->mode, op2, quadmode, 0); + else + { + /* Select the upper 64 bits, either a V2SF or V4HF, this however + is quite messy, as the operation required even though simple + doesn't have a simple RTL pattern, and seems it's quite hard to + define using a single RTL pattern. The target generic version + gen_highpart_mode generates code that isn't optimal. */ + rtx temp1 = gen_reg_rtx (d->mode); + rtx temp2 = gen_reg_rtx (DImode); + temp1 = simplify_gen_subreg (d->mode, op2, quadmode, 0); + temp1 = simplify_gen_subreg (V2DImode, temp1, d->mode, 0); + emit_insn (gen_aarch64_get_lanev2di (temp2, temp1 , const1_rtx)); + op2 = simplify_gen_subreg (d->mode, temp2, GET_MODE (temp2), 0); + + /* And recalculate the index. */ + lane -= nunits / 4; + } + + if (!target) + target = gen_reg_rtx (d->mode); + else + target = force_reg (d->mode, target); + + rtx pat = NULL_RTX; + + if (d->lane) + pat = GEN_FCN (d->icode) (target, op0, op1, op2, + gen_int_mode (lane, SImode)); + else + pat = GEN_FCN (d->icode) (target, op0, op1, op2); + + if (!pat) + return NULL_RTX; + + emit_insn (pat); + return target; +} + /* Expand an expression EXP that calls a built-in function, with result going to TARGET if that's convenient. */ rtx @@ -1395,6 +1567,16 @@ aarch64_expand_builtin (tree exp, } return target; + + case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ0_V2SF: + case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ90_V2SF: + case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ180_V2SF: + case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ270_V2SF: + case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ0_V4HF: + case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ90_V4HF: + case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ180_V4HF: + case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ270_V4HF: + return aarch64_expand_fcmla_builtin (exp, target, fcode); } if (fcode >= AARCH64_SIMD_BUILTIN_BASE && fcode <= AARCH64_SIMD_BUILTIN_MAX) diff --git a/gcc/config/aarch64/aarch64-c.c b/gcc/config/aarch64/aarch64-c.c index a595b53e5dbb8e69e92037e5c3c3b84a5d43c190..fcb1e80177dc549ba03b09778618a91f022777b7 100644 --- a/gcc/config/aarch64/aarch64-c.c +++ b/gcc/config/aarch64/aarch64-c.c @@ -109,6 +109,7 @@ aarch64_update_cpp_builtins (cpp_reader *pfile) aarch64_def_or_undef (TARGET_CRC32, "__ARM_FEATURE_CRC32", pfile); aarch64_def_or_undef (TARGET_DOTPROD, "__ARM_FEATURE_DOTPROD", pfile); + aarch64_def_or_undef (TARGET_COMPLEX, "__ARM_FEATURE_COMPLEX", pfile); cpp_undef (pfile, "__AARCH64_CMODEL_TINY__"); cpp_undef (pfile, "__AARCH64_CMODEL_SMALL__"); diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 55fe876bf7aeb0011319ea9a8b1edc4812e9ded0..17bb0c4869b12ede2fc51a8f89d841ded8fac230 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -217,6 +217,25 @@ BUILTIN_VB (QUADOP_LANE, sdot_laneq, 0) BUILTIN_VB (QUADOPU_LANE, udot_laneq, 0) + /* Implemented by aarch64_fcadd<rot><mode>. */ + BUILTIN_VHSDF (BINOP, fcadd90, 0) + BUILTIN_VHSDF (BINOP, fcadd270, 0) + + /* Implemented by aarch64_fcmla{_lane}{q}<rot><mode>. */ + BUILTIN_VHSDF (TERNOP, fcmla0, 0) + BUILTIN_VHSDF (TERNOP, fcmla90, 0) + BUILTIN_VHSDF (TERNOP, fcmla180, 0) + BUILTIN_VHSDF (TERNOP, fcmla270, 0) + BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane0, 0) + BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane90, 0) + BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane180, 0) + BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane270, 0) + + BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane0, 0) + BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane90, 0) + BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane180, 0) + BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane270, 0) + BUILTIN_VDQ_I (SHIFTIMM, ashr, 3) VAR1 (SHIFTIMM, ashr_simd, 0, di) BUILTIN_VDQ_I (SHIFTIMM, lshr, 3) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index ef633411e312e5ee6cdf64aa8cef3dbf47cf2388..be6c27d319a1ca6fee581d8f8856a4dff8f4a060 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -419,6 +419,70 @@ } ) +;; The fcadd and fcmla patterns are made UNSPEC for the explicitly due to the +;; fact that their usage need to guarantee that the source vectors are +;; contiguous. It would be wrong to describe the operation without being able +;; to describe the permute that is also required, but even if that is done +;; the permute would have been created as a LOAD_LANES which means the values +;; in the registers are in the wrong order. +(define_insn "aarch64_fcadd<rot><mode>" + [(set (match_operand:VHSDF 0 "register_operand" "=w") + (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w") + (match_operand:VHSDF 2 "register_operand" "w")] + FCADD))] + "TARGET_COMPLEX" + "fcadd\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>, #<rot>" + [(set_attr "type" "neon_fcadd")] +) + +(define_insn "aarch64_fcmla<rot><mode>" + [(set (match_operand:VHSDF 0 "register_operand" "=w") + (plus:VHSDF (match_operand:VHSDF 1 "register_operand" "0") + (unspec:VHSDF [(match_operand:VHSDF 2 "register_operand" "w") + (match_operand:VHSDF 3 "register_operand" "w")] + FCMLA)))] + "TARGET_COMPLEX" + "fcmla\t%0.<Vtype>, %2.<Vtype>, %3.<Vtype>, #<rot>" + [(set_attr "type" "neon_fcmla")] +) + + +(define_insn "aarch64_fcmla_lane<rot><mode>" + [(set (match_operand:VHSDF 0 "register_operand" "=w") + (plus:VHSDF (match_operand:VHSDF 1 "register_operand" "0") + (unspec:VHSDF [(match_operand:VHSDF 2 "register_operand" "w") + (match_operand:VHSDF 3 "register_operand" "w") + (match_operand:SI 4 "const_int_operand" "n")] + FCMLA)))] + "TARGET_COMPLEX" + "fcmla\t%0.<Vtype>, %2.<Vtype>, %3.<FCMLA_maybe_lane>, #<rot>" + [(set_attr "type" "neon_fcmla")] +) + +(define_insn "aarch64_fcmla_laneq<rot>v4hf" + [(set (match_operand:V4HF 0 "register_operand" "=w") + (plus:V4HF (match_operand:V4HF 1 "register_operand" "0") + (unspec:V4HF [(match_operand:V4HF 2 "register_operand" "w") + (match_operand:V8HF 3 "register_operand" "w") + (match_operand:SI 4 "const_int_operand" "n")] + FCMLA)))] + "TARGET_COMPLEX" + "fcmla\t%0.4h, %2.4h, %3.h[%4], #<rot>" + [(set_attr "type" "neon_fcmla")] +) + +(define_insn "aarch64_fcmlaq_lane<rot><mode>" + [(set (match_operand:VQ_HSF 0 "register_operand" "=w") + (plus:VQ_HSF (match_operand:VQ_HSF 1 "register_operand" "0") + (unspec:VQ_HSF [(match_operand:VQ_HSF 2 "register_operand" "w") + (match_operand:<VHALF> 3 "register_operand" "w") + (match_operand:SI 4 "const_int_operand" "n")] + FCMLA)))] + "TARGET_COMPLEX" + "fcmla\t%0.<Vtype>, %2.<Vtype>, %3.<FCMLA_maybe_lane>, #<rot>" + [(set_attr "type" "neon_fcmla")] +) + ;; These instructions map to the __builtins for the Dot Product operations. (define_insn "aarch64_<sur>dot<vsi2qi>" [(set (match_operand:VS 0 "register_operand" "=w") diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index 2617a8c27779a8a1f5a649518d387abb64672135..2f1c75fec34b2ad71415d095ee5f9add0b0484e9 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -254,6 +254,9 @@ extern unsigned aarch64_architecture_version; /* ARMv8.3-A features. */ #define TARGET_ARMV8_3 (AARCH64_ISA_V8_3) +/* Armv8.3-a Complex number extension to AdvSIMD extensions. */ +#define TARGET_COMPLEX (TARGET_SIMD && TARGET_ARMV8_3) + /* Make sure this is always defined so we don't have to check for ifdefs but rather use normal ifs. */ #ifndef TARGET_FIX_ERR_A53_835769_DEFAULT diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 2fd44dd25ce6d1489e69708fd500e2b9c28fda5a..90fce333d09e2c0989737b0c9bed925869dd620c 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -33294,6 +33294,481 @@ vbcaxq_s64 (int64x2_t __a, int64x2_t __b, int64x2_t __c) return __builtin_aarch64_bcaxqv2di (__a, __b, __c); } +#pragma GCC pop_options + +/* AdvSIMD Complex numbers intrinsics. */ + +#pragma GCC push_options +#pragma GCC target(("arch=armv8.3-a")) + +#pragma GCC push_options +#pragma GCC target(("+fp16")) +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcadd_rot90_f16 (float16x4_t __a, float16x4_t __b) +{ + return __builtin_aarch64_fcadd90v4hf (__a, __b); +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcaddq_rot90_f16 (float16x8_t __a, float16x8_t __b) +{ + return __builtin_aarch64_fcadd90v8hf (__a, __b); +} + +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcadd_rot270_f16 (float16x4_t __a, float16x4_t __b) +{ + return __builtin_aarch64_fcadd270v4hf (__a, __b); +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcaddq_rot270_f16 (float16x8_t __a, float16x8_t __b) +{ + return __builtin_aarch64_fcadd270v8hf (__a, __b); +} + +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return __builtin_aarch64_fcmla0v4hf (__r, __a, __b); +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return __builtin_aarch64_fcmla0v8hf (__r, __a, __b); +} + +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_lane_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane0v4hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_laneq_f16 (float16x4_t __r, float16x4_t __a, float16x8_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_laneq0v4hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_lane_f16 (float16x8_t __r, float16x8_t __a, float16x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmlaq_lane0v8hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot90_lane_f16 (float16x8_t __r, float16x8_t __a, float16x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmlaq_lane90v8hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot90_laneq_f16 (float16x4_t __r, float16x4_t __a, float16x8_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_laneq90v4hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot90_lane_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane90v4hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot90_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return __builtin_aarch64_fcmla90v8hf (__r, __a, __b); +} + +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot90_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return __builtin_aarch64_fcmla90v4hf (__r, __a, __b); +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_laneq_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane0v8hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot180_laneq_f16 (float16x4_t __r, float16x4_t __a, float16x8_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_laneq180v4hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot180_lane_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane180v4hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot180_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return __builtin_aarch64_fcmla180v8hf (__r, __a, __b); +} + +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot180_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return __builtin_aarch64_fcmla180v4hf (__r, __a, __b); +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot90_laneq_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane90v8hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot270_laneq_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane270v8hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot270_lane_f16 (float16x8_t __r, float16x8_t __a, float16x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmlaq_lane270v8hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot270_laneq_f16 (float16x4_t __r, float16x4_t __a, float16x8_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_laneq270v4hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot270_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return __builtin_aarch64_fcmla270v8hf (__r, __a, __b); +} + +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot270_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return __builtin_aarch64_fcmla270v4hf (__r, __a, __b); +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot180_laneq_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane180v8hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot180_lane_f16 (float16x8_t __r, float16x8_t __a, float16x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmlaq_lane180v8hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot270_lane_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane270v4hf (__r, __a, __b, __index); +} +#pragma GCC pop_options + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcadd_rot90_f32 (float32x2_t __a, float32x2_t __b) +{ + return __builtin_aarch64_fcadd90v2sf (__a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcaddq_rot90_f32 (float32x4_t __a, float32x4_t __b) +{ + return __builtin_aarch64_fcadd90v4sf (__a, __b); +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcaddq_rot90_f64 (float64x2_t __a, float64x2_t __b) +{ + return __builtin_aarch64_fcadd90v2df (__a, __b); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcadd_rot270_f32 (float32x2_t __a, float32x2_t __b) +{ + return __builtin_aarch64_fcadd270v2sf (__a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcaddq_rot270_f32 (float32x4_t __a, float32x4_t __b) +{ + return __builtin_aarch64_fcadd270v4sf (__a, __b); +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcaddq_rot270_f64 (float64x2_t __a, float64x2_t __b) +{ + return __builtin_aarch64_fcadd270v2df (__a, __b); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b) +{ + return __builtin_aarch64_fcmla0v2sf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b) +{ + return __builtin_aarch64_fcmla0v4sf (__r, __a, __b); +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_f64 (float64x2_t __r, float64x2_t __a, float64x2_t __b) +{ + return __builtin_aarch64_fcmla0v2df (__r, __a, __b); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_lane_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane0v2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_laneq_f32 (float32x2_t __r, float32x2_t __a, float32x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_laneq0v2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_lane_f32 (float32x4_t __r, float32x4_t __a, float32x2_t __b, + const int __index) +{ + return __builtin_aarch64_fcmlaq_lane0v4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane0v4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot90_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b) +{ + return __builtin_aarch64_fcmla90v2sf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot90_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b) +{ + return __builtin_aarch64_fcmla90v4sf (__r, __a, __b); +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot90_f64 (float64x2_t __r, float64x2_t __a, float64x2_t __b) +{ + return __builtin_aarch64_fcmla90v2df (__r, __a, __b); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot90_lane_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane90v2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot90_laneq_f32 (float32x2_t __r, float32x2_t __a, float32x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_laneq90v2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot90_lane_f32 (float32x4_t __r, float32x4_t __a, float32x2_t __b, + const int __index) +{ + return __builtin_aarch64_fcmlaq_lane90v4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot90_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane90v4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot180_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b) +{ + return __builtin_aarch64_fcmla180v2sf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot180_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b) +{ + return __builtin_aarch64_fcmla180v4sf (__r, __a, __b); +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot180_f64 (float64x2_t __r, float64x2_t __a, float64x2_t __b) +{ + return __builtin_aarch64_fcmla180v2df (__r, __a, __b); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot180_lane_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane180v2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot180_laneq_f32 (float32x2_t __r, float32x2_t __a, float32x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_laneq180v2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot180_lane_f32 (float32x4_t __r, float32x4_t __a, float32x2_t __b, + const int __index) +{ + return __builtin_aarch64_fcmlaq_lane180v4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot180_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane180v4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot270_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b) +{ + return __builtin_aarch64_fcmla270v2sf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot270_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b) +{ + return __builtin_aarch64_fcmla270v4sf (__r, __a, __b); +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot270_f64 (float64x2_t __r, float64x2_t __a, float64x2_t __b) +{ + return __builtin_aarch64_fcmla270v2df (__r, __a, __b); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot270_lane_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane270v2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot270_laneq_f32 (float32x2_t __r, float32x2_t __a, float32x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_laneq270v2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot270_lane_f32 (float32x4_t __r, float32x4_t __a, float32x2_t __b, + const int __index) +{ + return __builtin_aarch64_fcmlaq_lane270v4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot270_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane270v4sf (__r, __a, __b, __index); +} #pragma GCC pop_options diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 1065ea3bf79511e09648e84d100a7152fbf616ae..85fa1619ceb8c998cf57a08415a8f133acd6cf71 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -485,6 +485,12 @@ UNSPEC_COND_GE ; Used in aarch64-sve.md. UNSPEC_COND_GT ; Used in aarch64-sve.md. UNSPEC_LASTB ; Used in aarch64-sve.md. + UNSPEC_FCADD90 ; Used in aarch64-simd.md. + UNSPEC_FCADD270 ; Used in aarch64-simd.md. + UNSPEC_FCMLA ; Used in aarch64-simd.md. + UNSPEC_FCMLA90 ; Used in aarch64-simd.md. + UNSPEC_FCMLA180 ; Used in aarch64-simd.md. + UNSPEC_FCMLA270 ; Used in aarch64-simd.md. ]) ;; ------------------------------------------------------------------ @@ -1134,6 +1140,13 @@ (VNx16SI "vnx4bi") (VNx16SF "vnx4bi") (VNx8DI "vnx2bi") (VNx8DF "vnx2bi")]) +;; On AArch64 the By element instruction doesn't have a 2S variant. +;; However because the instruction always selects a pair of values +;; The normal 3SAME instruction can be used here instead. +(define_mode_attr FCMLA_maybe_lane [(V2SF "<Vtype>") (V4SF "<Vetype>[%4]") + (V4HF "<Vetype>[%4]") (V8HF "<Vetype>[%4]") + ]) + ;; ------------------------------------------------------------------- ;; Code Iterators ;; ------------------------------------------------------------------- @@ -1587,6 +1600,14 @@ UNSPEC_COND_EQ UNSPEC_COND_NE UNSPEC_COND_GE UNSPEC_COND_GT]) +(define_int_iterator FCADD [UNSPEC_FCADD90 + UNSPEC_FCADD270]) + +(define_int_iterator FCMLA [UNSPEC_FCMLA + UNSPEC_FCMLA90 + UNSPEC_FCMLA180 + UNSPEC_FCMLA270]) + ;; Iterators for atomic operations. (define_int_iterator ATOMIC_LDOP @@ -1848,6 +1869,13 @@ (UNSPEC_COND_MAX "fmaxnm") (UNSPEC_COND_MIN "fminnm")]) +(define_int_attr rot [(UNSPEC_FCADD90 "90") + (UNSPEC_FCADD270 "270") + (UNSPEC_FCMLA "0") + (UNSPEC_FCMLA90 "90") + (UNSPEC_FCMLA180 "180") + (UNSPEC_FCMLA270 "270")]) + (define_int_attr sve_fmla_op [(UNSPEC_COND_FMLA "fmla") (UNSPEC_COND_FMLS "fmls") (UNSPEC_COND_FNMLA "fnmla") diff --git a/gcc/config/arm/types.md b/gcc/config/arm/types.md index 8e7d097ce5d56319078c3ce1e8b828b9d6b9b4dd..f8f8dd09077a5c9d3691c95c6676ee36114786e4 100644 --- a/gcc/config/arm/types.md +++ b/gcc/config/arm/types.md @@ -763,6 +763,9 @@ neon_sub_halve,\ neon_sub_halve_q,\ neon_sub_halve_narrow_q,\ +\ + neon_fcadd,\ + neon_fcmla,\ \ neon_abs,\ neon_abs_q,\ diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vector-complex.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vector-complex.c new file mode 100644 index 0000000000000000000000000000000000000000..b7c999333ed3a7aa9708bca3a0510ba754b7e4d4 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vector-complex.c @@ -0,0 +1,251 @@ +/* { dg-skip-if "" { arm-*-* } } */ +/* { dg-do assemble } */ +/* { dg-require-effective-target arm_v8_3a_complex_neon_ok } */ +/* { dg-add-options arm_v8_3a_complex_neon } */ +/* { dg-additional-options "-O2 -save-temps" } */ + +#include <arm_neon.h> + +float32x2_t +test_vcadd_rot90_f32 (float32x2_t __a, float32x2_t __b) +{ + return vcadd_rot90_f32 (__a, __b); +} + +float32x4_t +test_vcaddq_rot90_f32 (float32x4_t __a, float32x4_t __b) +{ + return vcaddq_rot90_f32 (__a, __b); +} + +#ifdef __ARM_ARCH_ISA_A64 +float64x2_t +test_vcaddq_rot90_f64 (float64x2_t __a, float64x2_t __b) +{ + return vcaddq_rot90_f64 (__a, __b); +} +#endif + +float32x2_t +test_vcadd_rot270_f32 (float32x2_t __a, float32x2_t __b) +{ + return vcadd_rot270_f32 (__a, __b); +} + +float32x4_t +test_vcaddq_rot270_f32 (float32x4_t __a, float32x4_t __b) +{ + return vcaddq_rot270_f32 (__a, __b); +} + +#ifdef __ARM_ARCH_ISA_A64 +float64x2_t +test_vcaddq_rot270_f64 (float64x2_t __a, float64x2_t __b) +{ + return vcaddq_rot270_f64 (__a, __b); +} +#endif + +float32x2_t +test_vcmla_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b) +{ + return vcmla_f32 (__r, __a, __b); +} + +float32x4_t +test_vcmlaq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b) +{ + return vcmlaq_f32 (__r, __a, __b); +} + +#ifdef __ARM_ARCH_ISA_A64 +float64x2_t +test_vcmlaq_f64 (float64x2_t __r, float64x2_t __a, float64x2_t __b) +{ + return vcmlaq_f64 (__r, __a, __b); +} +#endif + +float32x2_t +test_vcmla_lane_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b) +{ + return vcmla_lane_f32 (__r, __a, __b, 0); +} + +float32x2_t +test_vcmla_laneq_f32 (float32x2_t __r, float32x2_t __a, float32x4_t __b) +{ + return vcmla_laneq_f32 (__r, __a, __b, 1); +} + +float32x4_t +test_vcmlaq_lane_f32 (float32x4_t __r, float32x4_t __a, float32x2_t __b) +{ + return vcmlaq_lane_f32 (__r, __a, __b, 0); +} + +float32x4_t +test_vcmlaq_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b) +{ + return vcmlaq_laneq_f32 (__r, __a, __b, 1); +} + +float32x2_t +test_vcmla_rot90_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b) +{ + return vcmla_rot90_f32 (__r, __a, __b); +} + +float32x4_t +test_vcmlaq_rot90_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b) +{ + return vcmlaq_rot90_f32 (__r, __a, __b); +} + +#ifdef __ARM_ARCH_ISA_A64 +float64x2_t +test_vcmlaq_rot90_f64 (float64x2_t __r, float64x2_t __a, float64x2_t __b) +{ + return vcmlaq_rot90_f64 (__r, __a, __b); +} +#endif + +float32x2_t +test_vcmla_rot90_lane_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b) +{ + return vcmla_rot90_lane_f32 (__r, __a, __b, 0); +} + +float32x2_t +test_vcmla_rot90_laneq_f32 (float32x2_t __r, float32x2_t __a, float32x4_t __b) +{ + return vcmla_rot90_laneq_f32 (__r, __a, __b, 1); +} + +float32x4_t +test_vcmlaq_rot90_lane_f32 (float32x4_t __r, float32x4_t __a, float32x2_t __b) +{ + return vcmlaq_rot90_lane_f32 (__r, __a, __b, 0); +} + +float32x4_t +test_vcmlaq_rot90_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b) +{ + return vcmlaq_rot90_laneq_f32 (__r, __a, __b, 1); +} + +float32x2_t +test_vcmla_rot180_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b) +{ + return vcmla_rot180_f32 (__r, __a, __b); +} + +float32x4_t +test_vcmlaq_rot180_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b) +{ + return vcmlaq_rot180_f32 (__r, __a, __b); +} + +#ifdef __ARM_ARCH_ISA_A64 +float64x2_t +test_vcmlaq_rot180_f64 (float64x2_t __r, float64x2_t __a, float64x2_t __b) +{ + return vcmlaq_rot180_f64 (__r, __a, __b); +} +#endif + +float32x2_t +test_vcmla_rot180_lane_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b) +{ + return vcmla_rot180_lane_f32 (__r, __a, __b, 0); +} + +float32x2_t +test_vcmla_rot180_laneq_f32 (float32x2_t __r, float32x2_t __a, float32x4_t __b) +{ + return vcmla_rot180_laneq_f32 (__r, __a, __b, 1); +} + +float32x4_t +test_vcmlaq_rot180_lane_f32 (float32x4_t __r, float32x4_t __a, float32x2_t __b) +{ + return vcmlaq_rot180_lane_f32 (__r, __a, __b, 0); +} + +float32x4_t +test_vcmlaq_rot180_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b) +{ + return vcmlaq_rot180_laneq_f32 (__r, __a, __b, 1); +} + +float32x2_t +test_vcmla_rot270_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b) +{ + return vcmla_rot270_f32 (__r, __a, __b); +} + +float32x4_t +test_vcmlaq_rot270_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b) +{ + return vcmlaq_rot270_f32 (__r, __a, __b); +} + +#ifdef __ARM_ARCH_ISA_A64 +float64x2_t +test_vcmlaq_rot270_f64 (float64x2_t __r, float64x2_t __a, float64x2_t __b) +{ + return vcmlaq_rot270_f64 (__r, __a, __b); +} +#endif + +float32x2_t +test_vcmla_rot270_lane_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b) +{ + return vcmla_rot270_lane_f32 (__r, __a, __b, 0); +} + +float32x2_t +test_vcmla_rot270_laneq_f32 (float32x2_t __r, float32x2_t __a, float32x4_t __b) +{ + return vcmla_rot270_laneq_f32 (__r, __a, __b, 1); +} + +float32x4_t +test_vcmlaq_rot270_lane_f32 (float32x4_t __r, float32x4_t __a, float32x2_t __b) +{ + return vcmlaq_rot270_lane_f32 (__r, __a, __b, 0); +} + +float32x4_t +test_vcmlaq_rot270_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b) +{ + return vcmlaq_rot270_laneq_f32 (__r, __a, __b, 1); +} + +/* { dg-final { scan-assembler-times {fcadd\tv[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, #270} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcadd\tv[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, #90} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcadd\tv[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, #270} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcadd\tv[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, #90} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcadd\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, #270} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcadd\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, #90} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, #0} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, #180} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, #270} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, #90} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, #0} 3 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, #180} 3 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, #270} 3 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, #90} 3 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, #0} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, #180} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, #270} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, #90} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.s\[0\], #0} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.s\[0\], #180} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.s\[0\], #270} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.s\[0\], #90} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.s\[1\], #0} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.s\[1\], #180} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.s\[1\], #270} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.s\[1\], #90} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {dup\td[0-9]+, v[0-9]+.d\[1\]} 4 { target { aarch64*-*-* } } } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vector-complex_f16.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vector-complex_f16.c new file mode 100644 index 0000000000000000000000000000000000000000..dbcebcbfba67172de25bb3ab743270cacf7c9f96 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vector-complex_f16.c @@ -0,0 +1,306 @@ +/* { dg-skip-if "" { arm-*-* } } */ +/* { dg-do assemble } */ +/* { dg-require-effective-target arm_v8_3a_complex_neon_ok } */ +/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */ +/* { dg-add-options arm_v8_3a_complex_neon } */ +/* { dg-additional-options "-O2 -march=armv8.3-a+fp16 -save-temps" } */ + +#include <arm_neon.h> + +float16x4_t +test_vcadd_rot90_f16 (float16x4_t __a, float16x4_t __b) +{ + return vcadd_rot90_f16 (__a, __b); +} + +float16x8_t +test_vcaddq_rot90_f16 (float16x8_t __a, float16x8_t __b) +{ + return vcaddq_rot90_f16 (__a, __b); +} + +float16x4_t +test_vcadd_rot270_f16 (float16x4_t __a, float16x4_t __b) +{ + return vcadd_rot270_f16 (__a, __b); +} + +float16x8_t +test_vcaddq_rot270_f16 (float16x8_t __a, float16x8_t __b) +{ + return vcaddq_rot270_f16 (__a, __b); +} + +float16x4_t +test_vcmla_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return vcmla_f16 (__r, __a, __b); +} + +float16x8_t +test_vcmlaq_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return vcmlaq_f16 (__r, __a, __b); +} + +float16x4_t +test_vcmla_lane_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return vcmla_lane_f16 (__r, __a, __b, 0); +} + +float16x4_t +test_vcmla_laneq_f16 (float16x4_t __r, float16x4_t __a, float16x8_t __b) +{ + return vcmla_laneq_f16 (__r, __a, __b, 0); +} + +float16x8_t +test_vcmlaq_lane_f16 (float16x8_t __r, float16x8_t __a, float16x4_t __b) +{ + return vcmlaq_lane_f16 (__r, __a, __b, 0); +} + +float16x8_t +test_vcmlaq_laneq_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return vcmlaq_laneq_f16 (__r, __a, __b, 0); +} + +float16x4_t +test_vcmla_lane_f16_2 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return vcmla_lane_f16 (__r, __a, __b, 1); +} + +float16x4_t +test_vcmla_laneq_f16_2 (float16x4_t __r, float16x4_t __a, float16x8_t __b) +{ + return vcmla_laneq_f16 (__r, __a, __b, 3); +} + +float16x8_t +test_vcmlaq_lane_f16_2 (float16x8_t __r, float16x8_t __a, float16x4_t __b) +{ + return vcmlaq_lane_f16 (__r, __a, __b, 1); +} + +float16x8_t +test_vcmlaq_laneq_f16_2 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return vcmlaq_laneq_f16 (__r, __a, __b, 3); +} + +float16x4_t +test_vcmla_rot90_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return vcmla_rot90_f16 (__r, __a, __b); +} + +float16x8_t +test_vcmlaq_rot90_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return vcmlaq_rot90_f16 (__r, __a, __b); +} + +float16x4_t +test_vcmla_rot90_lane_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return vcmla_rot90_lane_f16 (__r, __a, __b, 0); +} + +float16x4_t +test_vcmla_rot90_laneq_f16 (float16x4_t __r, float16x4_t __a, float16x8_t __b) +{ + return vcmla_rot90_laneq_f16 (__r, __a, __b, 0); +} + +float16x8_t +test_vcmlaq_rot90_lane_f16 (float16x8_t __r, float16x8_t __a, float16x4_t __b) +{ + return vcmlaq_rot90_lane_f16 (__r, __a, __b, 0); +} + +float16x8_t +test_vcmlaq_rot90_laneq_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return vcmlaq_rot90_laneq_f16 (__r, __a, __b, 0); +} + +float16x4_t +test_vcmla_rot90_lane_f16_2 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return vcmla_rot90_lane_f16 (__r, __a, __b, 1); +} + +float16x4_t +test_vcmla_rot90_laneq_f16_2 (float16x4_t __r, float16x4_t __a, float16x8_t __b) +{ + return vcmla_rot90_laneq_f16 (__r, __a, __b, 3); +} + +float16x8_t +test_vcmlaq_rot90_lane_f16_2 (float16x8_t __r, float16x8_t __a, float16x4_t __b) +{ + return vcmlaq_rot90_lane_f16 (__r, __a, __b, 1); +} + +float16x8_t +test_vcmlaq_rot90_laneq_f16_2 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return vcmlaq_rot90_laneq_f16 (__r, __a, __b, 3); +} + +float16x4_t +test_vcmla_rot180_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return vcmla_rot180_f16 (__r, __a, __b); +} + +float16x8_t +test_vcmlaq_rot180_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return vcmlaq_rot180_f16 (__r, __a, __b); +} + +float16x4_t +test_vcmla_rot180_lane_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return vcmla_rot180_lane_f16 (__r, __a, __b, 0); +} + +float16x4_t +test_vcmla_rot180_laneq_f16 (float16x4_t __r, float16x4_t __a, float16x8_t __b) +{ + return vcmla_rot180_laneq_f16 (__r, __a, __b, 0); +} + +float16x8_t +test_vcmlaq_rot180_lane_f16 (float16x8_t __r, float16x8_t __a, float16x4_t __b) +{ + return vcmlaq_rot180_lane_f16 (__r, __a, __b, 0); +} + +float16x8_t +test_vcmlaq_rot180_laneq_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return vcmlaq_rot180_laneq_f16 (__r, __a, __b, 0); +} + +float16x4_t +test_vcmla_rot180_lane_f16_2 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return vcmla_rot180_lane_f16 (__r, __a, __b, 1); +} + +float16x4_t +test_vcmla_rot180_laneq_f16_2 (float16x4_t __r, float16x4_t __a, float16x8_t __b) +{ + return vcmla_rot180_laneq_f16 (__r, __a, __b, 3); +} + +float16x8_t +test_vcmlaq_rot180_lane_f16_2 (float16x8_t __r, float16x8_t __a, float16x4_t __b) +{ + return vcmlaq_rot180_lane_f16 (__r, __a, __b, 1); +} + +float16x8_t +test_vcmlaq_rot180_laneq_f16_2 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return vcmlaq_rot180_laneq_f16 (__r, __a, __b, 3); +} + +float16x4_t +test_vcmla_rot270_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return vcmla_rot270_f16 (__r, __a, __b); +} + +float16x8_t +test_vcmlaq_rot270_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return vcmlaq_rot270_f16 (__r, __a, __b); +} + +float16x4_t +test_vcmla_rot270_lane_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return vcmla_rot270_lane_f16 (__r, __a, __b, 0); +} + +float16x4_t +test_vcmla_rot270_laneq_f16 (float16x4_t __r, float16x4_t __a, float16x8_t __b) +{ + return vcmla_rot270_laneq_f16 (__r, __a, __b, 0); +} + +float16x8_t +test_vcmlaq_rot270_lane_f16 (float16x8_t __r, float16x8_t __a, float16x4_t __b) +{ + return vcmlaq_rot270_lane_f16 (__r, __a, __b, 0); +} + +float16x8_t +test_vcmlaq_rot270_laneq_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return vcmlaq_rot270_laneq_f16 (__r, __a, __b, 0); +} + +float16x4_t +test_vcmla_rot270_lane_f16_2 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return vcmla_rot270_lane_f16 (__r, __a, __b, 1); +} + +float16x4_t +test_vcmla_rot270_laneq_f16_2 (float16x4_t __r, float16x4_t __a, float16x8_t __b) +{ + return vcmla_rot270_laneq_f16 (__r, __a, __b, 3); +} + +float16x8_t +test_vcmlaq_rot270_lane_f16_2 (float16x8_t __r, float16x8_t __a, float16x4_t __b) +{ + return vcmlaq_rot270_lane_f16 (__r, __a, __b, 1); +} + +float16x8_t +test_vcmlaq_rot270_laneq_f16_2 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return vcmlaq_rot270_laneq_f16 (__r, __a, __b, 3); +} + +/* { dg-final { scan-assembler-times {dup\td[0-9]+, v[0-9]+.d\[1\]} 4 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcadd\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, #270} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcadd\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, #90} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcadd\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, #270} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcadd\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, #90} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, #0} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, #180} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, #270} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, #90} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.h\[0\], #0} 2 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.h\[0\], #180} 2 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.h\[0\], #270} 2 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.h\[0\], #90} 2 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.h\[1\], #0} 2 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.h\[1\], #180} 2 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.h\[1\], #270} 2 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.h\[1\], #90} 2 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, #0} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, #180} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, #270} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, #90} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.h\[0\], #0} 2 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.h\[0\], #180} 2 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.h\[0\], #270} 2 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.h\[0\], #90} 2 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.h\[1\], #0} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.h\[1\], #180} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.h\[1\], #270} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.h\[1\], #90} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.h\[3\], #0} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.h\[3\], #180} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.h\[3\], #270} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.h\[3\], #90} 1 { target { aarch64*-*-* } } } } */