OK, thanks. Richard
Stam Markianos-Wright <stam.markianos-wri...@arm.com> writes: > On 12/30/19 10:21 AM, Richard Sandiford wrote: >> Stam Markianos-Wright <stam.markianos-wri...@arm.com> writes: >>> On 12/20/19 2:13 PM, Richard Sandiford wrote: >>>> Stam Markianos-Wright <stam.markianos-wri...@arm.com> writes: >>>>> +**... >>>>> +**ret >>>>> +*/ >>>>> +int32x2_t ufoo (int32x2_t r, uint8x8_t x, int8x8_t y) >>>>> +{ >>>>> + return vusdot_s32 (r, x, y); >>>>> +} >>>>> + >>>> >>>> If we're using check-function-bodies anyway, it might be slightly more >>>> robust to compile at -O and check for the exact RA. E.g.: >>>> >>>> /* >>>> **ufoo: >>>> **usdotv0\.2s, (v1\.8b, v2\.8b|v2\.8b, v1\.8b) >>>> **ret >>>> */ >>>> >>>> Just a suggestion though -- either way is fine. >>> >>> done this too and as per our internal discussion also added one >>> xx_untied tests for usdot and one for usdot_lane >>> >>> That's one xx_untied test for each of the RTL pattern types added in >>> aarch64-simd.md. Lmk if this is ok! >>> >>> Also I found that the way we were using check-function-bodies wasn't >>> actually checking the assembler correctly, so I've changed that to: >>> +/* { dg-final { check-function-bodies "**" "" "" } } */ >>> +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ >>> which seems to perform more checks >> >> Ah, OK, hadn't realised that we were cycling through optimisation >> options already. In that case, it might be better to leave out the >> -O from the dg-options and instead use: >> >> /* { dg-skip-if "" { *-*-* } { { "-fno-fat-lto-objects" } { "-O0" } } } */ >> >> (untested). >> >> It's unfortunate that we're skipping this for -O0 though. Ideally we'd >> still compile the code and just skip the dg-final. Does it work if you do: >> >> /* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */ >> /* { dg-skip-if "" { *-*-* } { { "-fno-fat-lto-objects" } } } */ >> >> ? Make sure that we actually still run the check-function-bodies when >> optimisation is enabled. :-) > > This works! > Now we are only doing the following for O0: > PASS: gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-2.c -O0 (test > for > excess errors) > > whereas for other optimisation levels do all the checks: > PASS: gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-2.c -O1 (test > for > excess errors) > PASS: gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-2.c -O1 > check-function-bodies ufoo > PASS: gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-2.c -O1 > check-function-bodies ufooq > PASS: gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-2.c -O1 > check-function-bodies ufoo_lane > PASS: gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-2.c -O1 > check-function-bodies ufoo_laneq > PASS: gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-2.c -O1 > check-function-bodies ufooq_lane > PASS: gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-2.c -O1 > check-function-bodies ufooq_laneq > PASS: gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-2.c -O1 > check-function-bodies sfoo_lane > PASS: gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-2.c -O1 > check-function-bodies sfoo_laneq > PASS: gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-2.c -O1 > check-function-bodies sfooq_lane > PASS: gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-2.c -O1 > check-function-bodies sfooq_laneq > PASS: gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-2.c -O1 > check-function-bodies ufoo_untied > PASS: gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-2.c -O1 > check-function-bodies ufooq_laneq_untied > >> >> Also, I'm an idiot. The reason I'd used (...|...) in the regexps was >> that "dot product is commutative". But of course that's not true for >> these mixed-sign ops, so the string must be: >> >> usdot v0\.2s, v1\.8b, v2\.8b >> >> The patch copied the (...|...) regexps above to the lane tests, but those >> wouldn't be commutative even if the operands had the same type. > > Ahh, makes sense now. Done :) > > Cheers, > Stam > >> >> Thanks, >> Richard >> > > > diff --git a/gcc/config/aarch64/aarch64-builtins.c > b/gcc/config/aarch64/aarch64-builtins.c > index > 1bd2640a1ced352de232fed1cf134b46c69b80f7..702b317d94d2fc6ebe59609727ad853f3f5cc652 > 100644 > --- a/gcc/config/aarch64/aarch64-builtins.c > +++ b/gcc/config/aarch64/aarch64-builtins.c > @@ -107,6 +107,9 @@ enum aarch64_type_qualifiers > /* Lane indices selected in pairs. - must be in range, and flipped for > bigendian. */ > qualifier_lane_pair_index = 0x800, > + /* Lane indices selected in quadtuplets. - must be in range, and flipped > for > + bigendian. */ > + qualifier_lane_quadtup_index = 0x1000, > }; > > typedef struct > @@ -173,6 +176,10 @@ > aarch64_types_ternopu_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS] > = { qualifier_unsigned, qualifier_unsigned, > qualifier_unsigned, qualifier_immediate }; > #define TYPES_TERNOPUI (aarch64_types_ternopu_imm_qualifiers) > +static enum aarch64_type_qualifiers > +aarch64_types_ternop_ssus_qualifiers[SIMD_MAX_BUILTIN_ARGS] > + = { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_none }; > +#define TYPES_TERNOP_SSUS (aarch64_types_ternop_ssus_qualifiers) > > > static enum aarch64_type_qualifiers > @@ -191,6 +198,19 @@ > aarch64_types_quadopu_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS] > qualifier_unsigned, qualifier_lane_index }; > #define TYPES_QUADOPU_LANE (aarch64_types_quadopu_lane_qualifiers) > > +static enum aarch64_type_qualifiers > +aarch64_types_quadopssus_lane_quadtup_qualifiers[SIMD_MAX_BUILTIN_ARGS] > + = { qualifier_none, qualifier_none, qualifier_unsigned, > + qualifier_none, qualifier_lane_quadtup_index }; > +#define TYPES_QUADOPSSUS_LANE_QUADTUP \ > + (aarch64_types_quadopssus_lane_quadtup_qualifiers) > +static enum aarch64_type_qualifiers > +aarch64_types_quadopsssu_lane_quadtup_qualifiers[SIMD_MAX_BUILTIN_ARGS] > + = { qualifier_none, qualifier_none, qualifier_none, > + qualifier_unsigned, qualifier_lane_quadtup_index }; > +#define TYPES_QUADOPSSSU_LANE_QUADTUP \ > + (aarch64_types_quadopsssu_lane_quadtup_qualifiers) > + > static enum aarch64_type_qualifiers > aarch64_types_quadopu_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS] > = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned, > @@ -1260,6 +1280,7 @@ typedef enum > SIMD_ARG_LANE_INDEX, > SIMD_ARG_STRUCT_LOAD_STORE_LANE_INDEX, > SIMD_ARG_LANE_PAIR_INDEX, > + SIMD_ARG_LANE_QUADTUP_INDEX, > SIMD_ARG_STOP > } builtin_simd_arg; > > @@ -1349,9 +1370,25 @@ aarch64_simd_expand_args (rtx target, int icode, int > have_retval, > op[opc] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), > SImode); > } > - /* Fall through - if the lane index isn't a constant then > - the next case will error. */ > - /* FALLTHRU */ > + /* If the lane index isn't a constant then error out. */ > + goto constant_arg; > + case SIMD_ARG_LANE_QUADTUP_INDEX: > + /* Must be a previous operand into which this is an index and > + index is restricted to nunits / 4. */ > + gcc_assert (opc > 0); > + if (CONST_INT_P (op[opc])) > + { > + machine_mode vmode = insn_data[icode].operand[opc - 1].mode; > + unsigned int nunits > + = GET_MODE_NUNITS (vmode).to_constant (); > + aarch64_simd_lane_bounds (op[opc], 0, nunits / 4, exp); > + /* Keep to GCC-vector-extension lane indices in the RTL. */ > + int lane = INTVAL (op[opc]); > + op[opc] = gen_int_mode (ENDIAN_LANE_N (nunits / 4, lane), > + SImode); > + } > + /* If the lane index isn't a constant then error out. */ > + goto constant_arg; > case SIMD_ARG_CONSTANT: > constant_arg: > if (!(*insn_data[icode].operand[opc].predicate) > @@ -1464,6 +1501,8 @@ aarch64_simd_expand_builtin (int fcode, tree exp, rtx > target) > args[k] = SIMD_ARG_LANE_INDEX; > else if (d->qualifiers[qualifiers_k] & qualifier_lane_pair_index) > args[k] = SIMD_ARG_LANE_PAIR_INDEX; > + else if (d->qualifiers[qualifiers_k] & qualifier_lane_quadtup_index) > + args[k] = SIMD_ARG_LANE_QUADTUP_INDEX; > else if (d->qualifiers[qualifiers_k] & > qualifier_struct_load_store_lane_index) > args[k] = SIMD_ARG_STRUCT_LOAD_STORE_LANE_INDEX; > else if (d->qualifiers[qualifiers_k] & qualifier_immediate) > diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def > b/gcc/config/aarch64/aarch64-simd-builtins.def > index > 57fc5933b43bfc0da132342c681b8a2c14549c9c..4744dd1f6b2f20327db810277ff65c59ce5abdec > 100644 > --- a/gcc/config/aarch64/aarch64-simd-builtins.def > +++ b/gcc/config/aarch64/aarch64-simd-builtins.def > @@ -212,10 +212,15 @@ > /* Implemented by aarch64_<sur><dotprod>{_lane}{q}<dot_mode>. */ > BUILTIN_VB (TERNOP, sdot, 0) > BUILTIN_VB (TERNOPU, udot, 0) > + BUILTIN_VB (TERNOP_SSUS, usdot, 0) > BUILTIN_VB (QUADOP_LANE, sdot_lane, 0) > BUILTIN_VB (QUADOPU_LANE, udot_lane, 0) > BUILTIN_VB (QUADOP_LANE, sdot_laneq, 0) > BUILTIN_VB (QUADOPU_LANE, udot_laneq, 0) > + BUILTIN_VB (QUADOPSSUS_LANE_QUADTUP, usdot_lane, 0) > + BUILTIN_VB (QUADOPSSUS_LANE_QUADTUP, usdot_laneq, 0) > + BUILTIN_VB (QUADOPSSSU_LANE_QUADTUP, sudot_lane, 0) > + BUILTIN_VB (QUADOPSSSU_LANE_QUADTUP, sudot_laneq, 0) > > /* Implemented by aarch64_fcadd<rot><mode>. */ > BUILTIN_VHSDF (BINOP, fcadd90, 0) > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > index > 4e28cf97516df19e1d502e56c776f6b34f15c116..2306fd8b42c2215fcd6229f6fbfadb9b7f2d19ae > 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -506,6 +506,20 @@ > [(set_attr "type" "neon_dot<q>")] > ) > > +;; These instructions map to the __builtins for the armv8.6a I8MM usdot > +;; (vector) Dot Product operation. > +(define_insn "aarch64_usdot<vsi2qi>" > + [(set (match_operand:VS 0 "register_operand" "=w") > + (plus:VS > + (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w") > + (match_operand:<VSI2QI> 3 "register_operand" "w")] > + UNSPEC_USDOT) > + (match_operand:VS 1 "register_operand" "0")))] > + "TARGET_I8MM" > + "usdot\\t%0.<Vtype>, %2.<Vdottype>, %3.<Vdottype>" > + [(set_attr "type" "neon_dot<q>")] > +) > + > ;; These expands map to the Dot Product optab the vectorizer checks for. > ;; The auto-vectorizer expects a dot product builtin that also does an > ;; accumulation into the provided register. > @@ -573,6 +587,26 @@ > [(set_attr "type" "neon_dot<q>")] > ) > > +;; These instructions map to the __builtins for the armv8.6a I8MM usdot, > sudot > +;; (by element) Dot Product operations. > +(define_insn "aarch64_<DOTPROD_I8MM:sur>dot_lane<VB:isquadop><VS:vsi2qi>" > + [(set (match_operand:VS 0 "register_operand" "=w") > + (plus:VS > + (unspec:VS [(match_operand:<VS:VSI2QI> 2 "register_operand" "w") > + (match_operand:VB 3 "register_operand" "w") > + (match_operand:SI 4 "immediate_operand" "i")] > + DOTPROD_I8MM) > + (match_operand:VS 1 "register_operand" "0")))] > + "TARGET_I8MM" > + { > + int nunits = GET_MODE_NUNITS (<VB:MODE>mode).to_constant (); > + int lane = INTVAL (operands[4]); > + operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 4, lane), SImode); > + return "<DOTPROD_I8MM:sur>dot\\t%0.<VS:Vtype>, %2.<VS:Vdottype>, > %3.4b[%4]"; > + } > + [(set_attr "type" "neon_dot<VS:q>")] > +) > + > (define_expand "copysign<mode>3" > [(match_operand:VHSDF 0 "register_operand") > (match_operand:VHSDF 1 "register_operand") > diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h > index > c7425346b86b5f5310a7148c465497b53ac75bf5..5d6c65e99330a22cfe598532ecea85c1907431cb > 100644 > --- a/gcc/config/aarch64/arm_neon.h > +++ b/gcc/config/aarch64/arm_neon.h > @@ -34606,6 +34606,89 @@ vrnd64xq_f64 (float64x2_t __a) > > #pragma GCC pop_options > > +/* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics. */ > + > +#pragma GCC push_options > +#pragma GCC target ("arch=armv8.2-a+i8mm") > + > +__extension__ extern __inline int32x2_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b) > +{ > + return __builtin_aarch64_usdotv8qi_ssus (__r, __a, __b); > +} > + > +__extension__ extern __inline int32x4_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b) > +{ > + return __builtin_aarch64_usdotv16qi_ssus (__r, __a, __b); > +} > + > +__extension__ extern __inline int32x2_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vusdot_lane_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b, const int > __index) > +{ > + return __builtin_aarch64_usdot_lanev8qi_ssuss (__r, __a, __b, __index); > +} > + > +__extension__ extern __inline int32x2_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vusdot_laneq_s32 (int32x2_t __r, uint8x8_t __a, int8x16_t __b, > + const int __index) > +{ > + return __builtin_aarch64_usdot_laneqv8qi_ssuss (__r, __a, __b, __index); > +} > + > +__extension__ extern __inline int32x4_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vusdotq_lane_s32 (int32x4_t __r, uint8x16_t __a, int8x8_t __b, > + const int __index) > +{ > + return __builtin_aarch64_usdot_lanev16qi_ssuss (__r, __a, __b, __index); > +} > + > +__extension__ extern __inline int32x4_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vusdotq_laneq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b, > + const int __index) > +{ > + return __builtin_aarch64_usdot_laneqv16qi_ssuss (__r, __a, __b, __index); > +} > + > +__extension__ extern __inline int32x2_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vsudot_lane_s32 (int32x2_t __r, int8x8_t __a, uint8x8_t __b, const int > __index) > +{ > + return __builtin_aarch64_sudot_lanev8qi_sssus (__r, __a, __b, __index); > +} > + > +__extension__ extern __inline int32x2_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vsudot_laneq_s32 (int32x2_t __r, int8x8_t __a, uint8x16_t __b, > + const int __index) > +{ > + return __builtin_aarch64_sudot_laneqv8qi_sssus (__r, __a, __b, __index); > +} > + > +__extension__ extern __inline int32x4_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vsudotq_lane_s32 (int32x4_t __r, int8x16_t __a, uint8x8_t __b, > + const int __index) > +{ > + return __builtin_aarch64_sudot_lanev16qi_sssus (__r, __a, __b, __index); > +} > + > +__extension__ extern __inline int32x4_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vsudotq_laneq_s32 (int32x4_t __r, int8x16_t __a, uint8x16_t __b, > + const int __index) > +{ > + return __builtin_aarch64_sudot_laneqv16qi_sssus (__r, __a, __b, __index); > +} > + > +#pragma GCC pop_options > + > #undef __aarch64_vget_lane_any > > #undef __aarch64_vdup_lane_any > diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md > index > e5fa31f6748ee81d4323f11544fd8edb19d9af43..9dee626c65ef19f7d1582c05611028b65b2a32a8 > 100644 > --- a/gcc/config/aarch64/iterators.md > +++ b/gcc/config/aarch64/iterators.md > @@ -650,6 +650,8 @@ > UNSPEC_UMULHS ; Used in aarch64-sve2.md. > UNSPEC_UMULHRS ; Used in aarch64-sve2.md. > UNSPEC_ASRD ; Used in aarch64-sve.md. > + UNSPEC_USDOT ; Used in aarch64-simd.md. > + UNSPEC_SUDOT ; Used in aarch64-simd.md. > ]) > > ;; ------------------------------------------------------------------ > @@ -1299,6 +1301,8 @@ > > (define_mode_attr f16quad [(V2SF "") (V4SF "q")]) > > +(define_mode_attr isquadop [(V8QI "") (V16QI "q")]) > + > (define_code_attr f16mac [(plus "a") (minus "s")]) > > ;; Map smax to smin and umax to umin. > @@ -1859,6 +1863,8 @@ > > (define_int_iterator DOTPROD [UNSPEC_SDOT UNSPEC_UDOT]) > > +(define_int_iterator DOTPROD_I8MM [UNSPEC_USDOT UNSPEC_SUDOT]) > + > (define_int_iterator ADDSUBHN [UNSPEC_ADDHN UNSPEC_RADDHN > UNSPEC_SUBHN UNSPEC_RSUBHN]) > > @@ -2298,6 +2304,7 @@ > (UNSPEC_URSHL "ur") (UNSPEC_SRSHL "sr") > (UNSPEC_UQRSHL "u") (UNSPEC_SQRSHL "s") > (UNSPEC_SDOT "s") (UNSPEC_UDOT "u") > + (UNSPEC_USDOT "us") (UNSPEC_SUDOT "su") > ]) > > (define_int_attr r [(UNSPEC_SQDMULH "") (UNSPEC_SQRDMULH "r") > diff --git > a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-1.c > b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-1.c > new file mode 100755 > index > 0000000000000000000000000000000000000000..ac4f821e77143e93ecb23db71b7100b37df770f2 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-1.c > @@ -0,0 +1,136 @@ > +/* { dg-do assemble { target { aarch64*-*-* } } } */ > +/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */ > +/* { dg-add-options arm_v8_2a_i8mm } */ > +/* { dg-additional-options "-save-temps" } */ > +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */ > +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ > + > +#include <arm_neon.h> > + > +/* Unsigned-Signed Dot Product instructions. */ > + > +/* > +**ufoo: > +** usdot v0\.2s, v1\.8b, v2\.8b > +** ret > +*/ > +int32x2_t ufoo (int32x2_t r, uint8x8_t x, int8x8_t y) > +{ > + return vusdot_s32 (r, x, y); > +} > + > +/* > +**ufooq: > +** usdot v0\.4s, v1\.16b, v2\.16b > +** ret > +*/ > +int32x4_t ufooq (int32x4_t r, uint8x16_t x, int8x16_t y) > +{ > + return vusdotq_s32 (r, x, y); > +} > + > +/* > +**ufoo_lane: > +** usdot v0\.2s, v1\.8b, v2\.4b\[0\] > +** ret > +*/ > +int32x2_t ufoo_lane (int32x2_t r, uint8x8_t x, int8x8_t y) > +{ > + return vusdot_lane_s32 (r, x, y, 0); > +} > + > +/* > +**ufoo_laneq: > +** usdot v0\.2s, v1\.8b, v2\.4b\[2\] > +** ret > +*/ > +int32x2_t ufoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y) > +{ > + return vusdot_laneq_s32 (r, x, y, 2); > +} > + > +/* > +**ufooq_lane: > +** usdot v0\.4s, v1\.16b, v2\.4b\[1\] > +** ret > +*/ > +int32x4_t ufooq_lane (int32x4_t r, uint8x16_t x, int8x8_t y) > +{ > + return vusdotq_lane_s32 (r, x, y, 1); > +} > + > +/* > +**ufooq_laneq: > +** usdot v0\.4s, v1\.16b, v2\.4b\[3\] > +** ret > +*/ > +int32x4_t ufooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y) > +{ > + return vusdotq_laneq_s32 (r, x, y, 3); > +} > + > + > +/* Signed-Unsigned Dot Product instructions. */ > + > +/* > +**sfoo_lane: > +** sudot v0\.2s, v1\.8b, v2\.4b\[0\] > +** ret > +*/ > +int32x2_t sfoo_lane (int32x2_t r, int8x8_t x, uint8x8_t y) > +{ > + return vsudot_lane_s32 (r, x, y, 0); > +} > + > +/* > +**sfoo_laneq: > +** sudot v0\.2s, v1\.8b, v2\.4b\[2\] > +** ret > +*/ > +int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y) > +{ > + return vsudot_laneq_s32 (r, x, y, 2); > +} > + > +/* > +**sfooq_lane: > +** sudot v0\.4s, v1\.16b, v2\.4b\[1\] > +** ret > +*/ > +int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, uint8x8_t y) > +{ > + return vsudotq_lane_s32 (r, x, y, 1); > +} > + > +/* > +**sfooq_laneq: > +** sudot v0\.4s, v1\.16b, v2\.4b\[3\] > +** ret > +*/ > +int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y) > +{ > + return vsudotq_laneq_s32 (r, x, y, 3); > +} > + > +/* > +**ufoo_untied: > +** mov v0\.8b, v1\.8b > +** usdot v0\.2s, v2\.8b, v3\.8b > +** ret > +*/ > +int32x2_t ufoo_untied (int32x2_t unused, int32x2_t r, uint8x8_t x, int8x8_t > y) > +{ > + return vusdot_s32 (r, x, y); > +} > + > +/* > +**ufooq_laneq_untied: > +** mov v0\.16b, v1\.16b > +** usdot v0\.4s, v2\.16b, v3\.4b\[3\] > +** ret > +*/ > +int32x4_t ufooq_laneq_untied (int32x2_t unused, int32x4_t r, uint8x16_t x, > int8x16_t y) > +{ > + return vusdotq_laneq_s32 (r, x, y, 3); > +} > + > diff --git > a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-2.c > b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-2.c > new file mode 100755 > index > 0000000000000000000000000000000000000000..96bca2356e4d5d93378d2c8de3778fe28751117e > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-2.c > @@ -0,0 +1,137 @@ > +/* { dg-do assemble { target { aarch64*-*-* } } } */ > +/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */ > +/* { dg-add-options arm_v8_2a_i8mm } */ > +/* { dg-additional-options "-mbig-endian -save-temps" } */ > +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */ > +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ > + > +#include <arm_neon.h> > + > +/* Unsigned-Signed Dot Product instructions. */ > + > +/* > +**ufoo: > +** usdot v0\.2s, v1\.8b, v2\.8b > +** ret > +*/ > +int32x2_t ufoo (int32x2_t r, uint8x8_t x, int8x8_t y) > +{ > + return vusdot_s32 (r, x, y); > +} > + > +/* > +**ufooq: > +** usdot v0\.4s, v1\.16b, v2\.16b > +** ret > +*/ > +int32x4_t ufooq (int32x4_t r, uint8x16_t x, int8x16_t y) > +{ > + return vusdotq_s32 (r, x, y); > +} > + > +/* > +**ufoo_lane: > +** usdot v0\.2s, v1\.8b, v2\.4b\[0\] > +** ret > +*/ > +int32x2_t ufoo_lane (int32x2_t r, uint8x8_t x, int8x8_t y) > +{ > + return vusdot_lane_s32 (r, x, y, 0); > +} > + > +/* > +**ufoo_laneq: > +** usdot v0\.2s, v1\.8b, v2\.4b\[2\] > +** ret > +*/ > +int32x2_t ufoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y) > +{ > + return vusdot_laneq_s32 (r, x, y, 2); > +} > + > +/* > +**ufooq_lane: > +** usdot v0\.4s, v1\.16b, v2\.4b\[1\] > +** ret > +*/ > +int32x4_t ufooq_lane (int32x4_t r, uint8x16_t x, int8x8_t y) > +{ > + return vusdotq_lane_s32 (r, x, y, 1); > +} > + > +/* > +**ufooq_laneq: > +** usdot v0\.4s, v1\.16b, v2\.4b\[3\] > +** ret > +*/ > +int32x4_t ufooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y) > +{ > + return vusdotq_laneq_s32 (r, x, y, 3); > +} > + > + > +/* Signed-Unsigned Dot Product instructions. */ > + > +/* > +**sfoo_lane: > +** sudot v0\.2s, v1\.8b, v2\.4b\[0\] > +** ret > +*/ > +int32x2_t sfoo_lane (int32x2_t r, int8x8_t x, uint8x8_t y) > +{ > + return vsudot_lane_s32 (r, x, y, 0); > +} > + > +/* > +**sfoo_laneq: > +** sudot v0\.2s, v1\.8b, v2\.4b\[2\] > +** ret > +*/ > +int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y) > +{ > + return vsudot_laneq_s32 (r, x, y, 2); > +} > + > +/* > +**sfooq_lane: > +** sudot v0\.4s, v1\.16b, v2\.4b\[1\] > +** ret > +*/ > +int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, uint8x8_t y) > +{ > + return vsudotq_lane_s32 (r, x, y, 1); > +} > + > +/* > +**sfooq_laneq: > +** sudot v0\.4s, v1\.16b, v2\.4b\[3\] > +** ret > +*/ > +int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y) > +{ > + return vsudotq_laneq_s32 (r, x, y, 3); > +} > + > +/* > +**ufoo_untied: > +** mov v0\.8b, v1\.8b > +** usdot v0\.2s, v2\.8b, v3\.8b > +** ret > +*/ > +int32x2_t ufoo_untied (int32x2_t unused, int32x2_t r, uint8x8_t x, int8x8_t > y) > +{ > + return vusdot_s32 (r, x, y); > +} > + > +/* > +**ufooq_laneq_untied: > +** mov v0\.16b, v1\.16b > +** usdot v0\.4s, v2\.16b, v3\.4b\[3\] > +** ret > +*/ > +int32x4_t ufooq_laneq_untied (int32x2_t unused, int32x4_t r, uint8x16_t x, > int8x16_t y) > +{ > + return vusdotq_laneq_s32 (r, x, y, 3); > +} > + > + > diff --git > a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-3.c > b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-3.c > new file mode 100755 > index > 0000000000000000000000000000000000000000..18ecabef8dc6b99872d71c8e412b6f4b4809e901 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-3.c > @@ -0,0 +1,31 @@ > +/* { dg-do assemble { target { aarch64*-*-* } } } */ > +/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */ > +/* { dg-add-options arm_v8_2a_i8mm } */ > +/* { dg-additional-options "--save-temps" } */ > +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ > + > +#include <arm_neon.h> > + > +int32x2_t ufoo_lane (int32x2_t r, uint8x8_t x, int8x8_t y) > +{ > + /* { dg-error "lane -1 out of range 0 - 1" "" { target *-*-* } 0 } */ > + return vusdot_lane_s32 (r, x, y, -1); > +} > + > +int32x2_t ufoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y) > +{ > + /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */ > + return vusdot_laneq_s32 (r, x, y, -1); > +} > + > +int32x4_t ufooq_lane (int32x4_t r, uint8x16_t x, int8x8_t y) > +{ > + /* { dg-error "lane 2 out of range 0 - 1" "" { target *-*-* } 0 } */ > + return vusdotq_lane_s32 (r, x, y, 2); > +} > + > +int32x4_t ufooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y) > +{ > + /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */ > + return vusdotq_laneq_s32 (r, x, y, 4); > +} > diff --git > a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-4.c > b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-4.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..66c87d48694bad9624b491aec4cd1a38b75fbb95 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-4.c > @@ -0,0 +1,31 @@ > +/* { dg-do assemble { target { aarch64*-*-* } } } */ > +/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */ > +/* { dg-add-options arm_v8_2a_i8mm } */ > +/* { dg-additional-options "--save-temps" } */ > +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ > + > +#include <arm_neon.h> > + > +int32x2_t sfoo_lane (int32x2_t r, int8x8_t x, uint8x8_t y) > +{ > + /* { dg-error "lane -1 out of range 0 - 1" "" { target *-*-* } 0 } */ > + return vsudot_lane_s32 (r, x, y, -1); > +} > + > +int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y) > +{ > + /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */ > + return vsudot_laneq_s32 (r, x, y, -1); > +} > + > +int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, uint8x8_t y) > +{ > + /* { dg-error "lane 2 out of range 0 - 1" "" { target *-*-* } 0 } */ > + return vsudotq_lane_s32 (r, x, y, 2); > +} > + > +int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y) > +{ > + /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */ > + return vsudotq_laneq_s32 (r, x, y, 4); > +}