This patch fixes up the vget_lane RTL patterns to better exploit the behaviour of their target instructions, and to allow variants keeping the result in the SIMD register file.
We patch up aarch64_get_lane_<un><signed>. These are somewhat misleading and are not being used in their full capacity. They currently zero/sign_extend from something of size <VEL> to something of size <VEL> which is always going to be a no-op, and should never be introduced by the combiner. More useful would be to have these patterns perform the cast they actually perform. That is to say, <VEL> to SI or DI as appropriate. So, these get renamed to aarch64_get_lane_<zero_>extend, and modified such that they return the widened type. Sadly, this means they cannot be used purely in the SIMD register set as there is no widen-to-32/64-bit instruction operating on this register file. So, that leaves the case we had before. If we eliminate the no-op, we have the same pattern as aarch64_dup_lane_scalar, so eliminate this - it makes more sense to be called aarch64_get_lane. And then we fix up arm_neon.h... As these are lane intrinsics we should be a little careful. We are likely to use the vget_lane intrinsics in composition with other intrinsics or operations, but we must guarantee that the constant parameter is actually a compile time constant. We define some internal wrapper macros in arm_neon.h, which should be used in preference to calling the raw compiler builtin. All of this effort is required to ensure that when we use a vget_lane intrinsic, the RTL generated is simply a vec_select. This allows us to begin building other lane intrinsics as composites with __aarch64_vget_lane. We must fix the ever-troublesome scalar_intrinsics.c testcase to teach it the new names for get_lane, but otherwise tested on aarch64-none-elf with no regressions. OK? Thanks, James --- gcc/ 2013-08-05 James Greenhalgh <james.greenha...@arm.com> * config/aarch64/aarch64-simd-builtins.def (get_lane_signed): Remove. (get_lane_unsigned): Likewise. (dup_lane_scalar): Likewise. (get_lane): enable for VALL. * config/aarch64/aarch64-simd.md (aarch64_dup_lane_scalar<mode>): Remove. (aarch64_get_lane_signed<mode>): Likewise. (aarch64_get_lane_unsigned<mode>): Likewise. (aarch64_get_lane_extend<GPI:mode><VDQQH:mode>): New. (aarch64_get_lane_zero_extendsi<mode>): Likewise. (aarch64_get_lane<mode>): Enable for all vector modes. (aarch64_get_lanedi): Remove misleading constraints. * config/aarch64/arm_neon.h (__aarch64_vget_lane_any): Define. (__aarch64_vget<q>_lane_<fpsu><8,16,32,64>): Likewise. (vget<q>_lane_<fpsu><8,16,32,64>): Use __aarch64_vget_lane macros. (vdup<bhsd>_lane_<su><8,16,32,64>): Likewise. * config/aarch64/iterators.md (VDQQH): New. (VDQQHS): Likewise. (vwcore): Likewise. gcc/testsuite/ 2013-08-05 James Greenhalgh <james.greenha...@arm.com> * gcc.target/aarch64/scalar_intrinsics.c: Update expected output of vdup intrinsics.
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 55dead6..4046d7a 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -40,10 +40,6 @@ 10 - CODE_FOR_<name><mode>. */ BUILTIN_VD_RE (CREATE, create, 0) - BUILTIN_VQ_S (GETLANE, get_lane_signed, 0) - BUILTIN_VDQ (GETLANE, get_lane_unsigned, 0) - BUILTIN_VDQF (GETLANE, get_lane, 0) - VAR1 (GETLANE, get_lane, 0, di) BUILTIN_VDC (COMBINE, combine, 0) BUILTIN_VB (BINOP, pmul, 0) BUILTIN_VDQF (UNOP, sqrt, 2) @@ -51,6 +47,9 @@ VAR1 (UNOP, addp, 0, di) VAR1 (UNOP, clz, 2, v4si) + BUILTIN_VALL (GETLANE, get_lane, 0) + VAR1 (GETLANE, get_lane, 0, di) + BUILTIN_VD_RE (REINTERP, reinterpretdi, 0) BUILTIN_VDC (REINTERP, reinterpretv8qi, 0) BUILTIN_VDC (REINTERP, reinterpretv4hi, 0) @@ -64,7 +63,6 @@ BUILTIN_VQ (REINTERP, reinterpretv2df, 0) BUILTIN_VDQ_I (BINOP, dup_lane, 0) - BUILTIN_VDQ_I (BINOP, dup_lane_scalar, 0) /* Implemented by aarch64_<sur>q<r>shl<mode>. */ BUILTIN_VSDQ_I (BINOP, sqshl, 0) BUILTIN_VSDQ_I (BINOP, uqshl, 0) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 3c76032..9823730 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -357,20 +357,6 @@ (set_attr "simd_mode" "<MODE>")] ) -(define_insn "aarch64_dup_lane_scalar<mode>" - [(set (match_operand:<VEL> 0 "register_operand" "=w, r") - (vec_select:<VEL> - (match_operand:VDQ 1 "register_operand" "w, w") - (parallel [(match_operand:SI 2 "immediate_operand" "i, i")]) - ))] - "TARGET_SIMD" - "@ - dup\\t%<Vetype>0, %1.<Vetype>[%2] - umov\\t%<vw>0, %1.<Vetype>[%2]" - [(set_attr "simd_type" "simd_dup, simd_movgp") - (set_attr "simd_mode" "<MODE>")] -) - (define_insn "aarch64_simd_dup<mode>" [(set (match_operand:VDQF 0 "register_operand" "=w") (vec_duplicate:VDQF (match_operand:<VEL> 1 "register_operand" "w")))] @@ -2147,45 +2133,50 @@ DONE; }) -(define_insn "aarch64_get_lane_signed<mode>" - [(set (match_operand:<VEL> 0 "register_operand" "=r") - (sign_extend:<VEL> +;; Lane extraction with sign extension to general purpose register. +(define_insn "*aarch64_get_lane_extend<GPI:mode><VDQQH:mode>" + [(set (match_operand:GPI 0 "register_operand" "=r") + (sign_extend:GPI (vec_select:<VEL> - (match_operand:VQ_S 1 "register_operand" "w") + (match_operand:VDQQH 1 "register_operand" "w") (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))] "TARGET_SIMD" - "smov\\t%0, %1.<Vetype>[%2]" + "smov\\t%<GPI:w>0, %1.<VDQQH:Vetype>[%2]" [(set_attr "simd_type" "simd_movgp") - (set_attr "simd_mode" "<MODE>")] + (set_attr "simd_mode" "<VDQQH:MODE>")] ) -(define_insn "aarch64_get_lane_unsigned<mode>" - [(set (match_operand:<VEL> 0 "register_operand" "=r") - (zero_extend:<VEL> +(define_insn "*aarch64_get_lane_zero_extendsi<mode>" + [(set (match_operand:SI 0 "register_operand" "=r") + (zero_extend:SI (vec_select:<VEL> - (match_operand:VDQ 1 "register_operand" "w") + (match_operand:VDQQH 1 "register_operand" "w") (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))] "TARGET_SIMD" - "umov\\t%<vw>0, %1.<Vetype>[%2]" + "umov\\t%w0, %1.<Vetype>[%2]" [(set_attr "simd_type" "simd_movgp") (set_attr "simd_mode" "<MODE>")] ) +;; Lane extraction of a value, neither sign nor zero extension +;; is guaranteed so upper bits should be considered undefined. (define_insn "aarch64_get_lane<mode>" - [(set (match_operand:<VEL> 0 "register_operand" "=w") + [(set (match_operand:<VEL> 0 "register_operand" "=r, w") (vec_select:<VEL> - (match_operand:VDQF 1 "register_operand" "w") - (parallel [(match_operand:SI 2 "immediate_operand" "i")])))] + (match_operand:VALL 1 "register_operand" "w, w") + (parallel [(match_operand:SI 2 "immediate_operand" "i, i")])))] "TARGET_SIMD" - "mov\\t%0.<Vetype>[0], %1.<Vetype>[%2]" - [(set_attr "simd_type" "simd_ins") + "@ + umov\\t%<vwcore>0, %1.<Vetype>[%2] + dup\\t%<Vetype>0, %1.<Vetype>[%2]" + [(set_attr "simd_type" "simd_movgp, simd_dup") (set_attr "simd_mode" "<MODE>")] ) (define_expand "aarch64_get_lanedi" - [(match_operand:DI 0 "register_operand" "=r") - (match_operand:DI 1 "register_operand" "w") - (match_operand:SI 2 "immediate_operand" "i")] + [(match_operand:DI 0 "register_operand") + (match_operand:DI 1 "register_operand") + (match_operand:SI 2 "immediate_operand")] "TARGET_SIMD" { aarch64_simd_lane_bounds (operands[2], 0, 1); diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 99cf123..e4e2110 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -446,7 +446,66 @@ typedef struct poly16x8x4_t poly16x8_t val[4]; } poly16x8x4_t; - +/* vget_lane internal macros. */ + +#define __aarch64_vget_lane_any(__size, __cast_ret, __cast_a, __a, __b) \ + (__cast_ret \ + __builtin_aarch64_get_lane##__size (__cast_a __a, __b)) + +#define __aarch64_vget_lane_f32(__a, __b) \ + __aarch64_vget_lane_any (v2sf, , , __a, __b) +#define __aarch64_vget_lane_f64(__a, __b) (__a) + +#define __aarch64_vget_lane_p8(__a, __b) \ + __aarch64_vget_lane_any (v8qi, (poly8_t), (int8x8_t), __a, __b) +#define __aarch64_vget_lane_p16(__a, __b) \ + __aarch64_vget_lane_any (v4hi, (poly16_t), (int16x4_t), __a, __b) + +#define __aarch64_vget_lane_s8(__a, __b) \ + __aarch64_vget_lane_any (v8qi, , ,__a, __b) +#define __aarch64_vget_lane_s16(__a, __b) \ + __aarch64_vget_lane_any (v4hi, , ,__a, __b) +#define __aarch64_vget_lane_s32(__a, __b) \ + __aarch64_vget_lane_any (v2si, , ,__a, __b) +#define __aarch64_vget_lane_s64(__a, __b) (__a) + +#define __aarch64_vget_lane_u8(__a, __b) \ + __aarch64_vget_lane_any (v8qi, (uint8_t), (int8x8_t), __a, __b) +#define __aarch64_vget_lane_u16(__a, __b) \ + __aarch64_vget_lane_any (v4hi, (uint16_t), (int16x4_t), __a, __b) +#define __aarch64_vget_lane_u32(__a, __b) \ + __aarch64_vget_lane_any (v2si, (uint32_t), (int32x2_t), __a, __b) +#define __aarch64_vget_lane_u64(__a, __b) (__a) + +#define __aarch64_vgetq_lane_f32(__a, __b) \ + __aarch64_vget_lane_any (v4sf, , , __a, __b) +#define __aarch64_vgetq_lane_f64(__a, __b) \ + __aarch64_vget_lane_any (v2df, , , __a, __b) + +#define __aarch64_vgetq_lane_p8(__a, __b) \ + __aarch64_vget_lane_any (v16qi, (poly8_t), (int8x16_t), __a, __b) +#define __aarch64_vgetq_lane_p16(__a, __b) \ + __aarch64_vget_lane_any (v8hi, (poly16_t), (int16x8_t), __a, __b) + +#define __aarch64_vgetq_lane_s8(__a, __b) \ + __aarch64_vget_lane_any (v16qi, , ,__a, __b) +#define __aarch64_vgetq_lane_s16(__a, __b) \ + __aarch64_vget_lane_any (v8hi, , ,__a, __b) +#define __aarch64_vgetq_lane_s32(__a, __b) \ + __aarch64_vget_lane_any (v4si, , ,__a, __b) +#define __aarch64_vgetq_lane_s64(__a, __b) \ + __aarch64_vget_lane_any (v2di, , ,__a, __b) + +#define __aarch64_vgetq_lane_u8(__a, __b) \ + __aarch64_vget_lane_any (v16qi, (uint8_t), (int8x16_t), __a, __b) +#define __aarch64_vgetq_lane_u16(__a, __b) \ + __aarch64_vget_lane_any (v8hi, (uint16_t), (int16x8_t), __a, __b) +#define __aarch64_vgetq_lane_u32(__a, __b) \ + __aarch64_vget_lane_any (v4si, (uint32_t), (int32x4_t), __a, __b) +#define __aarch64_vgetq_lane_u64(__a, __b) \ + __aarch64_vget_lane_any (v2di, (uint64_t), (int64x2_t), __a, __b) + +/* vadd */ __extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) vadd_s8 (int8x8_t __a, int8x8_t __b) { @@ -2307,155 +2366,156 @@ vcreate_p16 (uint64_t __a) return (poly16x4_t) __a; } +/* vget_lane */ + +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vget_lane_f32 (float32x2_t __a, const int __b) +{ + return __aarch64_vget_lane_f32 (__a, __b); +} + +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vget_lane_f64 (float64x1_t __a, const int __b) +{ + return __aarch64_vget_lane_f64 (__a, __b); +} + +__extension__ static __inline poly8_t __attribute__ ((__always_inline__)) +vget_lane_p8 (poly8x8_t __a, const int __b) +{ + return __aarch64_vget_lane_p8 (__a, __b); +} + +__extension__ static __inline poly16_t __attribute__ ((__always_inline__)) +vget_lane_p16 (poly16x4_t __a, const int __b) +{ + return __aarch64_vget_lane_p16 (__a, __b); +} + __extension__ static __inline int8_t __attribute__ ((__always_inline__)) vget_lane_s8 (int8x8_t __a, const int __b) { - return (int8_t) __builtin_aarch64_get_lane_signedv8qi (__a, __b); + return __aarch64_vget_lane_s8 (__a, __b); } __extension__ static __inline int16_t __attribute__ ((__always_inline__)) vget_lane_s16 (int16x4_t __a, const int __b) { - return (int16_t) __builtin_aarch64_get_lane_signedv4hi (__a, __b); + return __aarch64_vget_lane_s16 (__a, __b); } __extension__ static __inline int32_t __attribute__ ((__always_inline__)) vget_lane_s32 (int32x2_t __a, const int __b) { - return (int32_t) __builtin_aarch64_get_lane_signedv2si (__a, __b); + return __aarch64_vget_lane_s32 (__a, __b); } -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vget_lane_f32 (float32x2_t __a, const int __b) +__extension__ static __inline int64_t __attribute__ ((__always_inline__)) +vget_lane_s64 (int64x1_t __a, const int __b) { - return (float32_t) __builtin_aarch64_get_lanev2sf (__a, __b); + return __aarch64_vget_lane_s64 (__a, __b); } __extension__ static __inline uint8_t __attribute__ ((__always_inline__)) vget_lane_u8 (uint8x8_t __a, const int __b) { - return (uint8_t) __builtin_aarch64_get_lane_unsignedv8qi ((int8x8_t) __a, - __b); + return __aarch64_vget_lane_u8 (__a, __b); } __extension__ static __inline uint16_t __attribute__ ((__always_inline__)) vget_lane_u16 (uint16x4_t __a, const int __b) { - return (uint16_t) __builtin_aarch64_get_lane_unsignedv4hi ((int16x4_t) __a, - __b); + return __aarch64_vget_lane_u16 (__a, __b); } __extension__ static __inline uint32_t __attribute__ ((__always_inline__)) vget_lane_u32 (uint32x2_t __a, const int __b) { - return (uint32_t) __builtin_aarch64_get_lane_unsignedv2si ((int32x2_t) __a, - __b); + return __aarch64_vget_lane_u32 (__a, __b); } -__extension__ static __inline poly8_t __attribute__ ((__always_inline__)) -vget_lane_p8 (poly8x8_t __a, const int __b) +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vget_lane_u64 (uint64x1_t __a, const int __b) { - return (poly8_t) __builtin_aarch64_get_lane_unsignedv8qi ((int8x8_t) __a, - __b); + return __aarch64_vget_lane_u64 (__a, __b); } -__extension__ static __inline poly16_t __attribute__ ((__always_inline__)) -vget_lane_p16 (poly16x4_t __a, const int __b) +/* vgetq_lane */ + +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vgetq_lane_f32 (float32x4_t __a, const int __b) { - return (poly16_t) __builtin_aarch64_get_lane_unsignedv4hi ((int16x4_t) __a, - __b); + return __aarch64_vgetq_lane_f32 (__a, __b); } -__extension__ static __inline int64_t __attribute__ ((__always_inline__)) -vget_lane_s64 (int64x1_t __a, const int __b) +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vgetq_lane_f64 (float64x2_t __a, const int __b) { - return (int64_t) __builtin_aarch64_get_lanedi (__a, __b); + return __aarch64_vgetq_lane_f64 (__a, __b); } -__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) -vget_lane_u64 (uint64x1_t __a, const int __b) +__extension__ static __inline poly8_t __attribute__ ((__always_inline__)) +vgetq_lane_p8 (poly8x16_t __a, const int __b) +{ + return __aarch64_vgetq_lane_p8 (__a, __b); +} + +__extension__ static __inline poly16_t __attribute__ ((__always_inline__)) +vgetq_lane_p16 (poly16x8_t __a, const int __b) { - return (uint64_t) __builtin_aarch64_get_lanedi ((int64x1_t) __a, __b); + return __aarch64_vgetq_lane_p16 (__a, __b); } __extension__ static __inline int8_t __attribute__ ((__always_inline__)) vgetq_lane_s8 (int8x16_t __a, const int __b) { - return (int8_t) __builtin_aarch64_get_lane_signedv16qi (__a, __b); + return __aarch64_vgetq_lane_s8 (__a, __b); } __extension__ static __inline int16_t __attribute__ ((__always_inline__)) vgetq_lane_s16 (int16x8_t __a, const int __b) { - return (int16_t) __builtin_aarch64_get_lane_signedv8hi (__a, __b); + return __aarch64_vgetq_lane_s16 (__a, __b); } __extension__ static __inline int32_t __attribute__ ((__always_inline__)) vgetq_lane_s32 (int32x4_t __a, const int __b) { - return (int32_t) __builtin_aarch64_get_lane_signedv4si (__a, __b); -} - -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vgetq_lane_f32 (float32x4_t __a, const int __b) -{ - return (float32_t) __builtin_aarch64_get_lanev4sf (__a, __b); + return __aarch64_vgetq_lane_s32 (__a, __b); } -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) -vgetq_lane_f64 (float64x2_t __a, const int __b) +__extension__ static __inline int64_t __attribute__ ((__always_inline__)) +vgetq_lane_s64 (int64x2_t __a, const int __b) { - return (float64_t) __builtin_aarch64_get_lanev2df (__a, __b); + return __aarch64_vgetq_lane_s64 (__a, __b); } __extension__ static __inline uint8_t __attribute__ ((__always_inline__)) vgetq_lane_u8 (uint8x16_t __a, const int __b) { - return (uint8_t) __builtin_aarch64_get_lane_unsignedv16qi ((int8x16_t) __a, - __b); + return __aarch64_vgetq_lane_u8 (__a, __b); } __extension__ static __inline uint16_t __attribute__ ((__always_inline__)) vgetq_lane_u16 (uint16x8_t __a, const int __b) { - return (uint16_t) __builtin_aarch64_get_lane_unsignedv8hi ((int16x8_t) __a, - __b); + return __aarch64_vgetq_lane_u16 (__a, __b); } __extension__ static __inline uint32_t __attribute__ ((__always_inline__)) vgetq_lane_u32 (uint32x4_t __a, const int __b) { - return (uint32_t) __builtin_aarch64_get_lane_unsignedv4si ((int32x4_t) __a, - __b); -} - -__extension__ static __inline poly8_t __attribute__ ((__always_inline__)) -vgetq_lane_p8 (poly8x16_t __a, const int __b) -{ - return (poly8_t) __builtin_aarch64_get_lane_unsignedv16qi ((int8x16_t) __a, - __b); -} - -__extension__ static __inline poly16_t __attribute__ ((__always_inline__)) -vgetq_lane_p16 (poly16x8_t __a, const int __b) -{ - return (poly16_t) __builtin_aarch64_get_lane_unsignedv8hi ((int16x8_t) __a, - __b); -} - -__extension__ static __inline int64_t __attribute__ ((__always_inline__)) -vgetq_lane_s64 (int64x2_t __a, const int __b) -{ - return __builtin_aarch64_get_lane_unsignedv2di (__a, __b); + return __aarch64_vgetq_lane_u32 (__a, __b); } __extension__ static __inline uint64_t __attribute__ ((__always_inline__)) vgetq_lane_u64 (uint64x2_t __a, const int __b) { - return (uint64_t) __builtin_aarch64_get_lane_unsignedv2di ((int64x2_t) __a, - __b); + return __aarch64_vgetq_lane_u64 (__a, __b); } +/* vreinterpret */ + __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) vreinterpret_p8_s8 (int8x8_t __a) { @@ -6724,18 +6784,6 @@ vget_high_u64 (uint64x2_t a) return result; } -#define vget_lane_f64(a, b) \ - __extension__ \ - ({ \ - float64x1_t a_ = (a); \ - float64_t result; \ - __asm__ ("umov %x0, %1.d[%2]" \ - : "=r"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vget_low_f32 (float32x4_t a) { @@ -19732,49 +19780,49 @@ vcvtpq_u64_f64 (float64x2_t __a) __extension__ static __inline int8x1_t __attribute__ ((__always_inline__)) vdupb_lane_s8 (int8x16_t a, int const b) { - return __builtin_aarch64_dup_lane_scalarv16qi (a, b); + return __aarch64_vget_laneq_s8 (a, b); } __extension__ static __inline uint8x1_t __attribute__ ((__always_inline__)) vdupb_lane_u8 (uint8x16_t a, int const b) { - return (uint8x1_t) __builtin_aarch64_dup_lane_scalarv16qi ((int8x16_t) a, b); + return __aarch64_vget_laneq_u8 (a, b); } __extension__ static __inline int16x1_t __attribute__ ((__always_inline__)) vduph_lane_s16 (int16x8_t a, int const b) { - return __builtin_aarch64_dup_lane_scalarv8hi (a, b); + return __aarch64_vget_laneq_s16 (a, b); } __extension__ static __inline uint16x1_t __attribute__ ((__always_inline__)) vduph_lane_u16 (uint16x8_t a, int const b) { - return (uint16x1_t) __builtin_aarch64_dup_lane_scalarv8hi ((int16x8_t) a, b); + return __aarch64_vget_laneq_u16 (a, b); } __extension__ static __inline int32x1_t __attribute__ ((__always_inline__)) vdups_lane_s32 (int32x4_t a, int const b) { - return __builtin_aarch64_dup_lane_scalarv4si (a, b); + return __aarch64_vget_laneq_s32 (a, b); } __extension__ static __inline uint32x1_t __attribute__ ((__always_inline__)) vdups_lane_u32 (uint32x4_t a, int const b) { - return (uint32x1_t) __builtin_aarch64_dup_lane_scalarv4si ((int32x4_t) a, b); + return __aarch64_vget_laneq_u32 (a, b); } __extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) vdupd_lane_s64 (int64x2_t a, int const b) { - return __builtin_aarch64_dup_lane_scalarv2di (a, b); + return __aarch64_vget_laneq_s64 (a, b); } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) vdupd_lane_u64 (uint64x2_t a, int const b) { - return (uint64x1_t) __builtin_aarch64_dup_lane_scalarv2di ((int64x2_t) a, b); + return __aarch64_vget_laneq_s64 (a, b); } /* vld1 */ @@ -25581,4 +25629,31 @@ __INTERLEAVE_LIST (zip) /* End of optimal implementations in approved order. */ +#undef __aarch64_vget_lane_any +#undef __aarch64_vget_lane_f32 +#undef __aarch64_vget_lane_f64 +#undef __aarch64_vget_lane_p8 +#undef __aarch64_vget_lane_p16 +#undef __aarch64_vget_lane_s8 +#undef __aarch64_vget_lane_s16 +#undef __aarch64_vget_lane_s32 +#undef __aarch64_vget_lane_s64 +#undef __aarch64_vget_lane_u8 +#undef __aarch64_vget_lane_u16 +#undef __aarch64_vget_lane_u32 +#undef __aarch64_vget_lane_u64 + +#undef __aarch64_vgetq_lane_f32 +#undef __aarch64_vgetq_lane_f64 +#undef __aarch64_vgetq_lane_p8 +#undef __aarch64_vgetq_lane_p16 +#undef __aarch64_vgetq_lane_s8 +#undef __aarch64_vgetq_lane_s16 +#undef __aarch64_vgetq_lane_s32 +#undef __aarch64_vgetq_lane_s64 +#undef __aarch64_vgetq_lane_u8 +#undef __aarch64_vgetq_lane_u16 +#undef __aarch64_vgetq_lane_u32 +#undef __aarch64_vgetq_lane_u64 + #endif diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 3ec889f..37b6cbc 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -134,9 +134,15 @@ ;; Vector modes except double int. (define_mode_iterator VDQIF [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF V2DF]) +;; Vector modes for Q and H types. +(define_mode_iterator VDQQH [V8QI V16QI V4HI V8HI]) + ;; Vector modes for H and S types. (define_mode_iterator VDQHS [V4HI V8HI V2SI V4SI]) +;; Vector modes for Q, H and S types. +(define_mode_iterator VDQQHS [V8QI V16QI V4HI V8HI V2SI V4SI]) + ;; Vector and scalar integer modes for H and S (define_mode_iterator VSDQ_HSI [V4HI V8HI V2SI V4SI HI SI]) @@ -453,6 +459,15 @@ (V2SF "s") (V4SF "s") (V2DF "d")]) +;; Corresponding core element mode for each vector mode. This is a +;; variation on <vw> mapping FP modes to GP regs. +(define_mode_attr vwcore [(V8QI "w") (V16QI "w") + (V4HI "w") (V8HI "w") + (V2SI "w") (V4SI "w") + (DI "x") (V2DI "x") + (V2SF "w") (V4SF "w") + (V2DF "x")]) + ;; Double vector types for ALLX. (define_mode_attr Vallxd [(QI "8b") (HI "4h") (SI "2s")]) diff --git a/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c b/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c index 3d902f6..d84bfeb 100644 --- a/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c +++ b/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c @@ -193,7 +193,7 @@ test_vcltzd_s64 (int64x1_t a) return res; } -/* { dg-final { scan-assembler-times "aarch64_dup_lane_scalarv16qi" 2 } } */ +/* { dg-final { scan-assembler-times "aarch64_get_lanev16qi" 2 } } */ int8x1_t test_vdupb_lane_s8 (int8x16_t a) @@ -207,7 +207,7 @@ test_vdupb_lane_u8 (uint8x16_t a) return vdupb_lane_u8 (a, 2); } -/* { dg-final { scan-assembler-times "aarch64_dup_lane_scalarv8hi" 2 } } */ +/* { dg-final { scan-assembler-times "aarch64_get_lanev8hi" 2 } } */ int16x1_t test_vduph_lane_s16 (int16x8_t a) @@ -221,7 +221,7 @@ test_vduph_lane_u16 (uint16x8_t a) return vduph_lane_u16 (a, 2); } -/* { dg-final { scan-assembler-times "aarch64_dup_lane_scalarv4si" 2 } } */ +/* { dg-final { scan-assembler-times "aarch64_get_lanev4si" 2 } } */ int32x1_t test_vdups_lane_s32 (int32x4_t a) @@ -235,7 +235,7 @@ test_vdups_lane_u32 (uint32x4_t a) return vdups_lane_u32 (a, 2); } -/* { dg-final { scan-assembler-times "aarch64_dup_lane_scalarv2di" 2 } } */ +/* { dg-final { scan-assembler-times "aarch64_get_lanev2di" 2 } } */ int64x1_t test_vdupd_lane_s64 (int64x2_t a)