Hi, This is a ping for: https://gcc.gnu.org/ml/gcc-patches/2015-03/msg00772.html Regtested with aarch64-linux-gnu on QEMU. This patch has no regressions for aarch64_be-linux-gnu big-endian target too. OK for the trunk?
Thanks. Jiang jiji ---------- Re: [PING^2] [PATCH] [AArch64, NEON] Improve vmulX intrinsics Hi, Kyrill Thank you for your suggestion. I fixed it and regtested with aarch64-linux-gnu on QEMU. This patch has no regressions for aarch64_be-linux-gnu big-endian target too. OK for the trunk? Thanks. Jiang jiji Index: gcc/ChangeLog =================================================================== --- gcc/ChangeLog (revision 221393) +++ gcc/ChangeLog (working copy) @@ -1,3 +1,38 @@ +2015-03-14 Felix Yang <felix.y...@huawei.com> + Jiji Jiang <jiangj...@huawei.com> + + * config/aarch64/aarch64-simd.md (aarch64_mul_n<mode>, + aarch64_<su>mull_n<mode>, aarch64_<su>mull<mode>, + aarch64_simd_<su>mull2_n<mode>, aarch64_<su>mull2_n<mode>, + aarch64_<su>mull_lane<mode>, aarch64_<su>mull2_lane<mode>_internal, + aarch64_<su>mull_laneq<mode>, aarch64_<su>mull2_laneq<mode>_internal, + aarch64_smull2_lane<mode>, aarch64_umull2_lane<mode>, + aarch64_smull2_laneq<mode>, aarch64_umull2_laneq<mode>, + aarch64_fmulx<mode>, aarch64_fmulx<mode>, aarch64_fmulx_lane<mode>, + aarch64_pmull2v16qi, aarch64_pmullv8qi): New patterns. + * config/aarch64/aarch64-simd-builtins.def (vec_widen_smult_hi_, + vec_widen_umult_hi_, umull, smull, smull_n, umull_n, mul_n, smull2_n, + umull2_n, smull_lane, umull_lane, smull_laneq, umull_laneq, pmull, + umull2_lane, smull2_laneq, umull2_laneq, fmulx, fmulx_lane, pmull2, + smull2_lane): New builtins. + * config/aarch64/arm_neon.h (vmul_n_f32, vmul_n_s16, vmul_n_s32, + vmul_n_u16, vmul_n_u32, vmulq_n_f32, vmulq_n_f64, vmulq_n_s16, + vmulq_n_s32, vmulq_n_u16, vmulq_n_u32, vmull_high_lane_s16, + vmull_high_lane_s32, vmull_high_lane_u16, vmull_high_lane_u32, + vmull_high_laneq_s16, vmull_high_laneq_s32, vmull_high_laneq_u16, + vmull_high_laneq_u32, vmull_high_n_s16, vmull_high_n_s32, + vmull_high_n_u16, vmull_high_n_u32, vmull_high_p8, vmull_high_s8, + vmull_high_s16, vmull_high_s32, vmull_high_u8, vmull_high_u16, + vmull_high_u32, vmull_lane_s16, vmull_lane_s32, vmull_lane_u16, + vmull_lane_u32, vmull_laneq_s16, vmull_laneq_s32, vmull_laneq_u16, + vmull_laneq_u32, vmull_n_s16, vmull_n_s32, vmull_n_u16, vmull_n_u32, + vmull_p8, vmull_s8, vmull_s16, vmull_s32, vmull_u8, vmull_u16, + vmull_u32, vmulx_f32, vmulx_lane_f32, vmulxd_f64, vmulxq_f32, + vmulxq_f64, vmulxq_lane_f32, vmulxq_lane_f64, vmulxs_f32): Rewrite + using builtin functions. + * config/aarch64/iterators.md (UNSPEC_FMULX, UNSPEC_FMULX_LANE, + VDQF_Q): New unspec and int iterator. + 2015-03-12 Kyrylo Tkachov <kyrylo.tkac...@arm.com> PR rtl-optimization/65235 Index: gcc/config/aarch64/arm_neon.h =================================================================== --- gcc/config/aarch64/arm_neon.h (revision 221393) +++ gcc/config/aarch64/arm_neon.h (working copy) @@ -7580,671 +7580,6 @@ vmovn_u64 (uint64x2_t a) return result; } -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vmul_n_f32 (float32x2_t a, float32_t b) -{ - float32x2_t result; - __asm__ ("fmul %0.2s,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vmul_n_s16 (int16x4_t a, int16_t b) -{ - int16x4_t result; - __asm__ ("mul %0.4h,%1.4h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vmul_n_s32 (int32x2_t a, int32_t b) -{ - int32x2_t result; - __asm__ ("mul %0.2s,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vmul_n_u16 (uint16x4_t a, uint16_t b) -{ - uint16x4_t result; - __asm__ ("mul %0.4h,%1.4h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vmul_n_u32 (uint32x2_t a, uint32_t b) -{ - uint32x2_t result; - __asm__ ("mul %0.2s,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -#define vmull_high_lane_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x4_t b_ = (b); \ - int16x8_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_high_lane_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x2_t b_ = (b); \ - int32x4_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_high_lane_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x4_t b_ = (b); \ - uint16x8_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_high_lane_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x2_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_high_laneq_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x8_t b_ = (b); \ - int16x8_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_high_laneq_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x4_t b_ = (b); \ - int32x4_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_high_laneq_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x8_t b_ = (b); \ - uint16x8_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_high_laneq_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x4_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vmull_high_n_s16 (int16x8_t a, int16_t b) -{ - int32x4_t result; - __asm__ ("smull2 %0.4s,%1.8h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vmull_high_n_s32 (int32x4_t a, int32_t b) -{ - int64x2_t result; - __asm__ ("smull2 %0.2d,%1.4s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vmull_high_n_u16 (uint16x8_t a, uint16_t b) -{ - uint32x4_t result; - __asm__ ("umull2 %0.4s,%1.8h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vmull_high_n_u32 (uint32x4_t a, uint32_t b) -{ - uint64x2_t result; - __asm__ ("umull2 %0.2d,%1.4s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) -vmull_high_p8 (poly8x16_t a, poly8x16_t b) -{ - poly16x8_t result; - __asm__ ("pmull2 %0.8h,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vmull_high_s8 (int8x16_t a, int8x16_t b) -{ - int16x8_t result; - __asm__ ("smull2 %0.8h,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vmull_high_s16 (int16x8_t a, int16x8_t b) -{ - int32x4_t result; - __asm__ ("smull2 %0.4s,%1.8h,%2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vmull_high_s32 (int32x4_t a, int32x4_t b) -{ - int64x2_t result; - __asm__ ("smull2 %0.2d,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vmull_high_u8 (uint8x16_t a, uint8x16_t b) -{ - uint16x8_t result; - __asm__ ("umull2 %0.8h,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vmull_high_u16 (uint16x8_t a, uint16x8_t b) -{ - uint32x4_t result; - __asm__ ("umull2 %0.4s,%1.8h,%2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vmull_high_u32 (uint32x4_t a, uint32x4_t b) -{ - uint64x2_t result; - __asm__ ("umull2 %0.2d,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -#define vmull_lane_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x4_t b_ = (b); \ - int16x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smull %0.4s,%1.4h,%2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_lane_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x2_t b_ = (b); \ - int32x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smull %0.2d,%1.2s,%2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_lane_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x4_t b_ = (b); \ - uint16x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umull %0.4s,%1.4h,%2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_lane_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x2_t b_ = (b); \ - uint32x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umull %0.2d, %1.2s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_laneq_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x8_t b_ = (b); \ - int16x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smull %0.4s, %1.4h, %2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_laneq_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x4_t b_ = (b); \ - int32x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smull %0.2d, %1.2s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_laneq_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x8_t b_ = (b); \ - uint16x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umull %0.4s, %1.4h, %2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_laneq_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x4_t b_ = (b); \ - uint32x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umull %0.2d, %1.2s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vmull_n_s16 (int16x4_t a, int16_t b) -{ - int32x4_t result; - __asm__ ("smull %0.4s,%1.4h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vmull_n_s32 (int32x2_t a, int32_t b) -{ - int64x2_t result; - __asm__ ("smull %0.2d,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vmull_n_u16 (uint16x4_t a, uint16_t b) -{ - uint32x4_t result; - __asm__ ("umull %0.4s,%1.4h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vmull_n_u32 (uint32x2_t a, uint32_t b) -{ - uint64x2_t result; - __asm__ ("umull %0.2d,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) -vmull_p8 (poly8x8_t a, poly8x8_t b) -{ - poly16x8_t result; - __asm__ ("pmull %0.8h, %1.8b, %2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vmull_s8 (int8x8_t a, int8x8_t b) -{ - int16x8_t result; - __asm__ ("smull %0.8h, %1.8b, %2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vmull_s16 (int16x4_t a, int16x4_t b) -{ - int32x4_t result; - __asm__ ("smull %0.4s, %1.4h, %2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vmull_s32 (int32x2_t a, int32x2_t b) -{ - int64x2_t result; - __asm__ ("smull %0.2d, %1.2s, %2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vmull_u8 (uint8x8_t a, uint8x8_t b) -{ - uint16x8_t result; - __asm__ ("umull %0.8h, %1.8b, %2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vmull_u16 (uint16x4_t a, uint16x4_t b) -{ - uint32x4_t result; - __asm__ ("umull %0.4s, %1.4h, %2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vmull_u32 (uint32x2_t a, uint32x2_t b) -{ - uint64x2_t result; - __asm__ ("umull %0.2d, %1.2s, %2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vmulq_n_f32 (float32x4_t a, float32_t b) -{ - float32x4_t result; - __asm__ ("fmul %0.4s,%1.4s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vmulq_n_f64 (float64x2_t a, float64_t b) -{ - float64x2_t result; - __asm__ ("fmul %0.2d,%1.2d,%2.d[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vmulq_n_s16 (int16x8_t a, int16_t b) -{ - int16x8_t result; - __asm__ ("mul %0.8h,%1.8h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vmulq_n_s32 (int32x4_t a, int32_t b) -{ - int32x4_t result; - __asm__ ("mul %0.4s,%1.4s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vmulq_n_u16 (uint16x8_t a, uint16_t b) -{ - uint16x8_t result; - __asm__ ("mul %0.8h,%1.8h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vmulq_n_u32 (uint32x4_t a, uint32_t b) -{ - uint32x4_t result; - __asm__ ("mul %0.4s,%1.4s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vmulx_f32 (float32x2_t a, float32x2_t b) -{ - float32x2_t result; - __asm__ ("fmulx %0.2s,%1.2s,%2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -#define vmulx_lane_f32(a, b, c) \ - __extension__ \ - ({ \ - float32x4_t b_ = (b); \ - float32x2_t a_ = (a); \ - float32x2_t result; \ - __asm__ ("fmulx %0.2s,%1.2s,%2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) -vmulxd_f64 (float64_t a, float64_t b) -{ - float64_t result; - __asm__ ("fmulx %d0, %d1, %d2" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vmulxq_f32 (float32x4_t a, float32x4_t b) -{ - float32x4_t result; - __asm__ ("fmulx %0.4s,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vmulxq_f64 (float64x2_t a, float64x2_t b) -{ - float64x2_t result; - __asm__ ("fmulx %0.2d,%1.2d,%2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -#define vmulxq_lane_f32(a, b, c) \ - __extension__ \ - ({ \ - float32x4_t b_ = (b); \ - float32x4_t a_ = (a); \ - float32x4_t result; \ - __asm__ ("fmulx %0.4s,%1.4s,%2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmulxq_lane_f64(a, b, c) \ - __extension__ \ - ({ \ - float64x2_t b_ = (b); \ - float64x2_t a_ = (a); \ - float64x2_t result; \ - __asm__ ("fmulx %0.2d,%1.2d,%2.d[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vmulxs_f32 (float32_t a, float32_t b) -{ - float32_t result; - __asm__ ("fmulx %s0, %s1, %s2" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) vmvn_p8 (poly8x8_t a) { @@ -18695,6 +18030,380 @@ vmul_n_f64 (float64x1_t __a, float64_t __b) return (float64x1_t) { vget_lane_f64 (__a, 0) * __b }; } +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmul_n_f32 (float32x2_t __a, float32_t __b) +{ + return __builtin_aarch64_mul_nv2sf (__a, __b); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmul_n_s16 (int16x4_t __a, int16_t __b) +{ + return __builtin_aarch64_mul_nv4hi (__a, __b); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmul_n_s32 (int32x2_t __a, int32_t __b) +{ + return __builtin_aarch64_mul_nv2si (__a, __b); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmul_n_u16 (uint16x4_t __a, uint16_t __b) +{ + return (uint16x4_t) __builtin_aarch64_mul_nv4hi ((int16x4_t)__a, + (int16_t)__b); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmul_n_u32 (uint32x2_t __a, uint32_t __b) +{ + return (uint32x2_t) __builtin_aarch64_mul_nv2si ((int32x2_t)__a, + (int32_t)__b); +} + +/* vmulq_n */ + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vmulq_n_f32 (float32x4_t __a, float32_t __b) +{ + return __builtin_aarch64_mul_nv4sf (__a, __b); +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vmulq_n_f64 (float64x2_t __a, float64_t __b) +{ + return __builtin_aarch64_mul_nv2df (__a, __b); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmulq_n_s16 (int16x8_t __a, int16_t __b) +{ + return __builtin_aarch64_mul_nv8hi (__a, __b); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmulq_n_s32 (int32x4_t __a, int32_t __b) +{ + return __builtin_aarch64_mul_nv4si (__a, __b); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmulq_n_u16 (uint16x8_t __a, uint16_t __b) +{ + return (uint16x8_t) __builtin_aarch64_mul_nv8hi ((int16x8_t)__a, + (int16_t)__b); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmulq_n_u32 (uint32x4_t __a, uint32_t __b) +{ + return (uint32x4_t) __builtin_aarch64_mul_nv4si ((int32x4_t)__a, + (int32_t)__b); +} + +/* vmull_high_lane */ + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmull_high_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c) +{ + return __builtin_aarch64_smull2_lanev8hi (__a, __b, __c); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmull_high_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c) +{ + return __builtin_aarch64_smull2_lanev4si (__a, __b, __c); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmull_high_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __c) +{ + return (uint32x4_t) __builtin_aarch64_umull2_lanev8hi ((int16x8_t) __a, + (int16x4_t) __b, + __c); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmull_high_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __c) +{ + return (uint64x2_t) __builtin_aarch64_umull2_lanev4si ((int32x4_t) __a, + (int32x2_t) __b, + __c); +} + +/* vmull_high_laneq */ + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmull_high_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c) +{ + return __builtin_aarch64_smull2_laneqv8hi (__a, __b, __c); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmull_high_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c) +{ + return __builtin_aarch64_smull2_laneqv4si (__a, __b, __c); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmull_high_laneq_u16 (uint16x8_t __a, uint16x8_t __b, const int __c) +{ + return (uint32x4_t) __builtin_aarch64_umull2_laneqv8hi ((int16x8_t)__a, + (int16x8_t)__b, + __c); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmull_high_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __c) +{ + return (uint64x2_t) __builtin_aarch64_umull2_laneqv4si ((int32x4_t) __a, + (int32x4_t) __b, + __c); +} + +/* vmull_high_n */ + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmull_high_n_s16 (int16x8_t __a, int16_t __b) +{ + return __builtin_aarch64_smull2_nv8hi (__a, __b); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmull_high_n_s32 (int32x4_t __a, int32_t __b) +{ + return __builtin_aarch64_smull2_nv4si (__a, __b); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmull_high_n_u16 (uint16x8_t __a, uint16_t __b) +{ + return __builtin_aarch64_umull2_nv8hi_uuu (__a, __b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmull_high_n_u32 (uint32x4_t __a, uint32_t __b) +{ + return __builtin_aarch64_umull2_nv4si_uuu (__a, __b); +} + +/* vmull_high */ + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vmull_high_p8 (poly8x16_t __a, poly8x16_t __b) +{ + return __builtin_aarch64_pmull2v16qi_ppp (__a, __b); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmull_high_s8 (int8x16_t __a, int8x16_t __b) +{ + return __builtin_aarch64_vec_widen_smult_hi_v16qi (__a, __b); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmull_high_s16 (int16x8_t __a, int16x8_t __b) +{ + return __builtin_aarch64_vec_widen_smult_hi_v8hi (__a, __b); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmull_high_s32 (int32x4_t __a, int32x4_t __b) +{ + return __builtin_aarch64_vec_widen_smult_hi_v4si (__a, __b); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmull_high_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return __builtin_aarch64_vec_widen_umult_hi_v16qi_uuu (__a, __b); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmull_high_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return __builtin_aarch64_vec_widen_umult_hi_v8hi_uuu (__a, __b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmull_high_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return __builtin_aarch64_vec_widen_umult_hi_v4si_uuu (__a, __b); +} + +/* vmull_lane */ + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmull_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c) +{ + return __builtin_aarch64_smull_lanev4hi (__a, __b, __c); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmull_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c) +{ + return __builtin_aarch64_smull_lanev2si (__a, __b, __c); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmull_lane_u16 (uint16x4_t __a, uint16x4_t __b, const unsigned int __c) +{ + return __builtin_aarch64_umull_lanev4hi_uuuu (__a, __b, __c); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmull_lane_u32 (uint32x2_t __a, uint32x2_t __b, const unsigned int __c) +{ + return __builtin_aarch64_umull_lanev2si_uuuu (__a, __b, __c); +} + +/* vmull_laneq */ + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmull_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c) +{ + return __builtin_aarch64_smull_laneqv4hi (__a, __b, __c); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmull_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c) +{ + return __builtin_aarch64_smull_laneqv2si (__a, __b, __c); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmull_laneq_u16 (uint16x4_t __a, uint16x8_t __b, const unsigned int __c) +{ + return __builtin_aarch64_umull_laneqv4hi_uuuu (__a, __b, __c); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmull_laneq_u32 (uint32x2_t __a, uint32x4_t __b, const unsigned int __c) +{ + return __builtin_aarch64_umull_laneqv2si_uuuu (__a, __b, __c); +} + +/* vmull_n */ + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmull_n_s16 (int16x4_t __a, int16_t __b) +{ + return __builtin_aarch64_smull_nv4hi (__a, __b); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmull_n_s32 (int32x2_t __a, int32_t __b) +{ + return __builtin_aarch64_smull_nv2si (__a, __b); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmull_n_u16 (uint16x4_t __a, uint16_t __b) +{ + return __builtin_aarch64_umull_nv4hi_uuu (__a, __b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmull_n_u32 (uint32x2_t __a, uint32_t __b) +{ + return __builtin_aarch64_umull_nv2si_uuu (__a, __b); +} + +/* vmull */ +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vmull_p8 (poly8x8_t __a, poly8x8_t __b) +{ + return __builtin_aarch64_pmullv8qi_ppp (__a, __b); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmull_s8 (int8x8_t __a, int8x8_t __b) +{ + return __builtin_aarch64_smullv8qi (__a, __b); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmull_s16 (int16x4_t __a, int16x4_t __b) +{ + return __builtin_aarch64_smullv4hi (__a, __b); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmull_s32 (int32x2_t __a, int32x2_t __b) +{ + return __builtin_aarch64_smullv2si (__a, __b); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmull_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return __builtin_aarch64_umullv8qi_uuu (__a, __b); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmull_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return __builtin_aarch64_umullv4hi_uuu (__a, __b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmull_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return __builtin_aarch64_umullv2si_uuu (__a, __b); +} + +/* vmulx */ + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmulx_f32 (float32x2_t __a, float32x2_t __b) +{ + return __builtin_aarch64_fmulxv2sf (__a, __b); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmulx_lane_f32 (float32x2_t __a, float32x4_t __b, const int __c) +{ + return __builtin_aarch64_fmulx_lanev2sf (__a, __b, __c); +} + + +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vmulxd_f64 (float64_t __a, float64_t __b) +{ + return __builtin_aarch64_fmulxdf (__a, __b); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vmulxq_f32 (float32x4_t __a, float32x4_t __b) +{ + return __builtin_aarch64_fmulxv4sf (__a, __b); +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vmulxq_f64 (float64x2_t __a, float64x2_t __b) +{ + return __builtin_aarch64_fmulxv2df (__a, __b); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vmulxq_lane_f32 (float32x4_t __a, float32x4_t __b, const int __c) +{ + return __builtin_aarch64_fmulx_lanev4sf (__a, __b, __c); +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vmulxq_lane_f64 (float64x2_t __a, float64x2_t __b, const int __c) +{ + return __builtin_aarch64_fmulx_lanev2df (__a, __b, __c); +} + +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vmulxs_f32 (float32_t __a, float32_t __b) +{ + return __builtin_aarch64_fmulxsf (__a, __b); +} + /* vmulq_lane */ __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) Index: gcc/config/aarch64/iterators.md =================================================================== --- gcc/config/aarch64/iterators.md (revision 221393) +++ gcc/config/aarch64/iterators.md (working copy) @@ -277,6 +277,8 @@ UNSPEC_SHA256SU1 ; Used in aarch64-simd.md. UNSPEC_PMULL ; Used in aarch64-simd.md. UNSPEC_PMULL2 ; Used in aarch64-simd.md. + UNSPEC_FMULX ; Used in aarch64-simd.md. + UNSPEC_FMULX_LANE ; Used in aarch64-simd.md. UNSPEC_REV_REGLIST ; Used in aarch64-simd.md. ]) @@ -468,6 +470,9 @@ ) +(define_mode_attr VDQF_Q [(V2SF "V4SF") (V4SF "V4SF") + (V2DF "V2DF")]) + ;; Widened mode register suffixes for VD_BHSI/VQW. (define_mode_attr Vwtype [(V8QI "8h") (V4HI "4s") (V2SI "2d") (V16QI "8h") Index: gcc/config/aarch64/aarch64-simd.md =================================================================== --- gcc/config/aarch64/aarch64-simd.md (revision 221393) +++ gcc/config/aarch64/aarch64-simd.md (working copy) @@ -1400,6 +1400,252 @@ } ) +(define_insn "aarch64_mul_n<mode>" + [(set (match_operand:VMUL 0 "register_operand" "=w") + (mult:VMUL + (match_operand:VMUL 1 "register_operand" "w") + (vec_duplicate:VMUL + (match_operand:<VEL> 2 "register_operand" "<h_con>"))))] + "TARGET_SIMD" + "<f>mul\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]" + [(set_attr "type" "neon_mul_<Vetype>_long")] +) + +(define_insn "aarch64_<su>mull_n<mode>" + [(set (match_operand:<VWIDE> 0 "register_operand" "=w") + (mult:<VWIDE> + (ANY_EXTEND:<VWIDE> + (match_operand:VD_HSI 1 "register_operand" "w")) + (ANY_EXTEND:<VWIDE> + (vec_duplicate:VD_HSI + (match_operand:<VEL> 2 "register_operand" "<vwx>")))))] + "TARGET_SIMD" + "<su>mull\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[0]" + [(set_attr "type" "neon_mul_<Vetype>_scalar_long")] +) + +(define_insn "aarch64_<su>mull<mode>" + [(set (match_operand:<VWIDE> 0 "register_operand" "=w") + (mult:<VWIDE> + (ANY_EXTEND:<VWIDE> + (match_operand:VD_BHSI 1 "register_operand" "w")) + (ANY_EXTEND:<VWIDE> + (match_operand:VD_BHSI 2 "register_operand" "w"))))] + "TARGET_SIMD" + "<su>mull\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vtype>" + [(set_attr "type" "neon_mul_<Vetype>_long")] +) + +(define_insn "aarch64_simd_<su>mull2_n<mode>" + [(set (match_operand:<VWIDE> 0 "register_operand" "=w") + (mult:<VWIDE> (ANY_EXTEND:<VWIDE> (vec_select:<VHALF> + (match_operand:VQ_HSI 1 "register_operand" "w") + (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" ""))) + (ANY_EXTEND:<VWIDE> (vec_duplicate:<VHALF> + (match_operand:<VEL> 2 "register_operand" "<vw>")))))] + "TARGET_SIMD" + "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[0]" + [(set_attr "type" "neon_mul_<Vetype>_scalar_long")] +) + +(define_expand "aarch64_<su>mull2_n<mode>" + [(match_operand:<VWIDE> 0 "register_operand" "") + (ANY_EXTEND:<VWIDE> (match_operand:VQ_HSI 1 "register_operand" "")) + (match_operand:<VEL> 2 "register_operand" "")] + "TARGET_SIMD" + { + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + emit_insn (gen_aarch64_simd_<su>mull2_n<mode> (operands[0], + operands[1], + operands[2], p)); + DONE; + + } +) + +(define_insn "aarch64_<su>mull_lane<mode>" + [(set (match_operand:<VWIDE> 0 "register_operand" "=w") + (mult:<VWIDE> + (ANY_EXTEND:<VWIDE> + (match_operand:VD_HSI 1 "register_operand" "w")) + (ANY_EXTEND:<VWIDE> + (vec_duplicate:VD_HSI + (vec_select:<VEL> + (match_operand:<VCOND> 2 "register_operand" "<vwx>") + (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))] + "TARGET_SIMD" + { + operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3]))); + return "<su>mull\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]"; + } + [(set_attr "type" "neon_mul_<Vetype>_scalar_long")] +) + +(define_insn "aarch64_<su>mull_laneq<mode>" + [(set (match_operand:<VWIDE> 0 "register_operand" "=w") + (mult:<VWIDE> + (ANY_EXTEND:<VWIDE> + (match_operand:VD_HSI 1 "register_operand" "w")) + (ANY_EXTEND:<VWIDE> + (vec_duplicate:VD_HSI + (vec_select:<VEL> + (match_operand:<VCONQ> 2 "register_operand" "<vwx>") + (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))] + "TARGET_SIMD" + { + operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3]))); + return "<su>mull\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]"; + } + [(set_attr "type" "neon_mul_<Vetype>_scalar_long")] +) + +(define_insn "aarch64_<su>mull2_lane<mode>_internal" + [(set (match_operand:<VWIDE> 0 "register_operand" "=w") + (mult:<VWIDE> + (ANY_EXTEND:<VWIDE> + (vec_select:<VHALF> + (match_operand:VQ_HSI 1 "register_operand" "w") + (match_operand:VQ_HSI 4 "vect_par_cnst_hi_half" ""))) + (ANY_EXTEND:<VWIDE> + (vec_duplicate:<VHALF> + (vec_select:<VEL> + (match_operand:<VCOND> 2 "register_operand" "<vwx>") + (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))] + "TARGET_SIMD" + { + operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3]))); + return "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]"; + } + [(set_attr "type" "neon_mul_<Vetype>_scalar_long")] +) + +(define_insn "aarch64_<su>mull2_laneq<mode>_internal" + [(set (match_operand:<VWIDE> 0 "register_operand" "=w") + (mult:<VWIDE> + (ANY_EXTEND:<VWIDE> + (vec_select:<VHALF> + (match_operand:VQ_HSI 1 "register_operand" "w") + (match_operand:VQ_HSI 4 "vect_par_cnst_hi_half" ""))) + (ANY_EXTEND:<VWIDE> + (vec_duplicate:<VHALF> + (vec_select:<VEL> + (match_operand:<VCONQ> 2 "register_operand" "<vwx>") + (parallel [(match_operand:SI 3 "immediate_operand" "i")]))))))] + "TARGET_SIMD" + { + operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3]))); + return "<su>mull2\\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[%3]"; + } + [(set_attr "type" "neon_mul_<Vetype>_scalar_long")] +) + +(define_expand "aarch64_smull2_lane<mode>" + [(match_operand:<VWIDE> 0 "register_operand" "=w") + (match_operand:VQ_HSI 1 "register_operand" "w") + (match_operand:<VCOND> 2 "register_operand" "<vwx>") + (match_operand:SI 3 "immediate_operand" "i")] + "TARGET_SIMD" +{ + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + emit_insn (gen_aarch64_smull2_lane<mode>_internal (operands[0], operands[1], + operands[2], operands[3], + p)); + DONE; +}) + +(define_expand "aarch64_umull2_lane<mode>" + [(match_operand:<VWIDE> 0 "register_operand" "=w") + (match_operand:VQ_HSI 1 "register_operand" "w") + (match_operand:<VCOND> 2 "register_operand" "<vwx>") + (match_operand:SI 3 "immediate_operand" "i")] + "TARGET_SIMD" +{ + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + emit_insn (gen_aarch64_umull2_lane<mode>_internal (operands[0], operands[1], + operands[2], operands[3], + p)); + DONE; +}) + +(define_expand "aarch64_smull2_laneq<mode>" + [(match_operand:<VWIDE> 0 "register_operand" "=w") + (match_operand:VQ_HSI 1 "register_operand" "w") + (match_operand:<VCONQ> 2 "register_operand" "<vwx>") + (match_operand:SI 3 "immediate_operand" "i")] + "TARGET_SIMD" +{ + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + emit_insn (gen_aarch64_smull2_laneq<mode>_internal (operands[0], operands[1], + operands[2], operands[3], + p)); + DONE; +}) + +(define_expand "aarch64_umull2_laneq<mode>" + [(match_operand:<VWIDE> 0 "register_operand" "=w") + (match_operand:VQ_HSI 1 "register_operand" "w") + (match_operand:<VCONQ> 2 "register_operand" "<vwx>") + (match_operand:SI 3 "immediate_operand" "i")] + "TARGET_SIMD" +{ + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + emit_insn (gen_aarch64_umull2_laneq<mode>_internal (operands[0], operands[1], + operands[2], operands[3], + p)); + DONE; +}) + +(define_insn "aarch64_fmulx<mode>" + [(set (match_operand:VDQF 0 "register_operand" "=w") + (unspec:VDQF [(match_operand:VDQF 1 "register_operand" "w") + (match_operand:VDQF 2 "register_operand" "w")] + UNSPEC_FMULX))] + "TARGET_SIMD" + "fmulx\\t%0.<vtype>, %1.<vtype>, %2.<vtype>" + [(set_attr "type" "neon_mul_s")] +) + +(define_insn "aarch64_fmulx<mode>" + [(set (match_operand:GPF 0 "register_operand" "=w") + (unspec:GPF [(match_operand:GPF 1 "register_operand" "w") + (match_operand:GPF 2 "register_operand" "w")] + UNSPEC_FMULX))] + "TARGET_SIMD" + "fmulx\\t%<s>0, %<s>1, %<s>2" + [(set_attr "type" "neon_mul_s")] +) + +(define_insn "aarch64_fmulx_lane<mode>" + [(set (match_operand:VDQF 0 "register_operand" "=w") + (unspec:VDQF [(match_operand:VDQF 1 "register_operand" "w") + (match_operand:<VDQF_Q> 2 "register_operand" "w") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_FMULX_LANE))] + "TARGET_SIMD" + "fmulx\\t%0.<vtype>, %1.<vtype>, %2.<vetype>[%3]" + [(set_attr "type" "neon_mul_s")] +) + +(define_insn "aarch64_pmull2v16qi" + [(set (match_operand:V8HI 0 "register_operand" "=w") + (unspec:V8HI [(match_operand:V16QI 1 "register_operand" "w") + (match_operand:V16QI 2 "register_operand" "w")] + UNSPEC_PMULL2))] + "TARGET_SIMD" + "pmull2\\t%0.8h, %1.16b, %2.16b" + [(set_attr "type" "neon_mul_b_long")] +) + +(define_insn "aarch64_pmullv8qi" + [(set (match_operand:V8HI 0 "register_operand" "=w") + (unspec:V8HI [(match_operand:V8QI 1 "register_operand" "w") + (match_operand:V8QI 2 "register_operand" "w")] + UNSPEC_PMULL))] + "TARGET_SIMD" + "pmull\\t%0.8h, %1.8b, %2.8b" + [(set_attr "type" "neon_mul_b_long")] +) + ;; FP vector operations. ;; AArch64 AdvSIMD supports single-precision (32-bit) and ;; double-precision (64-bit) floating-point data types and arithmetic as Index: gcc/config/aarch64/aarch64-simd-builtins.def =================================================================== --- gcc/config/aarch64/aarch64-simd-builtins.def (revision 221393) +++ gcc/config/aarch64/aarch64-simd-builtins.def (working copy) @@ -187,6 +187,39 @@ BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_lane, 0) BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_laneq, 0) + /* Implemented by vec_widen_<su>mult_hi_<mode>. */ + BUILTIN_VQW (BINOP, vec_widen_smult_hi_, 10) + BUILTIN_VQW (BINOPU, vec_widen_umult_hi_, 10) + /* Implemented by aarch64_<su>mull<mode>. */ + BUILTIN_VD_BHSI (BINOPU, umull, 0) + BUILTIN_VD_BHSI (BINOP, smull, 0) + /* Implemented by aarch64_<su>mull_n<mode>. */ + BUILTIN_VD_HSI (BINOP, smull_n, 0) + BUILTIN_VD_HSI (BINOPU, umull_n, 0) + /* Implemented by aarch64_mul_n<mode>. */ + BUILTIN_VMUL (BINOP, mul_n, 0) + /* Implemented by aarch64_<su>mull2_n<mode>. */ + BUILTIN_VQ_HSI (BINOP, smull2_n, 0) + BUILTIN_VQ_HSI (BINOPU, umull2_n, 0) + /* Implemented by aarch64_<su>mull_lane<q><mode>. */ + BUILTIN_VD_HSI (TERNOP, smull_lane, 0) + BUILTIN_VD_HSI (TERNOPU, umull_lane, 0) + BUILTIN_VD_HSI (TERNOP, smull_laneq, 0) + BUILTIN_VD_HSI (TERNOPU, umull_laneq, 0) + /* Implemented by aarch64_<su>mull2_lane<q><mode>. */ + BUILTIN_VQ_HSI (TERNOP, smull2_lane, 0) + BUILTIN_VQ_HSI (TERNOP_LANE, umull2_lane, 0) + BUILTIN_VQ_HSI (TERNOP, smull2_laneq, 0) + BUILTIN_VQ_HSI (TERNOP_LANE, umull2_laneq, 0) + /* Implemented by aarch64_fmulx<mode>. */ + BUILTIN_VDQF (BINOP, fmulx, 0) + BUILTIN_GPF (BINOP, fmulx, 0) + BUILTIN_VDQF (BINOP, fmulx_lane, 0) + + /* Implemented by aarch64_pmull<2><mode>.*/ + VAR1 (BINOPP, pmull, 0, v8qi) + VAR1 (BINOPP, pmull2, 0, v16qi) + BUILTIN_VSDQ_I_DI (BINOP, ashl, 3) /* Implemented by aarch64_<sur>shl<mode>. */ BUILTIN_VSDQ_I_DI (BINOP, sshl, 0)