Hi, This patch replaces builtins for vdup_n and vmov_n. The patch results in regression for pr51534.c. Consider following function:
uint8x8_t f1 (uint8x8_t a) { return vcgt_u8(a, vdup_n_u8(0)); } code-gen before patch: f1: vmov.i32 d16, #0 @ v8qi vcgt.u8 d0, d0, d16 bx lr code-gen after patch: f1: vceq.i8 d0, d0, #0 vmvn d0, d0 bx lr I am not sure which one is better tho ? Also, this patch regressed bf16_dup.c on arm-linux-gnueabi, which is due to a missed opt in lowering. I had filed it as PR98435, and posted a fix for it here: https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572648.html Thanks, Prathamesh
2021-06-24 Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> PR target/66791 * gcc/config/arm/arm_neon.h (vdup_n_s8): Replace call to builtin with constructor. (vdup_n_s16): Likewise. (vdup_n_s32): Likewise. (vdup_n_s64): Likewise. (vdup_n_u8): Likewise. (vdup_n_u16): Likewise. (vdup_n_u32): Likewise. (vdup_n_u64): Likewise. (vdup_n_p8): Likewise. (vdup_n_p16): Likewise. (vdup_n_p64): Likewise. (vdup_n_f16): Likewise. (vdup_n_f32): Likewise. (vdupq_n_s8): Likewise. (vdupq_n_s16): Likewise. (vdupq_n_s32): Likewise. (vdupq_n_s64): Likewise. (vdupq_n_u8): Likewise. (vdupq_n_u16): Likewise. (vdupq_n_u32): Likewise. (vdupq_n_u64): Likewise. (vdupq_n_p8): Likewise. (vdupq_n_p16): Likewise. (vdupq_n_p64): Likewise. (vdupq_n_f16): Likewise. (vdupq_n_f32): Likewise. (vmov_n_s8): Replace call to builtin with call to corresponding vdup intrinsic. (vmov_n_s16): Likewise. (vmov_n_s32): Likewise. (vmov_n_s64): Likewise. (vmov_n_u8): Likewise. (vmov_n_u16): Likewise. (vmov_n_u32): Likewise. (vmov_n_u64): Likewise. (vmov_n_p8): Likewise. (vmov_n_p16): Likewise. (vmov_n_f16): Likewise. (vmov_n_f32): Likewise. (vmovq_n_s8): Likewise. (vmovq_n_s16): Likewise. (vmovq_n_s32): Likewise. (vmovq_n_s64): Likewise. (vmovq_n_u8): Likewise. (vmovq_n_u16): Likewise. (vmovq_n_u32): Likewise. (vmovq_n_u64): Likewise. (vmovq_n_p8): Likewise. (vmovq_n_p16): Likewise. (vmovq_n_f16): Likewise. (vmovq_n_f32): Likewise. * config/arm/arm_neon_builtins.def: Remove entries for vdup_n. diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 3efcfa45229..bf26cd49d53 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -6625,63 +6625,63 @@ __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdup_n_s8 (int8_t __a) { - return (int8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a); + return (int8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdup_n_s16 (int16_t __a) { - return (int16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a); + return (int16x4_t) {__a, __a, __a, __a}; } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdup_n_s32 (int32_t __a) { - return (int32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a); + return (int32x2_t) {__a, __a}; } __extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdup_n_f32 (float32_t __a) { - return (float32x2_t)__builtin_neon_vdup_nv2sf ((__builtin_neon_sf) __a); + return (float32x2_t) {__a, __a}; } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdup_n_u8 (uint8_t __a) { - return (uint8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a); + return (uint8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdup_n_u16 (uint16_t __a) { - return (uint16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a); + return (uint16x4_t) {__a, __a, __a, __a}; } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdup_n_u32 (uint32_t __a) { - return (uint32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a); + return (uint32x2_t) {__a, __a}; } __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdup_n_p8 (poly8_t __a) { - return (poly8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a); + return (poly8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } __extension__ extern __inline poly16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdup_n_p16 (poly16_t __a) { - return (poly16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a); + return (poly16x4_t) {__a, __a, __a, __a}; } #pragma GCC push_options @@ -6690,7 +6690,7 @@ __extension__ extern __inline poly64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdup_n_p64 (poly64_t __a) { - return (poly64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a); + return (poly64x1_t) {__a}; } #pragma GCC pop_options @@ -6698,14 +6698,14 @@ __extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdup_n_s64 (int64_t __a) { - return (int64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a); + return (int64x1_t) {__a}; } __extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdup_n_u64 (uint64_t __a) { - return (uint64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a); + return (uint64x1_t) {__a}; } #pragma GCC push_options @@ -6714,7 +6714,7 @@ __extension__ extern __inline poly64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdupq_n_p64 (poly64_t __a) { - return (poly64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a); + return (poly64x2_t) {__a, __a}; } #pragma GCC pop_options @@ -6722,231 +6722,234 @@ __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdupq_n_s8 (int8_t __a) { - return (int8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a); + return (int8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a, + __a, __a, __a, __a, __a, __a, __a, __a}; } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdupq_n_s16 (int16_t __a) { - return (int16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a); + return (int16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdupq_n_s32 (int32_t __a) { - return (int32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a); + return (int32x4_t) {__a, __a, __a, __a}; } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdupq_n_f32 (float32_t __a) { - return (float32x4_t)__builtin_neon_vdup_nv4sf ((__builtin_neon_sf) __a); + return (float32x4_t) {__a, __a, __a, __a}; } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdupq_n_u8 (uint8_t __a) { - return (uint8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a); + return (uint8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a, + __a, __a, __a, __a, __a, __a, __a, __a}; } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdupq_n_u16 (uint16_t __a) { - return (uint16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a); + return (uint16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdupq_n_u32 (uint32_t __a) { - return (uint32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a); + return (uint32x4_t) {__a, __a, __a, __a}; } __extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdupq_n_p8 (poly8_t __a) { - return (poly8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a); + return (poly8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a, + __a, __a, __a, __a, __a, __a, __a, __a}; } __extension__ extern __inline poly16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdupq_n_p16 (poly16_t __a) { - return (poly16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a); + return (poly16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdupq_n_s64 (int64_t __a) { - return (int64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a); + return (int64x2_t) {__a, __a}; } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdupq_n_u64 (uint64_t __a) { - return (uint64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a); + return (uint64x2_t) {__a, __a}; } __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmov_n_s8 (int8_t __a) { - return (int8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a); + return vdup_n_s8 (__a); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmov_n_s16 (int16_t __a) { - return (int16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a); + return vdup_n_s16 (__a); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmov_n_s32 (int32_t __a) { - return (int32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a); + return vdup_n_s32 (__a); } __extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmov_n_f32 (float32_t __a) { - return (float32x2_t)__builtin_neon_vdup_nv2sf ((__builtin_neon_sf) __a); + return vdup_n_f32 (__a); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmov_n_u8 (uint8_t __a) { - return (uint8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a); + return vdup_n_u8 (__a); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmov_n_u16 (uint16_t __a) { - return (uint16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a); + return vdup_n_u16 (__a); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmov_n_u32 (uint32_t __a) { - return (uint32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a); + return vdup_n_u32 (__a); } __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmov_n_p8 (poly8_t __a) { - return (poly8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a); + return vdup_n_p8 (__a); } __extension__ extern __inline poly16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmov_n_p16 (poly16_t __a) { - return (poly16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a); + return vdup_n_p16 (__a); } __extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmov_n_s64 (int64_t __a) { - return (int64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a); + return vdup_n_s64 (__a); } __extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmov_n_u64 (uint64_t __a) { - return (uint64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a); + return vdup_n_u64 (__a); } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovq_n_s8 (int8_t __a) { - return (int8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a); + return vdupq_n_s8 (__a); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovq_n_s16 (int16_t __a) { - return (int16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a); + return vdupq_n_s16 (__a); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovq_n_s32 (int32_t __a) { - return (int32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a); + return vdupq_n_s32 (__a); } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovq_n_f32 (float32_t __a) { - return (float32x4_t)__builtin_neon_vdup_nv4sf ((__builtin_neon_sf) __a); + return vdupq_n_f32 (__a); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovq_n_u8 (uint8_t __a) { - return (uint8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a); + return vdupq_n_u8 (__a); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovq_n_u16 (uint16_t __a) { - return (uint16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a); + return vdupq_n_u16 (__a); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovq_n_u32 (uint32_t __a) { - return (uint32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a); + return vdupq_n_u32 (__a); } __extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovq_n_p8 (poly8_t __a) { - return (poly8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a); + return vdupq_n_p8 (__a); } __extension__ extern __inline poly16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovq_n_p16 (poly16_t __a) { - return (poly16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a); + return vdupq_n_p16 (__a); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovq_n_s64 (int64_t __a) { - return (int64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a); + return vdupq_n_s64 (__a); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovq_n_u64 (uint64_t __a) { - return (uint64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a); + return vdupq_n_u64 (__a); } __extension__ extern __inline int8x8_t @@ -17879,14 +17882,14 @@ __extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdup_n_f16 (float16_t __a) { - return __builtin_neon_vdup_nv4hf (__a); + return (float16x4_t) {__a, __a, __a, __a}; } __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdupq_n_f16 (float16_t __a) { - return __builtin_neon_vdup_nv8hf (__a); + return (float16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } __extension__ extern __inline float16x4_t @@ -17921,14 +17924,14 @@ __extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmov_n_f16 (float16_t __a) { - return __builtin_neon_vdup_nv4hf (__a); + return vdup_n_f16 (__a); } __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovq_n_f16 (float16_t __a) { - return __builtin_neon_vdup_nv8hf (__a); + return vdupq_n_f16 (__a); } __extension__ extern __inline float16x4_t @@ -18852,14 +18855,14 @@ __extension__ extern __inline bfloat16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdup_n_bf16 (bfloat16_t __a) { - return __builtin_neon_vdup_nv4bf (__a); + return (bfloat16x4_t) {__a, __a, __a, __a}; } __extension__ extern __inline bfloat16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vdupq_n_bf16 (bfloat16_t __a) { - return __builtin_neon_vdup_nv8bf (__a); + return (bfloat16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } __extension__ extern __inline bfloat16x4_t diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index ae104d5ba1b..a233e9bbd9e 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -214,9 +214,6 @@ VAR10 (GETLANE, vget_lane, VAR6 (GETLANE, vget_laneu, v8qi, v4hi, v2si, v16qi, v8hi, v4si) VAR10 (SETLANE, vset_lane, v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) -VAR10 (UNOP, vdup_n, - v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) -VAR4 (UNOP, vdup_n, v8hf, v4hf, v8bf, v4bf) VAR10 (GETLANE, vdup_lane, v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) VAR4 (GETLANE, vdup_lane, v8hf, v4hf, v8bf, v4bf)