Hi, The attached patch removes calls to builtins from vshl_n intrinsics, and replacing them with left shift operator. The patch passes bootstrap+test on arm-linux-gnueabihf.
Altho, I noticed, that the patch causes 3 extra registers to spill using << instead of the builtin for vshl_n.c. Could that be perhaps due to inlining of intrinsics ? Before patch, the shift operation was performed by call to __builtin_neon_vshl<type> (__a, __b) and now it's inlined to __a << __b, which might result in increased register pressure ? Thanks, Prathamesh
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 41b596b5fc6..f5c85eb43e7 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -4887,112 +4887,112 @@ __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vshl_n_s8 (int8x8_t __a, const int __b) { - return (int8x8_t)__builtin_neon_vshl_nv8qi (__a, __b); + return __a << __b; } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vshl_n_s16 (int16x4_t __a, const int __b) { - return (int16x4_t)__builtin_neon_vshl_nv4hi (__a, __b); + return __a << __b; } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vshl_n_s32 (int32x2_t __a, const int __b) { - return (int32x2_t)__builtin_neon_vshl_nv2si (__a, __b); + return __a << __b; } __extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vshl_n_s64 (int64x1_t __a, const int __b) { - return (int64x1_t)__builtin_neon_vshl_ndi (__a, __b); + return __a << __b; } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vshl_n_u8 (uint8x8_t __a, const int __b) { - return (uint8x8_t)__builtin_neon_vshl_nv8qi ((int8x8_t) __a, __b); + return __a << __b; } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vshl_n_u16 (uint16x4_t __a, const int __b) { - return (uint16x4_t)__builtin_neon_vshl_nv4hi ((int16x4_t) __a, __b); + return __a << __b; } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vshl_n_u32 (uint32x2_t __a, const int __b) { - return (uint32x2_t)__builtin_neon_vshl_nv2si ((int32x2_t) __a, __b); + return __a << __b; } __extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vshl_n_u64 (uint64x1_t __a, const int __b) { - return (uint64x1_t)__builtin_neon_vshl_ndi ((int64x1_t) __a, __b); + return __a << __b; } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vshlq_n_s8 (int8x16_t __a, const int __b) { - return (int8x16_t)__builtin_neon_vshl_nv16qi (__a, __b); + return __a << __b; } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vshlq_n_s16 (int16x8_t __a, const int __b) { - return (int16x8_t)__builtin_neon_vshl_nv8hi (__a, __b); + return __a << __b; } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vshlq_n_s32 (int32x4_t __a, const int __b) { - return (int32x4_t)__builtin_neon_vshl_nv4si (__a, __b); + return __a << __b; } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vshlq_n_s64 (int64x2_t __a, const int __b) { - return (int64x2_t)__builtin_neon_vshl_nv2di (__a, __b); + return __a << __b; } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vshlq_n_u8 (uint8x16_t __a, const int __b) { - return (uint8x16_t)__builtin_neon_vshl_nv16qi ((int8x16_t) __a, __b); + return __a << __b; } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vshlq_n_u16 (uint16x8_t __a, const int __b) { - return (uint16x8_t)__builtin_neon_vshl_nv8hi ((int16x8_t) __a, __b); + return __a << __b; } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vshlq_n_u32 (uint32x4_t __a, const int __b) { - return (uint32x4_t)__builtin_neon_vshl_nv4si ((int32x4_t) __a, __b); + return __a << __b; } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vshlq_n_u64 (uint64x2_t __a, const int __b) { - return (uint64x2_t)__builtin_neon_vshl_nv2di ((int64x2_t) __a, __b); + return __a << __b; } __extension__ extern __inline int8x8_t diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index 70438ac1848..ea6bd43a035 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -103,7 +103,6 @@ VAR3 (BINOP_IMM, vqrshrns_n, v8hi, v4si, v2di) VAR3 (BINOP_IMM, vqrshrnu_n, v8hi, v4si, v2di) VAR3 (BINOP_IMM, vqshrun_n, v8hi, v4si, v2di) VAR3 (BINOP_IMM, vqrshrun_n, v8hi, v4si, v2di) -VAR8 (BINOP_IMM, vshl_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di) VAR8 (BINOP_IMM, vqshl_s_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di) VAR8 (BINOP_IMM, vqshl_u_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di) VAR8 (BINOP_IMM, vqshlu_n, v8qi, v4hi, v2si, di, v16qi, v8hi, v4si, v2di)