On Mon, May 20, 2024 at 2:57 AM Richard Sandiford <richard.sandif...@arm.com> wrote: > > Pengxuan Zheng <quic_pzh...@quicinc.com> writes: > > This patch folds vget_low_* intrinsics to BIT_FILED_REF to open up more > > optimization opportunities for gimple optimizers. > > > > While we are here, we also remove the vget_low_* definitions from > > arm_neon.h and > > use the new intrinsics framework. > > > > PR target/102171 > > > > gcc/ChangeLog: > > > > * config/aarch64/aarch64-builtins.cc (AARCH64_SIMD_VGET_LOW_BUILTINS): > > New macro to create definitions for all vget_low intrinsics. > > (VGET_LOW_BUILTIN): Likewise. > > (enum aarch64_builtins): Add vget_low function codes. > > (aarch64_general_fold_builtin): Fold vget_low calls. > > * config/aarch64/aarch64-simd-builtins.def: Delete vget_low builtins. > > * config/aarch64/aarch64-simd.md (aarch64_get_low<mode>): Delete. > > (aarch64_vget_lo_halfv8bf): Likewise. > > * config/aarch64/arm_neon.h (__attribute__): Delete. > > (vget_low_f16): Likewise. > > (vget_low_f32): Likewise. > > (vget_low_f64): Likewise. > > (vget_low_p8): Likewise. > > (vget_low_p16): Likewise. > > (vget_low_p64): Likewise. > > (vget_low_s8): Likewise. > > (vget_low_s16): Likewise. > > (vget_low_s32): Likewise. > > (vget_low_s64): Likewise. > > (vget_low_u8): Likewise. > > (vget_low_u16): Likewise. > > (vget_low_u32): Likewise. > > (vget_low_u64): Likewise. > > (vget_low_bf16): Likewise. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/aarch64/pr113573.c: Replace __builtin_aarch64_get_lowv8hi > > with vget_low_s16. > > * gcc.target/aarch64/vget_low_2.c: New test. > > * gcc.target/aarch64/vget_low_2_be.c: New test. > > Ok, thanks. I suppose the patch has the side effect of allowing > vget_low_bf16 to be called without +bf16. IMO that's the correct > behaviour though, and is consistent with how we handle reinterprets.
Pushed as r15-697-ga2e4fe5a53cf75cd055f64e745ebd51253e42254 . Thanks, Andrew > > Richard > > > Signed-off-by: Pengxuan Zheng <quic_pzh...@quicinc.com> > > --- > > gcc/config/aarch64/aarch64-builtins.cc | 60 ++++++++++ > > gcc/config/aarch64/aarch64-simd-builtins.def | 5 +- > > gcc/config/aarch64/aarch64-simd.md | 23 +--- > > gcc/config/aarch64/arm_neon.h | 105 ------------------ > > gcc/testsuite/gcc.target/aarch64/pr113573.c | 2 +- > > gcc/testsuite/gcc.target/aarch64/vget_low_2.c | 30 +++++ > > .../gcc.target/aarch64/vget_low_2_be.c | 31 ++++++ > > 7 files changed, 124 insertions(+), 132 deletions(-) > > create mode 100644 gcc/testsuite/gcc.target/aarch64/vget_low_2.c > > create mode 100644 gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c > > > > diff --git a/gcc/config/aarch64/aarch64-builtins.cc > > b/gcc/config/aarch64/aarch64-builtins.cc > > index 75d21de1401..4afe7c86ae3 100644 > > --- a/gcc/config/aarch64/aarch64-builtins.cc > > +++ b/gcc/config/aarch64/aarch64-builtins.cc > > @@ -658,6 +658,23 @@ static aarch64_simd_builtin_datum > > aarch64_simd_builtin_data[] = { > > VREINTERPRET_BUILTINS \ > > VREINTERPRETQ_BUILTINS > > > > +#define AARCH64_SIMD_VGET_LOW_BUILTINS \ > > + VGET_LOW_BUILTIN(f16) \ > > + VGET_LOW_BUILTIN(f32) \ > > + VGET_LOW_BUILTIN(f64) \ > > + VGET_LOW_BUILTIN(p8) \ > > + VGET_LOW_BUILTIN(p16) \ > > + VGET_LOW_BUILTIN(p64) \ > > + VGET_LOW_BUILTIN(s8) \ > > + VGET_LOW_BUILTIN(s16) \ > > + VGET_LOW_BUILTIN(s32) \ > > + VGET_LOW_BUILTIN(s64) \ > > + VGET_LOW_BUILTIN(u8) \ > > + VGET_LOW_BUILTIN(u16) \ > > + VGET_LOW_BUILTIN(u32) \ > > + VGET_LOW_BUILTIN(u64) \ > > + VGET_LOW_BUILTIN(bf16) > > + > > typedef struct > > { > > const char *name; > > @@ -697,6 +714,9 @@ typedef struct > > #define VREINTERPRET_BUILTIN(A, B, L) \ > > AARCH64_SIMD_BUILTIN_VREINTERPRET##L##_##A##_##B, > > > > +#define VGET_LOW_BUILTIN(A) \ > > + AARCH64_SIMD_BUILTIN_VGET_LOW_##A, > > + > > #undef VAR1 > > #define VAR1(T, N, MAP, FLAG, A) \ > > AARCH64_SIMD_BUILTIN_##T##_##N##A, > > @@ -732,6 +752,7 @@ enum aarch64_builtins > > AARCH64_CRC32_BUILTIN_MAX, > > /* SIMD intrinsic builtins. */ > > AARCH64_SIMD_VREINTERPRET_BUILTINS > > + AARCH64_SIMD_VGET_LOW_BUILTINS > > /* ARMv8.3-A Pointer Authentication Builtins. */ > > AARCH64_PAUTH_BUILTIN_AUTIA1716, > > AARCH64_PAUTH_BUILTIN_PACIA1716, > > @@ -823,8 +844,37 @@ static aarch64_fcmla_laneq_builtin_datum > > aarch64_fcmla_lane_builtin_data[] = { > > && SIMD_INTR_QUAL(A) == SIMD_INTR_QUAL(B) \ > > }, > > > > +#undef VGET_LOW_BUILTIN > > +#define VGET_LOW_BUILTIN(A) \ > > + {"vget_low_" #A, \ > > + AARCH64_SIMD_BUILTIN_VGET_LOW_##A, \ > > + 2, \ > > + { SIMD_INTR_MODE(A, d), SIMD_INTR_MODE(A, q) }, \ > > + { SIMD_INTR_QUAL(A), SIMD_INTR_QUAL(A) }, \ > > + FLAG_AUTO_FP, \ > > + false \ > > + }, > > + > > +#define AARCH64_SIMD_VGET_LOW_BUILTINS \ > > + VGET_LOW_BUILTIN(f16) \ > > + VGET_LOW_BUILTIN(f32) \ > > + VGET_LOW_BUILTIN(f64) \ > > + VGET_LOW_BUILTIN(p8) \ > > + VGET_LOW_BUILTIN(p16) \ > > + VGET_LOW_BUILTIN(p64) \ > > + VGET_LOW_BUILTIN(s8) \ > > + VGET_LOW_BUILTIN(s16) \ > > + VGET_LOW_BUILTIN(s32) \ > > + VGET_LOW_BUILTIN(s64) \ > > + VGET_LOW_BUILTIN(u8) \ > > + VGET_LOW_BUILTIN(u16) \ > > + VGET_LOW_BUILTIN(u32) \ > > + VGET_LOW_BUILTIN(u64) \ > > + VGET_LOW_BUILTIN(bf16) > > + > > static const aarch64_simd_intrinsic_datum aarch64_simd_intrinsic_data[] = { > > AARCH64_SIMD_VREINTERPRET_BUILTINS > > + AARCH64_SIMD_VGET_LOW_BUILTINS > > }; > > > > > > @@ -3216,6 +3266,9 @@ aarch64_fold_builtin_lane_check (tree arg0, tree > > arg1, tree arg2) > > #define VREINTERPRET_BUILTIN(A, B, L) \ > > case AARCH64_SIMD_BUILTIN_VREINTERPRET##L##_##A##_##B: > > > > +#undef VGET_LOW_BUILTIN > > +#define VGET_LOW_BUILTIN(A) \ > > + case AARCH64_SIMD_BUILTIN_VGET_LOW_##A: > > > > /* Try to fold a call to the built-in function with subcode FCODE. The > > function is passed the N_ARGS arguments in ARGS and it returns a value > > @@ -3235,6 +3288,13 @@ aarch64_general_fold_builtin (unsigned int fcode, > > tree type, > > return fold_build1 (FLOAT_EXPR, type, args[0]); > > AARCH64_SIMD_VREINTERPRET_BUILTINS > > return fold_build1 (VIEW_CONVERT_EXPR, type, args[0]); > > + AARCH64_SIMD_VGET_LOW_BUILTINS > > + { > > + auto pos = BYTES_BIG_ENDIAN ? 64 : 0; > > + > > + return fold_build3 (BIT_FIELD_REF, type, args[0], bitsize_int > > (64), > > + bitsize_int (pos)); > > + } > > case AARCH64_SIMD_BUILTIN_LANE_CHECK: > > gcc_assert (n_args == 3); > > if (aarch64_fold_builtin_lane_check (args[0], args[1], args[2])) > > diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def > > b/gcc/config/aarch64/aarch64-simd-builtins.def > > index da16f602a55..a9f0558f8b6 100644 > > --- a/gcc/config/aarch64/aarch64-simd-builtins.def > > +++ b/gcc/config/aarch64/aarch64-simd-builtins.def > > @@ -65,8 +65,6 @@ > > BUILTIN_VS (UNOP, ctz, 2, NONE) > > BUILTIN_VB (UNOP, popcount, 2, NONE) > > > > - /* Implemented by aarch64_get_low<mode>. */ > > - BUILTIN_VQMOV (UNOP, get_low, 0, AUTO_FP) > > /* Implemented by aarch64_get_high<mode>. */ > > BUILTIN_VQMOV (UNOP, get_high, 0, AUTO_FP) > > > > @@ -960,8 +958,7 @@ > > VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, FP, v4sf) > > VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, FP, v4sf) > > > > - /* Implemented by aarch64_vget_lo/hi_halfv8bf. */ > > - VAR1 (UNOP, vget_lo_half, 0, AUTO_FP, v8bf) > > + /* Implemented by aarch64_vget_hi_halfv8bf. */ > > VAR1 (UNOP, vget_hi_half, 0, AUTO_FP, v8bf) > > > > /* Implemented by aarch64_simd_<sur>mmlav16qi. */ > > diff --git a/gcc/config/aarch64/aarch64-simd.md > > b/gcc/config/aarch64/aarch64-simd.md > > index f8bb973a278..5a28a8e3c6a 100644 > > --- a/gcc/config/aarch64/aarch64-simd.md > > +++ b/gcc/config/aarch64/aarch64-simd.md > > @@ -288,17 +288,6 @@ (define_expand "aarch64_get_half<mode>" > > } > > ) > > > > -(define_expand "aarch64_get_low<mode>" > > - [(match_operand:<VHALF> 0 "register_operand") > > - (match_operand:VQMOV 1 "register_operand")] > > - "TARGET_FLOAT" > > - { > > - rtx lo = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false); > > - emit_insn (gen_aarch64_get_half<mode> (operands[0], operands[1], lo)); > > - DONE; > > - } > > -) > > - > > (define_expand "aarch64_get_high<mode>" > > [(match_operand:<VHALF> 0 "register_operand") > > (match_operand:VQMOV 1 "register_operand")] > > @@ -9774,17 +9763,7 @@ (define_insn > > "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>" > > [(set_attr "type" "neon_dot<VDQSF:q>")] > > ) > > > > -;; vget_low/high_bf16 > > -(define_expand "aarch64_vget_lo_halfv8bf" > > - [(match_operand:V4BF 0 "register_operand") > > - (match_operand:V8BF 1 "register_operand")] > > - "TARGET_BF16_SIMD" > > -{ > > - rtx p = aarch64_simd_vect_par_cnst_half (V8BFmode, 8, false); > > - emit_insn (gen_aarch64_get_halfv8bf (operands[0], operands[1], p)); > > - DONE; > > -}) > > - > > +;; vget_high_bf16 > > (define_expand "aarch64_vget_hi_halfv8bf" > > [(match_operand:V4BF 0 "register_operand") > > (match_operand:V8BF 1 "register_operand")] > > diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h > > index 0ee325dccad..92c2c5361cd 100644 > > --- a/gcc/config/aarch64/arm_neon.h > > +++ b/gcc/config/aarch64/arm_neon.h > > @@ -3027,104 +3027,6 @@ vsetq_lane_u64 (uint64_t __elem, uint64x2_t __vec, > > const int __index) > > return __aarch64_vset_lane_any (__elem, __vec, __index); > > } > > > > -__extension__ extern __inline float16x4_t > > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > > -vget_low_f16 (float16x8_t __a) > > -{ > > - return __builtin_aarch64_get_lowv8hf (__a); > > -} > > - > > -__extension__ extern __inline float32x2_t > > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > > -vget_low_f32 (float32x4_t __a) > > -{ > > - return __builtin_aarch64_get_lowv4sf (__a); > > -} > > - > > -__extension__ extern __inline float64x1_t > > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > > -vget_low_f64 (float64x2_t __a) > > -{ > > - return (float64x1_t) {__builtin_aarch64_get_lowv2df (__a)}; > > -} > > - > > -__extension__ extern __inline poly8x8_t > > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > > -vget_low_p8 (poly8x16_t __a) > > -{ > > - return (poly8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t) __a); > > -} > > - > > -__extension__ extern __inline poly16x4_t > > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > > -vget_low_p16 (poly16x8_t __a) > > -{ > > - return (poly16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t) __a); > > -} > > - > > -__extension__ extern __inline poly64x1_t > > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > > -vget_low_p64 (poly64x2_t __a) > > -{ > > - return (poly64x1_t) __builtin_aarch64_get_lowv2di ((int64x2_t) __a); > > -} > > - > > -__extension__ extern __inline int8x8_t > > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > > -vget_low_s8 (int8x16_t __a) > > -{ > > - return __builtin_aarch64_get_lowv16qi (__a); > > -} > > - > > -__extension__ extern __inline int16x4_t > > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > > -vget_low_s16 (int16x8_t __a) > > -{ > > - return __builtin_aarch64_get_lowv8hi (__a); > > -} > > - > > -__extension__ extern __inline int32x2_t > > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > > -vget_low_s32 (int32x4_t __a) > > -{ > > - return __builtin_aarch64_get_lowv4si (__a); > > -} > > - > > -__extension__ extern __inline int64x1_t > > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > > -vget_low_s64 (int64x2_t __a) > > -{ > > - return (int64x1_t) {__builtin_aarch64_get_lowv2di (__a)}; > > -} > > - > > -__extension__ extern __inline uint8x8_t > > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > > -vget_low_u8 (uint8x16_t __a) > > -{ > > - return (uint8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t) __a); > > -} > > - > > -__extension__ extern __inline uint16x4_t > > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > > -vget_low_u16 (uint16x8_t __a) > > -{ > > - return (uint16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t) __a); > > -} > > - > > -__extension__ extern __inline uint32x2_t > > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > > -vget_low_u32 (uint32x4_t __a) > > -{ > > - return (uint32x2_t) __builtin_aarch64_get_lowv4si ((int32x4_t) __a); > > -} > > - > > -__extension__ extern __inline uint64x1_t > > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > > -vget_low_u64 (uint64x2_t __a) > > -{ > > - return (uint64x1_t) {__builtin_aarch64_get_lowv2di ((int64x2_t) __a)}; > > -} > > - > > __extension__ extern __inline float16x4_t > > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > > vget_high_f16 (float16x8_t __a) > > @@ -28479,13 +28381,6 @@ vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t > > __a, bfloat16x8_t __b, > > return __builtin_aarch64_bfmlalt_lane_qv4sf (__r, __a, __b, __index); > > } > > > > -__extension__ extern __inline bfloat16x4_t > > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > > -vget_low_bf16 (bfloat16x8_t __a) > > -{ > > - return __builtin_aarch64_vget_lo_halfv8bf (__a); > > -} > > - > > __extension__ extern __inline bfloat16x4_t > > __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > > vget_high_bf16 (bfloat16x8_t __a) > > diff --git a/gcc/testsuite/gcc.target/aarch64/pr113573.c > > b/gcc/testsuite/gcc.target/aarch64/pr113573.c > > index a8e445c6e19..fc8607f7218 100644 > > --- a/gcc/testsuite/gcc.target/aarch64/pr113573.c > > +++ b/gcc/testsuite/gcc.target/aarch64/pr113573.c > > @@ -26,7 +26,7 @@ void jsimd_extbgrx_ycc_convert_neon() { > > int y_l = vmull_laneq_u16(r); > > uint16x8_t __a = g; > > jsimd_extbgrx_ycc_convert_neon___trans_tmp_2 = > > - (uint16x4_t)__builtin_aarch64_get_lowv8hi((int16x8_t)__a); > > + (uint16x4_t)vget_low_s16((int16x8_t)__a); > > __a = b; > > int cb_l = scaled_128_5; > > int cb_h = scaled_128_5; > > diff --git a/gcc/testsuite/gcc.target/aarch64/vget_low_2.c > > b/gcc/testsuite/gcc.target/aarch64/vget_low_2.c > > new file mode 100644 > > index 00000000000..44414e1c043 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/vget_low_2.c > > @@ -0,0 +1,30 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O3 -fdump-tree-optimized" } */ > > + > > +#include <arm_neon.h> > > + > > +#define VARIANTS \ > > +VARIANT (uint8x8_t, uint8x16_t, u8) \ > > +VARIANT (uint16x4_t, uint16x8_t, u16) \ > > +VARIANT (uint32x2_t, uint32x4_t, u32) \ > > +VARIANT (uint64x1_t, uint64x2_t, u64) \ > > +VARIANT (int8x8_t, int8x16_t, s8) \ > > +VARIANT (int16x4_t, int16x8_t, s16) \ > > +VARIANT (int32x2_t, int32x4_t, s32) \ > > +VARIANT (int64x1_t, int64x2_t, s64) \ > > +VARIANT (float16x4_t, float16x8_t, f16) \ > > +VARIANT (float32x2_t, float32x4_t, f32) \ > > +VARIANT (float64x1_t, float64x2_t, f64) \ > > +VARIANT (bfloat16x4_t, bfloat16x8_t, bf16) > > + > > +/* vget_low_* intrinsics should become BIT_FIELD_REF. */ > > +#define VARIANT(TYPE64, TYPE128, SUFFIX) \ > > +TYPE64 \ > > +test_vget_low_##SUFFIX (TYPE128 vec) \ > > +{ \ > > + return vget_low_##SUFFIX (vec); \ > > +} > > + > > +VARIANTS > > + > > +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF > > <vec_\[0-9\]*\\\(D\\\), 64, 0>" 12 "optimized" } } */ > > diff --git a/gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c > > b/gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c > > new file mode 100644 > > index 00000000000..c3f4c4f0e0d > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c > > @@ -0,0 +1,31 @@ > > +/* { dg-do compile } */ > > +/* { dg-require-effective-target stdint_types_mbig_endian } */ > > +/* { dg-options "-O3 -fdump-tree-optimized -mbig-endian" } */ > > + > > +#include <arm_neon.h> > > + > > +#define VARIANTS \ > > +VARIANT (uint8x8_t, uint8x16_t, u8) \ > > +VARIANT (uint16x4_t, uint16x8_t, u16) \ > > +VARIANT (uint32x2_t, uint32x4_t, u32) \ > > +VARIANT (uint64x1_t, uint64x2_t, u64) \ > > +VARIANT (int8x8_t, int8x16_t, s8) \ > > +VARIANT (int16x4_t, int16x8_t, s16) \ > > +VARIANT (int32x2_t, int32x4_t, s32) \ > > +VARIANT (int64x1_t, int64x2_t, s64) \ > > +VARIANT (float16x4_t, float16x8_t, f16) \ > > +VARIANT (float32x2_t, float32x4_t, f32) \ > > +VARIANT (float64x1_t, float64x2_t, f64) \ > > +VARIANT (bfloat16x4_t, bfloat16x8_t, bf16) > > + > > +/* vget_low_* intrinsics should become BIT_FIELD_REF. */ > > +#define VARIANT(TYPE64, TYPE128, SUFFIX) \ > > +TYPE64 \ > > +test_vget_low_##SUFFIX (TYPE128 vec) \ > > +{ \ > > + return vget_low_##SUFFIX (vec); \ > > +} > > + > > +VARIANTS > > + > > +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF > > <vec_\[0-9\]*\\\(D\\\), 64, 64>" 12 "optimized" } } */