On Mon, May 20, 2024 at 2:57 AM Richard Sandiford
<richard.sandif...@arm.com> wrote:
>
> Pengxuan Zheng <quic_pzh...@quicinc.com> writes:
> > This patch folds vget_low_* intrinsics to BIT_FILED_REF to open up more
> > optimization opportunities for gimple optimizers.
> >
> > While we are here, we also remove the vget_low_* definitions from 
> > arm_neon.h and
> > use the new intrinsics framework.
> >
> >         PR target/102171
> >
> > gcc/ChangeLog:
> >
> >       * config/aarch64/aarch64-builtins.cc (AARCH64_SIMD_VGET_LOW_BUILTINS):
> >       New macro to create definitions for all vget_low intrinsics.
> >       (VGET_LOW_BUILTIN): Likewise.
> >       (enum aarch64_builtins): Add vget_low function codes.
> >       (aarch64_general_fold_builtin): Fold vget_low calls.
> >       * config/aarch64/aarch64-simd-builtins.def: Delete vget_low builtins.
> >       * config/aarch64/aarch64-simd.md (aarch64_get_low<mode>): Delete.
> >       (aarch64_vget_lo_halfv8bf): Likewise.
> >       * config/aarch64/arm_neon.h (__attribute__): Delete.
> >       (vget_low_f16): Likewise.
> >       (vget_low_f32): Likewise.
> >       (vget_low_f64): Likewise.
> >       (vget_low_p8): Likewise.
> >       (vget_low_p16): Likewise.
> >       (vget_low_p64): Likewise.
> >       (vget_low_s8): Likewise.
> >       (vget_low_s16): Likewise.
> >       (vget_low_s32): Likewise.
> >       (vget_low_s64): Likewise.
> >       (vget_low_u8): Likewise.
> >       (vget_low_u16): Likewise.
> >       (vget_low_u32): Likewise.
> >       (vget_low_u64): Likewise.
> >       (vget_low_bf16): Likewise.
> >
> > gcc/testsuite/ChangeLog:
> >
> >       * gcc.target/aarch64/pr113573.c: Replace __builtin_aarch64_get_lowv8hi
> >       with vget_low_s16.
> >       * gcc.target/aarch64/vget_low_2.c: New test.
> >       * gcc.target/aarch64/vget_low_2_be.c: New test.
>
> Ok, thanks.  I suppose the patch has the side effect of allowing
> vget_low_bf16 to be called without +bf16.  IMO that's the correct
> behaviour though, and is consistent with how we handle reinterprets.

Pushed as r15-697-ga2e4fe5a53cf75cd055f64e745ebd51253e42254 .

Thanks,
Andrew

>
> Richard
>
> > Signed-off-by: Pengxuan Zheng <quic_pzh...@quicinc.com>
> > ---
> >  gcc/config/aarch64/aarch64-builtins.cc        |  60 ++++++++++
> >  gcc/config/aarch64/aarch64-simd-builtins.def  |   5 +-
> >  gcc/config/aarch64/aarch64-simd.md            |  23 +---
> >  gcc/config/aarch64/arm_neon.h                 | 105 ------------------
> >  gcc/testsuite/gcc.target/aarch64/pr113573.c   |   2 +-
> >  gcc/testsuite/gcc.target/aarch64/vget_low_2.c |  30 +++++
> >  .../gcc.target/aarch64/vget_low_2_be.c        |  31 ++++++
> >  7 files changed, 124 insertions(+), 132 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/vget_low_2.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c
> >
> > diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
> > b/gcc/config/aarch64/aarch64-builtins.cc
> > index 75d21de1401..4afe7c86ae3 100644
> > --- a/gcc/config/aarch64/aarch64-builtins.cc
> > +++ b/gcc/config/aarch64/aarch64-builtins.cc
> > @@ -658,6 +658,23 @@ static aarch64_simd_builtin_datum 
> > aarch64_simd_builtin_data[] = {
> >    VREINTERPRET_BUILTINS \
> >    VREINTERPRETQ_BUILTINS
> >
> > +#define AARCH64_SIMD_VGET_LOW_BUILTINS \
> > +  VGET_LOW_BUILTIN(f16) \
> > +  VGET_LOW_BUILTIN(f32) \
> > +  VGET_LOW_BUILTIN(f64) \
> > +  VGET_LOW_BUILTIN(p8) \
> > +  VGET_LOW_BUILTIN(p16) \
> > +  VGET_LOW_BUILTIN(p64) \
> > +  VGET_LOW_BUILTIN(s8) \
> > +  VGET_LOW_BUILTIN(s16) \
> > +  VGET_LOW_BUILTIN(s32) \
> > +  VGET_LOW_BUILTIN(s64) \
> > +  VGET_LOW_BUILTIN(u8) \
> > +  VGET_LOW_BUILTIN(u16) \
> > +  VGET_LOW_BUILTIN(u32) \
> > +  VGET_LOW_BUILTIN(u64) \
> > +  VGET_LOW_BUILTIN(bf16)
> > +
> >  typedef struct
> >  {
> >    const char *name;
> > @@ -697,6 +714,9 @@ typedef struct
> >  #define VREINTERPRET_BUILTIN(A, B, L) \
> >    AARCH64_SIMD_BUILTIN_VREINTERPRET##L##_##A##_##B,
> >
> > +#define VGET_LOW_BUILTIN(A) \
> > +  AARCH64_SIMD_BUILTIN_VGET_LOW_##A,
> > +
> >  #undef VAR1
> >  #define VAR1(T, N, MAP, FLAG, A) \
> >    AARCH64_SIMD_BUILTIN_##T##_##N##A,
> > @@ -732,6 +752,7 @@ enum aarch64_builtins
> >    AARCH64_CRC32_BUILTIN_MAX,
> >    /* SIMD intrinsic builtins.  */
> >    AARCH64_SIMD_VREINTERPRET_BUILTINS
> > +  AARCH64_SIMD_VGET_LOW_BUILTINS
> >    /* ARMv8.3-A Pointer Authentication Builtins.  */
> >    AARCH64_PAUTH_BUILTIN_AUTIA1716,
> >    AARCH64_PAUTH_BUILTIN_PACIA1716,
> > @@ -823,8 +844,37 @@ static aarch64_fcmla_laneq_builtin_datum 
> > aarch64_fcmla_lane_builtin_data[] = {
> >       && SIMD_INTR_QUAL(A) == SIMD_INTR_QUAL(B) \
> >    },
> >
> > +#undef VGET_LOW_BUILTIN
> > +#define VGET_LOW_BUILTIN(A) \
> > +  {"vget_low_" #A, \
> > +   AARCH64_SIMD_BUILTIN_VGET_LOW_##A, \
> > +   2, \
> > +   { SIMD_INTR_MODE(A, d), SIMD_INTR_MODE(A, q) }, \
> > +   { SIMD_INTR_QUAL(A), SIMD_INTR_QUAL(A) }, \
> > +   FLAG_AUTO_FP, \
> > +   false \
> > +  },
> > +
> > +#define AARCH64_SIMD_VGET_LOW_BUILTINS \
> > +  VGET_LOW_BUILTIN(f16) \
> > +  VGET_LOW_BUILTIN(f32) \
> > +  VGET_LOW_BUILTIN(f64) \
> > +  VGET_LOW_BUILTIN(p8) \
> > +  VGET_LOW_BUILTIN(p16) \
> > +  VGET_LOW_BUILTIN(p64) \
> > +  VGET_LOW_BUILTIN(s8) \
> > +  VGET_LOW_BUILTIN(s16) \
> > +  VGET_LOW_BUILTIN(s32) \
> > +  VGET_LOW_BUILTIN(s64) \
> > +  VGET_LOW_BUILTIN(u8) \
> > +  VGET_LOW_BUILTIN(u16) \
> > +  VGET_LOW_BUILTIN(u32) \
> > +  VGET_LOW_BUILTIN(u64) \
> > +  VGET_LOW_BUILTIN(bf16)
> > +
> >  static const aarch64_simd_intrinsic_datum aarch64_simd_intrinsic_data[] = {
> >    AARCH64_SIMD_VREINTERPRET_BUILTINS
> > +  AARCH64_SIMD_VGET_LOW_BUILTINS
> >  };
> >
> >
> > @@ -3216,6 +3266,9 @@ aarch64_fold_builtin_lane_check (tree arg0, tree 
> > arg1, tree arg2)
> >  #define VREINTERPRET_BUILTIN(A, B, L) \
> >    case AARCH64_SIMD_BUILTIN_VREINTERPRET##L##_##A##_##B:
> >
> > +#undef VGET_LOW_BUILTIN
> > +#define VGET_LOW_BUILTIN(A) \
> > +  case AARCH64_SIMD_BUILTIN_VGET_LOW_##A:
> >
> >  /* Try to fold a call to the built-in function with subcode FCODE.  The
> >     function is passed the N_ARGS arguments in ARGS and it returns a value
> > @@ -3235,6 +3288,13 @@ aarch64_general_fold_builtin (unsigned int fcode, 
> > tree type,
> >       return fold_build1 (FLOAT_EXPR, type, args[0]);
> >        AARCH64_SIMD_VREINTERPRET_BUILTINS
> >       return fold_build1 (VIEW_CONVERT_EXPR, type, args[0]);
> > +      AARCH64_SIMD_VGET_LOW_BUILTINS
> > +        {
> > +          auto pos = BYTES_BIG_ENDIAN ? 64 : 0;
> > +
> > +          return fold_build3 (BIT_FIELD_REF, type, args[0], bitsize_int 
> > (64),
> > +                              bitsize_int (pos));
> > +        }
> >        case AARCH64_SIMD_BUILTIN_LANE_CHECK:
> >       gcc_assert (n_args == 3);
> >       if (aarch64_fold_builtin_lane_check (args[0], args[1], args[2]))
> > diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def 
> > b/gcc/config/aarch64/aarch64-simd-builtins.def
> > index da16f602a55..a9f0558f8b6 100644
> > --- a/gcc/config/aarch64/aarch64-simd-builtins.def
> > +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
> > @@ -65,8 +65,6 @@
> >    BUILTIN_VS (UNOP, ctz, 2, NONE)
> >    BUILTIN_VB (UNOP, popcount, 2, NONE)
> >
> > -  /* Implemented by aarch64_get_low<mode>.  */
> > -  BUILTIN_VQMOV (UNOP, get_low, 0, AUTO_FP)
> >    /* Implemented by aarch64_get_high<mode>.  */
> >    BUILTIN_VQMOV (UNOP, get_high, 0, AUTO_FP)
> >
> > @@ -960,8 +958,7 @@
> >    VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, FP, v4sf)
> >    VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, FP, v4sf)
> >
> > -  /* Implemented by aarch64_vget_lo/hi_halfv8bf.  */
> > -  VAR1 (UNOP, vget_lo_half, 0, AUTO_FP, v8bf)
> > +  /* Implemented by aarch64_vget_hi_halfv8bf.  */
> >    VAR1 (UNOP, vget_hi_half, 0, AUTO_FP, v8bf)
> >
> >    /* Implemented by aarch64_simd_<sur>mmlav16qi.  */
> > diff --git a/gcc/config/aarch64/aarch64-simd.md 
> > b/gcc/config/aarch64/aarch64-simd.md
> > index f8bb973a278..5a28a8e3c6a 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -288,17 +288,6 @@ (define_expand "aarch64_get_half<mode>"
> >    }
> >  )
> >
> > -(define_expand "aarch64_get_low<mode>"
> > -  [(match_operand:<VHALF> 0 "register_operand")
> > -   (match_operand:VQMOV 1 "register_operand")]
> > -  "TARGET_FLOAT"
> > -  {
> > -    rtx lo = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
> > -    emit_insn (gen_aarch64_get_half<mode> (operands[0], operands[1], lo));
> > -    DONE;
> > -  }
> > -)
> > -
> >  (define_expand "aarch64_get_high<mode>"
> >    [(match_operand:<VHALF> 0 "register_operand")
> >     (match_operand:VQMOV 1 "register_operand")]
> > @@ -9774,17 +9763,7 @@ (define_insn 
> > "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>"
> >    [(set_attr "type" "neon_dot<VDQSF:q>")]
> >  )
> >
> > -;; vget_low/high_bf16
> > -(define_expand "aarch64_vget_lo_halfv8bf"
> > -  [(match_operand:V4BF 0 "register_operand")
> > -   (match_operand:V8BF 1 "register_operand")]
> > -  "TARGET_BF16_SIMD"
> > -{
> > -  rtx p = aarch64_simd_vect_par_cnst_half (V8BFmode, 8, false);
> > -  emit_insn (gen_aarch64_get_halfv8bf (operands[0], operands[1], p));
> > -  DONE;
> > -})
> > -
> > +;; vget_high_bf16
> >  (define_expand "aarch64_vget_hi_halfv8bf"
> >    [(match_operand:V4BF 0 "register_operand")
> >     (match_operand:V8BF 1 "register_operand")]
> > diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> > index 0ee325dccad..92c2c5361cd 100644
> > --- a/gcc/config/aarch64/arm_neon.h
> > +++ b/gcc/config/aarch64/arm_neon.h
> > @@ -3027,104 +3027,6 @@ vsetq_lane_u64 (uint64_t __elem, uint64x2_t __vec, 
> > const int __index)
> >    return __aarch64_vset_lane_any (__elem, __vec, __index);
> >  }
> >
> > -__extension__ extern __inline float16x4_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_f16 (float16x8_t __a)
> > -{
> > -  return __builtin_aarch64_get_lowv8hf (__a);
> > -}
> > -
> > -__extension__ extern __inline float32x2_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_f32 (float32x4_t __a)
> > -{
> > -  return __builtin_aarch64_get_lowv4sf (__a);
> > -}
> > -
> > -__extension__ extern __inline float64x1_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_f64 (float64x2_t __a)
> > -{
> > -  return (float64x1_t) {__builtin_aarch64_get_lowv2df (__a)};
> > -}
> > -
> > -__extension__ extern __inline poly8x8_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_p8 (poly8x16_t __a)
> > -{
> > -  return (poly8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t) __a);
> > -}
> > -
> > -__extension__ extern __inline poly16x4_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_p16 (poly16x8_t __a)
> > -{
> > -  return (poly16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t) __a);
> > -}
> > -
> > -__extension__ extern __inline poly64x1_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_p64 (poly64x2_t __a)
> > -{
> > -  return (poly64x1_t) __builtin_aarch64_get_lowv2di ((int64x2_t) __a);
> > -}
> > -
> > -__extension__ extern __inline int8x8_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_s8 (int8x16_t __a)
> > -{
> > -  return  __builtin_aarch64_get_lowv16qi (__a);
> > -}
> > -
> > -__extension__ extern __inline int16x4_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_s16 (int16x8_t __a)
> > -{
> > -  return  __builtin_aarch64_get_lowv8hi (__a);
> > -}
> > -
> > -__extension__ extern __inline int32x2_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_s32 (int32x4_t __a)
> > -{
> > -  return  __builtin_aarch64_get_lowv4si (__a);
> > -}
> > -
> > -__extension__ extern __inline int64x1_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_s64 (int64x2_t __a)
> > -{
> > -  return  (int64x1_t) {__builtin_aarch64_get_lowv2di (__a)};
> > -}
> > -
> > -__extension__ extern __inline uint8x8_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_u8 (uint8x16_t __a)
> > -{
> > -  return (uint8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t) __a);
> > -}
> > -
> > -__extension__ extern __inline uint16x4_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_u16 (uint16x8_t __a)
> > -{
> > -  return (uint16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t) __a);
> > -}
> > -
> > -__extension__ extern __inline uint32x2_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_u32 (uint32x4_t __a)
> > -{
> > -  return (uint32x2_t) __builtin_aarch64_get_lowv4si ((int32x4_t) __a);
> > -}
> > -
> > -__extension__ extern __inline uint64x1_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_u64 (uint64x2_t __a)
> > -{
> > -  return (uint64x1_t) {__builtin_aarch64_get_lowv2di ((int64x2_t) __a)};
> > -}
> > -
> >  __extension__ extern __inline float16x4_t
> >  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> >  vget_high_f16 (float16x8_t __a)
> > @@ -28479,13 +28381,6 @@ vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t 
> > __a, bfloat16x8_t __b,
> >    return __builtin_aarch64_bfmlalt_lane_qv4sf (__r, __a, __b, __index);
> >  }
> >
> > -__extension__ extern __inline bfloat16x4_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_bf16 (bfloat16x8_t __a)
> > -{
> > -  return __builtin_aarch64_vget_lo_halfv8bf (__a);
> > -}
> > -
> >  __extension__ extern __inline bfloat16x4_t
> >  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> >  vget_high_bf16 (bfloat16x8_t __a)
> > diff --git a/gcc/testsuite/gcc.target/aarch64/pr113573.c 
> > b/gcc/testsuite/gcc.target/aarch64/pr113573.c
> > index a8e445c6e19..fc8607f7218 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/pr113573.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/pr113573.c
> > @@ -26,7 +26,7 @@ void jsimd_extbgrx_ycc_convert_neon() {
> >        int y_l = vmull_laneq_u16(r);
> >        uint16x8_t __a = g;
> >        jsimd_extbgrx_ycc_convert_neon___trans_tmp_2 =
> > -          (uint16x4_t)__builtin_aarch64_get_lowv8hi((int16x8_t)__a);
> > +          (uint16x4_t)vget_low_s16((int16x8_t)__a);
> >        __a = b;
> >        int cb_l = scaled_128_5;
> >        int cb_h = scaled_128_5;
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vget_low_2.c 
> > b/gcc/testsuite/gcc.target/aarch64/vget_low_2.c
> > new file mode 100644
> > index 00000000000..44414e1c043
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vget_low_2.c
> > @@ -0,0 +1,30 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -fdump-tree-optimized" } */
> > +
> > +#include <arm_neon.h>
> > +
> > +#define VARIANTS                             \
> > +VARIANT (uint8x8_t, uint8x16_t, u8)          \
> > +VARIANT (uint16x4_t, uint16x8_t, u16)                \
> > +VARIANT (uint32x2_t, uint32x4_t, u32)                \
> > +VARIANT (uint64x1_t, uint64x2_t, u64)                \
> > +VARIANT (int8x8_t, int8x16_t, s8)            \
> > +VARIANT (int16x4_t, int16x8_t, s16)          \
> > +VARIANT (int32x2_t, int32x4_t, s32)          \
> > +VARIANT (int64x1_t, int64x2_t, s64)          \
> > +VARIANT (float16x4_t, float16x8_t, f16)              \
> > +VARIANT (float32x2_t, float32x4_t, f32)              \
> > +VARIANT (float64x1_t, float64x2_t, f64)              \
> > +VARIANT (bfloat16x4_t, bfloat16x8_t, bf16)
> > +
> > +/* vget_low_* intrinsics should become BIT_FIELD_REF. */
> > +#define VARIANT(TYPE64, TYPE128, SUFFIX)     \
> > +TYPE64                                               \
> > +test_vget_low_##SUFFIX (TYPE128 vec)         \
> > +{                                            \
> > +  return vget_low_##SUFFIX (vec);            \
> > +}
> > +
> > +VARIANTS
> > +
> > +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF 
> > <vec_\[0-9\]*\\\(D\\\), 64, 0>" 12 "optimized" } } */
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c 
> > b/gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c
> > new file mode 100644
> > index 00000000000..c3f4c4f0e0d
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c
> > @@ -0,0 +1,31 @@
> > +/* { dg-do compile } */
> > +/* { dg-require-effective-target stdint_types_mbig_endian } */
> > +/* { dg-options "-O3 -fdump-tree-optimized -mbig-endian" } */
> > +
> > +#include <arm_neon.h>
> > +
> > +#define VARIANTS                             \
> > +VARIANT (uint8x8_t, uint8x16_t, u8)          \
> > +VARIANT (uint16x4_t, uint16x8_t, u16)                \
> > +VARIANT (uint32x2_t, uint32x4_t, u32)                \
> > +VARIANT (uint64x1_t, uint64x2_t, u64)                \
> > +VARIANT (int8x8_t, int8x16_t, s8)            \
> > +VARIANT (int16x4_t, int16x8_t, s16)          \
> > +VARIANT (int32x2_t, int32x4_t, s32)          \
> > +VARIANT (int64x1_t, int64x2_t, s64)          \
> > +VARIANT (float16x4_t, float16x8_t, f16)              \
> > +VARIANT (float32x2_t, float32x4_t, f32)              \
> > +VARIANT (float64x1_t, float64x2_t, f64)              \
> > +VARIANT (bfloat16x4_t, bfloat16x8_t, bf16)
> > +
> > +/* vget_low_* intrinsics should become BIT_FIELD_REF. */
> > +#define VARIANT(TYPE64, TYPE128, SUFFIX)     \
> > +TYPE64                                               \
> > +test_vget_low_##SUFFIX (TYPE128 vec)         \
> > +{                                            \
> > +  return vget_low_##SUFFIX (vec);            \
> > +}
> > +
> > +VARIANTS
> > +
> > +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF 
> > <vec_\[0-9\]*\\\(D\\\), 64, 64>" 12 "optimized" } } */

Reply via email to