Re: [PATCH] aarch64: enable mixed-types for aarch64 simdclones

Richard Sandiford Wed, 29 Nov 2023 07:47:25 -0800

Sorry for the very slow review on this.  It LGTM apart from some
minor comments below:


"Andre Vieira (lists)" <andre.simoesdiasvie...@arm.com> writes:
> Hey,
>
> Just a minor update to the patch, I had missed the libgomp testsuite, so 
> had to make some adjustments there too.
>
> gcc/ChangeLog:
>
>          * config/aarch64/aarch64.cc (lane_size): New function.
>          (aarch64_simd_clone_compute_vecsize_and_simdlen): Determine 
> simdlen according to NDS rule
>          and reject combination of simdlen and types that lead to 
> vectors larger than 128bits.
>
> gcc/testsuite/ChangeLog:
>
>          * lib/target-supports.exp: Add aarch64 targets to vect_simd_clones.
>          * c-c++-common/gomp/declare-variant-14.c: Adapt test for aarch64.
>          * c-c++-common/gomp/pr60823-1.c: Likewise.
>          * c-c++-common/gomp/pr60823-2.c: Likewise.
>          * c-c++-common/gomp/pr60823-3.c: Likewise.
>          * g++.dg/gomp/attrs-10.C: Likewise.
>          * g++.dg/gomp/declare-simd-1.C: Likewise.
>          * g++.dg/gomp/declare-simd-3.C: Likewise.
>          * g++.dg/gomp/declare-simd-4.C: Likewise.
>          * g++.dg/gomp/declare-simd-7.C: Likewise.
>          * g++.dg/gomp/declare-simd-8.C: Likewise.
>          * g++.dg/gomp/pr88182.C: Likewise.
>          * gcc.dg/declare-simd.c: Likewise.
>          * gcc.dg/gomp/declare-simd-1.c: Likewise.
>          * gcc.dg/gomp/declare-simd-3.c: Likewise.
>          * gcc.dg/gomp/pr87887-1.c: Likewise.
>          * gcc.dg/gomp/pr87895-1.c: Likewise.
>          * gcc.dg/gomp/pr89246-1.c: Likewise.
>          * gcc.dg/gomp/pr99542.c: Likewise.
>          * gcc.dg/gomp/simd-clones-2.c: Likewise.
>          * gcc.dg/gcc.dg/vect/vect-simd-clone-1.c: Likewise.
>          * gcc.dg/gcc.dg/vect/vect-simd-clone-2.c: Likewise.
>          * gcc.dg/gcc.dg/vect/vect-simd-clone-4.c: Likewise.
>          * gcc.dg/gcc.dg/vect/vect-simd-clone-5.c: Likewise.
>          * gcc.dg/gcc.dg/vect/vect-simd-clone-8.c: Likewise.
>          * gfortran.dg/gomp/declare-simd-2.f90: Likewise.
>          * gfortran.dg/gomp/declare-simd-coarray-lib.f90: Likewise.
>          * gfortran.dg/gomp/declare-variant-14.f90: Likewise.
>          * gfortran.dg/gomp/pr79154-1.f90: Likewise.
>          * gfortran.dg/gomp/pr83977.f90: Likewise.
>
> libgomp/testsuite/ChangeLog:
>
>          * libgomp.c/declare-variant-1.c: Adapt test for aarch64.
>          * libgomp.fortran/declare-simd-1.f90: Likewise.
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 
> 9fbfc548a891f5d11940c6fd3c49a14bfbdec886..37507f091c2a6154fa944c3a9fad6a655ab5d5a1
>  100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -27414,33 +27414,62 @@ supported_simd_type (tree t)
>    return false;
>  }
>  
> -/* Return true for types that currently are supported as SIMD return
> -   or argument types.  */
> +/* Determine the lane size for the clone argument/return type.  This follows
> +   the LS(P) rule in the VFABIA64.  */
>  
> -static bool
> -currently_supported_simd_type (tree t, tree b)
> +static unsigned
> +lane_size (cgraph_simd_clone_arg_type clone_arg_type, tree type)
>  {
> -  if (COMPLEX_FLOAT_TYPE_P (t))
> -    return false;
> +  gcc_assert (clone_arg_type != SIMD_CLONE_ARG_TYPE_MASK);
>  
> -  if (TYPE_SIZE (t) != TYPE_SIZE (b))
> -    return false;
> +  /* For non map-to-vector types that are pointers we use the element type it
> +     points to.  */
> +  if (POINTER_TYPE_P (type))
> +    switch (clone_arg_type)
> +      {
> +      default:
> +     break;
> +      case SIMD_CLONE_ARG_TYPE_UNIFORM:
> +      case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP:
> +      case SIMD_CLONE_ARG_TYPE_LINEAR_VARIABLE_STEP:
> +     type = TREE_TYPE (type);
> +     break;
> +      }
>  
> -  return supported_simd_type (t);
> +  /* For types (or types pointers of non map-to-vector types point to) that 
> are
> +     integers or floating point, we use their size if they are 1, 2, 4 or 8.
> +   */
> +  if (INTEGRAL_TYPE_P (type)
> +      || SCALAR_FLOAT_TYPE_P (type))
> +      switch (TYPE_PRECISION (type) / BITS_PER_UNIT)
> +     {
> +     default:
> +       break;
> +     case 1:
> +     case 2:
> +     case 4:
> +     case 8:
> +       return TYPE_PRECISION (type);
> +     }

The formatting looks a bit off here.  The switch should be indented by
4 columns and the { by 6.

> +  /* For any other we use the size of uintptr_t.  For map-to-vector types 
> that
> +     are pointers, using the size of uintptr_t is the same as using the size 
> of
> +     their type, seeing all pointers are the same size as uintptr_t.  */
> +  return POINTER_SIZE;
>  }
>  
> +
>  /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
>  
>  static int
>  aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
>                                       struct cgraph_simd_clone *clonei,
> -                                     tree base_type, int num,
> -                                     bool explicit_p)
> +                                     tree base_type ATTRIBUTE_UNUSED,
> +                                     int num, bool explicit_p)
>  {
>    tree t, ret_type;
> -  unsigned int elt_bits, count;
> +  unsigned int nds_elt_bits;
> +  int count;
>    unsigned HOST_WIDE_INT const_simdlen;
> -  poly_uint64 vec_bits;
>  
>    if (!TARGET_SIMD)
>      return 0;
> @@ -27460,80 +27489,135 @@ aarch64_simd_clone_compute_vecsize_and_simdlen 
> (struct cgraph_node *node,
>      }
>  
>    ret_type = TREE_TYPE (TREE_TYPE (node->decl));
> +  /* According to AArch64's Vector ABI the type that determines the simdlen 
> is
> +     the narrowest of types, so we ignore base_type for AArch64.  */
>    if (TREE_CODE (ret_type) != VOID_TYPE
> -      && !currently_supported_simd_type (ret_type, base_type))
> +      && !supported_simd_type (ret_type))
>      {
>        if (!explicit_p)
>       ;
> -      else if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
> -     warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
> -                 "GCC does not currently support mixed size types "
> -                 "for %<simd%> functions");
> -      else if (supported_simd_type (ret_type))
> +      else if (COMPLEX_FLOAT_TYPE_P (ret_type))
>       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
>                   "GCC does not currently support return type %qT "
> -                 "for %<simd%> functions", ret_type);
> +                 "for simd", ret_type);
>        else
>       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
> -                 "unsupported return type %qT for %<simd%> functions",
> +                 "unsupported return type %qT for simd",
>                   ret_type);
>        return 0;
>      }
>  
> +  auto_vec<std::pair <tree, unsigned int>> vec_elts (clonei->nargs + 1);
> +
> +  /* We are looking for the NDS type here according to the VFABIA64.  */
> +  if (TREE_CODE (ret_type) != VOID_TYPE)
> +    {
> +      nds_elt_bits = lane_size (SIMD_CLONE_ARG_TYPE_VECTOR, ret_type);
> +      vec_elts.safe_push (std::make_pair (ret_type, nds_elt_bits));
> +    }
> +  else
> +    nds_elt_bits = POINTER_SIZE;
> +
>    int i;
>    tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
>    bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE);
> -
>    for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 
> 0;
>         t && t != void_list_node; t = TREE_CHAIN (t), i++)
>      {
>        tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
> -
>        if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
> -       && !currently_supported_simd_type (arg_type, base_type))
> +       && !supported_simd_type (arg_type))
>       {
>         if (!explicit_p)
>           ;
> -       else if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
> +       else if (COMPLEX_FLOAT_TYPE_P (ret_type))
>           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
> -                     "GCC does not currently support mixed size types "
> -                     "for %<simd%> functions");
> +                     "GCC does not currently support argument type %qT "
> +                     "for simd", arg_type);
>         else
>           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
> -                     "GCC does not currently support argument type %qT "
> -                     "for %<simd%> functions", arg_type);
> +                     "unsupported argument type %qT for simd",
> +                     arg_type);
>         return 0;
>       }
> +      unsigned lane_bits = lane_size (clonei->args[i].arg_type, arg_type);
> +      if (clonei->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
> +     vec_elts.safe_push (std::make_pair (arg_type, lane_bits));
> +      if (nds_elt_bits > lane_bits)
> +     nds_elt_bits = lane_bits;
>      }
>  
>    clonei->vecsize_mangle = 'n';
>    clonei->mask_mode = VOIDmode;
> -  elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
> +  poly_uint64 simdlen;
> +  auto_vec<poly_uint64> simdlens (2);
> +  /* Keep track of the possible simdlens the clones of this function can 
> have,
> +     and check them later to see if we support them.  */
>    if (known_eq (clonei->simdlen, 0U))
>      {
> -      count = 2;
> -      vec_bits = (num == 0 ? 64 : 128);
> -      clonei->simdlen = exact_div (vec_bits, elt_bits);
> +      simdlen = exact_div (poly_uint64 (64), nds_elt_bits);
> +      simdlens.safe_push (simdlen);
> +      simdlens.safe_push (simdlen * 2);
>      }
>    else
> +    simdlens.safe_push (clonei->simdlen);
> +
> +  clonei->vecsize_int = 0;
> +  clonei->vecsize_float = 0;
> +
> +  /* We currently do not support generating simdclones where vector arguments
> +     do not fit into a single vector register, i.e. vector types that are 
> more
> +     than 128-bits large.  This is because of how we currently represent such
> +     types in ACLE, where we use a struct to allow us to pass them as 
> arguments
> +     and return.
> +     Hence why we have to check whether the simdlens available for this
> +     simdclone would cause a vector type to be larger than 128-bits, and 
> reject
> +     such a clone.  */
> +  count = simdlens.length ();
> +  int j = 0;
> +  while (j < count && !simdlens.is_empty ())

Think this would be clearer as:

  while (j < simdlens.length ())

with count only being used after the loop (or maybe being dropped
altogether).

> +    {
> +      bool remove_simdlen = false;
> +      for (auto elt : vec_elts)
> +     if (known_gt (simdlens[j] * elt.second, 128U))
> +       {
> +         /* Don't issue a warning for every simdclone when there is no
> +            specific simdlen clause.  */
> +         if (explicit_p && known_ne (clonei->simdlen, 0U))

The inverse of known_eq is maybe_ne rather than known_ne.

> +           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
> +                       "GCC does not currently support simdlen %wd for "
> +                       "type %qT",
> +                       constant_lower_bound (simdlens[j]), elt.first);
> +         remove_simdlen = true;
> +         break;
> +       }
> +      if (remove_simdlen)
> +     {
> +       count--;
> +       simdlens.ordered_remove (j);
> +     }
> +      else
> +     j++;
> +    }
> +
> +  count = simdlens.length ();
> +  if (count == 0)
>      {
> -      count = 1;
> -      vec_bits = clonei->simdlen * elt_bits;
> -      /* For now, SVE simdclones won't produce illegal simdlen, So only check
> -      const simdlens here.  */
> -      if (clonei->simdlen.is_constant (&const_simdlen)
> -       && maybe_ne (vec_bits, 64U) && maybe_ne (vec_bits, 128U))
> +      if (explicit_p && known_eq (clonei->simdlen, 0U))
>       {
> -       if (explicit_p)
> -         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
> -                     "GCC does not currently support simdlen %wd for "
> -                     "type %qT",
> -                     const_simdlen, base_type);
> -       return 0;
> +       /* Warn the user if we can't generate any simdclone.  */
> +       simdlen = exact_div (poly_uint64 (64), nds_elt_bits);
> +       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
> +                   "GCC does not currently support a simdclone with simdlens"
> +                   " %wd and %wd for these types.",
> +                   constant_lower_bound (simdlen),
> +                   constant_lower_bound (simdlen*2));
>       }
> +      return 0;
>      }
> -  clonei->vecsize_int = vec_bits;
> -  clonei->vecsize_float = vec_bits;
> +
> +  gcc_assert (num < count);
> +  clonei->simdlen = simdlens[num];
>    return count;
>  }
>  
> [...]
> diff --git a/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c 
> b/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..abb128ffc9cd2c1353b99eb38aae72377746e6d6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c
> @@ -0,0 +1,56 @@
> +/* { dg-do compile } */
> +/* { dg-options "-fopenmp-simd" } */
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +#pragma omp declare simd
> +short __attribute__ ((const)) f00 (short a , char b)
> +{
> +  return a + b;
> +}
> +/* { dg-final { scan-assembler {_ZGVnN8vv_f00:} } } */
> +/* { dg-final { scan-assembler {_ZGVnM8vv_f00:} } } */
> +
> +#pragma omp declare simd notinbranch
> +short __attribute__ ((const)) f01 (int a , short b)
> +{
> +  return a + b;
> +}
> +/* { dg-final { scan-assembler {_ZGVnN4vv_f01:} } } */

Might be worth adding:

/* { dg-final { scan-assembler-not {_ZGVnM4vv_f01:} } } */

Same for the others with inbranch/notinbranch.

The changes to the existing tests look reasonable, but I didn't go
through them in much detail.  I suppose we'll need to make a note
to revisit those tests once we add the missing support for multi-register
vectors, but I don't think there's any better record of what needs to be
done than the patch itself.  Adding comments to every single test isn't
likely to help.

I also went through the examples in the spec (and found a couple of
bugs there).  I agree the patch implements them correctly.

So OK with the changes above, thanks, and sorry again for the delay.

Richard

> +
> +#pragma omp declare simd linear(b) inbranch
> +int __attribute__ ((const)) f02 (int a, short *b)
> +{
> +  return a + *b;
> +}
> +/* { dg-final { scan-assembler {_ZGVnM4vl2_f02:} } } */
> +
> +#pragma omp declare simd uniform(a) notinbranch
> +void f03 (char b, int a)
> +{
> +}
> +/* { dg-final { scan-assembler {_ZGVnN8vu_f03:} } } */
> +/* { dg-final { scan-assembler {_ZGVnN16vu_f03:} } } */
> +
> +#pragma omp declare simd simdlen(2)
> +float f04 (double a)
> +{
> +  return (float) a;
> +}
> +/* { dg-final { scan-assembler {_ZGVnN2v_f04:} } } */
> +/* { dg-final { scan-assembler {_ZGVnM2v_f04:} } } */
> +
> +#pragma omp declare simd uniform(a) linear (b)
> +void f05 (short a, short *b, short c)
> +{
> +  *b += a + c;
> +}
> +
> +/* { dg-final { scan-assembler {_ZGVnN4ul2v_f05:} } } */
> +/* { dg-final { scan-assembler {_ZGVnN4ul2v_f05:} } } */
> +/* { dg-final { scan-assembler {_ZGVnM8ul2v_f05:} } } */
> +/* { dg-final { scan-assembler {_ZGVnM8ul2v_f05:} } } */
> +#ifdef __cplusplus
> +}
> +#endif
> +
> [...]

Re: [PATCH] aarch64: enable mixed-types for aarch64 simdclones

Reply via email to