Sorry for the very slow review on this. It LGTM apart from some minor comments below:
"Andre Vieira (lists)" <andre.simoesdiasvie...@arm.com> writes: > Hey, > > Just a minor update to the patch, I had missed the libgomp testsuite, so > had to make some adjustments there too. > > gcc/ChangeLog: > > * config/aarch64/aarch64.cc (lane_size): New function. > (aarch64_simd_clone_compute_vecsize_and_simdlen): Determine > simdlen according to NDS rule > and reject combination of simdlen and types that lead to > vectors larger than 128bits. > > gcc/testsuite/ChangeLog: > > * lib/target-supports.exp: Add aarch64 targets to vect_simd_clones. > * c-c++-common/gomp/declare-variant-14.c: Adapt test for aarch64. > * c-c++-common/gomp/pr60823-1.c: Likewise. > * c-c++-common/gomp/pr60823-2.c: Likewise. > * c-c++-common/gomp/pr60823-3.c: Likewise. > * g++.dg/gomp/attrs-10.C: Likewise. > * g++.dg/gomp/declare-simd-1.C: Likewise. > * g++.dg/gomp/declare-simd-3.C: Likewise. > * g++.dg/gomp/declare-simd-4.C: Likewise. > * g++.dg/gomp/declare-simd-7.C: Likewise. > * g++.dg/gomp/declare-simd-8.C: Likewise. > * g++.dg/gomp/pr88182.C: Likewise. > * gcc.dg/declare-simd.c: Likewise. > * gcc.dg/gomp/declare-simd-1.c: Likewise. > * gcc.dg/gomp/declare-simd-3.c: Likewise. > * gcc.dg/gomp/pr87887-1.c: Likewise. > * gcc.dg/gomp/pr87895-1.c: Likewise. > * gcc.dg/gomp/pr89246-1.c: Likewise. > * gcc.dg/gomp/pr99542.c: Likewise. > * gcc.dg/gomp/simd-clones-2.c: Likewise. > * gcc.dg/gcc.dg/vect/vect-simd-clone-1.c: Likewise. > * gcc.dg/gcc.dg/vect/vect-simd-clone-2.c: Likewise. > * gcc.dg/gcc.dg/vect/vect-simd-clone-4.c: Likewise. > * gcc.dg/gcc.dg/vect/vect-simd-clone-5.c: Likewise. > * gcc.dg/gcc.dg/vect/vect-simd-clone-8.c: Likewise. > * gfortran.dg/gomp/declare-simd-2.f90: Likewise. > * gfortran.dg/gomp/declare-simd-coarray-lib.f90: Likewise. > * gfortran.dg/gomp/declare-variant-14.f90: Likewise. > * gfortran.dg/gomp/pr79154-1.f90: Likewise. > * gfortran.dg/gomp/pr83977.f90: Likewise. > > libgomp/testsuite/ChangeLog: > > * libgomp.c/declare-variant-1.c: Adapt test for aarch64. > * libgomp.fortran/declare-simd-1.f90: Likewise. > > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index > 9fbfc548a891f5d11940c6fd3c49a14bfbdec886..37507f091c2a6154fa944c3a9fad6a655ab5d5a1 > 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -27414,33 +27414,62 @@ supported_simd_type (tree t) > return false; > } > > -/* Return true for types that currently are supported as SIMD return > - or argument types. */ > +/* Determine the lane size for the clone argument/return type. This follows > + the LS(P) rule in the VFABIA64. */ > > -static bool > -currently_supported_simd_type (tree t, tree b) > +static unsigned > +lane_size (cgraph_simd_clone_arg_type clone_arg_type, tree type) > { > - if (COMPLEX_FLOAT_TYPE_P (t)) > - return false; > + gcc_assert (clone_arg_type != SIMD_CLONE_ARG_TYPE_MASK); > > - if (TYPE_SIZE (t) != TYPE_SIZE (b)) > - return false; > + /* For non map-to-vector types that are pointers we use the element type it > + points to. */ > + if (POINTER_TYPE_P (type)) > + switch (clone_arg_type) > + { > + default: > + break; > + case SIMD_CLONE_ARG_TYPE_UNIFORM: > + case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP: > + case SIMD_CLONE_ARG_TYPE_LINEAR_VARIABLE_STEP: > + type = TREE_TYPE (type); > + break; > + } > > - return supported_simd_type (t); > + /* For types (or types pointers of non map-to-vector types point to) that > are > + integers or floating point, we use their size if they are 1, 2, 4 or 8. > + */ > + if (INTEGRAL_TYPE_P (type) > + || SCALAR_FLOAT_TYPE_P (type)) > + switch (TYPE_PRECISION (type) / BITS_PER_UNIT) > + { > + default: > + break; > + case 1: > + case 2: > + case 4: > + case 8: > + return TYPE_PRECISION (type); > + } The formatting looks a bit off here. The switch should be indented by 4 columns and the { by 6. > + /* For any other we use the size of uintptr_t. For map-to-vector types > that > + are pointers, using the size of uintptr_t is the same as using the size > of > + their type, seeing all pointers are the same size as uintptr_t. */ > + return POINTER_SIZE; > } > > + > /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */ > > static int > aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, > struct cgraph_simd_clone *clonei, > - tree base_type, int num, > - bool explicit_p) > + tree base_type ATTRIBUTE_UNUSED, > + int num, bool explicit_p) > { > tree t, ret_type; > - unsigned int elt_bits, count; > + unsigned int nds_elt_bits; > + int count; > unsigned HOST_WIDE_INT const_simdlen; > - poly_uint64 vec_bits; > > if (!TARGET_SIMD) > return 0; > @@ -27460,80 +27489,135 @@ aarch64_simd_clone_compute_vecsize_and_simdlen > (struct cgraph_node *node, > } > > ret_type = TREE_TYPE (TREE_TYPE (node->decl)); > + /* According to AArch64's Vector ABI the type that determines the simdlen > is > + the narrowest of types, so we ignore base_type for AArch64. */ > if (TREE_CODE (ret_type) != VOID_TYPE > - && !currently_supported_simd_type (ret_type, base_type)) > + && !supported_simd_type (ret_type)) > { > if (!explicit_p) > ; > - else if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type)) > - warning_at (DECL_SOURCE_LOCATION (node->decl), 0, > - "GCC does not currently support mixed size types " > - "for %<simd%> functions"); > - else if (supported_simd_type (ret_type)) > + else if (COMPLEX_FLOAT_TYPE_P (ret_type)) > warning_at (DECL_SOURCE_LOCATION (node->decl), 0, > "GCC does not currently support return type %qT " > - "for %<simd%> functions", ret_type); > + "for simd", ret_type); > else > warning_at (DECL_SOURCE_LOCATION (node->decl), 0, > - "unsupported return type %qT for %<simd%> functions", > + "unsupported return type %qT for simd", > ret_type); > return 0; > } > > + auto_vec<std::pair <tree, unsigned int>> vec_elts (clonei->nargs + 1); > + > + /* We are looking for the NDS type here according to the VFABIA64. */ > + if (TREE_CODE (ret_type) != VOID_TYPE) > + { > + nds_elt_bits = lane_size (SIMD_CLONE_ARG_TYPE_VECTOR, ret_type); > + vec_elts.safe_push (std::make_pair (ret_type, nds_elt_bits)); > + } > + else > + nds_elt_bits = POINTER_SIZE; > + > int i; > tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl)); > bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE); > - > for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = > 0; > t && t != void_list_node; t = TREE_CHAIN (t), i++) > { > tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t); > - > if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM > - && !currently_supported_simd_type (arg_type, base_type)) > + && !supported_simd_type (arg_type)) > { > if (!explicit_p) > ; > - else if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type)) > + else if (COMPLEX_FLOAT_TYPE_P (ret_type)) > warning_at (DECL_SOURCE_LOCATION (node->decl), 0, > - "GCC does not currently support mixed size types " > - "for %<simd%> functions"); > + "GCC does not currently support argument type %qT " > + "for simd", arg_type); > else > warning_at (DECL_SOURCE_LOCATION (node->decl), 0, > - "GCC does not currently support argument type %qT " > - "for %<simd%> functions", arg_type); > + "unsupported argument type %qT for simd", > + arg_type); > return 0; > } > + unsigned lane_bits = lane_size (clonei->args[i].arg_type, arg_type); > + if (clonei->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR) > + vec_elts.safe_push (std::make_pair (arg_type, lane_bits)); > + if (nds_elt_bits > lane_bits) > + nds_elt_bits = lane_bits; > } > > clonei->vecsize_mangle = 'n'; > clonei->mask_mode = VOIDmode; > - elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type)); > + poly_uint64 simdlen; > + auto_vec<poly_uint64> simdlens (2); > + /* Keep track of the possible simdlens the clones of this function can > have, > + and check them later to see if we support them. */ > if (known_eq (clonei->simdlen, 0U)) > { > - count = 2; > - vec_bits = (num == 0 ? 64 : 128); > - clonei->simdlen = exact_div (vec_bits, elt_bits); > + simdlen = exact_div (poly_uint64 (64), nds_elt_bits); > + simdlens.safe_push (simdlen); > + simdlens.safe_push (simdlen * 2); > } > else > + simdlens.safe_push (clonei->simdlen); > + > + clonei->vecsize_int = 0; > + clonei->vecsize_float = 0; > + > + /* We currently do not support generating simdclones where vector arguments > + do not fit into a single vector register, i.e. vector types that are > more > + than 128-bits large. This is because of how we currently represent such > + types in ACLE, where we use a struct to allow us to pass them as > arguments > + and return. > + Hence why we have to check whether the simdlens available for this > + simdclone would cause a vector type to be larger than 128-bits, and > reject > + such a clone. */ > + count = simdlens.length (); > + int j = 0; > + while (j < count && !simdlens.is_empty ()) Think this would be clearer as: while (j < simdlens.length ()) with count only being used after the loop (or maybe being dropped altogether). > + { > + bool remove_simdlen = false; > + for (auto elt : vec_elts) > + if (known_gt (simdlens[j] * elt.second, 128U)) > + { > + /* Don't issue a warning for every simdclone when there is no > + specific simdlen clause. */ > + if (explicit_p && known_ne (clonei->simdlen, 0U)) The inverse of known_eq is maybe_ne rather than known_ne. > + warning_at (DECL_SOURCE_LOCATION (node->decl), 0, > + "GCC does not currently support simdlen %wd for " > + "type %qT", > + constant_lower_bound (simdlens[j]), elt.first); > + remove_simdlen = true; > + break; > + } > + if (remove_simdlen) > + { > + count--; > + simdlens.ordered_remove (j); > + } > + else > + j++; > + } > + > + count = simdlens.length (); > + if (count == 0) > { > - count = 1; > - vec_bits = clonei->simdlen * elt_bits; > - /* For now, SVE simdclones won't produce illegal simdlen, So only check > - const simdlens here. */ > - if (clonei->simdlen.is_constant (&const_simdlen) > - && maybe_ne (vec_bits, 64U) && maybe_ne (vec_bits, 128U)) > + if (explicit_p && known_eq (clonei->simdlen, 0U)) > { > - if (explicit_p) > - warning_at (DECL_SOURCE_LOCATION (node->decl), 0, > - "GCC does not currently support simdlen %wd for " > - "type %qT", > - const_simdlen, base_type); > - return 0; > + /* Warn the user if we can't generate any simdclone. */ > + simdlen = exact_div (poly_uint64 (64), nds_elt_bits); > + warning_at (DECL_SOURCE_LOCATION (node->decl), 0, > + "GCC does not currently support a simdclone with simdlens" > + " %wd and %wd for these types.", > + constant_lower_bound (simdlen), > + constant_lower_bound (simdlen*2)); > } > + return 0; > } > - clonei->vecsize_int = vec_bits; > - clonei->vecsize_float = vec_bits; > + > + gcc_assert (num < count); > + clonei->simdlen = simdlens[num]; > return count; > } > > [...] > diff --git a/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c > b/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..abb128ffc9cd2c1353b99eb38aae72377746e6d6 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c > @@ -0,0 +1,56 @@ > +/* { dg-do compile } */ > +/* { dg-options "-fopenmp-simd" } */ > +#ifdef __cplusplus > +extern "C" { > +#endif > +#pragma omp declare simd > +short __attribute__ ((const)) f00 (short a , char b) > +{ > + return a + b; > +} > +/* { dg-final { scan-assembler {_ZGVnN8vv_f00:} } } */ > +/* { dg-final { scan-assembler {_ZGVnM8vv_f00:} } } */ > + > +#pragma omp declare simd notinbranch > +short __attribute__ ((const)) f01 (int a , short b) > +{ > + return a + b; > +} > +/* { dg-final { scan-assembler {_ZGVnN4vv_f01:} } } */ Might be worth adding: /* { dg-final { scan-assembler-not {_ZGVnM4vv_f01:} } } */ Same for the others with inbranch/notinbranch. The changes to the existing tests look reasonable, but I didn't go through them in much detail. I suppose we'll need to make a note to revisit those tests once we add the missing support for multi-register vectors, but I don't think there's any better record of what needs to be done than the patch itself. Adding comments to every single test isn't likely to help. I also went through the examples in the spec (and found a couple of bugs there). I agree the patch implements them correctly. So OK with the changes above, thanks, and sorry again for the delay. Richard > + > +#pragma omp declare simd linear(b) inbranch > +int __attribute__ ((const)) f02 (int a, short *b) > +{ > + return a + *b; > +} > +/* { dg-final { scan-assembler {_ZGVnM4vl2_f02:} } } */ > + > +#pragma omp declare simd uniform(a) notinbranch > +void f03 (char b, int a) > +{ > +} > +/* { dg-final { scan-assembler {_ZGVnN8vu_f03:} } } */ > +/* { dg-final { scan-assembler {_ZGVnN16vu_f03:} } } */ > + > +#pragma omp declare simd simdlen(2) > +float f04 (double a) > +{ > + return (float) a; > +} > +/* { dg-final { scan-assembler {_ZGVnN2v_f04:} } } */ > +/* { dg-final { scan-assembler {_ZGVnM2v_f04:} } } */ > + > +#pragma omp declare simd uniform(a) linear (b) > +void f05 (short a, short *b, short c) > +{ > + *b += a + c; > +} > + > +/* { dg-final { scan-assembler {_ZGVnN4ul2v_f05:} } } */ > +/* { dg-final { scan-assembler {_ZGVnN4ul2v_f05:} } } */ > +/* { dg-final { scan-assembler {_ZGVnM8ul2v_f05:} } } */ > +/* { dg-final { scan-assembler {_ZGVnM8ul2v_f05:} } } */ > +#ifdef __cplusplus > +} > +#endif > + > [...]