This patch finalizes adding support for the generation of SVE simd clones when
no simdlen is provided, following the ABI rules where the widest data type
determines the minimum amount of elements in a length agnostic vector.

gcc/ChangeLog:

        * config/aarch64/aarch64-protos.h (add_sve_type_attribute): Declare.
        * config/aarch64/aarch64-sve-builtins.cc (add_sve_type_attribute): Make
        visibility global and support use for non_acle types.
        * config/aarch64/aarch64.cc
        (aarch64_simd_clone_compute_vecsize_and_simdlen): Create VLA simd clone
        when no simdlen is provided, according to ABI rules.
        (simd_clone_adjust_sve_vector_type): New helper function.
        (aarch64_simd_clone_adjust): Add '+sve' attribute to SVE simd clones
        and modify types to use SVE types.
        * omp-simd-clone.cc (simd_clone_mangle): Print 'x' for VLA simdlen.
        (simd_clone_adjust): Adapt safelen check to be compatible with VLA
        simdlen.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/declare-simd-2.c: Add SVE clone scan.
        * gcc.target/aarch64/vect-simd-clone-1.c: New test.
---
 gcc/config/aarch64/aarch64-protos.h           |   2 +
 gcc/config/aarch64/aarch64-sve-builtins.cc    |   6 +-
 gcc/config/aarch64/aarch64.cc                 | 181 +++++++++++++++---
 gcc/omp-simd-clone.cc                         |  13 +-
 .../gcc.target/aarch64/declare-simd-2.c       |   1 +
 .../gcc.target/aarch64/vect-simd-clone-1.c    | 137 +++++++++++++
 6 files changed, 306 insertions(+), 34 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vect-simd-clone-1.c

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 9be64913091..9b97b5cad40 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1039,6 +1039,8 @@ namespace aarch64_sve {
 #ifdef GCC_TARGET_H
   bool verify_type_context (location_t, type_context_kind, const_tree, bool);
 #endif
+ void add_sve_type_attribute (tree, unsigned int, unsigned int,
+                             const char *, const char *);
 }
 
 extern void aarch64_split_combinev16qi (rtx operands[3]);
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc 
b/gcc/config/aarch64/aarch64-sve-builtins.cc
index ef14f8cd39d..bd6e61c028c 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
@@ -951,14 +951,16 @@ static GTY(()) hash_map<tree, registered_function *> 
*overload_names[2];
 /* Record that TYPE is an ABI-defined SVE type that contains NUM_ZR SVE vectors
    and NUM_PR SVE predicates.  MANGLED_NAME, if nonnull, is the ABI-defined
    mangling of the type.  ACLE_NAME is the <arm_sve.h> name of the type.  */
-static void
+void
 add_sve_type_attribute (tree type, unsigned int num_zr, unsigned int num_pr,
                        const char *mangled_name, const char *acle_name)
 {
   tree mangled_name_tree
     = (mangled_name ? get_identifier (mangled_name) : NULL_TREE);
+  tree acle_name_tree
+    = (acle_name ? get_identifier (acle_name) : NULL_TREE);
 
-  tree value = tree_cons (NULL_TREE, get_identifier (acle_name), NULL_TREE);
+  tree value = tree_cons (NULL_TREE, acle_name_tree, NULL_TREE);
   value = tree_cons (NULL_TREE, mangled_name_tree, value);
   value = tree_cons (NULL_TREE, size_int (num_pr), value);
   value = tree_cons (NULL_TREE, size_int (num_zr), value);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 1e51d0e3604..324e7825b00 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -29022,7 +29022,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct 
cgraph_node *node,
                                        int num, bool explicit_p)
 {
   tree t, ret_type;
-  unsigned int nds_elt_bits;
+  unsigned int nds_elt_bits, wds_elt_bits;
   unsigned HOST_WIDE_INT const_simdlen;
 
   if (!TARGET_SIMD)
@@ -29067,10 +29067,14 @@ aarch64_simd_clone_compute_vecsize_and_simdlen 
(struct cgraph_node *node,
   if (TREE_CODE (ret_type) != VOID_TYPE)
     {
       nds_elt_bits = lane_size (SIMD_CLONE_ARG_TYPE_VECTOR, ret_type);
+      wds_elt_bits = nds_elt_bits;
       vec_elts.safe_push (std::make_pair (ret_type, nds_elt_bits));
     }
   else
-    nds_elt_bits = POINTER_SIZE;
+    {
+      nds_elt_bits = POINTER_SIZE;
+      wds_elt_bits = 0;
+    }
 
   int i;
   tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
@@ -29078,44 +29082,65 @@ aarch64_simd_clone_compute_vecsize_and_simdlen 
(struct cgraph_node *node,
   for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
        t && t != void_list_node; t = TREE_CHAIN (t), i++)
     {
-      tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
+      tree type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
       if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
-         && !supported_simd_type (arg_type))
+         && !supported_simd_type (type))
        {
          if (!explicit_p)
            ;
-         else if (COMPLEX_FLOAT_TYPE_P (ret_type))
+         else if (COMPLEX_FLOAT_TYPE_P (type))
            warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
                        "GCC does not currently support argument type %qT "
-                       "for simd", arg_type);
+                       "for simd", type);
          else
            warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
                        "unsupported argument type %qT for simd",
-                       arg_type);
+                       type);
          return 0;
        }
-      unsigned lane_bits = lane_size (clonei->args[i].arg_type, arg_type);
+      unsigned lane_bits = lane_size (clonei->args[i].arg_type, type);
       if (clonei->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
-       vec_elts.safe_push (std::make_pair (arg_type, lane_bits));
+       vec_elts.safe_push (std::make_pair (type, lane_bits));
       if (nds_elt_bits > lane_bits)
        nds_elt_bits = lane_bits;
+      if (wds_elt_bits < lane_bits)
+       wds_elt_bits = lane_bits;
     }
 
-  clonei->vecsize_mangle = 'n';
+  /* If we could not determine the WDS type from available parameters/return,
+     then fallback to using uintptr_t.  */
+  if (wds_elt_bits == 0)
+    wds_elt_bits = POINTER_SIZE;
+
   clonei->mask_mode = VOIDmode;
   poly_uint64 simdlen;
-  auto_vec<poly_uint64> simdlens (2);
+  typedef struct
+    {
+      poly_uint64 len;
+      char mangle;
+    } aarch64_clone_info;
+  auto_vec<aarch64_clone_info> clones (3);
+
   /* Keep track of the possible simdlens the clones of this function can have,
      and check them later to see if we support them.  */
   if (known_eq (clonei->simdlen, 0U))
     {
       simdlen = exact_div (poly_uint64 (64), nds_elt_bits);
       if (maybe_ne (simdlen, 1U))
-       simdlens.safe_push (simdlen);
-      simdlens.safe_push (simdlen * 2);
+       clones.safe_push ({simdlen, 'n'});
+      clones.safe_push ({simdlen * 2, 'n'});
+      /* Only create a SVE simd clone if we aren't dealing with an unprototyped
+        function.
+        We have also disabled support for creating SVE simdclones for functions
+        with function bodies and any simdclones when -msve-vector-bits is used.
+        TODO: add support for these.  */
+      if (prototype_p (TREE_TYPE (node->decl))
+         && !node->definition
+         && !aarch64_sve_vg.is_constant ())
+       clones.safe_push ({exact_div (BITS_PER_SVE_VECTOR, wds_elt_bits), 's'});
     }
   else
-    simdlens.safe_push (clonei->simdlen);
+    clones.safe_push ({clonei->simdlen, 'n'});
 
   clonei->vecsize_int = 0;
   clonei->vecsize_float = 0;
@@ -29129,11 +29154,12 @@ aarch64_simd_clone_compute_vecsize_and_simdlen 
(struct cgraph_node *node,
      simdclone would cause a vector type to be larger than 128-bits, and reject
      such a clone.  */
   unsigned j = 0;
-  while (j < simdlens.length ())
+  while (j < clones.length ())
     {
       bool remove_simdlen = false;
       for (auto elt : vec_elts)
-       if (known_gt (simdlens[j] * elt.second, 128U))
+       if (clones[j].mangle == 'n'
+           && known_gt (clones[j].len * elt.second, 128U))
          {
            /* Don't issue a warning for every simdclone when there is no
               specific simdlen clause.  */
@@ -29141,18 +29167,17 @@ aarch64_simd_clone_compute_vecsize_and_simdlen 
(struct cgraph_node *node,
              warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
                          "GCC does not currently support simdlen %wd for "
                          "type %qT",
-                         constant_lower_bound (simdlens[j]), elt.first);
+                         constant_lower_bound (clones[j].len), elt.first);
            remove_simdlen = true;
            break;
          }
       if (remove_simdlen)
-       simdlens.ordered_remove (j);
+       clones.ordered_remove (j);
       else
        j++;
     }
 
-
-  int count = simdlens.length ();
+  int count = clones.length ();
   if (count == 0)
     {
       if (explicit_p && known_eq (clonei->simdlen, 0U))
@@ -29169,21 +29194,118 @@ aarch64_simd_clone_compute_vecsize_and_simdlen 
(struct cgraph_node *node,
     }
 
   gcc_assert (num < count);
-  clonei->simdlen = simdlens[num];
+  clonei->simdlen = clones[num].len;
+  clonei->vecsize_mangle = clones[num].mangle;
+  /* SVE simdclones always have a Mask, so set inbranch to 1.  */
+  if (clonei->vecsize_mangle == 's')
+    clonei->inbranch = 1;
   return count;
 }
 
-/* Implement TARGET_SIMD_CLONE_ADJUST.  */
+/* Helper function to adjust a SVE vector type of a SVE simd clone.  Returns a
+   SVE vector type based on the element type of the vector TYPE, with SIMDLEN
+   number of elements.  If IS_MASK, returns a SVE mask type appropriate for use
+   with the SVE type it would otherwise return.  */
+
+static tree
+simd_clone_adjust_sve_vector_type (tree type, bool is_mask, poly_uint64 
simdlen)
+{
+  unsigned int num_zr = 0;
+  unsigned int num_pr = 0;
+  machine_mode vector_mode;
+  type = TREE_TYPE (type);
+  scalar_mode scalar_m = SCALAR_TYPE_MODE (type);
+  vector_mode = aarch64_sve_data_mode (scalar_m, simdlen).require ();
+  type = build_vector_type_for_mode (type, vector_mode);
+  if (is_mask)
+    {
+      type = truth_type_for (type);
+      num_pr = 1;
+    }
+  else
+    num_zr = 1;
 
+  /* We create new types here with the SVE type attribute instead of using ACLE
+     types as we need to support unpacked vectors which aren't available as
+     ACLE SVE types.  */
+  type = build_distinct_type_copy (type);
+  aarch64_sve::add_sve_type_attribute (type, num_zr, num_pr, NULL, NULL);
+  return type;
+}
+
+/* Implement TARGET_SIMD_CLONE_ADJUST.  */
 static void
 aarch64_simd_clone_adjust (struct cgraph_node *node)
 {
-  /* Add aarch64_vector_pcs target attribute to SIMD clones so they
-     use the correct ABI.  */
-
   tree t = TREE_TYPE (node->decl);
-  TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
-                                       TYPE_ATTRIBUTES (t));
+  cl_target_option cur_target;
+  bool m_old_have_regs_of_mode[MAX_MACHINE_MODE];
+
+  if (node->simdclone->vecsize_mangle == 's')
+    {
+      /* This is additive and has no effect if SVE, or a superset thereof, is
+        already enabled.  */
+      tree target = build_string (strlen ("+sve") + 1, "+sve");
+      if (!aarch64_option_valid_attribute_p (node->decl, NULL_TREE, target, 0))
+       gcc_unreachable ();
+      cl_target_option_save (&cur_target, &global_options, 
&global_options_set);
+      tree new_target = DECL_FUNCTION_SPECIFIC_TARGET (node->decl);
+      cl_target_option_restore (&global_options, &global_options_set,
+                               TREE_TARGET_OPTION (new_target));
+      aarch64_override_options_internal (&global_options);
+      memcpy (m_old_have_regs_of_mode, have_regs_of_mode,
+             sizeof (have_regs_of_mode));
+      for (int i = 0; i < NUM_MACHINE_MODES; ++i)
+       if (aarch64_sve_mode_p ((machine_mode) i))
+         have_regs_of_mode[i] = true;
+    }
+  else
+    {
+       /* Add aarch64_vector_pcs target attribute to SIMD clones so they
+          use the correct ABI.  */
+       TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
+                                             TYPE_ATTRIBUTES (t));
+    }
+  cgraph_simd_clone *sc = node->simdclone;
+
+  for (unsigned i = 0; i < sc->nargs; ++i)
+    {
+      bool is_mask = false;
+      tree type;
+      switch (sc->args[i].arg_type)
+       {
+       case SIMD_CLONE_ARG_TYPE_MASK:
+         is_mask = true;
+         gcc_fallthrough ();
+       case SIMD_CLONE_ARG_TYPE_VECTOR:
+       case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_CONSTANT_STEP:
+       case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_VARIABLE_STEP:
+         type = sc->args[i].vector_type;
+         gcc_assert (VECTOR_TYPE_P (type));
+         if (node->simdclone->vecsize_mangle == 's')
+           type = simd_clone_adjust_sve_vector_type (type, is_mask,
+                                                     sc->simdlen);
+         else if (is_mask)
+           type = truth_type_for (type);
+         sc->args[i].vector_type = type;
+         break;
+       default:
+         continue;
+       }
+    }
+  if (node->simdclone->vecsize_mangle == 's')
+    {
+      tree ret_type = TREE_TYPE (t);
+      if (VECTOR_TYPE_P (ret_type))
+       TREE_TYPE (t)
+         = simd_clone_adjust_sve_vector_type (ret_type, false,
+                                              node->simdclone->simdlen);
+      /* Restore current options.  */
+      cl_target_option_restore (&global_options, &global_options_set, 
&cur_target);
+      aarch64_override_options_internal (&global_options);
+      memcpy (have_regs_of_mode, m_old_have_regs_of_mode,
+             sizeof (have_regs_of_mode));
+    }
 }
 
 /* Implement TARGET_SIMD_CLONE_USABLE.  */
@@ -29197,6 +29319,11 @@ aarch64_simd_clone_usable (struct cgraph_node *node, 
machine_mode vector_mode)
       if (!TARGET_SIMD || aarch64_sve_mode_p (vector_mode))
        return -1;
       return 0;
+    case 's':
+      if (!TARGET_SVE
+         || !aarch64_sve_mode_p (vector_mode))
+       return -1;
+      return 0;
     default:
       gcc_unreachable ();
     }
diff --git a/gcc/omp-simd-clone.cc b/gcc/omp-simd-clone.cc
index 14a37f70948..294a10e87ac 100644
--- a/gcc/omp-simd-clone.cc
+++ b/gcc/omp-simd-clone.cc
@@ -542,9 +542,12 @@ simd_clone_mangle (struct cgraph_node *node,
   pp_string (&pp, "_ZGV");
   pp_character (&pp, vecsize_mangle);
   pp_character (&pp, mask);
-  /* For now, simdlen is always constant, while variable simdlen pp 'n'.  */
-  unsigned int len = simdlen.to_constant ();
-  pp_decimal_int (&pp, (len));
+
+  unsigned HOST_WIDE_INT len;
+  if (simdlen.is_constant (&len))
+    pp_decimal_int (&pp, (int) (len));
+  else
+    pp_character (&pp, 'x');
 
   for (n = 0; n < clone_info->nargs; ++n)
     {
@@ -1534,8 +1537,8 @@ simd_clone_adjust (struct cgraph_node *node)
         below).  */
       loop = alloc_loop ();
       cfun->has_force_vectorize_loops = true;
-      /* For now, simlen is always constant.  */
-      loop->safelen = node->simdclone->simdlen.to_constant ();
+      /* We can assert that safelen is the 'minimum' simdlen.  */
+      loop->safelen = constant_lower_bound (node->simdclone->simdlen);
       loop->force_vectorize = true;
       loop->header = body_bb;
     }
diff --git a/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c 
b/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c
index e2e80f0c663..2f4d3a866e5 100644
--- a/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c
+++ b/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c
@@ -43,6 +43,7 @@ float f04 (double a)
 }
 /* { dg-final { scan-assembler {_ZGVnN2v_f04:} } } */
 /* { dg-final { scan-assembler {_ZGVnM2v_f04:} } } */
+/* { dg-final { scan-assembler-not {_ZGVs[0-9a-z]*_f04:} } } */
 
 #pragma omp declare simd uniform(a) linear (b)
 void f05 (short a, short *b, short c)
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-1.c 
b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-1.c
new file mode 100644
index 00000000000..b33541cc329
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-1.c
@@ -0,0 +1,137 @@
+/* { dg-do compile }  */
+/* { dg-options "-std=c99" } */
+/* { dg-additional-options "-O3 -march=armv8-a" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*  Ensure correct creation of SVE Vector-length agnostic (VLA SVE) vector
+    function calls from scalar versions in accordance with the Vector Function
+    Application Binary Interface Specification for AArch64 (AAVPCS).
+
+  We check for correctness in:
+    - Vector function name mangling, with the grammar:
+
+      vector name := prefix  "_" name
+      prefix := "_ZGV" isa mask <len> <parameters>
+
+      Whereby:
+      - <isa>  := "s" for SVE
+      - <mask> := "M" for Mask
+      - <len>  := "x" for VLA SVE
+
+      resulting in:
+      <prefix> := "_ZGVsMx" <parameters>
+
+      with each vector parameter contributing a "v" to the prefix.
+
+    - Parameter and return value mapping:
+      - Unless marked with uniform or linear OpenMP clauses, parameters and
+        return values are expected to map to vectors.
+      - Where the lane-size of a parameter is less than the widest data size
+        for a given function, the resulting vector should be unpacked and
+        populated via use extending loads.
+
+    - Finally, we also make sure we can correctly generate calls to the same
+      function, differing only in the target architecture (i.e. SVE vs SIMD),
+      ensuring that each call points to the correctly-mangled vector function
+      and employs the correct ABI.  For example, for `fn' we may expect:
+
+       for #pragma GCC target("+sve"): _ZGVsMxvv_fn
+       for #pragma GCC target("+simd): _ZGVnN4vv_fn */
+
+#pragma GCC target ("+sve")
+/* scan-assembler {_ZGVsMxv_fn0} } } */
+extern int __attribute__ ((simd, const)) fn0 (int);
+void test_fn0 (int *a, int *b, int n)
+{
+  for (int i = 0; i < n; ++i)
+    a[i] += fn0 (b[i]);
+}
+
+/*
+** test_fn1:
+** ...
+**     ld1w    z1\.s, p7/z, \[x22, x19, lsl 2\]
+**     ld1h    z0\.s, p7/z, \[x23, x19, lsl 1\]
+**     mov     p0\.b, p7\.b
+**     bl      _ZGVsMxvv_fn1
+**     st1w    z0\.s, p7, \[x21, x19, lsl 2\]
+** ...
+*/
+extern int __attribute__ ((simd, const)) fn1 (short, int);
+void test_fn1 (int *a, int *b, short *c, int n)
+{
+  for (int i = 0; i < n; ++i)
+    a[i] = fn1 (c[i], b[i]);
+}
+
+/*
+** test_fn2:
+** ...
+**     ld1w    z1\.s, p7/z, \[x23, x19, lsl 2\]
+**     ld1h    z0\.s, p7/z, \[x22, x19, lsl 1\]
+**     mov     p0\.b, p7.b
+**     bl      _ZGVsMxvv_fn2
+**     st1h    z0\.s, p7, \[x21, x19, lsl 1\]
+** ...
+*/
+extern short __attribute__ ((simd, const)) fn2 (short, int);
+void test_fn2 (short *a, int *b, short *c, int n)
+{
+  for (int i = 0; i < n; ++i)
+    a[i] = fn2 (c[i], b[i]);
+}
+
+/*
+** test_fn3:
+** ...
+**     ld1b    z23\.s, p7/z, \[x22, x19\]
+**     ld1w    z0\.s, p7/z, \[x23, x19, lsl 2\]
+**     ptrue   p6\.b, all
+**     mov     p0\.b, p7\.b
+**     mov     z1\.d, z23\.d
+**     uxtb    z23\.h, p6/m, z23\.h
+**     bl      _ZGVsMxvv_fn3
+**     uxtb    z0\.h, p6/m, z0\.h
+**     add     z0\.h, z0\.h, z23\.h
+**     uxth    z0\.s, p6/m, z0\.s
+**     st1w    z0\.s, p7, \[x20, x19, lsl 2\]
+** ...
+*/
+extern char __attribute__ ((simd, const)) fn3 (int, char);
+void test_fn3 (int *a, int *b, char *c, int n)
+{
+  for (int i = 0; i < n; ++i)
+    a[i] = (int) (fn3 (b[i], c[i]) + c[i]);
+}
+
+/*
+** test_fn4:
+** ...
+**     ld1h    z23\.s, p7/z, \[x23, x19, lsl 1\]
+**     ld1w    z0\.s, p7/z, \[x22, x19, lsl 2\]
+**     mov     p0\.b, p7\.b
+**     mov     z1\.d, z23\.d
+**     ptrue   p6\.b, all
+**     bl      _ZGVsMxvv_fn4
+**     sxth    z23\.s, p6/m, z23\.s
+**     sxth    z0\.s, p6/m, z0\.s
+**     add     z0\.s, z0\.s, z23\.s
+**     st1w    z0\.s, p7, \[x21, x19, lsl 2\]
+** ...
+*/
+extern short __attribute__ ((simd, const)) fn4 (int, short);
+void test_fn4 (int *a, int *b, short *c, int n)
+{
+  for (int i = 0; i < n; ++i)
+    a[i] = (int) (fn4 (b[i], c[i]) + c[i]);
+}
+
+#pragma GCC reset_options
+#pragma GCC target ("+simd")
+/* scan-assembler {_ZGVnN4vv_fn4} } } */
+extern short __attribute__ ((simd, const)) fn4 (int, short);
+void test_fn5 (int *a, int *b, short *c, int n)
+{
+  for (int i = 0; i < n; ++i)
+    a[i] = (int) (fn4 (b[i], c[i]) + c[i]);
+}
-- 
2.34.1

Reply via email to