Re: [1/2] PR96463 - aarch64 specific changes

Prathamesh Kulkarni via Gcc-patches Tue, 03 May 2022 03:41:38 -0700

On Mon, 27 Dec 2021 at 15:54, Prathamesh Kulkarni
<prathamesh.kulka...@linaro.org> wrote:
>
> On Fri, 17 Dec 2021 at 17:03, Richard Sandiford
> <richard.sandif...@arm.com> wrote:
> >
> > Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> writes:
> > > Hi,
> > > The patch folds:
> > > lhs = svld1rq ({-1, -1, -1, ...}, &v[0])
> > > into:
> > > lhs = vec_perm_expr<v, v, {0, 1, 2, 3, ... }>
> > > and expands above vec_perm_expr using aarch64_expand_sve_dupq.
> > >
> > > With patch, for following test:
> > > #include <arm_sve.h>
> > > #include <arm_neon.h>
> > >
> > > svint32_t
> > > foo (int32x4_t x)
> > > {
> > >   return svld1rq (svptrue_b8 (), &x[0]);
> > > }
> > >
> > > it generates following code:
> > > foo:
> > > .LFB4350:
> > >         dup     z0.q, z0.q[0]
> > >         ret
> > >
> > > and passes bootstrap+test on aarch64-linux-gnu.
> > > But I am not sure if the changes to aarch64_evpc_sve_tbl
> > > are correct.
> >
> > Just in case: I was only using int32x4_t in the PR as an example.
> > The same thing should work for all element types.
> >
> > >
> > > Thanks,
> > > Prathamesh
> > >
> > > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
> > > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > > index 02e42a71e5e..e21bbec360c 100644
> > > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > > @@ -1207,6 +1207,56 @@ public:
> > >      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
> > >      return e.use_contiguous_load_insn (icode);
> > >    }
> > > +
> > > +  gimple *
> > > +  fold (gimple_folder &f) const OVERRIDE
> > > +  {
> > > +    tree arg0 = gimple_call_arg (f.call, 0);
> > > +    tree arg1 = gimple_call_arg (f.call, 1);
> > > +
> > > +    /* Transform:
> > > +       lhs = svld1rq ({-1, -1, ... }, &v[0])
> > > +       into:
> > > +       lhs = vec_perm_expr<v, v, {0, 1, 2, 3, ...}>.
> > > +       on little endian target.  */
> > > +
> > > +    if (!BYTES_BIG_ENDIAN
> > > +     && integer_all_onesp (arg0)
> > > +     && TREE_CODE (arg1) == ADDR_EXPR)
> > > +      {
> > > +     tree t = TREE_OPERAND (arg1, 0);
> > > +     if (TREE_CODE (t) == ARRAY_REF)
> > > +       {
> > > +         tree index = TREE_OPERAND (t, 1);
> > > +         t = TREE_OPERAND (t, 0);
> > > +         if (integer_zerop (index) && TREE_CODE (t) == VIEW_CONVERT_EXPR)
> > > +           {
> > > +             t = TREE_OPERAND (t, 0);
> > > +             tree vectype = TREE_TYPE (t);
> > > +             if (VECTOR_TYPE_P (vectype)
> > > +                 && known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u)
> > > +                 && wi::to_wide (TYPE_SIZE (vectype)) == 128)
> > > +               {
> >
> > Since this is quite a specific pattern match, and since we now lower
> > arm_neon.h vld1* to normal gimple accesses, I think we should try the
> > “more generally” approach mentioned in the PR and see what the fallout
> > is.  That is, keep:
> >
> >     if (!BYTES_BIG_ENDIAN
> >         && integer_all_onesp (arg0)
> >
> > If those conditions pass, create an Advanced SIMD access at address arg1,
> > using similar code to the handling of:
> >
> >      BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD)
> >      BUILTIN_VDQ_I (LOAD1_U, ld1, 0, LOAD)
> >      BUILTIN_VALLP_NO_DI (LOAD1_P, ld1, 0, LOAD)
> >
> > in aarch64_general_gimple_fold_builtin.  (Would be good to move the
> > common code to aarch64.c so that both files can use it.)
> >
> > > +                 tree lhs = gimple_call_lhs (f.call);
> > > +                 tree lhs_type = TREE_TYPE (lhs);
> > > +                 int source_nelts = TYPE_VECTOR_SUBPARTS 
> > > (vectype).to_constant ();
> > > +                 vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), 
> > > source_nelts, 1);
> > > +                 for (int i = 0; i < source_nelts; i++)
> > > +                   sel.quick_push (i);
> > > +
> > > +                 vec_perm_indices indices (sel, 1, source_nelts);
> > > +                 if (!can_vec_perm_const_p (TYPE_MODE (lhs_type), 
> > > indices))
> > > +                   return NULL;
> >
> > I don't think we need to check this: it should always be true.
> > Probably worth keeping as a gcc_checking_assert though.
> >
> > > +
> > > +                 tree mask = vec_perm_indices_to_tree (lhs_type, 
> > > indices);
> > > +                 return gimple_build_assign (lhs, VEC_PERM_EXPR, t, t, 
> > > mask);
> > > +               }
> > > +           }
> > > +       }
> > > +      }
> > > +
> > > +    return NULL;
> > > +  }
> > >  };
> > >
> > >  class svld1ro_impl : public load_replicate
> > > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> > > index f07330cff4f..af27f550be3 100644
> > > --- a/gcc/config/aarch64/aarch64.c
> > > +++ b/gcc/config/aarch64/aarch64.c
> > > @@ -23002,8 +23002,32 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d 
> > > *d)
> > >
> > >    machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
> > >    rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
> > > +
> > >    if (d->one_vector_p)
> > > -    emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, 
> > > sel));
> > > +    {
> > > +      bool use_dupq = false;
> > > +      /* Check if sel is dup vector with encoded elements {0, 1, 2, ... 
> > > nelts}  */
> > > +      if (GET_CODE (sel) == CONST_VECTOR
> > > +       && !GET_MODE_NUNITS (GET_MODE (sel)).is_constant ()
> > > +       && CONST_VECTOR_DUPLICATE_P (sel))
> > > +       {
> > > +         unsigned nelts = const_vector_encoded_nelts (sel);
> > > +         unsigned i;
> > > +         for (i = 0; i < nelts; i++)
> > > +           {
> > > +             rtx elem = CONST_VECTOR_ENCODED_ELT(sel, i);
> > > +             if (!(CONST_INT_P (elem) && INTVAL(elem) == i))
> > > +               break;
> > > +           }
> > > +         if (i == nelts)
> > > +           use_dupq = true;
> > > +       }
> > > +
> > > +      if (use_dupq)
> > > +     aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
> > > +      else
> > > +     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, 
> > > sel));
> > > +    }
> >
> > This shouldn't be a TBL but a new operation, handled by its own
> > aarch64_evpc_sve_* routine.  The check for the mask should then
> > be done on d->perm, to detect whether the permutation is one
> > that the new routine supports.
> >
> > I think the requirements are:
> >
> > - !BYTES_BIG_ENDIAN
> > - the source must be an Advanced SIMD vector
> > - the destination must be an SVE vector
> > - the permutation must be a duplicate (tested in the code above)
> > - the number of “patterns” in the permutation must equal the number of
> >   source elements
> > - element X of the permutation must equal X (tested in the code above)
> >
> > The existing aarch64_evpc_* routines expect the source and target modes
> > to be the same, so we should only call them when that's true.
> Hi Richard,
> Thanks for the suggestions, and sorry for late reply.
> Does the following patch look OK (sans the refactoring of building mem_ref) ?
> Passes bootstrap+test on aarch64-linux-gnu.
Hi Richard,
Since stage-1 has reopened, does the attached patch look OK to commit ?


Thanks,
Prathamesh
>
> Thanks,
> Prathamesh
> >
> > Thanks,
> > Richard

diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index c21476d7ae9..cfcd9117ce3 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -47,6 +47,7 @@
 #include "stringpool.h"
 #include "attribs.h"
 #include "gimple-fold.h"
+#include "aarch64-builtins.h"
 
 #define v8qi_UP  E_V8QImode
 #define v8di_UP  E_V8DImode
@@ -128,46 +129,6 @@
 
 #define SIMD_MAX_BUILTIN_ARGS 5
 
-enum aarch64_type_qualifiers
-{
-  /* T foo.  */
-  qualifier_none = 0x0,
-  /* unsigned T foo.  */
-  qualifier_unsigned = 0x1, /* 1 << 0  */
-  /* const T foo.  */
-  qualifier_const = 0x2, /* 1 << 1  */
-  /* T *foo.  */
-  qualifier_pointer = 0x4, /* 1 << 2  */
-  /* Used when expanding arguments if an operand could
-     be an immediate.  */
-  qualifier_immediate = 0x8, /* 1 << 3  */
-  qualifier_maybe_immediate = 0x10, /* 1 << 4  */
-  /* void foo (...).  */
-  qualifier_void = 0x20, /* 1 << 5  */
-  /* Some patterns may have internal operands, this qualifier is an
-     instruction to the initialisation code to skip this operand.  */
-  qualifier_internal = 0x40, /* 1 << 6  */
-  /* Some builtins should use the T_*mode* encoded in a simd_builtin_datum
-     rather than using the type of the operand.  */
-  qualifier_map_mode = 0x80, /* 1 << 7  */
-  /* qualifier_pointer | qualifier_map_mode  */
-  qualifier_pointer_map_mode = 0x84,
-  /* qualifier_const | qualifier_pointer | qualifier_map_mode  */
-  qualifier_const_pointer_map_mode = 0x86,
-  /* Polynomial types.  */
-  qualifier_poly = 0x100,
-  /* Lane indices - must be in range, and flipped for bigendian.  */
-  qualifier_lane_index = 0x200,
-  /* Lane indices for single lane structure loads and stores.  */
-  qualifier_struct_load_store_lane_index = 0x400,
-  /* Lane indices selected in pairs. - must be in range, and flipped for
-     bigendian.  */
-  qualifier_lane_pair_index = 0x800,
-  /* Lane indices selected in quadtuplets. - must be in range, and flipped for
-     bigendian.  */
-  qualifier_lane_quadtup_index = 0x1000,
-};
-
 /* Flags that describe what a function might do.  */
 const unsigned int FLAG_NONE = 0U;
 const unsigned int FLAG_READ_FPCR = 1U << 0;
@@ -671,44 +632,6 @@ const char *aarch64_scalar_builtin_types[] = {
   NULL
 };
 
-#define ENTRY(E, M, Q, G) E,
-enum aarch64_simd_type
-{
-#include "aarch64-simd-builtin-types.def"
-  ARM_NEON_H_TYPES_LAST
-};
-#undef ENTRY
-
-struct GTY(()) aarch64_simd_type_info
-{
-  enum aarch64_simd_type type;
-
-  /* Internal type name.  */
-  const char *name;
-
-  /* Internal type name(mangled).  The mangled names conform to the
-     AAPCS64 (see "Procedure Call Standard for the ARM 64-bit Architecture",
-     Appendix A).  To qualify for emission with the mangled names defined in
-     that document, a vector type must not only be of the correct mode but also
-     be of the correct internal AdvSIMD vector type (e.g. __Int8x8_t); these
-     types are registered by aarch64_init_simd_builtin_types ().  In other
-     words, vector types defined in other ways e.g. via vector_size attribute
-     will get default mangled names.  */
-  const char *mangle;
-
-  /* Internal type.  */
-  tree itype;
-
-  /* Element type.  */
-  tree eltype;
-
-  /* Machine mode the internal type maps to.  */
-  enum machine_mode mode;
-
-  /* Qualifiers.  */
-  enum aarch64_type_qualifiers q;
-};
-
 #define ENTRY(E, M, Q, G)  \
   {E, "__" #E, #G "__" #E, NULL_TREE, NULL_TREE, E_##M##mode, qualifier_##Q},
 static GTY(()) struct aarch64_simd_type_info aarch64_simd_types [] = {
@@ -2826,6 +2749,14 @@ get_mem_type_for_load_store (unsigned int fcode)
   }
 }
 
+/* Return aarch64_simd_type_info corresponding to TYPE.  */
+
+aarch64_simd_type_info
+aarch64_get_simd_info_for_type (enum aarch64_simd_type type)
+{
+  return aarch64_simd_types[type];
+}
+
 /* Try to fold STMT, given that it's a call to the built-in function with
    subcode FCODE.  Return the new statement on success and null on
    failure.  */
diff --git a/gcc/config/aarch64/aarch64-builtins.h 
b/gcc/config/aarch64/aarch64-builtins.h
new file mode 100644
index 00000000000..4d155566dc5
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-builtins.h
@@ -0,0 +1,101 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef AARCH64_BUILTINS_H
+#define AARCH64_BUILTINS_H
+
+#define ENTRY(E, M, Q, G) E,
+enum aarch64_simd_type
+{
+#include "aarch64-simd-builtin-types.def"
+  ARM_NEON_H_TYPES_LAST
+};
+#undef ENTRY
+
+enum aarch64_type_qualifiers
+{
+  /* T foo.  */
+  qualifier_none = 0x0,
+  /* unsigned T foo.  */
+  qualifier_unsigned = 0x1, /* 1 << 0  */
+  /* const T foo.  */
+  qualifier_const = 0x2, /* 1 << 1  */
+  /* T *foo.  */
+  qualifier_pointer = 0x4, /* 1 << 2  */
+  /* Used when expanding arguments if an operand could
+     be an immediate.  */
+  qualifier_immediate = 0x8, /* 1 << 3  */
+  qualifier_maybe_immediate = 0x10, /* 1 << 4  */
+  /* void foo (...).  */
+  qualifier_void = 0x20, /* 1 << 5  */
+  /* Some patterns may have internal operands, this qualifier is an
+     instruction to the initialisation code to skip this operand.  */
+  qualifier_internal = 0x40, /* 1 << 6  */
+  /* Some builtins should use the T_*mode* encoded in a simd_builtin_datum
+     rather than using the type of the operand.  */
+  qualifier_map_mode = 0x80, /* 1 << 7  */
+  /* qualifier_pointer | qualifier_map_mode  */
+  qualifier_pointer_map_mode = 0x84,
+  /* qualifier_const | qualifier_pointer | qualifier_map_mode  */
+  qualifier_const_pointer_map_mode = 0x86,
+  /* Polynomial types.  */
+  qualifier_poly = 0x100,
+  /* Lane indices - must be in range, and flipped for bigendian.  */
+  qualifier_lane_index = 0x200,
+  /* Lane indices for single lane structure loads and stores.  */
+  qualifier_struct_load_store_lane_index = 0x400,
+  /* Lane indices selected in pairs. - must be in range, and flipped for
+     bigendian.  */
+  qualifier_lane_pair_index = 0x800,
+  /* Lane indices selected in quadtuplets. - must be in range, and flipped for
+     bigendian.  */
+  qualifier_lane_quadtup_index = 0x1000,
+};
+
+struct GTY(()) aarch64_simd_type_info
+{
+  enum aarch64_simd_type type;
+
+  /* Internal type name.  */
+  const char *name;
+
+  /* Internal type name(mangled).  The mangled names conform to the
+     AAPCS64 (see "Procedure Call Standard for the ARM 64-bit Architecture",
+     Appendix A).  To qualify for emission with the mangled names defined in
+     that document, a vector type must not only be of the correct mode but also
+     be of the correct internal AdvSIMD vector type (e.g. __Int8x8_t); these
+     types are registered by aarch64_init_simd_builtin_types ().  In other
+     words, vector types defined in other ways e.g. via vector_size attribute
+     will get default mangled names.  */
+  const char *mangle;
+
+  /* Internal type.  */
+  tree itype;
+
+  /* Element type.  */
+  tree eltype;
+
+  /* Machine mode the internal type maps to.  */
+  enum machine_mode mode;
+
+  /* Qualifiers.  */
+  enum aarch64_type_qualifiers q;
+};
+
+aarch64_simd_type_info aarch64_get_simd_info_for_type (enum aarch64_simd_type);
+
+#endif /* AARCH64_BUILTINS_H */
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index c24c0548724..1ef4ea2087b 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -44,6 +44,14 @@
 #include "aarch64-sve-builtins-shapes.h"
 #include "aarch64-sve-builtins-base.h"
 #include "aarch64-sve-builtins-functions.h"
+#include "aarch64-builtins.h"
+#include "gimple-ssa.h"
+#include "tree-phinodes.h"
+#include "tree-ssa-operands.h"
+#include "ssa-iterators.h"
+#include "stringpool.h"
+#include "value-range.h"
+#include "tree-ssanames.h"
 
 using namespace aarch64_sve;
 
@@ -1207,6 +1215,56 @@ public:
     insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
     return e.use_contiguous_load_insn (icode);
   }
+
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    tree arg0 = gimple_call_arg (f.call, 0);
+    tree arg1 = gimple_call_arg (f.call, 1);
+
+    /* Transform:
+       lhs = svld1rq ({-1, -1, ... }, arg1)
+       into:
+       tmp = mem_ref<int32x4_t> [(int * {ref-all}) arg1]
+       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
+       on little endian target.  */
+
+    if (!BYTES_BIG_ENDIAN
+       && integer_all_onesp (arg0))
+      {
+       tree lhs = gimple_call_lhs (f.call);
+       auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t);
+
+       tree elt_ptr_type
+         = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, true);
+       tree zero = build_zero_cst (elt_ptr_type);
+
+       /* Use element type alignment.  */
+       tree access_type
+         = build_aligned_type (simd_type.itype, TYPE_ALIGN (simd_type.eltype));
+
+       tree tmp = make_ssa_name_fn (cfun, access_type, 0);
+       gimple *mem_ref_stmt
+         = gimple_build_assign (tmp, fold_build2 (MEM_REF, access_type, arg1, 
zero));
+       gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
+
+       tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt);
+       tree vectype = TREE_TYPE (mem_ref_lhs);
+       tree lhs_type = TREE_TYPE (lhs);
+
+       int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
+       vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
+       for (int i = 0; i < source_nelts; i++)
+         sel.quick_push (i);
+
+       vec_perm_indices indices (sel, 1, source_nelts);
+       gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), 
indices));
+       tree mask = vec_perm_indices_to_tree (lhs_type, indices);
+       return gimple_build_assign (lhs, VEC_PERM_EXPR, mem_ref_lhs, 
mem_ref_lhs, mask);
+      }
+
+    return NULL;
+  }
 };
 
 class svld1ro_impl : public load_replicate
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index f650abbc4ce..47810fec804 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23969,6 +23969,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Try to implement D using SVE dup instruction.  */
+
+static bool
+aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
+{
+  if (BYTES_BIG_ENDIAN
+      || d->perm.length ().is_constant ()
+      || !d->one_vector_p
+      || d->target == NULL
+      || d->op0 == NULL
+      || GET_MODE_NUNITS (GET_MODE (d->target)).is_constant ()
+      || !GET_MODE_NUNITS (GET_MODE (d->op0)).is_constant ())
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  int npatterns = d->perm.encoding ().npatterns ();
+  if (!known_eq (npatterns, GET_MODE_NUNITS (GET_MODE (d->op0))))
+    return false;
+
+  for (int i = 0; i < npatterns; i++)
+    if (!known_eq (d->perm[i], i))
+      return false;
+
+  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
+  return true;
+}
+
 /* Try to implement D using SVE SEL instruction.  */
 
 static bool
@@ -24129,7 +24158,12 @@ aarch64_expand_vec_perm_const_1 (struct 
expand_vec_perm_d *d)
       else if (aarch64_evpc_reencode (d))
        return true;
       if (d->vec_flags == VEC_SVE_DATA)
-       return aarch64_evpc_sve_tbl (d);
+       {
+         if (aarch64_evpc_sve_dup (d))
+           return true;
+         else if (aarch64_evpc_sve_tbl (d))
+           return true;
+       }
       else if (d->vec_flags == VEC_ADVSIMD)
        return aarch64_evpc_tbl (d);
     }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c
new file mode 100644
index 00000000000..35100a9e01c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include "arm_neon.h"
+#include "arm_sve.h"
+
+svint32_t f1 (int32x4_t x)
+{
+  return svld1rq (svptrue_b8 (), &x[0]);
+}
+
+svint32_t f2 (int *x)
+{
+  return svld1rq (svptrue_b8 (), x);
+}
+
+/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 2 { 
target aarch64_little_endian } } } */

Re: [1/2] PR96463 - aarch64 specific changes

Reply via email to