On Mon, 27 Dec 2021 at 15:54, Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> wrote: > > On Fri, 17 Dec 2021 at 17:03, Richard Sandiford > <richard.sandif...@arm.com> wrote: > > > > Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> writes: > > > Hi, > > > The patch folds: > > > lhs = svld1rq ({-1, -1, -1, ...}, &v[0]) > > > into: > > > lhs = vec_perm_expr<v, v, {0, 1, 2, 3, ... }> > > > and expands above vec_perm_expr using aarch64_expand_sve_dupq. > > > > > > With patch, for following test: > > > #include <arm_sve.h> > > > #include <arm_neon.h> > > > > > > svint32_t > > > foo (int32x4_t x) > > > { > > > return svld1rq (svptrue_b8 (), &x[0]); > > > } > > > > > > it generates following code: > > > foo: > > > .LFB4350: > > > dup z0.q, z0.q[0] > > > ret > > > > > > and passes bootstrap+test on aarch64-linux-gnu. > > > But I am not sure if the changes to aarch64_evpc_sve_tbl > > > are correct. > > > > Just in case: I was only using int32x4_t in the PR as an example. > > The same thing should work for all element types. > > > > > > > > Thanks, > > > Prathamesh > > > > > > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > > > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > > > index 02e42a71e5e..e21bbec360c 100644 > > > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > > > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > > > @@ -1207,6 +1207,56 @@ public: > > > insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0)); > > > return e.use_contiguous_load_insn (icode); > > > } > > > + > > > + gimple * > > > + fold (gimple_folder &f) const OVERRIDE > > > + { > > > + tree arg0 = gimple_call_arg (f.call, 0); > > > + tree arg1 = gimple_call_arg (f.call, 1); > > > + > > > + /* Transform: > > > + lhs = svld1rq ({-1, -1, ... }, &v[0]) > > > + into: > > > + lhs = vec_perm_expr<v, v, {0, 1, 2, 3, ...}>. > > > + on little endian target. */ > > > + > > > + if (!BYTES_BIG_ENDIAN > > > + && integer_all_onesp (arg0) > > > + && TREE_CODE (arg1) == ADDR_EXPR) > > > + { > > > + tree t = TREE_OPERAND (arg1, 0); > > > + if (TREE_CODE (t) == ARRAY_REF) > > > + { > > > + tree index = TREE_OPERAND (t, 1); > > > + t = TREE_OPERAND (t, 0); > > > + if (integer_zerop (index) && TREE_CODE (t) == VIEW_CONVERT_EXPR) > > > + { > > > + t = TREE_OPERAND (t, 0); > > > + tree vectype = TREE_TYPE (t); > > > + if (VECTOR_TYPE_P (vectype) > > > + && known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u) > > > + && wi::to_wide (TYPE_SIZE (vectype)) == 128) > > > + { > > > > Since this is quite a specific pattern match, and since we now lower > > arm_neon.h vld1* to normal gimple accesses, I think we should try the > > “more generally” approach mentioned in the PR and see what the fallout > > is. That is, keep: > > > > if (!BYTES_BIG_ENDIAN > > && integer_all_onesp (arg0) > > > > If those conditions pass, create an Advanced SIMD access at address arg1, > > using similar code to the handling of: > > > > BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD) > > BUILTIN_VDQ_I (LOAD1_U, ld1, 0, LOAD) > > BUILTIN_VALLP_NO_DI (LOAD1_P, ld1, 0, LOAD) > > > > in aarch64_general_gimple_fold_builtin. (Would be good to move the > > common code to aarch64.c so that both files can use it.) > > > > > + tree lhs = gimple_call_lhs (f.call); > > > + tree lhs_type = TREE_TYPE (lhs); > > > + int source_nelts = TYPE_VECTOR_SUBPARTS > > > (vectype).to_constant (); > > > + vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), > > > source_nelts, 1); > > > + for (int i = 0; i < source_nelts; i++) > > > + sel.quick_push (i); > > > + > > > + vec_perm_indices indices (sel, 1, source_nelts); > > > + if (!can_vec_perm_const_p (TYPE_MODE (lhs_type), > > > indices)) > > > + return NULL; > > > > I don't think we need to check this: it should always be true. > > Probably worth keeping as a gcc_checking_assert though. > > > > > + > > > + tree mask = vec_perm_indices_to_tree (lhs_type, > > > indices); > > > + return gimple_build_assign (lhs, VEC_PERM_EXPR, t, t, > > > mask); > > > + } > > > + } > > > + } > > > + } > > > + > > > + return NULL; > > > + } > > > }; > > > > > > class svld1ro_impl : public load_replicate > > > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > > > index f07330cff4f..af27f550be3 100644 > > > --- a/gcc/config/aarch64/aarch64.c > > > +++ b/gcc/config/aarch64/aarch64.c > > > @@ -23002,8 +23002,32 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d > > > *d) > > > > > > machine_mode sel_mode = related_int_vector_mode (d->vmode).require (); > > > rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm); > > > + > > > if (d->one_vector_p) > > > - emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, > > > sel)); > > > + { > > > + bool use_dupq = false; > > > + /* Check if sel is dup vector with encoded elements {0, 1, 2, ... > > > nelts} */ > > > + if (GET_CODE (sel) == CONST_VECTOR > > > + && !GET_MODE_NUNITS (GET_MODE (sel)).is_constant () > > > + && CONST_VECTOR_DUPLICATE_P (sel)) > > > + { > > > + unsigned nelts = const_vector_encoded_nelts (sel); > > > + unsigned i; > > > + for (i = 0; i < nelts; i++) > > > + { > > > + rtx elem = CONST_VECTOR_ENCODED_ELT(sel, i); > > > + if (!(CONST_INT_P (elem) && INTVAL(elem) == i)) > > > + break; > > > + } > > > + if (i == nelts) > > > + use_dupq = true; > > > + } > > > + > > > + if (use_dupq) > > > + aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0); > > > + else > > > + emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, > > > sel)); > > > + } > > > > This shouldn't be a TBL but a new operation, handled by its own > > aarch64_evpc_sve_* routine. The check for the mask should then > > be done on d->perm, to detect whether the permutation is one > > that the new routine supports. > > > > I think the requirements are: > > > > - !BYTES_BIG_ENDIAN > > - the source must be an Advanced SIMD vector > > - the destination must be an SVE vector > > - the permutation must be a duplicate (tested in the code above) > > - the number of “patterns” in the permutation must equal the number of > > source elements > > - element X of the permutation must equal X (tested in the code above) > > > > The existing aarch64_evpc_* routines expect the source and target modes > > to be the same, so we should only call them when that's true. > Hi Richard, > Thanks for the suggestions, and sorry for late reply. > Does the following patch look OK (sans the refactoring of building mem_ref) ? > Passes bootstrap+test on aarch64-linux-gnu. Hi Richard, Since stage-1 has reopened, does the attached patch look OK to commit ?
Thanks, Prathamesh > > Thanks, > Prathamesh > > > > Thanks, > > Richard
diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index c21476d7ae9..cfcd9117ce3 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -47,6 +47,7 @@ #include "stringpool.h" #include "attribs.h" #include "gimple-fold.h" +#include "aarch64-builtins.h" #define v8qi_UP E_V8QImode #define v8di_UP E_V8DImode @@ -128,46 +129,6 @@ #define SIMD_MAX_BUILTIN_ARGS 5 -enum aarch64_type_qualifiers -{ - /* T foo. */ - qualifier_none = 0x0, - /* unsigned T foo. */ - qualifier_unsigned = 0x1, /* 1 << 0 */ - /* const T foo. */ - qualifier_const = 0x2, /* 1 << 1 */ - /* T *foo. */ - qualifier_pointer = 0x4, /* 1 << 2 */ - /* Used when expanding arguments if an operand could - be an immediate. */ - qualifier_immediate = 0x8, /* 1 << 3 */ - qualifier_maybe_immediate = 0x10, /* 1 << 4 */ - /* void foo (...). */ - qualifier_void = 0x20, /* 1 << 5 */ - /* Some patterns may have internal operands, this qualifier is an - instruction to the initialisation code to skip this operand. */ - qualifier_internal = 0x40, /* 1 << 6 */ - /* Some builtins should use the T_*mode* encoded in a simd_builtin_datum - rather than using the type of the operand. */ - qualifier_map_mode = 0x80, /* 1 << 7 */ - /* qualifier_pointer | qualifier_map_mode */ - qualifier_pointer_map_mode = 0x84, - /* qualifier_const | qualifier_pointer | qualifier_map_mode */ - qualifier_const_pointer_map_mode = 0x86, - /* Polynomial types. */ - qualifier_poly = 0x100, - /* Lane indices - must be in range, and flipped for bigendian. */ - qualifier_lane_index = 0x200, - /* Lane indices for single lane structure loads and stores. */ - qualifier_struct_load_store_lane_index = 0x400, - /* Lane indices selected in pairs. - must be in range, and flipped for - bigendian. */ - qualifier_lane_pair_index = 0x800, - /* Lane indices selected in quadtuplets. - must be in range, and flipped for - bigendian. */ - qualifier_lane_quadtup_index = 0x1000, -}; - /* Flags that describe what a function might do. */ const unsigned int FLAG_NONE = 0U; const unsigned int FLAG_READ_FPCR = 1U << 0; @@ -671,44 +632,6 @@ const char *aarch64_scalar_builtin_types[] = { NULL }; -#define ENTRY(E, M, Q, G) E, -enum aarch64_simd_type -{ -#include "aarch64-simd-builtin-types.def" - ARM_NEON_H_TYPES_LAST -}; -#undef ENTRY - -struct GTY(()) aarch64_simd_type_info -{ - enum aarch64_simd_type type; - - /* Internal type name. */ - const char *name; - - /* Internal type name(mangled). The mangled names conform to the - AAPCS64 (see "Procedure Call Standard for the ARM 64-bit Architecture", - Appendix A). To qualify for emission with the mangled names defined in - that document, a vector type must not only be of the correct mode but also - be of the correct internal AdvSIMD vector type (e.g. __Int8x8_t); these - types are registered by aarch64_init_simd_builtin_types (). In other - words, vector types defined in other ways e.g. via vector_size attribute - will get default mangled names. */ - const char *mangle; - - /* Internal type. */ - tree itype; - - /* Element type. */ - tree eltype; - - /* Machine mode the internal type maps to. */ - enum machine_mode mode; - - /* Qualifiers. */ - enum aarch64_type_qualifiers q; -}; - #define ENTRY(E, M, Q, G) \ {E, "__" #E, #G "__" #E, NULL_TREE, NULL_TREE, E_##M##mode, qualifier_##Q}, static GTY(()) struct aarch64_simd_type_info aarch64_simd_types [] = { @@ -2826,6 +2749,14 @@ get_mem_type_for_load_store (unsigned int fcode) } } +/* Return aarch64_simd_type_info corresponding to TYPE. */ + +aarch64_simd_type_info +aarch64_get_simd_info_for_type (enum aarch64_simd_type type) +{ + return aarch64_simd_types[type]; +} + /* Try to fold STMT, given that it's a call to the built-in function with subcode FCODE. Return the new statement on success and null on failure. */ diff --git a/gcc/config/aarch64/aarch64-builtins.h b/gcc/config/aarch64/aarch64-builtins.h new file mode 100644 index 00000000000..4d155566dc5 --- /dev/null +++ b/gcc/config/aarch64/aarch64-builtins.h @@ -0,0 +1,101 @@ +/* Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + <http://www.gnu.org/licenses/>. */ + +#ifndef AARCH64_BUILTINS_H +#define AARCH64_BUILTINS_H + +#define ENTRY(E, M, Q, G) E, +enum aarch64_simd_type +{ +#include "aarch64-simd-builtin-types.def" + ARM_NEON_H_TYPES_LAST +}; +#undef ENTRY + +enum aarch64_type_qualifiers +{ + /* T foo. */ + qualifier_none = 0x0, + /* unsigned T foo. */ + qualifier_unsigned = 0x1, /* 1 << 0 */ + /* const T foo. */ + qualifier_const = 0x2, /* 1 << 1 */ + /* T *foo. */ + qualifier_pointer = 0x4, /* 1 << 2 */ + /* Used when expanding arguments if an operand could + be an immediate. */ + qualifier_immediate = 0x8, /* 1 << 3 */ + qualifier_maybe_immediate = 0x10, /* 1 << 4 */ + /* void foo (...). */ + qualifier_void = 0x20, /* 1 << 5 */ + /* Some patterns may have internal operands, this qualifier is an + instruction to the initialisation code to skip this operand. */ + qualifier_internal = 0x40, /* 1 << 6 */ + /* Some builtins should use the T_*mode* encoded in a simd_builtin_datum + rather than using the type of the operand. */ + qualifier_map_mode = 0x80, /* 1 << 7 */ + /* qualifier_pointer | qualifier_map_mode */ + qualifier_pointer_map_mode = 0x84, + /* qualifier_const | qualifier_pointer | qualifier_map_mode */ + qualifier_const_pointer_map_mode = 0x86, + /* Polynomial types. */ + qualifier_poly = 0x100, + /* Lane indices - must be in range, and flipped for bigendian. */ + qualifier_lane_index = 0x200, + /* Lane indices for single lane structure loads and stores. */ + qualifier_struct_load_store_lane_index = 0x400, + /* Lane indices selected in pairs. - must be in range, and flipped for + bigendian. */ + qualifier_lane_pair_index = 0x800, + /* Lane indices selected in quadtuplets. - must be in range, and flipped for + bigendian. */ + qualifier_lane_quadtup_index = 0x1000, +}; + +struct GTY(()) aarch64_simd_type_info +{ + enum aarch64_simd_type type; + + /* Internal type name. */ + const char *name; + + /* Internal type name(mangled). The mangled names conform to the + AAPCS64 (see "Procedure Call Standard for the ARM 64-bit Architecture", + Appendix A). To qualify for emission with the mangled names defined in + that document, a vector type must not only be of the correct mode but also + be of the correct internal AdvSIMD vector type (e.g. __Int8x8_t); these + types are registered by aarch64_init_simd_builtin_types (). In other + words, vector types defined in other ways e.g. via vector_size attribute + will get default mangled names. */ + const char *mangle; + + /* Internal type. */ + tree itype; + + /* Element type. */ + tree eltype; + + /* Machine mode the internal type maps to. */ + enum machine_mode mode; + + /* Qualifiers. */ + enum aarch64_type_qualifiers q; +}; + +aarch64_simd_type_info aarch64_get_simd_info_for_type (enum aarch64_simd_type); + +#endif /* AARCH64_BUILTINS_H */ diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc index c24c0548724..1ef4ea2087b 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc @@ -44,6 +44,14 @@ #include "aarch64-sve-builtins-shapes.h" #include "aarch64-sve-builtins-base.h" #include "aarch64-sve-builtins-functions.h" +#include "aarch64-builtins.h" +#include "gimple-ssa.h" +#include "tree-phinodes.h" +#include "tree-ssa-operands.h" +#include "ssa-iterators.h" +#include "stringpool.h" +#include "value-range.h" +#include "tree-ssanames.h" using namespace aarch64_sve; @@ -1207,6 +1215,56 @@ public: insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0)); return e.use_contiguous_load_insn (icode); } + + gimple * + fold (gimple_folder &f) const OVERRIDE + { + tree arg0 = gimple_call_arg (f.call, 0); + tree arg1 = gimple_call_arg (f.call, 1); + + /* Transform: + lhs = svld1rq ({-1, -1, ... }, arg1) + into: + tmp = mem_ref<int32x4_t> [(int * {ref-all}) arg1] + lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>. + on little endian target. */ + + if (!BYTES_BIG_ENDIAN + && integer_all_onesp (arg0)) + { + tree lhs = gimple_call_lhs (f.call); + auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t); + + tree elt_ptr_type + = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, true); + tree zero = build_zero_cst (elt_ptr_type); + + /* Use element type alignment. */ + tree access_type + = build_aligned_type (simd_type.itype, TYPE_ALIGN (simd_type.eltype)); + + tree tmp = make_ssa_name_fn (cfun, access_type, 0); + gimple *mem_ref_stmt + = gimple_build_assign (tmp, fold_build2 (MEM_REF, access_type, arg1, zero)); + gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT); + + tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt); + tree vectype = TREE_TYPE (mem_ref_lhs); + tree lhs_type = TREE_TYPE (lhs); + + int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); + vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1); + for (int i = 0; i < source_nelts; i++) + sel.quick_push (i); + + vec_perm_indices indices (sel, 1, source_nelts); + gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), indices)); + tree mask = vec_perm_indices_to_tree (lhs_type, indices); + return gimple_build_assign (lhs, VEC_PERM_EXPR, mem_ref_lhs, mem_ref_lhs, mask); + } + + return NULL; + } }; class svld1ro_impl : public load_replicate diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index f650abbc4ce..47810fec804 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -23969,6 +23969,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d) return true; } +/* Try to implement D using SVE dup instruction. */ + +static bool +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d) +{ + if (BYTES_BIG_ENDIAN + || d->perm.length ().is_constant () + || !d->one_vector_p + || d->target == NULL + || d->op0 == NULL + || GET_MODE_NUNITS (GET_MODE (d->target)).is_constant () + || !GET_MODE_NUNITS (GET_MODE (d->op0)).is_constant ()) + return false; + + if (d->testing_p) + return true; + + int npatterns = d->perm.encoding ().npatterns (); + if (!known_eq (npatterns, GET_MODE_NUNITS (GET_MODE (d->op0)))) + return false; + + for (int i = 0; i < npatterns; i++) + if (!known_eq (d->perm[i], i)) + return false; + + aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0); + return true; +} + /* Try to implement D using SVE SEL instruction. */ static bool @@ -24129,7 +24158,12 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) else if (aarch64_evpc_reencode (d)) return true; if (d->vec_flags == VEC_SVE_DATA) - return aarch64_evpc_sve_tbl (d); + { + if (aarch64_evpc_sve_dup (d)) + return true; + else if (aarch64_evpc_sve_tbl (d)) + return true; + } else if (d->vec_flags == VEC_ADVSIMD) return aarch64_evpc_tbl (d); } diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c new file mode 100644 index 00000000000..35100a9e01c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O3" } */ + +#include "arm_neon.h" +#include "arm_sve.h" + +svint32_t f1 (int32x4_t x) +{ + return svld1rq (svptrue_b8 (), &x[0]); +} + +svint32_t f2 (int *x) +{ + return svld1rq (svptrue_b8 (), x); +} + +/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 2 { target aarch64_little_endian } } } */