On Thu, 7 May 2020, Richard Sandiford wrote: > Richard Biener <rguent...@suse.de> writes: > > This implements patterns combining vector element insertion of > > vector element extraction to a VEC_PERM_EXPR of both vectors > > when supported. Plus it adds the more generic identity transform > > of inserting a piece of itself at the same position. > > > > Richard - is there anything I can do to make this SVE aware? > > I'd need to construct an identity permute and "insert" into > > that permute that element from the other (or same) vector. > > I suppose for most element positions that won't work but > > at least inserting at [0] should? I'm mostly struggling > > on how to use vec_perm_builder here when nelts is not constant, > > since it's derived from vec<> can I simply start with > > a single pattern with 1 stride and then insert by using []? > > I guess for SVE we still want to know that the range is safe > for all VL, so after dropping the is_constant check, we'd > want something like: > > { > poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type); > unsigned int min_nelts = constant_lower_bound (nelts); > } > (if (... > && at + elemsz <= min_nelts) > > In theory (hah) it should then just be a case of changing the > vec_perm_builder constructor to: > > vec_perm_builder sel (nelts, min_nelts, 3); > > and then iterating over min_nelts * 3 instead of nelts here: > > > + for (unsigned i = 0; i < nelts; ++i) > > + sel.quick_push (i / elemsz == at > > + ? nelts + elem * elemsz + i % elemsz : i); > > So as far as the encoding goes, the first min_nelts elements are arbitrary > values, and the following two min_nelts sequences form individual linear > series.
OK - not sure why we need exactly three nelts per pattern here. It also looks like all the constant_multiple_p () checks constrain things quite a bit. Oh, and does a BIT_FIELD_REF with poly-int position extract multiple elements in the end?! For the case we are extracting a sub-vector and thus elemsz != 1 we constrain it so that this sub-vector is not of variable size (err, not "independently" so, whatever that means..)? My brain hurts... how do you write a GIMPLE testcase for aarch64 SVE covering such cases? > This ought to be work for both SVE and non-SVE, although obviously > there's a bit of wasted work for non-SVE. > > (And thanks for asking :-)) So like this, it seems to still work on the x86 testcases? Thanks, Richard. This implements patterns combining vector element insertion of vector element extraction to a VEC_PERM_EXPR of both vectors when supported. Plus it adds the more generic identity transform of inserting a piece of itself at the same position. 2020-05-07 Richard Biener <rguent...@suse.de> PR tree-optimization/94864 PR tree-optimization/94865 * match.pd ((bit_insert @0 (BIT_FIELD_REF @0 ...) ...) -> @0): New simplification. ((bit_insert @0 (BIT_FIELD_REF @1 ...) ...) -> (vec_perm @0 @1 ..): Likewise. * gcc.dg/tree-ssa/forwprop-39.c: New testcase. * gcc.dg/tree-ssa/forwprop-40.c: Likewise. --- gcc/match.pd | 41 +++++++++++++++++++++++++++++ gcc/testsuite/gcc.dg/tree-ssa/forwprop-39.c | 30 +++++++++++++++++++++ gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c | 18 +++++++++++++ 3 files changed, 89 insertions(+) create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/forwprop-39.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c diff --git a/gcc/match.pd b/gcc/match.pd index 9259dd4ddaa..28d3bc459a7 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -5838,6 +5838,47 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) wi::to_wide (@ipos) + isize)) (BIT_FIELD_REF @0 @rsize @rpos))))) +/* Optimize a bit-insertion of a bit-extraction from the same object, + position and size. */ +(simplify + (bit_insert @0 (BIT_FIELD_REF@1 @0 @size @pos) @pos) + @0) + +/* Optimize an element insertion into a vector that is extracted from + another vector to a permutation of both vectors. */ +(simplify + (bit_insert @0 (BIT_FIELD_REF@1 @2 @size @rpos) @ipos) + (if (VECTOR_TYPE_P (type) + && types_match (type, TREE_TYPE (@2)) + && single_use (@1)) + (with + { + poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type); + unsigned int min_nelts = constant_lower_bound (nelts); + unsigned int elem_idx, at, n_elem; + unsigned HOST_WIDE_INT elem_sz + = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (type))); + } + (if (constant_multiple_p (tree_to_poly_uint64 (@rpos), + tree_to_poly_uint64 (@size), &elem_idx) + && constant_multiple_p (tree_to_poly_uint64 (@ipos), + tree_to_poly_uint64 (@size), &at) + && constant_multiple_p (tree_to_poly_uint64 (@size), elem_sz, &n_elem) + && at + n_elem <= min_nelts) + (with + { + vec_perm_builder sel (nelts, min_nelts, 3); + for (unsigned i = 0; i < 3 * min_nelts; ++i) + sel.quick_push (i / n_elem == at + ? nelts + elem_idx * n_elem + i % n_elem : i); + vec_perm_indices indices (sel, @0 == @2 ? 1 : 2, nelts); + } + (if (can_vec_perm_const_p (TYPE_MODE (type), indices)) + (vec_perm @0 @2 { vec_perm_indices_to_tree + (build_vector_type + (build_nonstandard_integer_type + (elem_sz, 1), nelts), indices); }))))))) + (if (canonicalize_math_after_vectorization_p ()) (for fmas (FMA) (simplify diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-39.c b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-39.c new file mode 100644 index 00000000000..d1aefa9ee60 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-39.c @@ -0,0 +1,30 @@ +/* { dg-do compile } */ +/* { dg-options "-O -fdump-tree-forwprop1 -fdump-tree-cddce1 -Wno-psabi -w" } */ + +typedef double v2df __attribute__((vector_size(16))); + +/* PR94865 */ +v2df move_sd1(v2df a, v2df b) +{ + v2df result = a; + result[1] = b[1]; + return result; +} + +/* PR94864 */ +v2df move_sd2(v2df a, v2df b) +{ + v2df result = a; + result[0] = b[1]; + return result; +} + +v2df move_nnop(v2df a) +{ + v2df result = a; + result[1] = a[1]; + return result; +} + +/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "forwprop1" } } */ +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 0 "cddce1" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c new file mode 100644 index 00000000000..94329437f0d --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O -fgimple -fdump-tree-forwprop1 -w -Wno-psabi" } */ + +typedef int v4si __attribute__((vector_size(16))); +typedef int v2si __attribute__((vector_size(8))); + +v4si __GIMPLE(ssa) bar (v4si a) +{ + v2si el; + v4si res; + + __BB(2): + el_2 = __BIT_FIELD_REF <v2si> (a_1(D), 64u, 64u); + res_3 = __BIT_INSERT (a_1(D), el_2, 0u); + return res_3; +} + +/* { dg-final { scan-tree-dump "VEC_PERM" "forwprop1" } } */ -- 2.13.7