On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.to...@gmail.com> wrote: > > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote: > > ) > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.to...@gmail.com> wrote: > > > > > > For vector init constructor: > > > > > > --- > > > typedef float __v4sf __attribute__ ((__vector_size__ (16))); > > > > > > __v4sf > > > foo (__v4sf x, float f) > > > { > > > __v4sf y = { f, x[1], x[2], x[3] }; > > > return y; > > > } > > > --- > > > > > > we can optimize vector init constructor with vector copy or permute > > > followed by a single scalar insert: > > > > > > __v4sf D.1912; > > > __v4sf D.1913; > > > __v4sf D.1914; > > > __v4sf y; > > > > > > x.0_1 = x; > > > D.1912 = x.0_1; > > > _2 = D.1912; > > > D.1913 = _2; > > > BIT_FIELD_REF <D.1913, 32, 0> = f; > > > y = D.1913; > > > D.1914 = y; > > > return D.1914; > > > > > > instead of > > > > > > __v4sf D.1962; > > > __v4sf y; > > > > > > _1 = BIT_FIELD_REF <x, 32, 32>; > > > _2 = BIT_FIELD_REF <x, 32, 64>; > > > _3 = BIT_FIELD_REF <x, 32, 96>; > > > y = {f, _1, _2, _3}; > > > D.1962 = y; > > > return D.1962; > > > > > > gcc/ > > > > > > PR tree-optimization/88828 > > > * gimplify.c (gimplify_init_constructor): Optimize vector init > > > constructor with vector copy or permute followed by a single > > > scalar insert. > > > > > > Doing this here does not catch things like: > > typedef float __v4sf __attribute__ ((__vector_size__ (16))); > > > > > > __v4sf > > vector_init (float f0,float f1, float f2,float f3) > > { > > __v4sf y = { f, x[1], x[2], x[3] }; > > return y; > > } > > > > __v4sf > > foo (__v4sf x, float f) > > { > > return vector_init (f, x[1], x[2], x[3]) ; > > } > > > > Here is a patch for simplify_vector_constructor to optimize vector init > constructor with vector copy or permute followed by a single scalar > insert.
That's the correct place to fix this indeed. But this doesn't work correcly: > > [hjl@gnu-cfl-2 pr88828]$ cat bar.i > typedef float __v4sf __attribute__ ((__vector_size__ (16))); > > static __v4sf > vector_init (float f0,float f1, float f2,float f3) > { > __v4sf y = { f0, f1, f2, f3 }; > return y; > } > > __v4sf > foo (__v4sf x, float f) > { > return vector_init (f, x[1], x[2], x[3]) ; > } > [hjl@gnu-cfl-2 pr88828]$ make bar.s > /export/build/gnu/tools-build/gcc-wip-debug/build-x86_64-linux/gcc/xgcc > -B/export/build/gnu/tools-build/gcc-wip-debug/build-x86_64-linux/gcc/ -O2 -S > bar.i > [hjl@gnu-cfl-2 pr88828]$ cat bar.s > .file "bar.i" > .text > .p2align 4 > .globl foo > .type foo, @function > foo: > .LFB1: > .cfi_startproc > ret > .cfi_endproc > .LFE1: > .size foo, .-foo > .ident "GCC: (GNU) 9.0.1 20190303 (experimental)" > .section .note.GNU-stack,"",@progbits > [hjl@gnu-cfl-2 pr88828]$ > > Scalar insert is missing. > --- > gcc/tree-ssa-forwprop.c | 77 ++++++++++++++++++++++++++++++++++++----- > 1 file changed, 69 insertions(+), 8 deletions(-) > > diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c > index eeb6281c652..b10cfccf7b8 100644 > --- a/gcc/tree-ssa-forwprop.c > +++ b/gcc/tree-ssa-forwprop.c > @@ -2008,7 +2008,7 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) > unsigned elem_size, i; > unsigned HOST_WIDE_INT nelts; > enum tree_code code, conv_code; > - constructor_elt *elt; > + constructor_elt *ce; > bool maybe_ident; > > gcc_checking_assert (gimple_assign_rhs_code (stmt) == CONSTRUCTOR); > @@ -2027,18 +2027,41 @@ simplify_vector_constructor (gimple_stmt_iterator > *gsi) > orig[1] = NULL; > conv_code = ERROR_MARK; > maybe_ident = true; > - FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, elt) > + > + tree rhs_vector = NULL; > + /* The single scalar element. */ > + tree scalar_element = NULL; > + unsigned int scalar_idx = 0; > + bool insert = false; > + unsigned int nscalars = 0; > + unsigned int nvectors = 0; > + FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, ce) > { > tree ref, op1; > > if (i >= nelts) > return false; > > - if (TREE_CODE (elt->value) != SSA_NAME) > + if (TREE_CODE (ce->value) != SSA_NAME) > return false; > - def_stmt = get_prop_source_stmt (elt->value, false, NULL); > + def_stmt = get_prop_source_stmt (ce->value, false, NULL); > if (!def_stmt) > - return false; > + { > + if ( gimple_nop_p (SSA_NAME_DEF_STMT (ce->value))) > + { > + /* Only allow one single scalar insert. */ > + if (nscalars != 0) > + return false; > + > + nscalars = 1; > + insert = true; > + scalar_idx = i; > + scalar_element = ce->value; > + continue; > + } > + else > + return false; > + } > code = gimple_assign_rhs_code (def_stmt); > if (code == FLOAT_EXPR > || code == FIX_TRUNC_EXPR) > @@ -2046,7 +2069,7 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) > op1 = gimple_assign_rhs1 (def_stmt); > if (conv_code == ERROR_MARK) > { > - if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE > (elt->value))), > + if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (ce->value))), > GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1))))) > return false; > conv_code = code; > @@ -2095,6 +2118,18 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) > elt += nelts; > if (elt != i) > maybe_ident = false; > + > + if (type == TREE_TYPE (ref)) > + { > + /* The RHS vector has the same type as LHS. */ > + if (rhs_vector == NULL) > + rhs_vector = ref; > + /* Check if all RHS vector elements come fome the same > + vector. */ > + if (rhs_vector == ref) > + nvectors++; > + } > + > sel.quick_push (elt); > } > if (i < nelts) > @@ -2113,6 +2148,12 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) > || conv_code == CALL_EXPR)) > return false; > > + /* Replace the scalar element with the vector element. */ > + if (insert > + && (TYPE_VECTOR_SUBPARTS (type).to_constant () > + == (nscalars + nvectors))) > + sel.quick_push (scalar_idx); > + > if (maybe_ident) > { > if (conv_code == ERROR_MARK) > @@ -2127,14 +2168,22 @@ simplify_vector_constructor (gimple_stmt_iterator > *gsi) > > vec_perm_indices indices (sel, orig[1] ? 2 : 1, nelts); > if (!can_vec_perm_const_p (TYPE_MODE (type), indices)) > - return false; > + { > + if (insert) > + gcc_unreachable (); > + return false; > + } > mask_type > = build_vector_type (build_nonstandard_integer_type (elem_size, 1), > nelts); > if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_VECTOR_INT > || maybe_ne (GET_MODE_SIZE (TYPE_MODE (mask_type)), > GET_MODE_SIZE (TYPE_MODE (type)))) > - return false; > + { > + if (insert) > + gcc_unreachable (); > + return false; > + } > op2 = vec_perm_indices_to_tree (mask_type, indices); > if (!orig[1]) > orig[1] = orig[0]; > @@ -2153,6 +2202,18 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) > } > } > update_stmt (gsi_stmt (*gsi)); > + if (insert) > + { > + /* Generate a single scalar insert. */ > + /* FIXME: This doesn't work correctly. */ > + tree lhs = gimple_assign_lhs (stmt); > + tree bitfield = build3 (BIT_FIELD_REF, elem_type, lhs, > + bitsize_int (elem_size), > + bitsize_int (scalar_idx * elem_size)); > + gimple *new_stmt = gimple_build_assign (bitfield, scalar_element); I think you want to generate from the original _1 = { .... }; the new _2 = copy or permute to _new_ LHS SSA name _1 = BIT_INSERT_EXPR <_2, scalar_element, scalar_idx * elem_size>; > + gsi_insert_after (gsi, new_stmt, GSI_SAME_STMT); and you want to advance to the _1 = BIT_INSERT_EXPR here. The easiest way is to emit a new stmt for _2 = copy ...; and do the set_rhs with the BIT_INSERT_EXPR. > + update_stmt (gsi_stmt (*gsi)); > + } > return true; > } > > -- > 2.20.1 >