Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org> writes: > diff --git a/gcc/cse.c b/gcc/cse.c > index > 330c1e90ce05b8f95b58f24576ec93e10ec55d89..d76e01b6478e22e9dd5760b7c78cecb536d7daef > 100644 > --- a/gcc/cse.c > +++ b/gcc/cse.c > @@ -44,6 +44,7 @@ along with GCC; see the file COPYING3. If not see > #include "regs.h" > #include "function-abi.h" > #include "rtlanal.h" > +#include "expr.h" > > /* The basic idea of common subexpression elimination is to go > through the code, keeping a record of expressions that would > @@ -4274,6 +4275,25 @@ find_sets_in_insn (rtx_insn *insn, struct set **psets) > someplace else, so it isn't worth cse'ing. */ > else if (GET_CODE (SET_SRC (x)) == CALL) > ; > + else if (GET_CODE (SET_SRC (x)) == CONST_VECTOR > + && GET_MODE_CLASS (GET_MODE (SET_SRC (x))) != MODE_VECTOR_BOOL) > + { > + /* First register the vector itself. */ > + sets[n_sets++].rtl = x; > + rtx src = SET_SRC (x); > + machine_mode elem_mode = GET_MODE_INNER (GET_MODE (src)); > + /* Go over the constants of the CONST_VECTOR in forward order, to > + put them in the same order in the SETS array. */ > + for (unsigned i = 0; i < const_vector_encoded_nelts (src) ; i++) > + { > + /* These are templates and don't actually get emitted but are > + used to tell CSE how to get to a particular constant. */ > + rtx tmp = gen_rtx_PARALLEL (VOIDmode, > + gen_rtvec (1, GEN_INT (i))); > + rtx y = gen_rtx_VEC_SELECT (elem_mode, SET_DEST (x), tmp); > + sets[n_sets++].rtl = gen_rtx_SET (y, CONST_VECTOR_ELT (src, i)); > + } > + }
As mentioned in the 2/2 thread, I think we should use subregs for the case where they're canonical. It'd probably be worth adding a simplify-rtx.c helper to extract one element from a vector, e.g.: rtx simplify_gen_vec_select (rtx op, unsigned int index); so that this is easier to do. Does making the loop above per-element mean that, for 128-bit Advanced SIMD, the optimisation “only” kicks in for 64-bit element sizes? Perhaps for other element sizes we could do “top” and “bottom” halves. (There's obviously no need to do that as part of this work, was just wondering.) > else > sets[n_sets++].rtl = x; > } > @@ -4513,7 +4533,14 @@ cse_insn (rtx_insn *insn) > struct set *sets = (struct set *) 0; > > if (GET_CODE (x) == SET) > - sets = XALLOCA (struct set); > + { > + /* For CONST_VECTOR we wants to be able to CSE the vector itself along > with > + elements inside the vector if the target says it's cheap. */ > + if (GET_CODE (SET_SRC (x)) == CONST_VECTOR) > + sets = XALLOCAVEC (struct set, const_vector_encoded_nelts (SET_SRC (x)) > + 1); > + else > + sets = XALLOCA (struct set); > + } > else if (GET_CODE (x) == PARALLEL) > sets = XALLOCAVEC (struct set, XVECLEN (x, 0)); I think this would be easier if “sets” was first converted to an auto_vec, say auto_vec<struct set, 8>. We then wouldn't need to predict in advance how many elements are needed. > @@ -4997,6 +5024,26 @@ cse_insn (rtx_insn *insn) > src_related_is_const_anchor = src_related != NULL_RTX; > } > > + /* Try to re-materialize a vec_dup with an existing constant. */ > + if (GET_CODE (src) == CONST_VECTOR > + && const_vector_encoded_nelts (src) == 1) > + { > + rtx const_rtx = CONST_VECTOR_ELT (src, 0); Would be simpler as: rtx src_elt; if (const_vec_duplicate_p (src, &src_elt)) I think we should also check !src_eqv_here, or perhaps: (!src_eqv_here || CONSTANT_P (src_eqv_here)) so that we don't override any existing reg notes, which could have more chance of succeeding. > + machine_mode const_mode = GET_MODE_INNER (GET_MODE (src)); > + struct table_elt *related_elt > + = lookup (const_rtx, HASH (const_rtx, const_mode), const_mode); > + if (related_elt) > + { > + for (related_elt = related_elt->first_same_value; > + related_elt; related_elt = related_elt->next_same_value) > + if (REG_P (related_elt->exp)) > + { > + src_eqv_here > + = gen_rtx_VEC_DUPLICATE (GET_MODE (src), > + related_elt->exp); > + } Other similar loops seem to break after the first match, instead of picking the last match. Thanks, Richard > + } > + } > > if (src == src_folded) > src_folded = 0;