Mailing list got lost somewhere, Archiving OK.
> -----Original Message----- > From: Richard Sandiford <richard.sandif...@arm.com> > Sent: Friday, October 29, 2021 4:52 PM > To: Tamar Christina <tamar.christ...@arm.com> > Cc: jeffreya...@gmail.com; rguent...@suse.de; nd <n...@arm.com> > Subject: Re: [PATCH 1/2] middle-end Teach CSE to be able to do vector > extracts. > > Sorry for the slow review. > > Tamar Christina <tamar.christ...@arm.com> writes: > > [this time with patch] > > > > Hi all, > > > > This is a new version which has the rewrite Richard S requested And > > also handles when lowpart_subreg fails. > > > > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu > > and no issues. > > > > Ok for master? > > > > Thanks, > > Tamar > > > > gcc/ChangeLog: > > > > * cse.c (add_to_set): New. > > (find_sets_in_insn): Register constants in sets. > > (canonicalize_insn): Use auto_vec instead. > > (cse_insn): Try materializing using vec_dup. > > * rtl.h (simplify_context::simplify_gen_vec_select, > > simplify_gen_vec_select): New. > > * simplify-rtx.c (simplify_context::simplify_gen_vec_select): New. > > > > --- inline copy of patch --- > > > > diff --git a/gcc/cse.c b/gcc/cse.c > > index > > > 4c3988ee430e99cff74c32cdf9b6382505edd415..2c0442484117317e553c92f48fa > c > > 24a0b55063bd 100644 > > --- a/gcc/cse.c > > +++ b/gcc/cse.c > > @@ -44,6 +44,7 @@ along with GCC; see the file COPYING3. If not see > > #include "regs.h" > > #include "function-abi.h" > > #include "rtlanal.h" > > +#include "expr.h" > > > > /* The basic idea of common subexpression elimination is to go > > through the code, keeping a record of expressions that would @@ > > -4240,13 +4241,23 @@ try_back_substitute_reg (rtx set, rtx_insn *insn) > > } > > } > > > > > > + > > Seems like excessive whitespace. > > > +/* Add an entry containing RTL X into SETS. */ static inline void > > +add_to_set (vec<struct set> *sets, rtx x) { > > + struct set entry = {}; > > + entry.rtl = x; > > + sets->safe_push (entry); > > +} > > + > > /* Record all the SETs in this instruction into SETS_PTR, > > and return the number of recorded sets. */ static int > > -find_sets_in_insn (rtx_insn *insn, struct set **psets) > > +find_sets_in_insn (rtx_insn *insn, vec<struct set> *psets) > > { > > - struct set *sets = *psets; > > - int n_sets = 0; > > + vec<struct set> sets = *psets; > > Is this needed? It looks like you convert all uses to pset (which is good). > > > + > > rtx x = PATTERN (insn); > > > > if (GET_CODE (x) == SET) > > @@ -4266,8 +4277,25 @@ find_sets_in_insn (rtx_insn *insn, struct set > **psets) > > someplace else, so it isn't worth cse'ing. */ > > else if (GET_CODE (SET_SRC (x)) == CALL) > > ; > > + else if (GET_CODE (SET_SRC (x)) == CONST_VECTOR > > + && GET_MODE_CLASS (GET_MODE (SET_SRC (x))) != > MODE_VECTOR_BOOL) > > + { > > + /* First register the vector itself. */ > > + add_to_set (psets, x); > > + rtx src = SET_SRC (x); > > + /* Go over the constants of the CONST_VECTOR in forward order, to > > + put them in the same order in the SETS array. */ > > + for (unsigned i = 0; i < const_vector_encoded_nelts (src) ; i++) > > + { > > + /* These are templates and don't actually get emitted but are > > + used to tell CSE how to get to a particular constant. */ > > + rtx y = simplify_gen_vec_select (SET_DEST (x), i); > > + gcc_assert (y); > > + add_to_set (psets, gen_rtx_SET (y, CONST_VECTOR_ELT > > + (src, i))); > > For the record: it looks like everything that uses set::rtl only really cares > about the SET_DEST & SET_SRC individually, so in principle we could save > creating some garbage SETs by splitting it into dest and src fields. I don't > think > that's important enough to be a requirement though. > > > + } > > + } > > else > > - sets[n_sets++].rtl = x; > > + add_to_set (psets, x); > > } > > else if (GET_CODE (x) == PARALLEL) > > { > > @@ -4288,12 +4316,12 @@ find_sets_in_insn (rtx_insn *insn, struct set > **psets) > > else if (GET_CODE (SET_SRC (y)) == CALL) > > ; > > else > > - sets[n_sets++].rtl = y; > > + add_to_set (psets, y); > > } > > } > > } > > > > - return n_sets; > > + return sets.length (); > > } > > > > > > /* Subroutine of canonicalize_insn. X is an ASM_OPERANDS in INSN. > > */ @@ -4341,9 +4369,10 @@ canon_asm_operands (rtx x, rtx_insn *insn) > > see canon_reg. */ > > > > static void > > -canonicalize_insn (rtx_insn *insn, struct set **psets, int n_sets) > > +canonicalize_insn (rtx_insn *insn, vec<struct set> *psets) > > { > > - struct set *sets = *psets; > > + vec<struct set> sets = *psets; > > + int n_sets = sets.length (); > > rtx tem; > > rtx x = PATTERN (insn); > > int i; > > @@ -4502,13 +4531,6 @@ cse_insn (rtx_insn *insn) > > int src_eqv_in_memory = 0; > > unsigned src_eqv_hash = 0; > > > > - struct set *sets = (struct set *) 0; > > - > > - if (GET_CODE (x) == SET) > > - sets = XALLOCA (struct set); > > - else if (GET_CODE (x) == PARALLEL) > > - sets = XALLOCAVEC (struct set, XVECLEN (x, 0)); > > - > > this_insn = insn; > > > > /* Find all regs explicitly clobbered in this insn, @@ -4517,10 > > +4539,11 @@ cse_insn (rtx_insn *insn) > > invalidate_from_sets_and_clobbers (insn); > > > > /* Record all the SETs in this instruction. */ > > - n_sets = find_sets_in_insn (insn, &sets); > > + auto_vec<struct set, 8> sets; > > + n_sets = find_sets_in_insn (insn, (vec<struct set>*)&sets); > > > > /* Substitute the canonical register where possible. */ > > - canonicalize_insn (insn, &sets, n_sets); > > + canonicalize_insn (insn, (vec<struct set>*)&sets); > > > > /* If this insn has a REG_EQUAL note, store the equivalent value in > SRC_EQV, > > if different, or if the DEST is a STRICT_LOW_PART/ZERO_EXTRACT. > > The @@ -4986,6 +5009,27 @@ cse_insn (rtx_insn *insn) > > src_related_is_const_anchor = src_related != NULL_RTX; > > } > > > > + /* Try to re-materialize a vec_dup with an existing constant. */ > > + rtx src_elt; > > + if ((!src_eqv_here || CONSTANT_P (src_eqv_here)) > > + && const_vec_duplicate_p (src, &src_elt)) > > + { > > + machine_mode const_mode = GET_MODE_INNER (GET_MODE > (src)); > > + struct table_elt *related_elt > > + = lookup (src_elt, HASH (src_elt, const_mode), const_mode); > > + if (related_elt) > > + { > > + for (related_elt = related_elt->first_same_value; > > + related_elt; related_elt = related_elt->next_same_value) > > + if (REG_P (related_elt->exp)) > > + { > > + src_eqv_here > > + = gen_rtx_VEC_DUPLICATE (GET_MODE (src), > > + related_elt->exp); > > I wondered whether we should compare the cost of the new vec_duplicate > against any existing constant src_eqv_here, but I guess that isn't necessary > because of: > > if (src_const == 0 > && (CONSTANT_P (src_folded) > /* Consider (minus (label_ref L1) (label_ref L2)) as > "constant" here so we will record it. This allows us > to fold switch statements when an ADDR_DIFF_VEC is used. > */ > || (GET_CODE (src_folded) == MINUS > && GET_CODE (XEXP (src_folded, 0)) == LABEL_REF > && GET_CODE (XEXP (src_folded, 1)) == LABEL_REF))) > src_const = src_folded, src_const_elt = elt; > else if (src_const == 0 && src_eqv_here && CONSTANT_P (src_eqv_here)) > src_const = src_eqv_here, src_const_elt = src_eqv_elt; > > Specifically, if src_folded is nonnull and src_eqv_here is a CONST_VECTOR. it > ought to be the same constant as src_folded, since (unlike symbolic > constants) each constant should have a unique representation. Perhaps > that's worth a comment: > > /* We don't need to compare costs with an existing (constant) > src_eqv_here, since any such src_eqv_here should already be > available in src_const. */ > > Maybe obvious, but still. > > OK with those changes (except the set::rtl one of course). > > Thanks, > Richard > > > + break; > > + } > > + } > > + } > > > > if (src == src_folded) > > src_folded = 0; > > diff --git a/gcc/rtl.h b/gcc/rtl.h > > index > > > 5473cc9f2ddf1863191a3e2b5914ae89598e53b4..6a6de1cf11d0bf5dd38f5ee37 > 901 > > 310d653ce722 100644 > > --- a/gcc/rtl.h > > +++ b/gcc/rtl.h > > @@ -3425,6 +3425,7 @@ public: > > rtx, rtx, rtx); > > rtx simplify_gen_relational (rtx_code, machine_mode, machine_mode, > rtx, rtx); > > rtx simplify_gen_subreg (machine_mode, rtx, machine_mode, > > poly_uint64); > > + rtx simplify_gen_vec_select (rtx, unsigned int); > > > > /* Tracks the level of MEM nesting for the value being simplified: > > 0 means the value is not in a MEM, >0 means it is. This is > > needed @@ -3526,6 +3527,12 @@ simplify_gen_subreg (machine_mode > outermode, rtx op, machine_mode innermode, > > innermode, byte); } > > > > +inline rtx > > +simplify_gen_vec_select (rtx op, unsigned int index) { > > + return simplify_context ().simplify_gen_vec_select (op, index); } > > + > > inline rtx > > lowpart_subreg (machine_mode outermode, rtx op, machine_mode > > innermode) { diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c > > index > > > bbbd6b74942491e9861ce2928f01b9aa7eeab031..19d899e6321e169d9ef92949 > 9e05 > > 7be0b73c2502 100644 > > --- a/gcc/simplify-rtx.c > > +++ b/gcc/simplify-rtx.c > > @@ -7589,6 +7589,28 @@ simplify_context::lowpart_subreg > (machine_mode outer_mode, rtx expr, > > subreg_lowpart_offset (outer_mode, > > inner_mode)); } > > > > +/* Generate RTX to select element at INDEX out of vector OP. */ > > + > > +rtx simplify_context::simplify_gen_vec_select (rtx op, unsigned int > > +index) { > > + > > + if (!VECTOR_MODE_P (GET_MODE (op))) > > + return NULL_RTX; > > + > > + machine_mode imode = GET_MODE_INNER (GET_MODE (op)); > > + > > + if (index == 0) > > + { > > + rtx res = lowpart_subreg (imode, op, GET_MODE (op)); > > + if (res) > > + return res; > > + } > > + > > + rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT > > +(index))); > > + return gen_rtx_VEC_SELECT (imode, op, tmp); } > > + > > + > > /* Simplify X, an rtx expression. > > > > Return the simplified expression or NULL if no simplifications